From aeac2a40fa3cc19a9739be4a036714280e1d68c9 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 19:45:04 +0000 Subject: [PATCH 1/7] support named entries.yaml files --- .../datasets/DataSetLoaderSimpleMFD.java | 37 +++++++++++++++++-- .../datasets/DataSetLoaderSimpleMFDTest.java | 29 +++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java index 5582e27e8..ec26c2191 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFD.java @@ -628,10 +628,7 @@ private static CatalogEntry buildCatalogEntry(Map fields, Path c // expand ${VAR} references in all field values var resolved = resolveEnvVars(fields); - String baseUrl = resolved.get("base_url"); - if (baseUrl != null && !baseUrl.endsWith("/")) { - baseUrl = baseUrl + "/"; - } + String baseUrl = normalizeBaseUrl(resolved.get("base_url")); // resolve cache_dir: entry field > DATASET_CACHE_DIR env var > catalog file's directory Path cacheDir; @@ -650,6 +647,38 @@ private static CatalogEntry buildCatalogEntry(Map fields, Path c return new CatalogEntry(resolved, cacheDir, baseUrl, source); } + /// Normalizes a user-supplied {@code base_url} to a directory URL ending in {@code /}. + /// + /// Two forms are accepted: + /// - A directory URL (with or without trailing slash): {@code s3://bucket/path} or + /// {@code s3://bucket/path/} → {@code s3://bucket/path/} + /// - A URL whose final segment names a specific entries/catalog file (e.g. + /// {@code knn_entries.yaml}): the filename is stripped so the parent directory + /// becomes the path root for per-entry relative filenames. The "named file" + /// form is retained as a valid way to record which remote catalog the entries + /// correspond to. + /// + /// Heuristic: if the URL does not end with {@code /} and its final path segment + /// contains a {@code .}, that segment is treated as a filename and stripped. + /// Otherwise a trailing {@code /} is appended. + /// + /// Returns {@code null} unchanged. + static String normalizeBaseUrl(String baseUrl) { + if (baseUrl == null || baseUrl.endsWith("/")) { + return baseUrl; + } + int lastSlash = baseUrl.lastIndexOf('/'); + if (lastSlash < 0) { + return baseUrl + "/"; + } + String lastSegment = baseUrl.substring(lastSlash + 1); + if (lastSegment.indexOf('.') >= 0) { + // last segment looks like a filename; strip it and keep the directory prefix + return baseUrl.substring(0, lastSlash + 1); + } + return baseUrl + "/"; + } + /// Matches {@code ${VAR}} and {@code ${VAR:-default}} syntax. private static final java.util.regex.Pattern ENV_VAR_PATTERN = java.util.regex.Pattern.compile("\\$\\{([^:}]+)(?::-((?:[^}]*)?))?}"); diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java index 33379dd57..fc38cc3fa 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderSimpleMFDTest.java @@ -528,6 +528,35 @@ public void anyYamlFileIsDiscovered() throws IOException { "Any .yaml file should be discovered"); } + // ======================================================================== + // base_url normalization + // ======================================================================== + + @Test + public void normalizeBaseUrl_nullAndTrailingSlashArePreserved() { + assertNull(DataSetLoaderSimpleMFD.normalizeBaseUrl(null)); + assertEquals("s3://bucket/path/", + DataSetLoaderSimpleMFD.normalizeBaseUrl("s3://bucket/path/")); + } + + @Test + public void normalizeBaseUrl_directoryWithoutTrailingSlashGetsSlash() { + assertEquals("s3://bucket/path/", + DataSetLoaderSimpleMFD.normalizeBaseUrl("s3://bucket/path")); + assertEquals("https://example.com/data/", + DataSetLoaderSimpleMFD.normalizeBaseUrl("https://example.com/data")); + } + + @Test + public void normalizeBaseUrl_specificallyNamedEntriesFileIsStrippedToParent() { + // Pointing base_url at a specifically-named catalog file is a valid form; + // the loader resolves entry filenames against the parent directory. + assertEquals("s3://bucket/sift1m/", + DataSetLoaderSimpleMFD.normalizeBaseUrl("s3://bucket/sift1m/knn_entries.yaml")); + assertEquals("https://example.com/path/", + DataSetLoaderSimpleMFD.normalizeBaseUrl("https://example.com/path/catalog.yml")); + } + @Test public void ymlExtensionAlsoDiscovered() throws IOException { Files.writeString(cacheDir.resolve("datasets.yml"), From d6b52c95e63f218486235123a5146a2e1f607bf4 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 19:45:08 +0000 Subject: [PATCH 2/7] scaffolding for additional partitioned testing --- .../sift-128-euclidean-c5.yml | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 jvector-examples/yaml-configs/index-parameters/sift-128-euclidean-c5.yml diff --git a/jvector-examples/yaml-configs/index-parameters/sift-128-euclidean-c5.yml b/jvector-examples/yaml-configs/index-parameters/sift-128-euclidean-c5.yml new file mode 100644 index 000000000..f045bbab7 --- /dev/null +++ b/jvector-examples/yaml-configs/index-parameters/sift-128-euclidean-c5.yml @@ -0,0 +1,46 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +# This file captures the equivalent test parameters which would be used by C*5.0.7 with the version of jvector then. +# This is not a direct 1:1 test, and results should be taken as anecdotal at best, given changes across versions. +# to use this file, copy its contents into the file which matches the dataset name: sift-128-euclidean.yml + +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: sift-128-euclidean + +construction: + outDegree: [16] + efConstruction: [100] + neighborOverflow: [1.2f] + alpha: [1.4f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: PQ + parameters: + # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor + mFactor: 2 + # k: 256 # optional parameter. By default, k=256 + centerData: No + anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: +# optimize_for: LATENCY (K=10 -> topK=16, K=100 -> topK=110) +# 10: [1.68] +# 100: [1.10] +# optimize_for: RECALL (K=10 -> topK=42, K=100 -> topK=200) + 10: [4.27] + 100: [2.00] + useSearchPruning: [No] + compression: + - type: None + queryRuns: 3 +repetitions: 3 + +also_for: + - "sift1m:label_*" \ No newline at end of file From e189e9f395e1d52f2cbfa34e8673c403e706fbde Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 19:45:12 +0000 Subject: [PATCH 3/7] multiple grid repetitions/query runs --- .../jvector/example/AutoBenchYAML.java | 34 ++- .../github/jbellis/jvector/example/Grid.java | 19 +- .../jvector/example/HelloVectorWorld.java | 4 + .../datasets/DataSetMetadataReader.java | 11 +- .../reporting/ExperimentsSchemaV1.java | 3 + .../example/reporting/RunArtifacts.java | 22 ++ .../example/reporting/RunSummaryPrinter.java | 239 ++++++++++++++++++ .../example/util/OnDiskGraphIndexCache.java | 40 ++- .../jvector/example/yaml/MultiConfig.java | 193 ++++++++++++-- .../example/yaml/SearchParameters.java | 13 + .../datasets/DataSetPropertiesTest.java | 56 ++++ .../yaml-configs/dataset-metadata.yml | 5 +- 12 files changed, 593 insertions(+), 46 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunSummaryPrinter.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index 6e805d03a..fb2c5a930 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -140,18 +140,28 @@ public static void main(String[] args) throws IOException { } logger.info("Using configuration: {}", config); - List datasetResults = Grid.runAllAndCollectResults(ds, - config.construction.useSavedIndexIfExists, - config.construction.outDegree, - config.construction.efConstruction, - config.construction.neighborOverflow, - config.construction.addHierarchy, - config.construction.refineFinalGraph, - config.construction.getFeatureSets(), - config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), - config.search.topKOverquery, - config.search.useSearchPruning); + List datasetResults = new ArrayList<>(); + int repetitions = config.repetitionsOrOne(); + for (int rep = 0; rep < repetitions; rep++) { + if (repetitions > 1) { + logger.info("{}: repetition {} of {}", datasetName, rep + 1, repetitions); + } + datasetResults.addAll(Grid.runAllAndCollectResults(ds, + config.construction.useSavedIndexIfExists, + config.construction.outDegree, + config.construction.efConstruction, + config.construction.neighborOverflow, + config.construction.alpha, + config.construction.addHierarchy, + config.construction.refineFinalGraph, + config.construction.getFeatureSets(), + config.construction.getCompressorParameters(), + config.search.getCompressorParameters(), + config.search.topKOverquery, + config.search.useSearchPruning, + config.search.queryRunsOrDefault(), + rep)); + } results.addAll(datasetResults); logger.info("Benchmark completed for dataset: {}", datasetName); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index ce9d62c1b..90a2fc8c4 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -107,6 +107,8 @@ static void runAll(DataSet ds, List> compressionGrid, Map> topKGrid, List usePruningGrid, + int queryRuns, + int repetition, RunArtifacts artifacts ) throws IOException { @@ -184,6 +186,8 @@ static void runAll(DataSet ds, compressionGrid, topKGrid, usePruningGrid, + SearchParameters.DEFAULT_QUERY_RUNS, + 0, // repetition RunArtifacts.disabled() // legacy callers do not use reporting ); } @@ -223,6 +227,8 @@ static void runOneGraph(OnDiskGraphIndexCache cache, List> compressionGrid, Map> topKGrid, List usePruningGrid, + int queryRuns, + int repetition, RunArtifacts artifacts, DataSet ds, Path workDirectory) throws IOException @@ -617,12 +623,13 @@ private static void testConfiguration(ConfiguredSystem cs, float neighborOverflow, boolean addHierarchy, boolean refineFinalGraph, + int queryRuns, + int repetition, Set featureSetForIndex, String buildCompressorString, RunArtifacts artifacts, ConstructionMetrics constructionMetrics, Path testDirectory) { - int queryRuns = 2; System.out.format("%s: Using %s:%n", cs.ds.getName(), cs.index); Map> benchmarksToCompute = artifacts.benchmarksToCompute(); @@ -656,7 +663,8 @@ private static void testConfiguration(ConfiguredSystem cs, "efConstruction", efConstruction, "neighborOverflow", neighborOverflow, "addHierarchy", addHierarchy, - "refineFinalGraph", refineFinalGraph + "refineFinalGraph", refineFinalGraph, + "repetition", repetition ), ordered( // Query configuration "usePruning", usePruning @@ -696,6 +704,7 @@ private static void testConfiguration(ConfiguredSystem cs, neighborOverflow, addHierarchy, refineFinalGraph, + repetition, featureSetForIndex, buildCompressorString, searchCompressorString, @@ -819,7 +828,9 @@ public static List runAllAndCollectResults( List> buildCompressors, List> compressionGrid, Map> topKGrid, - List usePruningGrid) throws IOException { + List usePruningGrid, + int queryRuns, + int repetition) throws IOException { // Initialize index caching (if enabled) final OnDiskGraphIndexCache cache = @@ -915,7 +926,6 @@ public static List runAllAndCollectResults( DiskUsageMonitor.MultiDirectorySnapshot buildDiskSnapshot = diagnostics.getLatestDiskSnapshot(); try (ConfiguredSystem cs = new ConfiguredSystem(ds, index, cvArg, features)) { - int queryRuns = 2; List benchmarks = List.of( (diagnostic_level > 0 ? ThroughputBenchmark.createDefault().withDiagnostics(getDiagnosticLevel()) : @@ -937,6 +947,7 @@ public static List runAllAndCollectResults( params.put("neighborOverflow", neighborOverflow); params.put("addHierarchy", addHierarchy); params.put("refineFinalGraph", refineFinalGraph); + params.put("repetition", repetition); params.put("features", features.toString()); params.put("buildCompressor", buildCompressor.toString()); params.put("searchCompressor", searchCompressor.toString()); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index ea4752e4b..bc1f12d77 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -57,6 +57,10 @@ public static void main(String[] args) throws IOException { config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, + config.search.queryRunsOrDefault(), + 0, artifacts); + + artifacts.printSummary(); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java index 3207e492f..fabf9fb22 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java @@ -24,9 +24,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.regex.Pattern; /// Reads dataset metadata from a multi-entry YAML file and provides keyed lookups /// for {@link DataSetProperties}. @@ -123,8 +126,12 @@ private Optional> findEntry(String datasetKey) { return Optional.of(entry); } - if (datasetKey.endsWith(".hdf5")) { - return Optional.ofNullable(metadata.get(datasetKey.substring(0, datasetKey.length() - ".hdf5".length()))); + String alternate = datasetKey.endsWith(".hdf5") + ? datasetKey.substring(0, datasetKey.length() - ".hdf5".length()) + : datasetKey + ".hdf5"; + entry = metadata.get(alternate); + if (entry != null) { + return Optional.of(entry); } return Optional.ofNullable(metadata.get(datasetKey + ".hdf5")); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java index f3e728ddf..7a3e53adc 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java @@ -58,6 +58,7 @@ public static List fixedColumns() { "neighborOverflow", "addHierarchy", "refineFinalGraph", + "repetition", "feature_set", "build_compressor", "search_compressor", @@ -80,6 +81,7 @@ public static Map fixedValues(RunContext run, float neighborOverflow, boolean addHierarchy, boolean refineFinalGraph, + int repetition, Set featureSet, String buildCompressorString, String searchCompressorString, @@ -99,6 +101,7 @@ public static Map fixedValues(RunContext run, fixed.put("neighborOverflow", Float.toString(neighborOverflow)); fixed.put("addHierarchy", Boolean.toString(addHierarchy)); fixed.put("refineFinalGraph", Boolean.toString(refineFinalGraph)); + fixed.put("repetition", Integer.toString(repetition)); fixed.put("feature_set", featureSet == null ? "" : featureSet.toString()); fixed.put("build_compressor", csvEscape(buildCompressorString)); fixed.put("search_compressor", csvEscape(searchCompressorString)); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java index 5cb2ebcde..bddea244b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java @@ -190,6 +190,7 @@ public void logRow(String datasetName, float neighborOverflow, boolean addHierarchy, boolean refineFinalGraph, + int repetition, Set featureSetForIndex, String buildCompressorString, String searchCompressorString, @@ -210,6 +211,7 @@ public void logRow(String datasetName, neighborOverflow, addHierarchy, refineFinalGraph, + repetition, featureSetForIndex, buildCompressorString, searchCompressorString, @@ -243,4 +245,24 @@ public void registerDataset(String datasetName, DataSet ds) throws IOException { datasetInfoWriter.register(DatasetInfoWriter.fromDataSet(datasetName, "", "", "", ds)); } + + /** + * Print a console summary grouped by everything except {@code repetition}, + * aggregating each metric column into count/mean/stddev/cv. + * + *

Reads from the experiments.csv written by this run. If logging is disabled + * (no writer), emits a one-line notice instead — per the "require logging" policy.

+ */ + public void printSummary() { + if (experimentsWriter == null) { + System.out.println("Summary skipped: experiments logging is disabled. " + + "Enable the 'logging:' block in run-config.yml to see run summaries."); + return; + } + try { + RunSummaryPrinter.print(experimentsWriter.path(), System.out); + } catch (IOException e) { + throw new UncheckedIOException("Failed to produce run summary", e); + } + } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunSummaryPrinter.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunSummaryPrinter.java new file mode 100644 index 000000000..2e658e1f6 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunSummaryPrinter.java @@ -0,0 +1,239 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.reporting; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +/** + * Reads a written {@code experiments.csv} file and prints a console summary + * that collapses rows across the {@code repetition} dimension. For each group + * (identical fixed columns excluding {@code repetition}), metric columns are + * aggregated into count, mean, stddev, and coefficient of variation. + * + *

Only used when experiments logging is enabled; otherwise the caller should + * print a one-line notice instead.

+ */ +public final class RunSummaryPrinter { + + /** Fixed columns that are NOT part of the grouping key. */ + private static final Set NON_GROUPING_FIXED = Set.of( + "schema_version", + "run_id", + "run_uuid", + "system_id", + "repetition" + ); + + private RunSummaryPrinter() {} + + /** + * Read {@code experiments.csv} and print a grouped summary to {@code out}. + */ + public static void print(Path experimentsCsv, PrintStream out) throws IOException { + if (!Files.exists(experimentsCsv)) { + out.println("Summary skipped: " + experimentsCsv + " does not exist."); + return; + } + + List lines = Files.readAllLines(experimentsCsv, StandardCharsets.UTF_8); + if (lines.size() < 2) { + out.println("Summary skipped: " + experimentsCsv + " has no data rows."); + return; + } + + List header = parseCsvRow(lines.get(0)); + Set fixedCols = new LinkedHashSet<>(ExperimentsSchemaV1.fixedColumns()); + + List groupingCols = new ArrayList<>(); + List metricCols = new ArrayList<>(); + for (String col : header) { + if (fixedCols.contains(col)) { + if (!NON_GROUPING_FIXED.contains(col)) { + groupingCols.add(col); + } + } else { + metricCols.add(col); + } + } + + // group-key -> (metric-col -> accumulator) + Map groups = new LinkedHashMap<>(); + + for (int i = 1; i < lines.size(); i++) { + String line = lines.get(i); + if (line.isBlank()) continue; + List row = parseCsvRow(line); + if (row.size() != header.size()) { + // Skip malformed rows rather than crash + continue; + } + + Map byCol = new LinkedHashMap<>(); + for (int c = 0; c < header.size(); c++) { + byCol.put(header.get(c), row.get(c)); + } + + StringBuilder keyBuf = new StringBuilder(); + Map keyMap = new LinkedHashMap<>(); + for (String gc : groupingCols) { + String v = byCol.getOrDefault(gc, ""); + keyBuf.append(gc).append('=').append(v).append('\u001f'); + keyMap.put(gc, v); + } + String key = keyBuf.toString(); + + Group g = groups.computeIfAbsent(key, k -> new Group(keyMap, metricCols)); + g.observedRows++; + + for (String mc : metricCols) { + String raw = byCol.getOrDefault(mc, ""); + if (raw == null || raw.isBlank()) continue; + try { + double val = Double.parseDouble(raw); + g.accumulate(mc, val); + } catch (NumberFormatException ignored) { + // non-numeric metric cell; skip + } + } + } + + if (groups.isEmpty()) { + out.println("Summary skipped: no data rows in " + experimentsCsv + "."); + return; + } + + out.println(); + out.println("=== Summary (aggregated across repetitions) ==="); + out.println("Source: " + experimentsCsv); + out.println(); + + int groupIdx = 0; + for (Group g : groups.values()) { + groupIdx++; + out.printf("Group %d (N=%d rows)%n", groupIdx, g.observedRows); + for (Map.Entry e : g.keyMap.entrySet()) { + out.printf(" %-20s %s%n", e.getKey(), e.getValue()); + } + for (String mc : metricCols) { + Accumulator acc = g.metrics.get(mc); + if (acc == null || acc.count == 0) continue; + double mean = acc.mean(); + double std = acc.sampleStddev(); + double cv = (mean != 0.0) ? (std / Math.abs(mean)) * 100.0 : 0.0; + out.printf(Locale.ROOT, + " %-40s n=%d mean=%.6g std=%.6g cv=%.2f%%%n", + mc, acc.count, mean, std, cv); + } + out.println(); + } + } + + private static final class Group { + final Map keyMap; + final Map metrics; + int observedRows; + + Group(Map keyMap, List metricCols) { + this.keyMap = keyMap; + this.metrics = new LinkedHashMap<>(); + for (String mc : metricCols) { + metrics.put(mc, new Accumulator()); + } + } + + void accumulate(String metricCol, double value) { + metrics.get(metricCol).add(value); + } + } + + /** Welford's online mean/variance. */ + private static final class Accumulator { + int count; + double mean; + double m2; + + void add(double x) { + count++; + double delta = x - mean; + mean += delta / count; + double delta2 = x - mean; + m2 += delta * delta2; + } + + double mean() { return mean; } + + double sampleStddev() { + return (count < 2) ? 0.0 : Math.sqrt(m2 / (count - 1)); + } + } + + // --------------------------- + // Minimal CSV row parser + // --------------------------- + // Handles: + // - double-quote-escaped fields (with ""-doubling) + // - commas inside quoted fields + // Does NOT handle embedded newlines inside quoted fields, which the writer does not produce. + private static List parseCsvRow(String line) { + List out = new ArrayList<>(); + StringBuilder cur = new StringBuilder(); + boolean inQuotes = false; + int i = 0; + while (i < line.length()) { + char c = line.charAt(i); + if (inQuotes) { + if (c == '"') { + if (i + 1 < line.length() && line.charAt(i + 1) == '"') { + cur.append('"'); + i += 2; + continue; + } + inQuotes = false; + i++; + continue; + } + cur.append(c); + i++; + } else { + if (c == ',') { + out.add(cur.toString()); + cur.setLength(0); + i++; + } else if (c == '"') { + inQuotes = true; + i++; + } else { + cur.append(c); + i++; + } + } + } + out.add(cur.toString()); + return out; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/OnDiskGraphIndexCache.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/OnDiskGraphIndexCache.java index ae4a57e1d..8b2484204 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/OnDiskGraphIndexCache.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/OnDiskGraphIndexCache.java @@ -29,6 +29,8 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -128,6 +130,7 @@ public static final class CacheKey { public final float alpha; public final boolean addHierarchy; public final boolean refineFinalGraph; + public final int repetition; public final String compressorId; private CacheKey(String datasetName, @@ -138,6 +141,7 @@ private CacheKey(String datasetName, float alpha, boolean addHierarchy, boolean refineFinalGraph, + int repetition, String compressorId) { this.datasetName = datasetName; this.featureSet = featureSet; @@ -147,6 +151,7 @@ private CacheKey(String datasetName, this.alpha = alpha; this.addHierarchy = addHierarchy; this.refineFinalGraph = refineFinalGraph; + this.repetition = repetition; this.compressorId = compressorId; } } @@ -154,6 +159,10 @@ private CacheKey(String datasetName, /** * Convenience factory for a {@link CacheKey}. Uses {@code buildCompressor.toString()} (whitespace removed) * as the compressor id, matching existing naming conventions. + * + *

The {@code repetition} index participates in the signature so that callers using + * {@code repetitions > 1} can force a fresh build per repetition even when the on-disk + * cache is otherwise enabled.

*/ public CacheKey key(String datasetName, Set featureSet, @@ -163,12 +172,13 @@ public CacheKey key(String datasetName, float alpha, boolean addHierarchy, boolean refineFinalGraph, + int repetition, VectorCompressor buildCompressor) { Objects.requireNonNull(datasetName, "datasetName"); Objects.requireNonNull(featureSet, "featureSet"); Objects.requireNonNull(buildCompressor, "buildCompressor"); String compressorId = buildCompressor.toString().replaceAll("\\s+", ""); - return new CacheKey(datasetName, featureSet, M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph, compressorId); + return new CacheKey(datasetName, featureSet, M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph, repetition, compressorId); } /** @@ -336,17 +346,23 @@ static Entry compute(Path cacheDir, CacheKey key) { .sorted() .collect(Collectors.joining("-")); - String signature = String.join("_", - datasetBase, - featureSetName, - "M" + key.M, - "ef" + key.efConstruction, - "of" + key.neighborOverflow, - "alpha" + key.alpha, - key.addHierarchy ? "H1" : "H0", - key.refineFinalGraph ? "R1" : "R0", - key.compressorId - ); + // Include "rep" segment only when repetition > 0 so signatures collapse to + // the pre-repetition format when the repetitions knob is unused. This preserves + // compatibility with previously cached indexes. + List parts = new ArrayList<>(); + parts.add(datasetBase); + parts.add(featureSetName); + parts.add("M" + key.M); + parts.add("ef" + key.efConstruction); + parts.add("of" + key.neighborOverflow); + parts.add("alpha" + key.alpha); + parts.add(key.addHierarchy ? "H1" : "H0"); + parts.add(key.refineFinalGraph ? "R1" : "R0"); + if (key.repetition > 0) { + parts.add("rep" + key.repetition); + } + parts.add(key.compressorId); + String signature = String.join("_", parts); String finalName = sanitizePathComponent("graph_" + signature); Path finalPath = cacheDir.resolve(finalName); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java index 6e56a6e26..0092409e5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/MultiConfig.java @@ -37,6 +37,33 @@ public class MultiConfig { public SearchParameters search; public String dataset; + /** + * Number of times to repeat the full (build + search) run for this dataset. + * Absent or < 1 is treated as 1. Each repetition produces its own set of + * experiments.csv rows tagged with a 0-based repetition index. + */ + public Integer repetitions; + + public int repetitionsOrOne() { + return (repetitions == null || repetitions < 1) ? 1 : repetitions; + } + + /** + * Additional dataset names this config should apply to when no dedicated + * {@code .yml} file exists under the index-parameters directory. + * + *

Each entry may be either an exact dataset name or a shell-style glob + * containing {@code *} (any characters) or {@code ?} (single character). + * Example: {@code "sift1m:label_*"} matches {@code sift1m:label_00} through + * {@code sift1m:label_11}.

+ * + *

Resolution order in {@link #getDefaultConfig(String)}: exact filename file + * first, then exact also_for match, then glob also_for match, then {@code default.yml}. + * The matching config's {@link #dataset} is overwritten with the requested name so + * downstream code loads the right dataset.

+ */ + public List also_for; + private static final String defaultDirectory = "jvector-examples/yaml-configs/index-parameters/"; private static final java.util.regex.Pattern YAML_SCHEMA_VERSION_KEY = java.util.regex.Pattern.compile("(?m)^\\s*yamlSchemaVersion\\s*:"); @@ -51,27 +78,152 @@ public class MultiConfig { new java.util.concurrent.atomic.AtomicReference<>(); public static MultiConfig getDefaultConfig(String datasetName) throws FileNotFoundException { + // 1. Direct filename match: .yml var name = defaultDirectory + datasetName; if (!name.endsWith(".yml")) { name += ".yml"; } File configFile = new File(name); - boolean useDefault = !configFile.exists(); - if (useDefault) { - configFile = new File(defaultDirectory + "default.yml"); + if (configFile.exists()) { + return getConfig(configFile); + } - // Record fallback usage for a compact summary print later - DEFAULT_FILE_USED.compareAndSet(null, configFile); - DEFAULT_USED_FOR_DATASETS.add(datasetName); + // 2. Scan index-parameters/*.yml for a config whose also_for list contains datasetName + File aliased = findConfigByAlsoFor(datasetName); + if (aliased != null) { + var config = getConfig(aliased); + // Overwrite dataset so downstream code loads the requested dataset, not the source config's + config.dataset = datasetName; + ALSO_FOR_USAGE.put(datasetName, aliased); + return config; } + // 3. Fall back to default.yml + configFile = new File(defaultDirectory + "default.yml"); + DEFAULT_FILE_USED.compareAndSet(null, configFile); + DEFAULT_USED_FOR_DATASETS.add(datasetName); + var config = getConfig(configFile); + config.dataset = datasetName; + return config; + } - if (useDefault) { - config.dataset = datasetName; + /** + * Records which dataset names were served via {@code also_for} and which file provided the config. + * Exposed for logging/telemetry. + */ + private static final java.util.concurrent.ConcurrentHashMap ALSO_FOR_USAGE = + new java.util.concurrent.ConcurrentHashMap<>(); + + /** + * Cached alias index: exact-match map plus an ordered list of glob patterns. + * Built lazily on first lookup; scans fail-soft so a single malformed yaml does not break the world. + */ + private static volatile AliasIndex ALIAS_INDEX; + + private static final class AliasIndex { + final Map exact; + final List globs; + + AliasIndex(Map exact, List globs) { + this.exact = exact; + this.globs = globs; } - return config; + File find(String datasetName) { + File hit = exact.get(datasetName); + if (hit != null) return hit; + for (GlobEntry g : globs) { + if (g.pattern.matcher(datasetName).matches()) { + return g.file; + } + } + return null; + } + } + + private static final class GlobEntry { + final String glob; + final java.util.regex.Pattern pattern; + final File file; + + GlobEntry(String glob, java.util.regex.Pattern pattern, File file) { + this.glob = glob; + this.pattern = pattern; + this.file = file; + } + } + + private static File findConfigByAlsoFor(String datasetName) { + AliasIndex idx = ALIAS_INDEX; + if (idx == null) { + synchronized (MultiConfig.class) { + idx = ALIAS_INDEX; + if (idx == null) { + idx = buildAliasIndex(); + ALIAS_INDEX = idx; + } + } + } + return idx.find(datasetName); + } + + private static boolean isGlob(String alias) { + return alias.indexOf('*') >= 0 || alias.indexOf('?') >= 0; + } + + /** + * Converts a shell-style glob containing {@code *} and {@code ?} wildcards + * into an anchored regex pattern. All other regex metacharacters are escaped. + */ + static java.util.regex.Pattern globToPattern(String glob) { + StringBuilder sb = new StringBuilder(glob.length() + 4); + sb.append('^'); + for (int i = 0; i < glob.length(); i++) { + char c = glob.charAt(i); + switch (c) { + case '*': sb.append(".*"); break; + case '?': sb.append('.'); break; + case '.': case '\\': case '+': case '(': case ')': + case '|': case '^': case '$': case '[': case ']': + case '{': case '}': + sb.append('\\').append(c); break; + default: sb.append(c); + } + } + sb.append('$'); + return java.util.regex.Pattern.compile(sb.toString()); + } + + private static AliasIndex buildAliasIndex() { + Map exact = new java.util.HashMap<>(); + List globs = new ArrayList<>(); + File dir = new File(defaultDirectory); + File[] yamls = dir.listFiles((d, n) -> n.endsWith(".yml") || n.endsWith(".yaml")); + if (yamls == null) return new AliasIndex(exact, globs); + for (File f : yamls) { + final MultiConfig cfg; + try { + cfg = getConfig(f); + } catch (Exception e) { + // Fail-soft: one bad yaml shouldn't block alias lookups for the rest + System.err.println("WARNING: failed to parse " + f.getName() + " during also_for index build: " + e.getMessage()); + continue; + } + if (cfg.also_for == null) continue; + for (String alias : cfg.also_for) { + if (isGlob(alias)) { + globs.add(new GlobEntry(alias, globToPattern(alias), f)); + } else { + File prev = exact.putIfAbsent(alias, f); + if (prev != null && !prev.equals(f)) { + System.err.println("WARNING: also_for alias '" + alias + "' declared by both " + + prev.getName() + " and " + f.getName() + "; keeping " + prev.getName()); + } + } + } + } + return new AliasIndex(exact, globs); } public static MultiConfig getConfig(String configName) throws FileNotFoundException { @@ -198,13 +350,24 @@ public void setOnDiskIndexVersion(int onDiskIndexVersion) { public static void printDefaultConfigUsageSummary() { File f = DEFAULT_FILE_USED.get(); - if (f == null) return; - - var datasets = new java.util.ArrayList<>(DEFAULT_USED_FOR_DATASETS); - if (datasets.isEmpty()) return; + if (f != null) { + var datasets = new java.util.ArrayList<>(DEFAULT_USED_FOR_DATASETS); + if (!datasets.isEmpty()) { + System.out.println("Default YAML used for datasets: " + wrapList(datasets, 6, " ")); + } + } - // Print a wrapped bracket-list similar to "Executing the following datasets" - System.out.println("Default YAML used for datasets: " + wrapList(datasets, 6, " ")); + if (!ALSO_FOR_USAGE.isEmpty()) { + // Group "dataset -> source yaml" reports by the source file for compact output + Map> bySource = new java.util.LinkedHashMap<>(); + for (var e : ALSO_FOR_USAGE.entrySet()) { + bySource.computeIfAbsent(e.getValue().getName(), k -> new ArrayList<>()).add(e.getKey()); + } + for (var entry : bySource.entrySet()) { + System.out.println("also_for in " + entry.getKey() + " applied to: " + + wrapList(entry.getValue(), 6, " ")); + } + } } private static String wrapList(java.util.List items, int perLine, String indent) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java index dabce268a..a42ba412c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/SearchParameters.java @@ -23,6 +23,19 @@ public class SearchParameters extends CommonParameters { public Map> topKOverquery; public List useSearchPruning; + /** + * Number of times to repeat the query phase against a built index. Each + * benchmark (latency, throughput, count, accuracy) aggregates across this many runs. + * Absent or < 1 falls back to {@link #DEFAULT_QUERY_RUNS}. + */ + public Integer queryRuns; + + public static final int DEFAULT_QUERY_RUNS = 2; + + public int queryRunsOrDefault() { + return (queryRuns == null || queryRuns < 1) ? DEFAULT_QUERY_RUNS : queryRuns; + } + // NOTE: benchmark compute + console/logging selection are now run-level (run-config.yml) // and are no longer recognized in dataset configs. } diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java index 13f2136aa..15595a9a0 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetPropertiesTest.java @@ -315,6 +315,62 @@ public void metadataReaderNonexistentFile() { DataSetMetadataReader.load("/no/such/file.yml")); } + @Test + public void metadataReaderMatchesGlobKey() { + var reader = DataSetMetadataReader.load(testResource("glob_entries.yml")); + for (String k : new String[]{"sift1m:label_00", "sift1m:label_01", "sift1m:label_11"}) { + var props = reader.getProperties(k); + assertTrue(props.isPresent(), "Expected glob match for " + k); + assertEquals(k, props.get().getName()); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, props.get().similarityFunction().orElse(null)); + assertEquals(DataSetProperties.LoadBehavior.NO_SCRUB, props.get().loadBehavior()); + } + } + + @Test + public void metadataReaderExactKeyPreferredOverGlob() { + // sift-128-euclidean is both an exact key and a potential glob match for "sift-*"-style patterns; + // exact match must win regardless. + var reader = DataSetMetadataReader.load(testResource("glob_entries.yml")); + var props = reader.getProperties("sift-128-euclidean"); + assertTrue(props.isPresent()); + assertEquals(128000, props.get().numVectors()); + } + + @Test + public void metadataReaderQuestionMarkGlob() { + var reader = DataSetMetadataReader.load(testResource("glob_entries.yml")); + assertTrue(reader.getProperties("cohere-en").isPresent()); + // "cohere-eng" has three chars after the dash and should NOT match "cohere-??" + assertTrue(reader.getProperties("cohere-eng").isEmpty()); + } + + @Test + public void metadataReaderGlobMissReturnsEmpty() { + var reader = DataSetMetadataReader.load(testResource("glob_entries.yml")); + assertTrue(reader.getProperties("totally-unrelated-dataset").isEmpty()); + } + + @Test + public void metadataReaderAmbiguousGlobsThrow() { + var reader = DataSetMetadataReader.load(testResource("ambiguous_globs.yml")); + // "sift1m:label_00" matches BOTH "sift1m:label_*" and "sift1m:*_00" + var ex = assertThrows(IllegalStateException.class, + () -> reader.getProperties("sift1m:label_00")); + assertTrue(ex.getMessage().contains("sift1m:label_00"), ex.getMessage()); + assertTrue(ex.getMessage().contains("sift1m:label_*"), ex.getMessage()); + assertTrue(ex.getMessage().contains("sift1m:*_00"), ex.getMessage()); + } + + @Test + public void metadataReaderAmbiguousGlobsDoNotThrowForUnambiguousKeys() { + // Same file, but a key that only one glob claims should resolve cleanly. + var reader = DataSetMetadataReader.load(testResource("ambiguous_globs.yml")); + var props = reader.getProperties("sift1m:label_05"); + assertTrue(props.isPresent()); + assertEquals(VectorSimilarityFunction.EUCLIDEAN, props.get().similarityFunction().orElse(null)); + } + // ======================================================================== // DataSetInfo delegation // ======================================================================== diff --git a/jvector-examples/yaml-configs/dataset-metadata.yml b/jvector-examples/yaml-configs/dataset-metadata.yml index 076ac7075..e8d263d7b 100644 --- a/jvector-examples/yaml-configs/dataset-metadata.yml +++ b/jvector-examples/yaml-configs/dataset-metadata.yml @@ -102,4 +102,7 @@ nytimes-256-angular: load_behavior: NO_SCRUB sift-128-euclidean: similarity_function: EUCLIDEAN - load_behavior: NO_SCRUB \ No newline at end of file + load_behavior: NO_SCRUB +"sift1m:label_*": + similarity_function: EUCLIDEAN + load_behavior: NO_SCRUB From 2a4bea025ce305e3c550b9b81368a311fd60bda8 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 19:45:17 +0000 Subject: [PATCH 4/7] glob support in dataset metadata --- .../datasets/DataSetMetadataReader.java | 75 ++++++++++++++++++- .../benchmarks/datasets/glob_entries.yml | 22 ++++++ 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/glob_entries.yml diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java index fabf9fb22..40f6d247f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java @@ -52,15 +52,30 @@ /// /// Keys may or may not include file extensions (e.g. {@code .hdf5}). The lookup tries /// the exact key first, then falls back to the key with {@code .hdf5} appended. +/// +/// Keys may also contain glob wildcards (`*` matches any run of characters, `?` matches a single +/// character) which are tried as a fallback after exact and hdf5-suffix lookups fail. For example, +/// a YAML key `"sift1m:label_*"` will match lookups of `sift1m:label_00`, `sift1m:label_01`, etc. +/// A single glob key is expected to match many concrete dataset names — that is the point. However, +/// if two or more glob keys in the same metadata file both match the same concrete lookup, the +/// glob set is ambiguous for that lookup and {@link IllegalStateException} is thrown so the +/// ambiguity can be resolved in the metadata file. public class DataSetMetadataReader { private static final String DEFAULT_FILE = "jvector-examples/yaml-configs/dataset-metadata.yml"; private static final String MODULE_RELATIVE_DEFAULT_FILE = "yaml-configs/dataset-metadata.yml"; private final Map> metadata; + private final List globEntries; private DataSetMetadataReader(Map> metadata) { this.metadata = metadata != null ? metadata : Map.of(); + this.globEntries = new ArrayList<>(); + for (Map.Entry> e : this.metadata.entrySet()) { + if (containsGlob(e.getKey())) { + this.globEntries.add(new GlobEntry(e.getKey(), globToPattern(e.getKey()), e.getValue())); + } + } } /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset-metadata.yml}). @@ -134,6 +149,64 @@ private Optional> findEntry(String datasetKey) { return Optional.of(entry); } - return Optional.ofNullable(metadata.get(datasetKey + ".hdf5")); + // Collect every glob key that matches this one concrete lookup. + // A glob is expected to match many different concrete keys across calls; what is NOT + // acceptable is having >1 glob claim ownership of the same concrete key on one lookup. + List matches = new ArrayList<>(); + for (GlobEntry g : globEntries) { + if (g.pattern.matcher(datasetKey).matches() || g.pattern.matcher(alternate).matches()) { + matches.add(g); + } + } + if (matches.size() > 1) { + List globs = new ArrayList<>(); + for (GlobEntry g : matches) globs.add(g.glob); + throw new IllegalStateException( + "Ambiguous glob match for dataset key '" + datasetKey + "': " + globs); + } + if (matches.size() == 1) { + return Optional.of(matches.get(0).value); + } + + return Optional.empty(); + } + + private static boolean containsGlob(String s) { + return s.indexOf('*') >= 0 || s.indexOf('?') >= 0; + } + + /// Converts a glob pattern (supporting `*` and `?`) to a regex that matches whole strings. + /// All other characters are treated as literals. + static Pattern globToPattern(String glob) { + StringBuilder sb = new StringBuilder(glob.length() + 8); + int i = 0; + int literalStart = 0; + while (i < glob.length()) { + char c = glob.charAt(i); + if (c == '*' || c == '?') { + if (i > literalStart) { + sb.append(Pattern.quote(glob.substring(literalStart, i))); + } + sb.append(c == '*' ? ".*" : "."); + literalStart = i + 1; + } + i++; + } + if (literalStart < glob.length()) { + sb.append(Pattern.quote(glob.substring(literalStart))); + } + return Pattern.compile(sb.toString()); + } + + private static final class GlobEntry { + final String glob; + final Pattern pattern; + final Map value; + + GlobEntry(String glob, Pattern pattern, Map value) { + this.glob = glob; + this.pattern = pattern; + this.value = value; + } } } diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/glob_entries.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/glob_entries.yml new file mode 100644 index 000000000..9d16c53db --- /dev/null +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/glob_entries.yml @@ -0,0 +1,22 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +sift-128-euclidean: + similarity_function: EUCLIDEAN + num_vectors: 128000 +"sift1m:label_*": + similarity_function: EUCLIDEAN + load_behavior: NO_SCRUB +"cohere-??": + similarity_function: COSINE From fbbc855d27e8e64dac5ffce2075133678eda3058 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 19:45:21 +0000 Subject: [PATCH 5/7] allow index parameters to be also_for others by pattern --- .../jbellis/jvector/example/BenchYAML.java | 41 +++++--- .../example/util/BenchArgExpander.java | 80 ++++++++++++++++ .../example/util/BenchArgExpanderTest.java | 77 +++++++++++++++ .../example/yaml/MultiConfigAlsoForTest.java | 95 +++++++++++++++++++ .../benchmarks/datasets/ambiguous_globs.yml | 18 ++++ jvector-examples/yaml-configs/datasets.yml | 16 +++- 6 files changed, 312 insertions(+), 15 deletions(-) create mode 100644 jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchArgExpander.java create mode 100644 jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchArgExpanderTest.java create mode 100644 jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/MultiConfigAlsoForTest.java create mode 100644 jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/ambiguous_globs.yml diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index e066a34dc..d517b3e73 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -20,6 +20,7 @@ import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets; import io.github.jbellis.jvector.example.reporting.RunArtifacts; import io.github.jbellis.jvector.example.reporting.SearchReportingCatalog; +import io.github.jbellis.jvector.example.util.BenchArgExpander; import io.github.jbellis.jvector.example.yaml.DatasetCollection; import io.github.jbellis.jvector.example.yaml.MultiConfig; import io.github.jbellis.jvector.example.yaml.RunConfig; @@ -43,9 +44,10 @@ public static void main(String[] args) throws IOException { if (args == null) { throw new InvalidParameterException("argv[] is null, check your maven exec config"); } + // Expand range shorthand like sift1m:label_[00..11] into 12 separate tokens + args = BenchArgExpander.expandAll(args); String regex = Arrays.stream(args) .filter(Objects::nonNull) - .flatMap(s -> Arrays.stream(s.split("\\s"))) .filter(s -> !s.isEmpty()) .collect(Collectors.joining("|")); var pattern = Pattern.compile(regex.isEmpty() ? ".*" : regex); @@ -124,20 +126,31 @@ public static void main(String[] args) throws IOException { // Register dataset info the first time we actually load the dataset for benchmarking artifacts.registerDataset(datasetName, ds); - Grid.runAll(ds, - config.construction.useSavedIndexIfExists, - config.construction.outDegree, - config.construction.efConstruction, - config.construction.neighborOverflow, - config.construction.addHierarchy, - config.construction.refineFinalGraph, - config.construction.getFeatureSets(), - config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), - config.search.topKOverquery, - config.search.useSearchPruning, - artifacts); + int repetitions = config.repetitionsOrOne(); + for (int rep = 0; rep < repetitions; rep++) { + if (repetitions > 1) { + System.out.printf("%s: repetition %d of %d%n", datasetName, rep + 1, repetitions); + } + Grid.runAll(ds, + config.construction.useSavedIndexIfExists, + config.construction.outDegree, + config.construction.efConstruction, + config.construction.neighborOverflow, + config.construction.alpha, + config.construction.addHierarchy, + config.construction.refineFinalGraph, + config.construction.getFeatureSets(), + config.construction.getCompressorParameters(), + config.search.getCompressorParameters(), + config.search.topKOverquery, + config.search.useSearchPruning, + config.search.queryRunsOrDefault(), + rep, + artifacts); + } } + + artifacts.printSummary(); } private static String wrapList(java.util.List items, int perLine, String indent) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchArgExpander.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchArgExpander.java new file mode 100644 index 000000000..75b063aa9 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchArgExpander.java @@ -0,0 +1,80 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.util; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/// Expands shorthand numeric range patterns in benchArgs tokens. +/// +/// A token may embed one or more `[start..end]` ranges, e.g. +/// `sift1m:label_[00..11]` expands to `sift1m:label_00`, `sift1m:label_01`, ... `sift1m:label_11`. +/// The zero-padding width is the max of the two endpoint token lengths, so `[00..11]` pads to 2 +/// and `[1..9]` pads to 1. Ranges expand combinatorially when more than one appears in a token, +/// and reverse ranges (`[11..00]`) count down. +public final class BenchArgExpander { + + private static final Pattern RANGE = Pattern.compile("\\[(\\d+)\\.\\.(\\d+)\\]"); + + private BenchArgExpander() {} + + /// Expands a single token, returning one or more resulting tokens. + public static List expand(String token) { + Matcher m = RANGE.matcher(token); + if (!m.find()) { + return List.of(token); + } + String startStr = m.group(1); + String endStr = m.group(2); + int start = Integer.parseInt(startStr); + int end = Integer.parseInt(endStr); + int width = Math.max(startStr.length(), endStr.length()); + String prefix = token.substring(0, m.start()); + String suffix = token.substring(m.end()); + int step = start <= end ? 1 : -1; + List out = new ArrayList<>(); + for (int i = start; step > 0 ? i <= end : i >= end; i += step) { + String padded = String.format("%0" + width + "d", i); + out.addAll(expand(prefix + padded + suffix)); + } + return out; + } + + /// Splits each arg on whitespace, drops empties, expands range patterns, and returns the flat list. + public static String[] expandAll(String[] args) { + if (args == null) { + return new String[0]; + } + List out = new ArrayList<>(); + for (String arg : args) { + if (arg == null) continue; + for (String token : arg.split("\\s+")) { + if (token.isEmpty()) continue; + out.addAll(expand(token)); + } + } + return out.toArray(new String[0]); + } + + /// Convenience: expand, then filter to tokens for which {@code keep} holds. + public static String[] expandAll(String[] args, java.util.function.Predicate keep) { + return Arrays.stream(expandAll(args)).filter(Objects::nonNull).filter(keep).toArray(String[]::new); + } +} diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchArgExpanderTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchArgExpanderTest.java new file mode 100644 index 000000000..fa8ed69d4 --- /dev/null +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchArgExpanderTest.java @@ -0,0 +1,77 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.util; + +import org.junit.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class BenchArgExpanderTest { + + @Test + public void singleRangeWithLeadingZeros() { + List out = BenchArgExpander.expand("sift1m:label_[00..11]"); + assertEquals(12, out.size()); + assertEquals("sift1m:label_00", out.get(0)); + assertEquals("sift1m:label_01", out.get(1)); + assertEquals("sift1m:label_11", out.get(11)); + } + + @Test + public void singleDigitRangeHasNoPadding() { + List out = BenchArgExpander.expand("foo[1..3]bar"); + assertEquals(List.of("foo1bar", "foo2bar", "foo3bar"), out); + } + + @Test + public void widthDerivesFromLongerEndpoint() { + List out = BenchArgExpander.expand("x[1..10]"); + assertEquals(10, out.size()); + assertEquals("x01", out.get(0)); + assertEquals("x10", out.get(9)); + } + + @Test + public void reverseRangeCountsDown() { + List out = BenchArgExpander.expand("r[03..01]"); + assertEquals(List.of("r03", "r02", "r01"), out); + } + + @Test + public void tokenWithoutRangeIsPassedThrough() { + assertEquals(List.of("plain-name"), BenchArgExpander.expand("plain-name")); + } + + @Test + public void multipleRangesExpandCombinatorially() { + List out = BenchArgExpander.expand("a[0..1]b[2..3]"); + assertEquals(List.of("a0b2", "a0b3", "a1b2", "a1b3"), out); + } + + @Test + public void expandAllSplitsWhitespaceAndFiltersEmpties() { + String[] out = BenchArgExpander.expandAll(new String[]{"glove [00..02] nytimes", null, ""}); + assertArrayEquals(new String[]{"glove", "00", "01", "02", "nytimes"}, out); + } + + @Test + public void expandAllHandlesNullArgs() { + assertArrayEquals(new String[0], BenchArgExpander.expandAll(null)); + } +} diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/MultiConfigAlsoForTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/MultiConfigAlsoForTest.java new file mode 100644 index 000000000..adfb1bf5e --- /dev/null +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/MultiConfigAlsoForTest.java @@ -0,0 +1,95 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.yaml; + +import org.junit.Test; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +/// Verifies that a yaml index-parameters config declaring an {@code also_for} list +/// has that field populated correctly. Full directory-scan resolution via +/// {@link MultiConfig#getDefaultConfig(String)} depends on the repo-relative +/// index-parameters directory and is covered by running BenchYAML end-to-end. +public class MultiConfigAlsoForTest { + + @Test + public void alsoForFieldParsesFromYaml() throws Exception { + String yaml = "yamlSchemaVersion: 1\n" + + "onDiskIndexVersion: 6\n" + + "dataset: some-dataset\n" + + "also_for:\n" + + " - \"other-dataset:variant\"\n" + + " - plain-alias\n"; + + Path tmp = Files.createTempFile("multiconfig-alsofor-", ".yml"); + try { + Files.writeString(tmp, yaml); + MultiConfig cfg = MultiConfig.getConfig(tmp.toFile()); + assertNotNull(cfg.also_for, "also_for should be parsed"); + assertEquals(2, cfg.also_for.size()); + assertEquals("other-dataset:variant", cfg.also_for.get(0)); + assertEquals("plain-alias", cfg.also_for.get(1)); + } finally { + Files.deleteIfExists(tmp); + } + } + + @Test + public void globToPattern_starMatchesAcrossSegments() { + var p = MultiConfig.globToPattern("sift1m:label_*"); + assertTrue(p.matcher("sift1m:label_00").matches()); + assertTrue(p.matcher("sift1m:label_11").matches()); + assertTrue(p.matcher("sift1m:label_").matches()); // * allows empty + assertFalse(p.matcher("sift1m-label_00").matches()); + assertFalse(p.matcher("other:label_00").matches()); + } + + @Test + public void globToPattern_questionMarkMatchesSingleChar() { + var p = MultiConfig.globToPattern("ds?-v1"); + assertTrue(p.matcher("dsA-v1").matches()); + assertTrue(p.matcher("ds1-v1").matches()); + assertFalse(p.matcher("ds-v1").matches()); // ? requires one char + assertFalse(p.matcher("dsAB-v1").matches()); + } + + @Test + public void globToPattern_regexMetacharsAreEscaped() { + var p = MultiConfig.globToPattern("a.b+c(d)"); + assertTrue(p.matcher("a.b+c(d)").matches()); + assertFalse(p.matcher("aXb+c(d)").matches()); // '.' must be literal, not wildcard + } + + @Test + public void alsoForIsNullWhenAbsent() throws Exception { + String yaml = "yamlSchemaVersion: 1\n" + + "onDiskIndexVersion: 6\n" + + "dataset: some-dataset\n"; + + Path tmp = Files.createTempFile("multiconfig-noalso-", ".yml"); + try { + Files.writeString(tmp, yaml); + MultiConfig cfg = MultiConfig.getConfig(tmp.toFile()); + assertNull(cfg.also_for); + } finally { + Files.deleteIfExists(tmp); + } + } +} diff --git a/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/ambiguous_globs.yml b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/ambiguous_globs.yml new file mode 100644 index 000000000..753daf673 --- /dev/null +++ b/jvector-examples/src/test/resources/io/github/jbellis/jvector/example/benchmarks/datasets/ambiguous_globs.yml @@ -0,0 +1,18 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"sift1m:label_*": + similarity_function: EUCLIDEAN +"sift1m:*_00": + similarity_function: COSINE diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml index 64f488efe..23fbb6223 100644 --- a/jvector-examples/yaml-configs/datasets.yml +++ b/jvector-examples/yaml-configs/datasets.yml @@ -34,4 +34,18 @@ regression-tests: # - cohere-english-v3-1M # - cohere-english-v3-10M # - deep-image-96-angular # large files not yet supported -# - gist-960-euclidean # large files not yet supported \ No newline at end of file +# - gist-960-euclidean # large files not yet supported +#sift-testing: +# - "sift1m:label_00" +# - "sift1m:label_01" +# - "sift1m:label_02" +# - "sift1m:label_03" +# - "sift1m:label_04" +# - "sift1m:label_05" +# - "sift1m:label_06" +# - "sift1m:label_07" +# - "sift1m:label_08" +# - "sift1m:label_09" +# - "sift1m:label_10" +# - "sift1m:label_11" +# - "sift1m:default" From 20db6ebd25b9c138e0f702abbe9b8a5e97442db7 Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 19:45:26 +0000 Subject: [PATCH 6/7] add alpha to index construction parameters --- .../github/jbellis/jvector/example/Bench.java | 7 +-- .../github/jbellis/jvector/example/Grid.java | 43 +++++++++++++------ .../jvector/example/HelloVectorWorld.java | 1 + .../reporting/ExperimentsSchemaV1.java | 3 ++ .../example/reporting/RunArtifacts.java | 2 + .../example/yaml/ConstructionParameters.java | 1 + .../yaml-configs/index-parameters/cap-10M.yml | 1 + .../yaml-configs/index-parameters/cap-1M.yml | 1 + .../cohere-english-v3-10M.yml | 1 + .../index-parameters/cohere-english-v3-1M.yml | 1 + .../index-parameters/colbert-1M.yml | 1 + .../yaml-configs/index-parameters/default.yml | 1 + .../index-parameters/dpr-gemma-10M.yml | 1 + .../index-parameters/dpr-gemma-1M.yml | 1 + .../index-parameters/glove-100-angular.yml | 1 + .../index-parameters/glove-200-angular.yml | 1 + .../index-parameters/glove-25-angular.yml | 1 + .../index-parameters/glove-50-angular.yml | 1 + .../index-parameters/lastfm-64-dot.yml | 1 + .../index-parameters/nytimes-256-angular.yml | 1 + 20 files changed, 55 insertions(+), 16 deletions(-) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 78a85e1fc..d4aa6be1d 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -52,6 +52,7 @@ public static void main(String[] args) throws IOException { List.of(1.0, 2.0) // oq ); // rerankK = oq * topK var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f); + var alphaGrid = List.of(1.2f); // List.of(1.0f, 1.2f, 1.4f); var addHierarchyGrid = List.of(true); // List.of(false, true); var refineFinalGraphGrid = List.of(true); // List.of(false, true); var usePruningGrid = List.of(true); // List.of(false, true); @@ -82,10 +83,10 @@ public static void main(String[] args) throws IOException { // compile regex and do substring matching using find var pattern = Pattern.compile(regex); - execute(pattern, enableIndexCache, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid); + execute(pattern, enableIndexCache, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, alphaGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid); } - private static void execute(Pattern pattern, boolean enableIndexCache, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { + private static void execute(Pattern pattern, boolean enableIndexCache, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List alphaGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { var datasetCollection = DatasetCollection.load(); var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); System.out.println("Executing the following datasets: " + datasetNames); @@ -94,7 +95,7 @@ private static void execute(Pattern pattern, boolean enableIndexCache, List new RuntimeException("Dataset " + datasetName + " not found") ).getDataSet(); - Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); + Grid.runAll(ds, enableIndexCache, mGrid, efConstructionGrid, neighborOverflowGrid, alphaGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); } } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index 90a2fc8c4..f1c314300 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -36,6 +36,7 @@ import io.github.jbellis.jvector.example.util.FilteredForkJoinPool; import io.github.jbellis.jvector.example.util.OnDiskGraphIndexCache; import io.github.jbellis.jvector.example.yaml.MetricSelection; +import io.github.jbellis.jvector.example.yaml.SearchParameters; import io.github.jbellis.jvector.graph.ImmutableGraphIndex; import io.github.jbellis.jvector.graph.GraphIndexBuilder; import io.github.jbellis.jvector.graph.GraphSearcher; @@ -100,6 +101,7 @@ static void runAll(DataSet ds, List mGrid, List efConstructionGrid, List neighborOverflowGrid, + List alphaGrid, List addHierarchyGrid, List refineFinalGraphGrid, List> featureSets, @@ -128,10 +130,12 @@ static void runAll(DataSet ds, for (var refineFinalGraph : refineFinalGraphGrid) { for (int M : mGrid) { for (float neighborOverflow : neighborOverflowGrid) { - for (int efC : efConstructionGrid) { - for (var bc : buildCompressors) { - runOneGraph(cache, featureSets, M, efC, neighborOverflow, addHierarchy, refineFinalGraph, - bc, compressionGrid, topKGrid, usePruningGrid, artifacts, ds, workDir); + for (float alpha : alphaGrid) { + for (int efC : efConstructionGrid) { + for (var bc : buildCompressors) { + runOneGraph(cache, featureSets, M, efC, neighborOverflow, alpha, addHierarchy, refineFinalGraph, + bc, compressionGrid, topKGrid, usePruningGrid, queryRuns, repetition, artifacts, ds, workDir); + } } } } @@ -166,6 +170,7 @@ static void runAll(DataSet ds, List mGrid, List efConstructionGrid, List neighborOverflowGrid, + List alphaGrid, List addHierarchyGrid, List refineFinalGraphGrid, List> featureSets, @@ -179,6 +184,7 @@ static void runAll(DataSet ds, mGrid, efConstructionGrid, neighborOverflowGrid, + alphaGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, @@ -221,6 +227,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache, int M, int efConstruction, float neighborOverflow, + float alpha, boolean addHierarchy, boolean refineFinalGraph, Function buildCompressor, @@ -260,7 +267,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache, Map, ImmutableGraphIndex> indexes = new HashMap<>(); if (buildCompressorObj == null) { - indexes = buildInMemory(featureSets, M, efConstruction, neighborOverflow, addHierarchy, refineFinalGraph, ds, workDirectory); + indexes = buildInMemory(featureSets, M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph, ds, workDirectory); } else { // If cache is disabled, we use the (tmp) workDirectory as the output Path outputDir = cache.isEnabled() ? cache.cacheDir().toAbsolutePath() : workDirectory; @@ -270,7 +277,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache, Map, OnDiskGraphIndexCache.WriteHandle> handles = new HashMap<>(); for (Set fs : featureSets) { - var key = cache.key(ds.getName(), fs, M, efConstruction, neighborOverflow, 1.2f, addHierarchy, refineFinalGraph, buildCompressorObj); + var key = cache.key(ds.getName(), fs, M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph, repetition, buildCompressorObj); var cached = cache.tryLoad(key); if (cached.isPresent()) { @@ -293,7 +300,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache, if (!missing.isEmpty()) { // At least one index needs to be built (b/c not in cache or cache is disabled) // We pass the handles map so buildOnDisk knows exactly where to write - var newIndexes = buildOnDisk(missing, M, efConstruction, neighborOverflow, addHierarchy, refineFinalGraph, + var newIndexes = buildOnDisk(missing, M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph, ds, outputDir, buildCompressorObj, handles, constructionMetrics); indexes.putAll(newIndexes); } @@ -340,7 +347,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache, } try (var cs = new ConfiguredSystem(ds, index, cv, featureSetForIndex)) { - testConfiguration(cs, topKGrid, usePruningGrid, M, efConstruction, neighborOverflow, addHierarchy, refineFinalGraph, featureSetForIndex, + testConfiguration(cs, topKGrid, usePruningGrid, M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph, queryRuns, repetition, featureSetForIndex, buildCompressorString, artifacts, constructionMetrics, workDirectory); } catch (Exception e) { throw new RuntimeException(e); @@ -374,6 +381,7 @@ private static Map, ImmutableGraphIndex> buildOnDisk(List, ImmutableGraphIndex> buildOnDisk(List buildCompressor.encodeAll(floatVectors)) : buildCompressor.encodeAll(floatVectors)); var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.getSimilarityFunction(), pq); - GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, efConstruction, neighborOverflow, 1.2f, addHierarchy, refineFinalGraph); + GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, efConstruction, neighborOverflow, alpha, addHierarchy, refineFinalGraph); // use the inline vectors index as the score provider for graph construction Map, OnDiskGraphIndexWriter> writers = new HashMap<>(); @@ -558,6 +566,7 @@ private static Map, ImmutableGraphIndex> buildInMemory(List, ImmutableGraphIndex> buildInMemory(List runAllAndCollectResults( List mGrid, List efConstructionGrid, List neighborOverflowGrid, + List alphaGrid, List addHierarchyGrid, List refineFinalGraphGrid, List> featureSets, @@ -842,6 +856,7 @@ public static List runAllAndCollectResults( for (int m : mGrid) { for (int ef : efConstructionGrid) { for (float neighborOverflow : neighborOverflowGrid) { + for (float alpha : alphaGrid) { for (boolean addHierarchy : addHierarchyGrid) { for (boolean refineFinalGraph : refineFinalGraphGrid) { for (Set features : featureSets) { @@ -888,7 +903,7 @@ public static List runAllAndCollectResults( Map, OnDiskGraphIndexCache.WriteHandle> handles = new HashMap<>(); for (Set fs : featureSets) { - var key = cache.key(ds.getName(), fs, m, ef, neighborOverflow, 1.2f, addHierarchy, refineFinalGraph, compressor); + var key = cache.key(ds.getName(), fs, m, ef, neighborOverflow, alpha, addHierarchy, refineFinalGraph, repetition, compressor); var cached = cache.tryLoad(key); if (cached.isPresent()) { @@ -912,7 +927,7 @@ public static List runAllAndCollectResults( if (!missing.isEmpty()) { // At least one index needs to be built (b/c not in cache or cache is disabled) // We pass the handles map so buildOnDisk knows exactly where to write - var newIndexes = buildOnDisk(missing, m, ef, neighborOverflow, addHierarchy, refineFinalGraph, + var newIndexes = buildOnDisk(missing, m, ef, neighborOverflow, alpha, addHierarchy, refineFinalGraph, ds, outputDir, compressor, handles, null); indexes.putAll(newIndexes); } @@ -945,6 +960,7 @@ public static List runAllAndCollectResults( params.put("M", m); params.put("efConstruction", ef); params.put("neighborOverflow", neighborOverflow); + params.put("alpha", alpha); params.put("addHierarchy", addHierarchy); params.put("refineFinalGraph", refineFinalGraph); params.put("repetition", repetition); @@ -1007,6 +1023,7 @@ public static List runAllAndCollectResults( } } } + } } } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index bc1f12d77..0bcc8ef12 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -50,6 +50,7 @@ public static void main(String[] args) throws IOException { config.construction.outDegree, config.construction.efConstruction, config.construction.neighborOverflow, + config.construction.alpha, config.construction.addHierarchy, config.construction.refineFinalGraph, config.construction.getFeatureSets(), diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java index 7a3e53adc..9566ac048 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/ExperimentsSchemaV1.java @@ -56,6 +56,7 @@ public static List fixedColumns() { "M", "efConstruction", "neighborOverflow", + "alpha", "addHierarchy", "refineFinalGraph", "repetition", @@ -79,6 +80,7 @@ public static Map fixedValues(RunContext run, int M, int efConstruction, float neighborOverflow, + float alpha, boolean addHierarchy, boolean refineFinalGraph, int repetition, @@ -99,6 +101,7 @@ public static Map fixedValues(RunContext run, fixed.put("M", Integer.toString(M)); fixed.put("efConstruction", Integer.toString(efConstruction)); fixed.put("neighborOverflow", Float.toString(neighborOverflow)); + fixed.put("alpha", Float.toString(alpha)); fixed.put("addHierarchy", Boolean.toString(addHierarchy)); fixed.put("refineFinalGraph", Boolean.toString(refineFinalGraph)); fixed.put("repetition", Integer.toString(repetition)); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java index bddea244b..0ea43dd38 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/reporting/RunArtifacts.java @@ -188,6 +188,7 @@ public void logRow(String datasetName, int M, int efConstruction, float neighborOverflow, + float alpha, boolean addHierarchy, boolean refineFinalGraph, int repetition, @@ -209,6 +210,7 @@ public void logRow(String datasetName, M, efConstruction, neighborOverflow, + alpha, addHierarchy, refineFinalGraph, repetition, diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/ConstructionParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/ConstructionParameters.java index 5177fdf4a..f2e678b09 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/ConstructionParameters.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/ConstructionParameters.java @@ -27,6 +27,7 @@ public class ConstructionParameters extends CommonParameters { public List outDegree; public List efConstruction; public List neighborOverflow; + public List alpha; public List addHierarchy; public List refineFinalGraph; public List reranking; diff --git a/jvector-examples/yaml-configs/index-parameters/cap-10M.yml b/jvector-examples/yaml-configs/index-parameters/cap-10M.yml index c4c285d18..d79c0ba84 100644 --- a/jvector-examples/yaml-configs/index-parameters/cap-10M.yml +++ b/jvector-examples/yaml-configs/index-parameters/cap-10M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [Yes] diff --git a/jvector-examples/yaml-configs/index-parameters/cap-1M.yml b/jvector-examples/yaml-configs/index-parameters/cap-1M.yml index 0bd9d5de1..9b3e53e56 100644 --- a/jvector-examples/yaml-configs/index-parameters/cap-1M.yml +++ b/jvector-examples/yaml-configs/index-parameters/cap-1M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [Yes] diff --git a/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-10M.yml b/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-10M.yml index 7973ce0f8..2c63ac1a4 100644 --- a/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-10M.yml +++ b/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-10M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [Yes] diff --git a/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-1M.yml b/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-1M.yml index e053edfc4..789d9b12e 100644 --- a/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-1M.yml +++ b/jvector-examples/yaml-configs/index-parameters/cohere-english-v3-1M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [Yes] diff --git a/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml b/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml index d0de5e0f4..f822b88dc 100644 --- a/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml +++ b/jvector-examples/yaml-configs/index-parameters/colbert-1M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/default.yml b/jvector-examples/yaml-configs/index-parameters/default.yml index b56e27ed0..76ded18fa 100644 --- a/jvector-examples/yaml-configs/index-parameters/default.yml +++ b/jvector-examples/yaml-configs/index-parameters/default.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/dpr-gemma-10M.yml b/jvector-examples/yaml-configs/index-parameters/dpr-gemma-10M.yml index f2370a063..1b1476b62 100644 --- a/jvector-examples/yaml-configs/index-parameters/dpr-gemma-10M.yml +++ b/jvector-examples/yaml-configs/index-parameters/dpr-gemma-10M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [Yes] diff --git a/jvector-examples/yaml-configs/index-parameters/dpr-gemma-1M.yml b/jvector-examples/yaml-configs/index-parameters/dpr-gemma-1M.yml index 96e92556e..ea1d51260 100644 --- a/jvector-examples/yaml-configs/index-parameters/dpr-gemma-1M.yml +++ b/jvector-examples/yaml-configs/index-parameters/dpr-gemma-1M.yml @@ -7,6 +7,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [Yes] diff --git a/jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml index 5f80f4ae2..560913cac 100644 --- a/jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml +++ b/jvector-examples/yaml-configs/index-parameters/glove-100-angular.yml @@ -8,6 +8,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml index faf5bf3fa..bb97586d1 100644 --- a/jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml +++ b/jvector-examples/yaml-configs/index-parameters/glove-200-angular.yml @@ -8,6 +8,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml index 953f454e2..ac88f8e5a 100644 --- a/jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml +++ b/jvector-examples/yaml-configs/index-parameters/glove-25-angular.yml @@ -8,6 +8,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml b/jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml index a62eede12..ee3d61d52 100644 --- a/jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml +++ b/jvector-examples/yaml-configs/index-parameters/glove-50-angular.yml @@ -8,6 +8,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml b/jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml index 045963849..71a1eb04a 100644 --- a/jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml +++ b/jvector-examples/yaml-configs/index-parameters/lastfm-64-dot.yml @@ -8,6 +8,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] diff --git a/jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml b/jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml index 755e99017..fec7d24ff 100644 --- a/jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml +++ b/jvector-examples/yaml-configs/index-parameters/nytimes-256-angular.yml @@ -8,6 +8,7 @@ construction: outDegree: [32] efConstruction: [100] neighborOverflow: [1.2f] + alpha: [1.2f] addHierarchy: [Yes] refineFinalGraph: [Yes] fusedGraph: [No] From b28cfa0183928a9c85a902d46d7ed12525ac7c9e Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 23 Apr 2026 20:42:43 +0000 Subject: [PATCH 7/7] Fan-out on compression parameters --- .../example/yaml/CommonParameters.java | 6 +- .../jvector/example/yaml/Compression.java | 188 +++++++++++++++--- .../jvector/example/yaml/CompressionTest.java | 172 ++++++++++++++++ 3 files changed, 333 insertions(+), 33 deletions(-) create mode 100644 jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/CompressionTest.java diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/CommonParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/CommonParameters.java index 626efc591..d86664a68 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/CommonParameters.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/CommonParameters.java @@ -27,6 +27,10 @@ public class CommonParameters { public List compression; public List> getCompressorParameters() { - return compression.stream().map(Compression::getCompressorParameters).collect(Collectors.toList()); + // Each Compression entry may expand into multiple concrete compressor configs + // (Cartesian product of its list-valued parameters); flatten across all entries. + return compression.stream() + .flatMap(c -> c.getCompressorParameters().stream()) + .collect(Collectors.toList()); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java index a8277508a..95fb77f8a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java @@ -20,49 +20,173 @@ import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.Map; import java.util.function.Function; +/// YAML-bound representation of a single {@code compression:} entry. +/// +/// Scalar parameter values (e.g. {@code mFactor: 2}) and list-valued parameters +/// (e.g. {@code mFactor: [2, 4]}) are both accepted; list-valued parameters cause +/// {@link #getCompressorParameters()} to return one entry per point in the +/// Cartesian product across all list-valued axes. A single compression entry with +/// {@code mFactor: [2, 4]} and {@code anisotropicThreshold: [-1.0, 0.2]} therefore +/// expands to four compressor configurations. public class Compression { public String type; - public Map parameters; + public Map parameters; - public Function getCompressorParameters() { + /// Expand this compression entry into one or more {@link CompressorParameters} + /// suppliers. Non-PQ types (None, BQ) always produce a single-element list; + /// PQ produces the Cartesian product across {@code m}/{@code mFactor}, + /// {@code k}, {@code centerData}, and {@code anisotropicThreshold}. + public List> getCompressorParameters() { switch (type) { case "None": - return __ -> CompressorParameters.NONE; - case "PQ": - int k = Integer.parseInt(parameters.getOrDefault("k", "256")); - String strCenterData = parameters.get("centerData"); - if (!(strCenterData == null || strCenterData.equals("Yes") || strCenterData.equals("No"))) { - throw new IllegalArgumentException("centerData must be Yes or No, or not specified at all."); - } - float anisotropicThreshold = Float.parseFloat(parameters.getOrDefault("anisotropicThreshold", "-1")); - - return ds -> { - boolean centerData; - if (strCenterData == null) { - centerData = ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN; - } else { - centerData = strCenterData.equals("Yes");; - } - - if (parameters.containsKey("m")) { - int m = Integer.parseInt(parameters.get("m")); - return new CompressorParameters.PQParameters(m, k, centerData, anisotropicThreshold); - } else if (parameters.containsKey("mFactor")) { - String strMFactor = parameters.get("mFactor"); - int mFactor = Integer.parseInt(strMFactor); - return new CompressorParameters.PQParameters(ds.getDimension() / mFactor, k, centerData, anisotropicThreshold); - } else { - throw new IllegalArgumentException("Need to specify either 'm' or 'mFactor'"); - } - }; + return List.of(__ -> CompressorParameters.NONE); case "BQ": - return ds -> new CompressorParameters.BQParameters(); + return List.of(ds -> new CompressorParameters.BQParameters()); + case "PQ": + return pqCombinations(); default: throw new IllegalArgumentException("Unsupported compression type: " + type); + } + } + + private List> pqCombinations() { + Map params = parameters == null ? Map.of() : parameters; + + List ks = asIntList(params.getOrDefault("k", 256)); + List thresholds = asFloatList(params.getOrDefault("anisotropicThreshold", -1.0f)); + + // centerData absent => use dataset-similarity-based default at resolve time (null sentinel) + List centerings = params.containsKey("centerData") + ? asBooleanList(params.get("centerData")) + : Collections.singletonList(null); + + boolean hasM = params.containsKey("m"); + boolean hasMFactor = params.containsKey("mFactor"); + if (!hasM && !hasMFactor) { + throw new IllegalArgumentException("PQ compression: need to specify either 'm' or 'mFactor'"); + } + + // 'm' takes precedence when both are present (matches prior behavior). + List mSources = new ArrayList<>(); + if (hasM) { + for (Integer mv : asIntList(params.get("m"))) { + mSources.add(MSource.exact(mv)); + } + } else { + for (Integer f : asIntList(params.get("mFactor"))) { + mSources.add(MSource.factor(f)); + } + } + + List> out = new ArrayList<>( + mSources.size() * ks.size() * centerings.size() * thresholds.size()); + for (MSource ms : mSources) { + for (Integer k : ks) { + for (Boolean cd : centerings) { + for (Float at : thresholds) { + out.add(pqFunction(ms, k, cd, at)); + } + } + } + } + return out; + } + + private static Function pqFunction(MSource ms, + int k, + Boolean centerDataSpec, + float anisotropicThreshold) { + return ds -> { + boolean centerData = (centerDataSpec != null) + ? centerDataSpec + : ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN; + int m = ms.resolve(ds); + return new CompressorParameters.PQParameters(m, k, centerData, anisotropicThreshold); + }; + } + + /// Source of the {@code m} dimension: either an exact value or a divisor applied to + /// the dataset dimensionality at resolve time. + private static final class MSource { + private final Integer exactM; + private final Integer factor; + + private MSource(Integer exactM, Integer factor) { + this.exactM = exactM; + this.factor = factor; + } + + static MSource exact(int m) { return new MSource(m, null); } + static MSource factor(int f) { return new MSource(null, f); } + + int resolve(DataSet ds) { + return (exactM != null) ? exactM : ds.getDimension() / factor; + } + } + + // ------------------------------------------------------------------------ + // YAML value coercion: scalar or list -> typed list + // ------------------------------------------------------------------------ + + private static List asIntList(Object raw) { + if (raw instanceof List) { + List src = (List) raw; + List out = new ArrayList<>(src.size()); + for (Object o : src) out.add(toInt(o)); + return out; + } + return List.of(toInt(raw)); + } + + private static List asFloatList(Object raw) { + if (raw instanceof List) { + List src = (List) raw; + List out = new ArrayList<>(src.size()); + for (Object o : src) out.add(toFloat(o)); + return out; + } + return List.of(toFloat(raw)); + } + + private static List asBooleanList(Object raw) { + if (raw instanceof List) { + List src = (List) raw; + List out = new ArrayList<>(src.size()); + for (Object o : src) out.add(toBoolean(o)); + return out; + } + return List.of(toBoolean(raw)); + } + + private static int toInt(Object o) { + if (o instanceof Number) return ((Number) o).intValue(); + if (o instanceof String) return Integer.parseInt(((String) o).trim()); + throw new IllegalArgumentException("Cannot interpret as integer: " + o); + } + + private static float toFloat(Object o) { + if (o instanceof Number) return ((Number) o).floatValue(); + if (o instanceof String) return Float.parseFloat(((String) o).trim()); + throw new IllegalArgumentException("Cannot interpret as float: " + o); + } + /// Accepts YAML-native {@code Boolean} (from {@code true}/{@code false} or + /// {@code Yes}/{@code No}) as well as the literal strings {@code "Yes"}/{@code "No"}/ + /// {@code "true"}/{@code "false"} for configs that quote the value. + private static boolean toBoolean(Object o) { + if (o instanceof Boolean) return (Boolean) o; + if (o instanceof String) { + String s = ((String) o).trim(); + if (s.equalsIgnoreCase("Yes") || s.equalsIgnoreCase("true")) return true; + if (s.equalsIgnoreCase("No") || s.equalsIgnoreCase("false")) return false; + throw new IllegalArgumentException("Cannot interpret as boolean: " + o); } + throw new IllegalArgumentException("Cannot interpret as boolean: " + o); } -} \ No newline at end of file +} diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/CompressionTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/CompressionTest.java new file mode 100644 index 000000000..37d5d5b8e --- /dev/null +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/yaml/CompressionTest.java @@ -0,0 +1,172 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.example.yaml; + +import io.github.jbellis.jvector.example.benchmarks.datasets.DataSet; +import io.github.jbellis.jvector.example.util.CompressorParameters; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import org.junit.Test; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +/// Verifies that {@link Compression#getCompressorParameters()} expands list-valued +/// PQ parameters into a Cartesian product of {@link CompressorParameters}. +public class CompressionTest { + + @Test + public void noneReturnsSingleton() { + Compression c = new Compression(); + c.type = "None"; + assertEquals(1, c.getCompressorParameters().size()); + assertSame(CompressorParameters.NONE, c.getCompressorParameters().get(0).apply(stub(128))); + } + + @Test + public void bqReturnsSingleton() { + Compression c = new Compression(); + c.type = "BQ"; + assertEquals(1, c.getCompressorParameters().size()); + assertNotNull(c.getCompressorParameters().get(0).apply(stub(128))); + } + + @Test + public void pqScalarsProduceSingleConfig() { + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map( + "mFactor", 2, + "k", 256, + "centerData", "No", + "anisotropicThreshold", -1.0); + var list = c.getCompressorParameters(); + assertEquals(1, list.size()); + // mFactor=2 against dim=128 -> m=64 + assertEquals("PQ_ds_64_256_false_-1.0", list.get(0).apply(stub(128)).idStringFor(stub(128))); + } + + @Test + public void pqListParametersCartesian() { + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map( + "mFactor", Arrays.asList(2, 4), + "k", Arrays.asList(128, 256), + "centerData", Arrays.asList(true, false), + "anisotropicThreshold", Arrays.asList(-1.0, 0.2)); + var list = c.getCompressorParameters(); + assertEquals(2 * 2 * 2 * 2, list.size()); + } + + @Test + public void pqMFactorListExpands() { + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map("mFactor", Arrays.asList(2, 4)); + var list = c.getCompressorParameters(); + assertEquals(2, list.size()); + // Verify both mFactor values are represented + var ids = list.stream().map(f -> f.apply(stub(128)).idStringFor(stub(128))).collect(Collectors.toList()); + assertTrue(ids.stream().anyMatch(s -> s.contains("_64_")), ids.toString()); + assertTrue(ids.stream().anyMatch(s -> s.contains("_32_")), ids.toString()); + } + + @Test + public void pqCenterDataAbsentUsesDatasetDefault() { + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map("mFactor", 2); + + Function f = c.getCompressorParameters().get(0); + // EUCLIDEAN -> centered=true + assertTrue(f.apply(stub(128, VectorSimilarityFunction.EUCLIDEAN)) + .idStringFor(stub(128, VectorSimilarityFunction.EUCLIDEAN)).contains("_true_")); + // DOT_PRODUCT -> centered=false + assertTrue(f.apply(stub(128, VectorSimilarityFunction.DOT_PRODUCT)) + .idStringFor(stub(128, VectorSimilarityFunction.DOT_PRODUCT)).contains("_false_")); + } + + @Test + public void pqMPrecedenceOverMFactor() { + // Both provided -> 'm' wins, mFactor list is ignored (matches prior behavior). + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map( + "m", 192, + "mFactor", Arrays.asList(2, 4)); + var list = c.getCompressorParameters(); + assertEquals(1, list.size()); + assertTrue(list.get(0).apply(stub(128)).idStringFor(stub(128)).contains("_192_")); + } + + @Test + public void pqRejectsMissingMAndMFactor() { + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map("k", 256); + assertThrows(IllegalArgumentException.class, c::getCompressorParameters); + } + + @Test + public void pqAcceptsYesNoStrings() { + Compression c = new Compression(); + c.type = "PQ"; + c.parameters = map("mFactor", 2, "centerData", "Yes"); + assertTrue(c.getCompressorParameters().get(0).apply(stub(128)).idStringFor(stub(128)).contains("_true_")); + } + + @Test + public void unsupportedTypeThrows() { + Compression c = new Compression(); + c.type = "ZQ"; + assertThrows(IllegalArgumentException.class, c::getCompressorParameters); + } + + // ------------------------------------------------------------------------ + + private static Map map(Object... kv) { + assertEquals(0, kv.length % 2); + Map m = new LinkedHashMap<>(); + for (int i = 0; i < kv.length; i += 2) { + m.put((String) kv[i], kv[i + 1]); + } + return m; + } + + private static DataSet stub(int dim) { + return stub(dim, VectorSimilarityFunction.EUCLIDEAN); + } + + private static DataSet stub(int dim, VectorSimilarityFunction sim) { + return new DataSet() { + @Override public int getDimension() { return dim; } + @Override public String getName() { return "ds"; } + @Override public VectorSimilarityFunction getSimilarityFunction() { return sim; } + @Override public RandomAccessVectorValues getBaseRavv() { throw new UnsupportedOperationException(); } + @Override public List> getBaseVectors() { throw new UnsupportedOperationException(); } + @Override public List> getQueryVectors() { throw new UnsupportedOperationException(); } + @Override public List> getGroundTruth() { throw new UnsupportedOperationException(); } + }; + } +}