From 918fa4469a8401119c82008bdbb9f2deaa21e731 Mon Sep 17 00:00:00 2001 From: prithvi Date: Sat, 25 Apr 2026 02:25:18 +0530 Subject: [PATCH] Fix undercounting of RAM used by vectors buffered in in-memory segments (#15901) --- lucene/CHANGES.txt | 7 ++ ...Lucene102BinaryQuantizedVectorsWriter.java | 23 +++++-- .../Lucene99ScalarQuantizedVectorsWriter.java | 15 +++- .../codecs/BufferingKnnVectorsWriter.java | 2 +- ...Lucene104ScalarQuantizedVectorsWriter.java | 24 +++++-- .../index/BaseKnnVectorsFormatTestCase.java | 69 +++++++++++++++++++ 6 files changed, 129 insertions(+), 11 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 278a4ae28b79..a0ce0393aa9b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -148,6 +148,13 @@ Optimizations Bug Fixes --------------------- +* GITHUB#15901: Fix undercounting of RAM used by vectors buffered in in-memory segments. + BufferingKnnVectorsWriter hardcoded Float.BYTES for all vector encodings, overcounting byte + vectors by 4x. Scalar quantized vector writers (Lucene104, Lucene99, Lucene102) did not + account for rawVectorDelegate RAM, making byte vector fields completely invisible in RAM + reporting. Also added missing dimensionSums array accounting in quantized writers. + (Prithvi S) + * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero when all values are identical. (Mike Sokolov) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java index a782aaee0975..5480ec86058f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsWriter.java @@ -55,6 +55,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -637,9 +638,12 @@ static int calculateCentroid(MergeState mergeState, FieldInfo fieldInfo, float[] @Override public long ramBytesUsed() { long total = SHALLOW_RAM_BYTES_USED; + // The rawVectorDelegate tracks all vector data for both byte and float32 fields. + // For byte vector fields (which bypass our FieldWriter), this is the only accounting. + total += rawVectorDelegate.ramBytesUsed(); for (FieldWriter field : fields) { - // the field tracks the delegate field usage - total += field.ramBytesUsed(); + // Add quantization-specific overhead not tracked by the delegate + total += field.quantizationOverheadBytesUsed(); } return total; } @@ -714,11 +718,22 @@ public float[] copyValue(float[] vectorValue) { throw new UnsupportedOperationException(); } + /** + * Returns the RAM usage of quantization-specific state only (magnitudes, dimensionSums, shallow + * object overhead). The underlying flat vector data is tracked separately by the + * rawVectorDelegate at the writer level to avoid double-counting. + */ + long quantizationOverheadBytesUsed() { + long size = SHALLOW_SIZE; + size += magnitudes.ramBytesUsed(); + size += RamUsageEstimator.sizeOf(dimensionSums); + return size; + } + @Override public long ramBytesUsed() { - long size = SHALLOW_SIZE; + long size = quantizationOverheadBytesUsed(); size += flatFieldVectorsWriter.ramBytesUsed(); - size += magnitudes.ramBytesUsed(); return size; } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index 54424b2915d1..2822e5443bf5 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -299,9 +299,12 @@ public void finish() throws IOException { @Override public long ramBytesUsed() { long total = SHALLOW_RAM_BYTES_USED; + // The rawVectorDelegate tracks all vector data for both byte and float32 fields. + // For byte vector fields (which bypass our FieldWriter), this is the only accounting. + total += rawVectorDelegate.ramBytesUsed(); for (FieldWriter field : fields) { - // the field tracks the delegate field usage - total += field.ramBytesUsed(); + // Add quantization-specific overhead not tracked by the delegate + total += field.quantizationOverheadBytesUsed(); } return total; } @@ -819,6 +822,14 @@ ScalarQuantizer createQuantizer() throws IOException { return quantizer; } + /** + * Returns the RAM usage of quantization-specific state only. The underlying flat vector data is + * tracked separately by the rawVectorDelegate at the writer level. + */ + long quantizationOverheadBytesUsed() { + return SHALLOW_SIZE; + } + @Override public long ramBytesUsed() { long size = SHALLOW_SIZE; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java index 96b0f75a259f..65190acbc052 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BufferingKnnVectorsWriter.java @@ -260,7 +260,7 @@ public final long ramBytesUsed() { * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) - + vectors.size() * (long) dim * Float.BYTES; + + vectors.size() * (long) dim * fieldInfo.getVectorEncoding().byteSize; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java index d591aa0a7e7c..8dba8f3b95da 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java @@ -53,6 +53,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.VectorUtil; import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; @@ -697,9 +698,13 @@ static int calculateCentroid(MergeState mergeState, FieldInfo fieldInfo, float[] @Override public long ramBytesUsed() { long total = SHALLOW_RAM_BYTES_USED; + // The rawVectorDelegate tracks all vector data for both byte and float32 fields. + // For byte vector fields (which bypass our FieldWriter), this is the only accounting. + // For float32 fields, this covers the flat vector data; our FieldWriter adds the + // quantization-specific overhead (magnitudes, dimensionSums) on top. + total += rawVectorDelegate.ramBytesUsed(); for (FieldWriter field : fields) { - // the field tracks the delegate field usage - total += field.ramBytesUsed(); + total += field.quantizationOverheadBytesUsed(); } return total; } @@ -774,11 +779,22 @@ public float[] copyValue(float[] vectorValue) { throw new UnsupportedOperationException(); } + /** + * Returns the RAM usage of quantization-specific state only (magnitudes, dimensionSums, shallow + * object overhead). The underlying flat vector data is tracked separately by the + * rawVectorDelegate at the writer level to avoid double-counting. + */ + long quantizationOverheadBytesUsed() { + long size = SHALLOW_SIZE; + size += magnitudes.ramBytesUsed(); + size += RamUsageEstimator.sizeOf(dimensionSums); + return size; + } + @Override public long ramBytesUsed() { - long size = SHALLOW_SIZE; + long size = quantizationOverheadBytesUsed(); size += flatFieldVectorsWriter.ramBytesUsed(); - size += magnitudes.ramBytesUsed(); return size; } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 8d338bc9d494..a7f3532d9928 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -444,6 +444,75 @@ public void testWriterRamEstimate() throws Exception { dir.close(); } + @SuppressWarnings("unchecked") + public void testWriterByteVectorRamEstimate() throws Exception { + final FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]); + final Directory dir = newDirectory(); + Codec codec = Codec.getDefault(); + final SegmentInfo si = + new SegmentInfo( + dir, + Version.LATEST, + Version.LATEST, + "0", + 10000, + false, + false, + codec, + Collections.emptyMap(), + StringHelper.randomId(), + new HashMap<>(), + null); + final SegmentWriteState state = + new SegmentWriteState( + InfoStream.getDefault(), dir, si, fieldInfos, null, newIOContext(random())); + final KnnVectorsFormat format = codec.knnVectorsFormat(); + try (KnnVectorsWriter writer = format.fieldsWriter(state)) { + int dim = random().nextInt(64) + 1; + if (dim % 2 == 1) { + ++dim; + } + int numDocs = atLeast(100); + KnnFieldVectorsWriter fieldWriter = + (KnnFieldVectorsWriter) + writer.addField( + new FieldInfo( + "fieldA", + 0, + false, + false, + false, + IndexOptions.NONE, + DocValuesType.NONE, + DocValuesSkipIndexType.NONE, + -1, + Map.of(), + 0, + 0, + 0, + dim, + VectorEncoding.BYTE, + VectorSimilarityFunction.DOT_PRODUCT, + false, + false)); + for (int i = 0; i < numDocs; i++) { + fieldWriter.addValue(i, randomVector8(dim)); + } + // Validate the field-level RAM accounting uses correct byte sizes. + // The reported RAM must be at least the raw byte vector data. + final long fieldRamBytesUsed = fieldWriter.ramBytesUsed(); + final long rawByteVectorData = (long) dim * numDocs * Byte.BYTES; + assertTrue( + "Expected field ramBytesUsed (" + + fieldRamBytesUsed + + ") >= raw byte vector data size (" + + rawByteVectorData + + ")", + fieldRamBytesUsed >= rawByteVectorData); + } + dir.close(); + } + public void testIllegalSimilarityFunctionChangeTwoWriters() throws Exception { try (Directory dir = newDirectory()) { try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) {