diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8d5496b9740a..761533548b03 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -264,6 +264,8 @@ API Changes * GITHUB#15584: Add support for termdoc fields that use custom term freqs (via IndexOptions.DOCS_AND_CUSTOM_FREQS). IndexWriter counts their terms rather than summing their freqs. Use +* GITHUB#15990: Add experimental api to IndexWriter for columnar indexing. + New Features --------------------- diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index 1358b9fe068d..eeed4020e7ae 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -76,6 +76,10 @@ exports org.apache.lucene.codecs.hnsw; exports org.apache.lucene.internal.vectorization to org.apache.lucene.benchmark.jmh; + exports org.apache.lucene.document.column; + + opens org.apache.lucene.document.column to + org.apache.lucene.test_framework; provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; diff --git a/lucene/core/src/java/org/apache/lucene/document/column/BinaryColumn.java b/lucene/core/src/java/org/apache/lucene/document/column/BinaryColumn.java new file mode 100644 index 000000000000..e1a831d742c4 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/BinaryColumn.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import org.apache.lucene.document.StoredValue; +import org.apache.lucene.index.IndexableFieldType; + +/** + * A {@link Column} that provides variable-size binary values via a tuple cursor. Used for {@link + * org.apache.lucene.index.DocValuesType#BINARY BINARY}, {@link + * org.apache.lucene.index.DocValuesType#SORTED SORTED}, and {@link + * org.apache.lucene.index.DocValuesType#SORTED_SET SORTED_SET} doc values, and for stored/indexed + * binary or text fields. Values fed to points are passed through unchanged, so callers are + * responsible for producing sort-encoded bytes of the correct total length. + * + *

Numeric doc values ({@link org.apache.lucene.index.DocValuesType#NUMERIC NUMERIC} / {@link + * org.apache.lucene.index.DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}) and 1-D numeric points (int + * / long / float / double) are fed by {@link LongColumn} instead. + * + * @lucene.experimental + */ +public abstract class BinaryColumn extends Column { + + /** Creates a BinaryColumn with the given field name, type, and density. */ + protected BinaryColumn(String name, IndexableFieldType fieldType, Density density) { + super(name, fieldType, density); + } + + /** + * The {@link org.apache.lucene.document.StoredValue.Type} to emit when this column is written to + * stored fields. The default is {@link org.apache.lucene.document.StoredValue.Type#BINARY}. Only + * {@link org.apache.lucene.document.StoredValue.Type#BINARY} and {@link + * org.apache.lucene.document.StoredValue.Type#STRING} are supported; numeric stored types require + * {@link LongColumn}. + */ + public StoredValue.Type storedType() { + return StoredValue.Type.BINARY; + } + + /** Returns a fresh tuple cursor starting at the beginning of the batch. */ + public abstract BinaryTupleCursor tuples(); +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/BinaryTupleCursor.java b/lucene/core/src/java/org/apache/lucene/document/column/BinaryTupleCursor.java new file mode 100644 index 000000000000..f6fe58aacb27 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/BinaryTupleCursor.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; + +/** + * A tuple cursor over a {@link BinaryColumn}. Yields {@code (docID, binaryValue)} pairs. + * Batch-local doc-ids are returned in non-decreasing order; the same doc-id may repeat for + * multi-valued fields (e.g. {@link org.apache.lucene.index.DocValuesType#SORTED_SET SORTED_SET}). + * + * @lucene.experimental + */ +public abstract class BinaryTupleCursor { + + /** Sole constructor. */ + protected BinaryTupleCursor() {} + + /** + * Advances to the next doc-id that has a value and returns it, or {@link + * DocIdSetIterator#NO_MORE_DOCS} if exhausted. Doc-ids are batch-local (0 to {@code numDocs - + * 1}). + */ + public abstract int nextDoc(); + + /** + * Returns the value at the current cursor position. Only valid until the next call to {@link + * #nextDoc()}, and only after a {@code nextDoc()} that returned a value other than {@link + * DocIdSetIterator#NO_MORE_DOCS}. + */ + public abstract BytesRef binaryValue(); +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/Column.java b/lucene/core/src/java/org/apache/lucene/document/column/Column.java new file mode 100644 index 000000000000..143a82ed1750 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/Column.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import java.util.Objects; +import org.apache.lucene.index.IndexableFieldType; + +/** + * A single field's values across multiple documents in a {@link ColumnBatch}. A Column carries only + * metadata (name, field type, and density); iteration is performed via cursors obtained from {@link + * LongColumn}, {@link BinaryColumn}, or {@link VectorColumn}. + * + *

Each call that requests a cursor returns a fresh cursor positioned at the first value, so + * columns can be consumed multiple times (for example, once in the row-oriented pass for stored + * fields and again in the column-oriented pass for doc values). + * + * @lucene.experimental + */ +public abstract class Column { + + /** + * Whether a column has a value for every document in the batch. This is a contract the column + * asserts up-front so the indexing chain can pick the right code path without probing the data. + */ + public enum Density { + /** The column has a value for every batch-local doc-id in {@code [0, numDocs)}, in order. */ + DENSE, + /** The column may be missing values or have multiple values for some doc-ids. */ + SPARSE, + } + + private final String name; + private final IndexableFieldType fieldType; + private final Density density; + + /** + * Creates a Column with the given field name, type, and density. + * + * @param name the field name + * @param fieldType describes how this field should be indexed + * @param density whether this column has a value for every document in the batch + */ + protected Column(String name, IndexableFieldType fieldType, Density density) { + this.name = Objects.requireNonNull(name, "field name must not be null"); + this.fieldType = Objects.requireNonNull(fieldType, "field type must not be null"); + this.density = Objects.requireNonNull(density, "density must not be null"); + } + + /** Returns the field name. */ + public String name() { + return name; + } + + /** Returns the field type describing how this field is indexed. */ + public IndexableFieldType fieldType() { + return fieldType; + } + + /** Returns the density of this column (whether every doc has a value). */ + public Density density() { + return density; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/ColumnBatch.java b/lucene/core/src/java/org/apache/lucene/document/column/ColumnBatch.java new file mode 100644 index 000000000000..cd70ebcd8b4e --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/ColumnBatch.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +/** + * A column-oriented batch of documents for indexing. A Batch contains a collection of {@link + * Column}s, where each Column represents a single field across all documents in the batch. + * Documents are identified by batch-local IDs from 0 (inclusive) to {@link #numDocs()} (exclusive). + * + * @lucene.experimental + */ +public abstract class ColumnBatch { + + /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ + protected ColumnBatch() {} + + /** + * Returns the number of documents in this batch. All column doc-ids must be in the range [0, + * numDocs()). + */ + public abstract int numDocs(); + + /** + * Returns the columns in this batch. Each column represents a single field across the documents + * in the batch. + */ + public abstract Iterable columns(); +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/ColumnFieldAdapter.java b/lucene/core/src/java/org/apache/lucene/document/column/ColumnFieldAdapter.java new file mode 100644 index 000000000000..c118081b7a69 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/ColumnFieldAdapter.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import java.nio.charset.StandardCharsets; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.InvertableType; +import org.apache.lucene.document.StoredValue; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; + +/** + * Lightweight adapter that presents a {@link Column}'s current cursor value as an {@link + * IndexableField} so it can be fed through the row-oriented indexing pass (stored fields and term + * inversion). Holds a fresh tuple cursor over the underlying column; one instance is created per + * column per batch. + * + * @lucene.internal + */ +public abstract class ColumnFieldAdapter extends Field { + + ColumnFieldAdapter(String name, IndexableFieldType fieldType) { + super(name, fieldType); + } + + /** Returns an adapter for the given column, dispatching on its concrete type. */ + public static ColumnFieldAdapter create(Column column) { + if (column instanceof LongColumn lc) { + return new LongColumnAdapter(lc); + } else if (column instanceof BinaryColumn bc) { + return new BinaryColumnAdapter(bc); + } else { + throw new IllegalArgumentException("Unknown column type: " + column.getClass().getName()); + } + } + + /** Advances to the next batch-local doc-id with a value. */ + public abstract int nextDoc(); +} + +final class LongColumnAdapter extends ColumnFieldAdapter { + private final LongTupleCursor cursor; + private final StoredValue reusableStoredValue; + private final StoredValue.Type storedType; + + LongColumnAdapter(LongColumn column) { + super(column.name(), column.fieldType()); + this.cursor = column.tuples(); + if (column.fieldType().stored()) { + this.storedType = column.storedType(); + this.reusableStoredValue = newReusableLongStoredValue(storedType); + } else { + this.storedType = null; + this.reusableStoredValue = null; + } + } + + private static StoredValue newReusableLongStoredValue(StoredValue.Type type) { + return switch (type) { + case INTEGER -> new StoredValue(0); + case LONG -> new StoredValue(0L); + case FLOAT -> new StoredValue(0.0f); + case DOUBLE -> new StoredValue(0.0); + case STRING, BINARY, DATA_INPUT -> + throw new AssertionError("rejected by ColumnValidation.validateLongColumn"); + }; + } + + @Override + public int nextDoc() { + return cursor.nextDoc(); + } + + @Override + public Number numericValue() { + return cursor.longValue(); + } + + @Override + public StoredValue storedValue() { + if (reusableStoredValue == null) { + return null; + } + long raw = cursor.longValue(); + switch (storedType) { + case INTEGER -> reusableStoredValue.setIntValue((int) raw); + case LONG -> reusableStoredValue.setLongValue(raw); + case FLOAT -> reusableStoredValue.setFloatValue(NumericUtils.sortableIntToFloat((int) raw)); + case DOUBLE -> reusableStoredValue.setDoubleValue(NumericUtils.sortableLongToDouble(raw)); + case STRING, BINARY, DATA_INPUT -> + throw new IllegalArgumentException("rejected by ColumnValidation.validateLongColumn"); + } + return reusableStoredValue; + } + + @Override + public InvertableType invertableType() { + return null; + } +} + +final class BinaryColumnAdapter extends ColumnFieldAdapter { + private final BinaryTupleCursor cursor; + private final StoredValue reusableStoredValue; + private final StoredValue.Type storedType; + private final boolean tokenized; + private final boolean indexed; + + BinaryColumnAdapter(BinaryColumn column) { + super(column.name(), column.fieldType()); + this.cursor = column.tuples(); + this.tokenized = column.fieldType().tokenized(); + this.indexed = column.fieldType().indexOptions() != IndexOptions.NONE; + if (column.fieldType().stored()) { + this.storedType = column.storedType(); + this.reusableStoredValue = newReusableStoredValue(storedType); + } else { + this.storedType = null; + this.reusableStoredValue = null; + } + } + + private static StoredValue newReusableStoredValue(StoredValue.Type type) { + return switch (type) { + case STRING -> new StoredValue(""); + case BINARY -> new StoredValue(new BytesRef()); + case INTEGER, LONG, FLOAT, DOUBLE, DATA_INPUT -> + throw new IllegalArgumentException("rejected by ColumnValidation.validateBinaryColumn"); + }; + } + + @Override + public int nextDoc() { + return cursor.nextDoc(); + } + + @Override + public BytesRef binaryValue() { + return cursor.binaryValue(); + } + + @Override + public String stringValue() { + if (tokenized) { + BytesRef ref = cursor.binaryValue(); + return new String(ref.bytes, ref.offset, ref.length, StandardCharsets.UTF_8); + } + return null; + } + + @Override + public StoredValue storedValue() { + if (reusableStoredValue == null) { + return null; + } + BytesRef value = cursor.binaryValue(); + switch (storedType) { + case STRING -> + reusableStoredValue.setStringValue( + new String(value.bytes, value.offset, value.length, StandardCharsets.UTF_8)); + case BINARY -> reusableStoredValue.setBinaryValue(value); + case INTEGER, LONG, FLOAT, DOUBLE, DATA_INPUT -> + throw new IllegalArgumentException("rejected by ColumnValidation.validateBinaryColumn"); + } + return reusableStoredValue; + } + + @Override + public InvertableType invertableType() { + if (indexed == false) { + return null; + } + return tokenized ? InvertableType.TOKEN_STREAM : InvertableType.BINARY; + } + + @Override + public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { + if (tokenized) { + return analyzer.tokenStream(name(), stringValue()); + } + return null; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/ColumnValidation.java b/lucene/core/src/java/org/apache/lucene/document/column/ColumnValidation.java new file mode 100644 index 000000000000..6c1584f87ce1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/ColumnValidation.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import org.apache.lucene.document.StoredValue; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableFieldType; + +/** + * Static validation and bounds-checking helpers for the columnar indexing path. These helpers are + * pure functions over the public column API and {@link IndexableFieldType}; they do not touch any + * indexing-chain state. + * + * @lucene.internal + */ +public final class ColumnValidation { + + private ColumnValidation() {} + + /** + * Throws {@link IllegalArgumentException} if {@code fieldType} declares no indexing feature (no + * doc values, no points, not stored, no index options, no vectors). + */ + public static void validateColumnHasIndexingFeature( + String fieldName, IndexableFieldType fieldType) { + if (fieldType.docValuesType() == DocValuesType.NONE + && fieldType.pointDimensionCount() == 0 + && fieldType.stored() == false + && fieldType.indexOptions() == IndexOptions.NONE + && fieldType.vectorDimension() == 0) { + throw new IllegalArgumentException( + "Column \"" + + fieldName + + "\" must have a non-NONE docValuesType, point dimensions, be stored," + + " have index options, or have vector dimensions"); + } + } + + /** Validates a {@link LongColumn} against the field type it will feed. */ + public static void validateLongColumn(LongColumn column, IndexableFieldType fieldType) { + final int pointDims = fieldType.pointDimensionCount(); + if (pointDims != 0) { + if (pointDims != 1) { + throw new IllegalArgumentException( + "LongColumn \"" + + column.name() + + "\" only supports 1-dimensional point fields, got pointDimensionCount=" + + pointDims); + } + final int expectedPointBytes = + (column.numericKind() == LongColumn.NumericKind.INT + || column.numericKind() == LongColumn.NumericKind.FLOAT) + ? Integer.BYTES + : Long.BYTES; + if (fieldType.pointNumBytes() != expectedPointBytes) { + throw new IllegalArgumentException( + "LongColumn \"" + + column.name() + + "\" numericKind=" + + column.numericKind() + + " requires pointNumBytes=" + + expectedPointBytes + + ", got " + + fieldType.pointNumBytes()); + } + } + if (fieldType.stored()) { + final StoredValue.Type storedType = column.storedType(); + switch (storedType) { + case INTEGER, LONG, FLOAT, DOUBLE -> { + // OK. + } + case STRING, BINARY -> + throw new IllegalArgumentException( + "LongColumn \"" + + column.name() + + "\" storedType=" + + storedType + + " is not supported; use a BinaryColumn for non-numeric stored data"); + case DATA_INPUT -> + throw new IllegalArgumentException( + "LongColumn \"" + + column.name() + + "\" storedType DATA_INPUT is not supported for columns"); + } + } + } + + /** Validates a {@link BinaryColumn} against the field type it will feed. */ + public static void validateBinaryColumn(BinaryColumn column, IndexableFieldType fieldType) { + final DocValuesType dvType = fieldType.docValuesType(); + if (dvType == DocValuesType.NUMERIC || dvType == DocValuesType.SORTED_NUMERIC) { + throw new IllegalArgumentException( + "BinaryColumn \"" + + column.name() + + "\" cannot feed docValuesType=" + + dvType + + "; use a LongColumn"); + } + if (fieldType.stored()) { + final StoredValue.Type storedType = column.storedType(); + switch (storedType) { + case BINARY, STRING -> { + // OK. + } + case INTEGER, LONG, FLOAT, DOUBLE -> + throw new IllegalArgumentException( + "BinaryColumn \"" + + column.name() + + "\" storedType=" + + storedType + + " is not supported; use a LongColumn for numeric stored data"); + case DATA_INPUT -> + throw new IllegalArgumentException( + "BinaryColumn \"" + + column.name() + + "\" storedType DATA_INPUT is not supported for columns"); + } + } + } + + /** Validates a {@link VectorColumn} against the field type it will feed. */ + public static void validateVectorColumn(VectorColumn column, IndexableFieldType fieldType) { + if (fieldType.vectorDimension() <= 0) { + throw new IllegalArgumentException( + "VectorColumn \"" + + column.name() + + "\" requires fieldType.vectorDimension() > 0; got " + + fieldType.vectorDimension()); + } + if (fieldType.docValuesType() != DocValuesType.NONE + || fieldType.pointDimensionCount() != 0 + || fieldType.stored() + || fieldType.indexOptions() != IndexOptions.NONE) { + throw new IllegalArgumentException( + "VectorColumn \"" + + column.name() + + "\" must be vector-only: docValuesType=NONE, pointDimensionCount=0," + + " stored=false, indexOptions=NONE"); + } + } + + /** Throws if {@code batchDocID} is outside {@code [0, numDocs)}. */ + public static void checkDocID(Column column, int batchDocID, int numDocs) { + if (batchDocID < 0 || batchDocID >= numDocs) { + throw new IllegalArgumentException( + "Column \"" + + column.name() + + "\" returned batch doc-id " + + batchDocID + + " which is out of range [0, " + + numDocs + + ")"); + } + } + + /** Throws if a dense column did not produce exactly {@code numDocs} values. */ + public static void checkDenseCount(Column column, int consumed, int numDocs) { + if (consumed != numDocs) { + throw new IllegalArgumentException( + "Dense column \"" + + column.name() + + "\" provided " + + consumed + + " values but batch has " + + numDocs + + " documents"); + } + } + + /** Throws if a vector cursor doc-id is not strictly greater than the previous one. */ + public static void checkVectorDocIDStrictlyIncreasing( + VectorColumn column, int batchDocID, int prevBatchDocID) { + if (batchDocID <= prevBatchDocID) { + throw new IllegalArgumentException( + "VectorColumn \"" + + column.name() + + "\" must yield strictly increasing batch doc-ids; got " + + batchDocID + + " after " + + prevBatchDocID); + } + } + + /** Throws if a vector value's length does not match the field's declared dimension. */ + public static void checkVectorDimension( + VectorColumn column, int actual, int expected, int batchDocID) { + if (actual != expected) { + throw new IllegalArgumentException( + "VectorColumn \"" + + column.name() + + "\" expected dimension " + + expected + + " but got vector of length " + + actual + + " at batch doc " + + batchDocID); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/LongColumn.java b/lucene/core/src/java/org/apache/lucene/document/column/LongColumn.java new file mode 100644 index 000000000000..732a89cc8e3e --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/LongColumn.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import java.util.Objects; +import org.apache.lucene.document.StoredValue; +import org.apache.lucene.index.IndexableFieldType; + +/** + * A {@link Column} that provides long values. Used for {@link + * org.apache.lucene.index.DocValuesType#NUMERIC NUMERIC} and {@link + * org.apache.lucene.index.DocValuesType#SORTED_NUMERIC SORTED_NUMERIC} doc values and for stored + * numeric fields. + * + *

Iteration is performed via cursors. {@link #tuples()} is always available and yields {@code + * (docID, longValue)} pairs. {@link #values()} is a bulk cursor over consecutive doc-ids; it must + * be overridden when {@link #density()} is {@link Column.Density#DENSE DENSE} and is only consulted + * in that case. + * + *

{@link #numericKind()} marks how the long bits should be interpreted. Defaults to {@link + * NumericKind#LONG LONG}; pass {@link NumericKind#INT INT} (low 32 bits, sign-extended), {@link + * NumericKind#FLOAT FLOAT} (low 32 bits encoded via {@link + * org.apache.lucene.util.NumericUtils#floatToSortableInt}), or {@link NumericKind#DOUBLE DOUBLE} + * (full 64 bits encoded via {@link org.apache.lucene.util.NumericUtils#doubleToSortableLong}) to + * the constructor to select another interpretation. Callers are responsible for producing the + * sortable encoding; doc values writes the long unchanged, points consumes it as sortable bytes, + * and stored fields round-trips it back to {@code float}/{@code double} via {@link + * org.apache.lucene.util.NumericUtils#sortableIntToFloat} / {@link + * org.apache.lucene.util.NumericUtils#sortableLongToDouble}. The numeric kind drives the default + * {@link #storedType()}. + * + * @lucene.experimental + */ +public abstract class LongColumn extends Column { + + /** The numeric interpretation of the column's long values. */ + public enum NumericKind { + /** Type of integer values. */ + INT, + /** Type of long values. */ + LONG, + /** Type of float values. */ + FLOAT, + /** Type of double values. */ + DOUBLE, + } + + private final NumericKind numericKind; + + /** Creates a LongColumn with {@link NumericKind#LONG}. */ + protected LongColumn(String name, IndexableFieldType fieldType, Density density) { + this(name, fieldType, density, NumericKind.LONG); + } + + /** Creates a LongColumn with the given numeric interpretation. */ + protected LongColumn( + String name, IndexableFieldType fieldType, Density density, NumericKind numericKind) { + super(name, fieldType, density); + this.numericKind = Objects.requireNonNull(numericKind, "numericKind must not be null"); + } + + /** Returns a fresh tuple cursor starting at the beginning of the batch. */ + public abstract LongTupleCursor tuples(); + + /** + * Returns a fresh values cursor iterating dense long values for doc-ids {@code [0, numDocs)}. + * Must be overridden when {@link Column#density()} is {@link Column.Density#DENSE DENSE}; the + * default implementation throws {@link UnsupportedOperationException} and is never called for + * {@link Column.Density#SPARSE SPARSE} columns. + */ + public LongValuesCursor values() { + throw new UnsupportedOperationException( + "values() requires density() == DENSE for column \"" + name() + "\""); + } + + /** The numeric interpretation of the column's long values. */ + public final NumericKind numericKind() { + return numericKind; + } + + /** + * The stored-field variant emitted for this column. The default derives from {@link + * #numericKind()} — {@code INT→INTEGER}, {@code LONG→LONG}, {@code FLOAT→FLOAT}, {@code + * DOUBLE→DOUBLE} — so a caller that wants the natural numeric variant does not need to override + * this method. Only numeric {@link org.apache.lucene.document.StoredValue.Type} values are + * permitted; non-numeric stored data should use a {@link BinaryColumn}. + */ + public StoredValue.Type storedType() { + return switch (numericKind) { + case INT -> StoredValue.Type.INTEGER; + case LONG -> StoredValue.Type.LONG; + case FLOAT -> StoredValue.Type.FLOAT; + case DOUBLE -> StoredValue.Type.DOUBLE; + }; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/LongTupleCursor.java b/lucene/core/src/java/org/apache/lucene/document/column/LongTupleCursor.java new file mode 100644 index 000000000000..25dee6aa7742 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/LongTupleCursor.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A tuple cursor over a {@link LongColumn}. Yields {@code (docID, longValue)} pairs. Batch-local + * doc-ids are returned in non-decreasing order; the same doc-id may repeat for multi-valued fields + * (e.g. {@link org.apache.lucene.index.DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}). + * + * @lucene.experimental + */ +public abstract class LongTupleCursor { + + /** Sole constructor. */ + protected LongTupleCursor() {} + + /** + * Advances to the next doc-id that has a value and returns it, or {@link + * DocIdSetIterator#NO_MORE_DOCS} if exhausted. Doc-ids are batch-local (0 to {@code numDocs - + * 1}). + */ + public abstract int nextDoc(); + + /** + * Returns the value at the current cursor position. Only valid after a successful {@link + * #nextDoc()} call that returned a value other than {@link DocIdSetIterator#NO_MORE_DOCS}. + */ + public abstract long longValue(); +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/LongValuesCursor.java b/lucene/core/src/java/org/apache/lucene/document/column/LongValuesCursor.java new file mode 100644 index 000000000000..34aefeb5fdb9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/LongValuesCursor.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +/** + * A values cursor over a dense {@link LongColumn}. The cursor produces exactly {@link #size()} + * values for consecutive batch-local doc-ids starting at 0, one per call to {@link #nextLong()}. + * + *

Implementations must throw an exception if {@link #nextLong()} is called more than {@link + * #size()} times. + * + * @lucene.experimental + */ +public abstract class LongValuesCursor { + + /** Sole constructor. */ + protected LongValuesCursor() {} + + /** Total number of values this cursor will produce. */ + public abstract int size(); + + /** Returns the next long value. Must not be called more than {@link #size()} times. */ + public abstract long nextLong(); + + /** + * Bulk-fill {@code length} values into {@code dst} starting at {@code offset}, advancing the + * cursor by {@code length}. Combined {@link #nextLong()} and {@code fill} calls must not consume + * more than {@link #size()} values; implementations must throw if they do. + * + *

The default implementation calls {@link #nextLong()} in a loop. Override to provide a more + * efficient bulk fill (for example a {@link System#arraycopy} from a backing array). + */ + public void fill(long[] dst, int offset, int length) { + for (int i = 0; i < length; i++) { + dst[offset + i] = nextLong(); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/VectorColumn.java b/lucene/core/src/java/org/apache/lucene/document/column/VectorColumn.java new file mode 100644 index 000000000000..1df27fd02a38 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/VectorColumn.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.index.VectorEncoding; + +/** + * A {@link Column} that provides KNN vector values via a tuple cursor. Vector columns are + * vector-only: the field type must declare {@code vectorDimension() > 0}, and must not also set doc + * values, points, stored, or {@code indexOptions}. Vectors are single-valued, so the cursor yields + * strictly increasing batch-local doc-ids. + * + *

The type parameter {@code T} must match {@link IndexableFieldType#vectorEncoding()}: {@code + * float[]} for {@link VectorEncoding#FLOAT32 FLOAT32} and {@code byte[]} for {@link + * VectorEncoding#BYTE BYTE}. A mismatch is reported as a {@link ClassCastException} when values are + * consumed during indexing. + * + *

{@link Column.Density#DENSE DENSE} indicates that every batch-local doc has a vector; {@link + * Column.Density#SPARSE SPARSE} allows gaps. Both densities use the same tuple cursor — there is no + * dense bulk-fill fast path for vectors. + * + * @param the vector array type, either {@code float[]} or {@code byte[]} + * @lucene.experimental + */ +public abstract class VectorColumn extends Column { + + /** + * Creates a VectorColumn with the given field name, type, and density. + * + * @throws IllegalArgumentException if {@code fieldType.vectorDimension() <= 0} + */ + protected VectorColumn(String name, IndexableFieldType fieldType, Density density) { + super(name, fieldType, density); + if (fieldType.vectorDimension() <= 0) { + throw new IllegalArgumentException( + "VectorColumn \"" + + name + + "\" requires fieldType.vectorDimension() > 0; got " + + fieldType.vectorDimension()); + } + } + + /** Returns a fresh tuple cursor starting at the beginning of the batch. */ + public abstract VectorTupleCursor tuples(); +} diff --git a/lucene/core/src/java/org/apache/lucene/document/column/VectorTupleCursor.java b/lucene/core/src/java/org/apache/lucene/document/column/VectorTupleCursor.java new file mode 100644 index 000000000000..d05b8d47ff69 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/column/VectorTupleCursor.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document.column; + +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A tuple cursor over a {@link VectorColumn}. Yields {@code (docID, vectorValue)} pairs. + * Batch-local doc-ids are returned in strictly increasing order (vectors are single-valued). + * + * @param the vector array type, either {@code float[]} or {@code byte[]} + * @lucene.experimental + */ +public abstract class VectorTupleCursor { + + /** Sole constructor. */ + protected VectorTupleCursor() {} + + /** + * Advances to the next doc-id that has a vector and returns it, or {@link + * DocIdSetIterator#NO_MORE_DOCS} if exhausted. Doc-ids are batch-local (0 to {@code numDocs - 1}) + * and strictly increasing. + */ + public abstract int nextDoc(); + + /** + * Returns the vector at the current cursor position. The returned array may be reused by the + * cursor on subsequent calls to {@link #nextDoc()} — the indexing chain copies the value before + * advancing. Only valid after a {@code nextDoc()} that returned a value other than {@link + * DocIdSetIterator#NO_MORE_DOCS}. + */ + public abstract T vectorValue(); +} diff --git a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java index 8c8b0cac26f6..a7d70b23ace7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java @@ -61,6 +61,31 @@ public void add(int docID) { cardinality++; } + /** + * Add a contiguous range of document IDs to the set. + * + * @param from first document ID (inclusive) + * @param toExclusive one past the last document ID (exclusive) + */ + public void addRange(int from, int toExclusive) { + if (from <= lastDocId) { + throw new IllegalArgumentException( + "Out of order doc ids: last=" + lastDocId + ", next=" + from); + } + int count = toExclusive - from; + if (set != null) { + set = FixedBitSet.ensureCapacity(set, toExclusive - 1); + set.set(from, toExclusive); + } else if (from != cardinality) { + // migrate to a sparse encoding using a bit set + set = new FixedBitSet(toExclusive); + set.set(0, cardinality); + set.set(from, toExclusive); + } + lastDocId = toExclusive - 1; + cardinality += count; + } + @Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + (set == null ? 0 : set.ramBytesUsed()); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java index 3414b882f89f..c6cf1ee8dc2b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -27,6 +27,7 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.function.Supplier; import java.util.function.ToLongFunction; +import org.apache.lucene.document.column.ColumnBatch; import org.apache.lucene.index.DocumentsWriterPerThread.FlushedSegment; import org.apache.lucene.search.Query; import org.apache.lucene.store.AlreadyClosedException; @@ -452,6 +453,45 @@ long updateDocuments( return seqNo; } + long updateBatch(final ColumnBatch columnBatch, final DocumentsWriterDeleteQueue.Node delNode) + throws IOException { + boolean hasEvents = preUpdate(); + + final DocumentsWriterPerThread dwpt = flushControl.obtainAndLock(); + final DocumentsWriterPerThread flushingDWPT; + long seqNo; + + try { + // This must happen after we've pulled the DWPT because IW.close + // waits for all DWPT to be released: + ensureOpen(); + try { + seqNo = + dwpt.updateBatch( + columnBatch, delNode, flushNotifications, numDocsInRAM::incrementAndGet); + } finally { + if (dwpt.isAborted()) { + flushControl.doOnAbort(dwpt); + } + } + flushingDWPT = flushControl.doAfterDocument(dwpt); + } finally { + synchronized (flushControl) { + if (dwpt.isFlushPending() || dwpt.isAborted() || dwpt.isQueueAdvanced()) { + dwpt.unlock(); + } else { + perThreadPool.marksAsFreeAndUnlock(dwpt); + } + } + assert dwpt.isHeldByCurrentThread() == false : "we didn't release the dwpt even on abort"; + } + + if (postUpdate(flushingDWPT, hasEvents)) { + seqNo = -seqNo; + } + return seqNo; + } + private boolean maybeFlush() throws IOException { final DocumentsWriterPerThread flushingDWPT = flushControl.nextPendingFlush(); if (flushingDWPT != null) { diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index e4cd3f04328c..2324dd6094f2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -32,6 +32,7 @@ import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.column.ColumnBatch; import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; @@ -284,6 +285,60 @@ long updateDocuments( } } + long updateBatch( + ColumnBatch columnBatch, + DocumentsWriterDeleteQueue.Node deleteNode, + DocumentsWriter.FlushNotifications flushNotifications, + Runnable onNewDocOnRAM) + throws IOException { + try { + testPoint("DocumentsWriterPerThread addBatch start"); + assert abortingException == null : "DWPT has hit aborting exception but is still indexing"; + if (INFO_VERBOSE && infoStream.isEnabled("DWPT")) { + infoStream.message( + "DWPT", + Thread.currentThread().getName() + + " update batch" + + " docID=" + + numDocsInRAM + + " seg=" + + segmentInfo.name); + } + final int docsInRamBefore = numDocsInRAM; + final int numDocs = columnBatch.numDocs(); + boolean allDocsIndexed = false; + try { + // Reserve all doc IDs upfront and account for them in numDocsInRAM immediately, + // so that deleteLastDocs in the finally block can correctly clean up on failure. + // Even on exception, the documents are still added (but marked deleted), matching + // the document path semantics. + for (int i = 0; i < numDocs; i++) { + reserveOneDoc(); + } + numDocsInRAM += numDocs; + for (int i = 0; i < numDocs; i++) { + onNewDocOnRAM.run(); + } + + indexingChain.processBatch(docsInRamBefore, columnBatch); + + if (numDocs > 1) { + segmentInfo.setHasBlocks(); + } + allDocsIndexed = true; + return finishDocuments(deleteNode, docsInRamBefore); + } finally { + if (!allDocsIndexed && !aborted) { + // the iterator threw an exception that is not aborting + // go and mark all docs from this block as deleted + deleteLastDocs(numDocsInRAM - docsInRamBefore); + } + } + } finally { + maybeAbort("updateBatch", flushNotifications); + } + } + private long finishDocuments(DocumentsWriterDeleteQueue.Node deleteNode, int docIdUpTo) { /* * here we actually finish the document in two steps 1. push the delete into diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 30ff07c3932f..79bada22d57b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -55,6 +55,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.document.Field; +import org.apache.lucene.document.column.ColumnBatch; import org.apache.lucene.index.DocValuesUpdate.BinaryDocValuesUpdate; import org.apache.lucene.index.DocValuesUpdate.NumericDocValuesUpdate; import org.apache.lucene.index.FieldInfos.FieldNumbers; @@ -1523,6 +1524,41 @@ public long addDocuments(Iterable> return updateDocuments((DocumentsWriterDeleteQueue.Node) null, docs); } + /** + * Adds a batch of documents in column-oriented format. The batch's columns are processed + * field-by-field rather than document-by-document. + * + * @param columnBatch the column-oriented batch of documents to add + * @return The sequence number for this operation + * @throws IOException if there is a low-level IO error + * @lucene.experimental + */ + public long addBatch(ColumnBatch columnBatch) throws IOException { + return updateBatch(null, columnBatch); + } + + private long updateBatch( + final DocumentsWriterDeleteQueue.Node delNode, ColumnBatch columnBatch) + throws IOException { + ensureOpen(); + boolean success = false; + try { + final long seqNo = maybeProcessEvents(docWriter.updateBatch(columnBatch, delNode)); + success = true; + return seqNo; + } catch (Error tragedy) { + tragicEvent(tragedy, "updateBatch"); + throw tragedy; + } finally { + if (success == false) { + if (infoStream.isEnabled("IW")) { + infoStream.message("IW", "hit exception adding batch"); + } + maybeCloseOnTragicEvent(); + } + } + } + /** * Atomically deletes documents matching the provided delTerm and adds a block of documents with * sequentially assigned document IDs, such that an external reader will see all or none of the diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index d62100635415..1db2a8f2567b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -43,6 +43,17 @@ import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StoredValue; +import org.apache.lucene.document.column.BinaryColumn; +import org.apache.lucene.document.column.BinaryTupleCursor; +import org.apache.lucene.document.column.Column; +import org.apache.lucene.document.column.ColumnBatch; +import org.apache.lucene.document.column.ColumnFieldAdapter; +import org.apache.lucene.document.column.ColumnValidation; +import org.apache.lucene.document.column.LongColumn; +import org.apache.lucene.document.column.LongTupleCursor; +import org.apache.lucene.document.column.LongValuesCursor; +import org.apache.lucene.document.column.VectorColumn; +import org.apache.lucene.document.column.VectorTupleCursor; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -59,6 +70,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.IntBlockPool; +import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.Version; @@ -681,6 +693,435 @@ private void oversizeDocFields() { docFields = newDocFields; } + /** + * Process a column-oriented batch of documents. Iterates the batch's columns, validates each + * column's field type, and feeds values to the appropriate DocValuesWriter. + * + * @param baseDocID the segment-level doc ID for the first document in the batch (batch-local doc + * 0 maps to this value) + * @param columnBatch the column-oriented batch + */ + void processBatch(int baseDocID, ColumnBatch columnBatch) throws IOException { + final int numDocs = columnBatch.numDocs(); + boolean hasRowColumns = false; + + // First pass: validate all column schemas and initialize field infos + for (Column column : columnBatch.columns()) { + final String fieldName = column.name(); + final IndexableFieldType fieldType = column.fieldType(); + + ColumnValidation.validateColumnHasIndexingFeature(fieldName, fieldType); + + if (column instanceof BinaryColumn bc) { + ColumnValidation.validateBinaryColumn(bc, fieldType); + } else if (column instanceof LongColumn lc) { + ColumnValidation.validateLongColumn(lc, fieldType); + } else if (column instanceof VectorColumn vc) { + ColumnValidation.validateVectorColumn(vc, fieldType); + } + + if (fieldType.stored() || fieldType.indexOptions() != IndexOptions.NONE) { + hasRowColumns = true; + } + + PerField pf = getOrAddPerField(fieldName); + validateColumnSchema(fieldName, pf, fieldType); + } + + // Index the parent field for every document (each batch doc is an individual document, + // not part of a block, so every doc is its own parent). + if (parentPf != null) { + if (parentPf.fieldInfo == null) { + initializeFieldInfo(parentPf); + parentPf.trySetValidatedFrozenFieldType(); + } + final NumericDocValuesWriter parentWriter = (NumericDocValuesWriter) parentPf.docValuesWriter; + final long value = parentField.numericValue().longValue(); + for (int i = 0; i < numDocs; i++) { + parentWriter.addValue(baseDocID + i, value); + } + } + + // Row-oriented pass: stored fields and term inversion only. Uses fresh tuple cursors. + if (hasRowColumns) { + processRowColumns(baseDocID, numDocs, columnBatch.columns()); + } + + // Column-oriented pass: doc values, points, and vectors. Each column is asked for a fresh + // cursor. + for (Column column : columnBatch.columns()) { + final IndexableFieldType fieldType = column.fieldType(); + if (fieldType.docValuesType() == DocValuesType.NONE + && fieldType.pointDimensionCount() == 0 + && fieldType.vectorDimension() == 0) { + continue; // no column-oriented features + } + PerField pf = getOrAddPerField(column.name()); + + switch (column) { + case LongColumn longCol -> processLongColumn(baseDocID, numDocs, longCol, pf, fieldType); + case BinaryColumn binaryCol -> + processBinaryColumn(baseDocID, numDocs, binaryCol, pf, fieldType); + case VectorColumn vectorCol -> + processVectorColumn(baseDocID, numDocs, vectorCol, pf, fieldType); + default -> + throw new IllegalArgumentException( + "Unknown column type: " + column.getClass().getName()); + } + } + } + + /** + * Processes row-oriented features (stored fields and term inversion) for columns that have stored + * or indexed fields. The outer loop iterates every batch-local doc-id in {@code [0, numDocs)} so + * every reserved doc is framed with {@code startStoredFields}/{@code termsHash.startDocument}, + * matching the single-doc indexing path. For each doc, row columns are consumed while their + * cursor head equals the current doc. Doc values and points are handled separately in the + * column-oriented pass. + */ + private void processRowColumns(int baseDocID, int numDocs, Iterable columns) + throws IOException { + // Collect row-oriented columns. Per-field PerFields are cached in the shared docFields array + // (also used by processDocument) to avoid a per-batch allocation; adapters and cursor heads + // are local since they're column-specific. + int numRowCols = 0; + ColumnFieldAdapter[] adapters = new ColumnFieldAdapter[4]; + int[] heads = new int[4]; + boolean hasInverted = false; + + for (Column column : columns) { + IndexableFieldType fieldType = column.fieldType(); + if (fieldType.stored() == false && fieldType.indexOptions() == IndexOptions.NONE) { + continue; + } + if (numRowCols >= adapters.length) { + adapters = ArrayUtil.grow(adapters, numRowCols + 1); + heads = ArrayUtil.grow(heads, numRowCols + 1); + } + if (numRowCols >= docFields.length) { + oversizeDocFields(); + } + ColumnFieldAdapter adapter = ColumnFieldAdapter.create(column); + adapters[numRowCols] = adapter; + docFields[numRowCols] = getOrAddPerField(column.name()); + heads[numRowCols] = adapter.nextDoc(); + if (fieldType.indexOptions() != IndexOptions.NONE) { + hasInverted = true; + } + numRowCols++; + } + + // Row-dense outer loop: frame every doc in [0, numDocs). Column cursors stay sparse, but the + // per-doc framing is fixed so stored fields and termsHash stay aligned with the reserved doc + // ids even for docs that have no row-oriented values. + for (int batchDocID = 0; batchDocID < numDocs; batchDocID++) { + int segDocID = baseDocID + batchDocID; + long fieldGen = nextFieldGen++; + int indexedFieldCount = 0; + + if (hasInverted) { + termsHash.startDocument(); + } + startStoredFields(segDocID); + try { + for (int i = 0; i < numRowCols; i++) { + int head = heads[i]; + if (head != DocIdSetIterator.NO_MORE_DOCS && head < batchDocID) { + throw new IllegalArgumentException( + "Row column \"" + + adapters[i].name() + + "\" returned out-of-order batch doc-id " + + head); + } + while (head == batchDocID) { + PerField pf = docFields[i]; + if (pf.fieldGen != fieldGen) { + pf.fieldGen = fieldGen; + pf.reset(segDocID, adapters[i].fieldType()); + } + if (invertAndStore(segDocID, adapters[i], pf)) { + fields[indexedFieldCount] = pf; + indexedFieldCount++; + } + head = adapters[i].nextDoc(); + } + heads[i] = head; + } + } finally { + if (hasHitAbortingException == false) { + for (int i = 0; i < indexedFieldCount; i++) { + fields[i].finish(segDocID); + } + finishStoredFields(); + if (hasInverted) { + try { + termsHash.finishDocument(segDocID); + } catch (Throwable th) { + abortingExceptionConsumer.accept(th); + throw th; + } + } + } + } + } + + // Any remaining cursor head after the outer loop is a doc-id >= numDocs. + for (int i = 0; i < numRowCols; i++) { + if (heads[i] != DocIdSetIterator.NO_MORE_DOCS) { + throw new IllegalArgumentException( + "Row column \"" + + adapters[i].name() + + "\" returned batch doc-id " + + heads[i] + + " which is out of range [0, " + + numDocs + + ")"); + } + } + } + + private void validateColumnSchema(String fieldName, PerField pf, IndexableFieldType fieldType) + throws IOException { + updateDocFieldSchema(fieldName, pf.schema, fieldType); + if (pf.fieldInfo == null) { + initializeFieldInfo(pf); + pf.trySetValidatedFrozenFieldType(); + } else { + pf.schema.assertSameSchema(pf.fieldInfo); + } + } + + private static void processLongColumn( + int baseDocID, int numDocs, LongColumn column, PerField pf, IndexableFieldType fieldType) + throws IOException { + final DocValuesType dvType = fieldType.docValuesType(); + final boolean hasPoints = fieldType.pointDimensionCount() != 0; + + // DV-only path (no points): the bulk dense path remains available. + // TODO: can support dense fast path for points + if (hasPoints == false) { + if (column.density() == Column.Density.DENSE) { + processDenseLongColumn(baseDocID, numDocs, column, column.values(), pf, dvType); + return; + } + LongTupleCursor cursor = column.tuples(); + switch (dvType) { + case NUMERIC -> { + NumericDocValuesWriter writer = (NumericDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + writer.addValue(baseDocID + batchDocID, cursor.longValue()); + } + } + case SORTED_NUMERIC -> { + SortedNumericDocValuesWriter writer = (SortedNumericDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + writer.addValue(baseDocID + batchDocID, cursor.longValue()); + } + } + // $CASES-OMITTED$ + default -> + throw new IllegalArgumentException( + "LongColumn \"" + column.name() + "\" has incompatible docValuesType: " + dvType); + } + return; + } + + // Points (+ optional numeric DV). Always uses the tuple cursor. + final LongColumn.NumericKind kind = column.numericKind(); + final int byteWidth = + (kind == LongColumn.NumericKind.INT || kind == LongColumn.NumericKind.FLOAT) + ? Integer.BYTES + : Long.BYTES; + final byte[] pointScratch = new byte[byteWidth]; + final BytesRef pointBytesRef = new BytesRef(pointScratch); + final PointValuesWriter pointWriter = pf.pointValuesWriter; + final LongTupleCursor cursor = column.tuples(); + + switch (dvType) { + case NONE -> { + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + encodeSortablePointBytes(cursor.longValue(), kind, pointScratch); + pointWriter.addPackedValue(baseDocID + batchDocID, pointBytesRef); + } + } + case NUMERIC -> { + NumericDocValuesWriter dvWriter = (NumericDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + int segDocID = baseDocID + batchDocID; + long raw = cursor.longValue(); + dvWriter.addValue(segDocID, raw); + encodeSortablePointBytes(raw, kind, pointScratch); + pointWriter.addPackedValue(segDocID, pointBytesRef); + } + } + case SORTED_NUMERIC -> { + SortedNumericDocValuesWriter dvWriter = (SortedNumericDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + int segDocID = baseDocID + batchDocID; + long raw = cursor.longValue(); + dvWriter.addValue(segDocID, raw); + encodeSortablePointBytes(raw, kind, pointScratch); + pointWriter.addPackedValue(segDocID, pointBytesRef); + } + } + // $CASES-OMITTED$ + default -> + throw new IllegalArgumentException( + "LongColumn \"" + column.name() + "\" has incompatible docValuesType: " + dvType); + } + } + + private static void encodeSortablePointBytes( + long raw, LongColumn.NumericKind kind, byte[] scratch) { + switch (kind) { + case INT, FLOAT -> NumericUtils.intToSortableBytes((int) raw, scratch, 0); + case LONG, DOUBLE -> NumericUtils.longToSortableBytes(raw, scratch, 0); + } + } + + private static void processDenseLongColumn( + int baseDocID, + int numDocs, + LongColumn column, + LongValuesCursor cursor, + PerField pf, + DocValuesType dvType) { + ColumnValidation.checkDenseCount(column, cursor.size(), numDocs); + switch (dvType) { + case NUMERIC -> { + NumericDocValuesWriter writer = (NumericDocValuesWriter) pf.docValuesWriter; + writer.addDenseValues(baseDocID, cursor); + } + case SORTED_NUMERIC -> { + SortedNumericDocValuesWriter writer = (SortedNumericDocValuesWriter) pf.docValuesWriter; + writer.addDenseValues(baseDocID, cursor); + } + // $CASES-OMITTED$ + default -> + throw new IllegalArgumentException( + "LongColumn \"" + column.name() + "\" has incompatible docValuesType: " + dvType); + } + } + + private static void processBinaryColumn( + int baseDocID, int numDocs, BinaryColumn column, PerField pf, IndexableFieldType fieldType) + throws IOException { + final DocValuesType dvType = fieldType.docValuesType(); + final boolean hasPoints = fieldType.pointDimensionCount() != 0; + final PointValuesWriter pointWriter = hasPoints ? pf.pointValuesWriter : null; + final BinaryTupleCursor cursor = column.tuples(); + + if (dvType == DocValuesType.NONE) { + // Points only: bytes are passed through unchanged (caller is responsible for producing + // sort-encoded bytes of the correct total length). + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + pointWriter.addPackedValue(baseDocID + batchDocID, cursor.binaryValue()); + } + return; + } + + switch (dvType) { + case BINARY -> { + BinaryDocValuesWriter writer = (BinaryDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + int segDocID = baseDocID + batchDocID; + BytesRef value = cursor.binaryValue(); + writer.addValue(segDocID, value); + if (hasPoints) { + pointWriter.addPackedValue(segDocID, value); + } + } + } + case SORTED -> { + SortedDocValuesWriter writer = (SortedDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + int segDocID = baseDocID + batchDocID; + BytesRef value = cursor.binaryValue(); + writer.addValue(segDocID, value); + if (hasPoints) { + pointWriter.addPackedValue(segDocID, value); + } + } + } + case SORTED_SET -> { + SortedSetDocValuesWriter writer = (SortedSetDocValuesWriter) pf.docValuesWriter; + int batchDocID; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + int segDocID = baseDocID + batchDocID; + BytesRef value = cursor.binaryValue(); + writer.addValue(segDocID, value); + if (hasPoints) { + pointWriter.addPackedValue(segDocID, value); + } + } + } + // $CASES-OMITTED$ + default -> + throw new IllegalArgumentException( + "BinaryColumn \"" + column.name() + "\" has incompatible docValuesType: " + dvType); + } + } + + @SuppressWarnings("unchecked") + private static void processVectorColumn( + int baseDocID, int numDocs, VectorColumn column, PerField pf, IndexableFieldType fieldType) + throws IOException { + final VectorEncoding encoding = fieldType.vectorEncoding(); + final int dimension = fieldType.vectorDimension(); + final VectorTupleCursor cursor = column.tuples(); + int prevBatchDocID = -1; + int consumed = 0; + int batchDocID; + switch (encoding) { + case FLOAT32 -> { + KnnFieldVectorsWriter writer = + (KnnFieldVectorsWriter) pf.knnFieldVectorsWriter; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + ColumnValidation.checkVectorDocIDStrictlyIncreasing(column, batchDocID, prevBatchDocID); + float[] vec = (float[]) cursor.vectorValue(); + ColumnValidation.checkVectorDimension(column, vec.length, dimension, batchDocID); + writer.addValue(baseDocID + batchDocID, vec); + prevBatchDocID = batchDocID; + consumed++; + } + } + case BYTE -> { + KnnFieldVectorsWriter writer = + (KnnFieldVectorsWriter) pf.knnFieldVectorsWriter; + while ((batchDocID = cursor.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + ColumnValidation.checkDocID(column, batchDocID, numDocs); + ColumnValidation.checkVectorDocIDStrictlyIncreasing(column, batchDocID, prevBatchDocID); + byte[] vec = (byte[]) cursor.vectorValue(); + ColumnValidation.checkVectorDimension(column, vec.length, dimension, batchDocID); + writer.addValue(baseDocID + batchDocID, vec); + prevBatchDocID = batchDocID; + consumed++; + } + } + } + if (column.density() == Column.Density.DENSE) { + ColumnValidation.checkDenseCount(column, consumed, numDocs); + } + } + private void initializeFieldInfo(PerField pf) throws IOException { // Create and add a new fieldInfo to fieldInfos for this segment. // During the creation of FieldInfo there is also verification of the correctness of all its @@ -764,10 +1205,30 @@ private void initializeFieldInfo(PerField pf) throws IOException { /** Index each field Returns {@code true}, if we are indexing a unique field with postings */ private boolean processField(int docID, IndexableField field, PerField pf) throws IOException { + boolean indexedField = invertAndStore(docID, field, pf); + IndexableFieldType fieldType = field.fieldType(); + DocValuesType dvType = fieldType.docValuesType(); + if (dvType != DocValuesType.NONE) { + indexDocValue(docID, pf, dvType, field); + } + if (fieldType.pointDimensionCount() != 0) { + pf.pointValuesWriter.addPackedValue(docID, field.binaryValue()); + } + if (fieldType.vectorDimension() != 0) { + indexVectorValue(docID, pf, fieldType.vectorEncoding(), field); + } + return indexedField; + } + + /** + * Inverts indexed fields and writes stored fields. Shared by the single-doc row path ({@link + * #processField}) and the column-batch row pass ({@link #processRowColumns}). Returns {@code + * true} if this is a unique indexed field with postings. + */ + private boolean invertAndStore(int docID, IndexableField field, PerField pf) throws IOException { IndexableFieldType fieldType = field.fieldType(); boolean indexedField = false; - // Invert indexed fields if (fieldType.indexOptions() != IndexOptions.NONE) { if (pf.first) { // first time we see this field in this doc pf.invert(docID, field, true); @@ -778,7 +1239,6 @@ private boolean processField(int docID, IndexableField field, PerField pf) throw } } - // Add stored fields if (fieldType.stored()) { StoredValue storedValue = field.storedValue(); if (storedValue == null) { @@ -800,16 +1260,6 @@ private boolean processField(int docID, IndexableField field, PerField pf) throw } } - DocValuesType dvType = fieldType.docValuesType(); - if (dvType != DocValuesType.NONE) { - indexDocValue(docID, pf, dvType, field); - } - if (fieldType.pointDimensionCount() != 0) { - pf.pointValuesWriter.addPackedValue(docID, field.binaryValue()); - } - if (fieldType.vectorDimension() != 0) { - indexVectorValue(docID, pf, fieldType.vectorEncoding(), field); - } return indexedField; } @@ -1264,8 +1714,10 @@ private void invertTokenStream(int docID, IndexableField field, boolean first) throws IOException { final boolean analyzed = field.fieldType().tokenized() && analyzer != null; /* - * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream - * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, + * To assist people in tracking down problems in analysis components, we wish to write the field name to the + * infostream + * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' + * clauses, * but rather a finally that takes note of the problem. */ boolean succeededInProcessingField = false; @@ -1327,7 +1779,8 @@ private void invertTokenStream(int docID, IndexableField field, boolean first) int endOffset = invertState.offset + invertState.offsetAttribute.endOffset(); if (startOffset < invertState.lastStartOffset || endOffset < startOffset) { throw new IllegalArgumentException( - "startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards " + "startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go " + + "backwards " + "startOffset=" + startOffset + ",endOffset=" @@ -1372,7 +1825,8 @@ private void invertTokenStream(int docID, IndexableField field, boolean first) + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + IndexWriter.MAX_TERM_LENGTH - + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + + "), all of which were skipped. Please correct the analyzer to not produce such terms. The " + + "prefix of the first immense term is: '" + Arrays.toString(prefix) + "...', original message: " + e.getMessage(); diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index 09bef657b2df..a84414a7348a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.document.column.LongValuesCursor; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Counter; @@ -64,6 +65,18 @@ public void addValue(int docID, long value) { lastDocID = docID; } + public void addDenseValues(int firstDocID, LongValuesCursor cursor) { + assert firstDocID > lastDocID; + + int numValues = cursor.size(); + pending.add(cursor); + docsWithField.addRange(firstDocID, firstDocID + numValues); + + updateBytesUsed(); + + lastDocID = firstDocID + numValues - 1; + } + private void updateBytesUsed() { final long newBytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed(); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java index 364dfb023a68..4b6a3ddf9e9b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java @@ -22,6 +22,7 @@ import java.util.Arrays; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.document.column.LongValuesCursor; import org.apache.lucene.index.NumericDocValuesWriter.BufferedNumericDocValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; @@ -68,11 +69,42 @@ public void addValue(int docID, long value) { updateBytesUsed(); } + public void addDenseValues(int firstDocID, LongValuesCursor cursor) { + assert firstDocID > currentDoc; + finishCurrentDoc(); + + int numValues = cursor.size(); + + // Write values directly to pending — each value is one doc, single-valued. + // No currentValues[] buffering, no sorting needed. + pending.add(cursor); + + // If pendingCounts is active (some earlier doc was multi-valued), + // record count=1 for each dense doc. + if (pendingCounts != null) { + for (int i = 0; i < numValues; i++) { + pendingCounts.add(1); + } + } + + // Bulk-add consecutive doc-ids + docsWithField.addRange(firstDocID, firstDocID + numValues); + + // Set currentDoc to last written doc so ordering is maintained. + // currentUpto stays 0 — nothing buffered. + currentDoc = firstDocID + numValues - 1; + + updateBytesUsed(); + } + // finalize currentDoc: this sorts the values in the current doc private void finishCurrentDoc() { if (currentDoc == -1) { return; } + if (currentUpto == 0) { + return; // doc already committed directly (e.g., via addDenseValues) + } if (currentUpto > 1) { Arrays.sort(currentValues, 0, currentUpto); } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java index e996c2d52e1a..2abf8f29bb6d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedLongValues.java @@ -18,6 +18,7 @@ import static org.apache.lucene.util.packed.PackedInts.checkBlockSize; +import org.apache.lucene.document.column.LongValuesCursor; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.LongValues; @@ -234,17 +235,40 @@ public Builder add(long l) { if (pending == null) { throw new IllegalStateException("Cannot be reused after build()"); } + packIfFull(); + pending[pendingOff++] = l; + size += 1; + return this; + } + + /** + * Add all values produced by the given {@link LongValuesCursor} in bulk. The cursor's {@link + * LongValuesCursor#size()} is used as the bounds: exactly that many values are pulled. + */ + public Builder add(LongValuesCursor cursor) { + if (pending == null) { + throw new IllegalStateException("Cannot be reused after build()"); + } + int remaining = cursor.size(); + while (remaining > 0) { + packIfFull(); + int toFill = Math.min(remaining, pending.length - pendingOff); + cursor.fill(pending, pendingOff, toFill); + pendingOff += toFill; + remaining -= toFill; + size += toFill; + } + return this; + } + + private void packIfFull() { if (pendingOff == pending.length) { - // check size if (values.length == valuesOff) { final int newLength = ArrayUtil.oversize(valuesOff + 1, 8); grow(newLength); } pack(); } - pending[pendingOff++] = l; - size += 1; - return this; } final void finish() { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestColumnBatchIndexing.java b/lucene/core/src/test/org/apache/lucene/index/TestColumnBatchIndexing.java new file mode 100644 index 000000000000..66e313528c88 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestColumnBatchIndexing.java @@ -0,0 +1,2752 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.FloatPoint; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StoredValue; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.document.column.BinaryColumn; +import org.apache.lucene.document.column.BinaryTupleCursor; +import org.apache.lucene.document.column.Column; +import org.apache.lucene.document.column.ColumnBatch; +import org.apache.lucene.document.column.LongColumn; +import org.apache.lucene.document.column.LongTupleCursor; +import org.apache.lucene.document.column.LongValuesCursor; +import org.apache.lucene.document.column.VectorColumn; +import org.apache.lucene.document.column.VectorTupleCursor; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; + +/** Tests for column-oriented batch indexing via {@link IndexWriter#addBatch}. */ +public class TestColumnBatchIndexing extends LuceneTestCase { + + public void testNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + long[] values = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + w.addBatch( + simpleBatch(3, new ArrayLongColumn("numeric", NumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("numeric"); + for (int i = 0; i < values.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSortedNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Doc 0 has two values, doc 1 has one value + int[] docIds = {0, 0, 1}; + long[] values = {5, 15, 25}; + w.addBatch( + simpleBatch( + 2, + new ArrayLongColumn( + "sortedNumeric", SortedNumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("sortedNumeric"); + + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(5, dv.nextValue()); + assertEquals(15, dv.nextValue()); + + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(25, dv.nextValue()); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testBinaryDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + BytesRef[] values = {newBytesRef("aaa"), newBytesRef("bbb"), newBytesRef("ccc")}; + int[] docIds = {0, 1, 2}; + w.addBatch( + simpleBatch(3, new ArrayBinaryColumn("binary", BinaryDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + BinaryDocValues dv = leaf.getBinaryDocValues("binary"); + for (int i = 0; i < values.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.binaryValue()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSortedDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + BytesRef[] values = {newBytesRef("x"), newBytesRef("y"), newBytesRef("x")}; + int[] docIds = {0, 1, 2}; + w.addBatch( + simpleBatch(3, new ArrayBinaryColumn("sorted", SortedDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + SortedDocValues dv = leaf.getSortedDocValues("sorted"); + + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef("x"), dv.lookupOrd(dv.ordValue())); + assertEquals(1, dv.nextDoc()); + assertEquals(newBytesRef("y"), dv.lookupOrd(dv.ordValue())); + assertEquals(2, dv.nextDoc()); + assertEquals(newBytesRef("x"), dv.lookupOrd(dv.ordValue())); + + // "x" and "y" should share ord space + assertEquals(2, dv.getValueCount()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSortedSetDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Doc 0 has two values, doc 1 has one value + int[] docIds = {0, 0, 1}; + BytesRef[] values = {newBytesRef("a"), newBytesRef("b"), newBytesRef("a")}; + w.addBatch( + simpleBatch( + 2, new ArrayBinaryColumn("sortedSet", SortedSetDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + SortedSetDocValues dv = leaf.getSortedSetDocValues("sortedSet"); + + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(newBytesRef("a"), dv.lookupOrd(dv.nextOrd())); + assertEquals(newBytesRef("b"), dv.lookupOrd(dv.nextOrd())); + + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(newBytesRef("a"), dv.lookupOrd(dv.nextOrd())); + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testMultipleColumns() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + int[] allDocs = {0, 1, 2}; + long[] numericValues = {100, 200, 300}; + BytesRef[] sortedValues = {newBytesRef("a"), newBytesRef("b"), newBytesRef("c")}; + + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("numeric", NumericDocValuesField.TYPE, allDocs, numericValues), + new ArrayBinaryColumn("sorted", SortedDocValuesField.TYPE, allDocs, sortedValues))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + NumericDocValues ndv = leaf.getNumericDocValues("numeric"); + SortedDocValues sdv = leaf.getSortedDocValues("sorted"); + for (int i = 0; i < 3; i++) { + assertEquals(i, ndv.nextDoc()); + assertEquals(numericValues[i], ndv.longValue()); + assertEquals(i, sdv.nextDoc()); + assertEquals(sortedValues[i], sdv.lookupOrd(sdv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testSparseDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Only doc 1 has a value (docs 0 and 2 are missing) + int[] docIds = {1}; + long[] values = {42}; + w.addBatch( + simpleBatch(3, new ArrayLongColumn("sparse", NumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("sparse"); + assertEquals(1, dv.nextDoc()); + assertEquals(42, dv.longValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testParentFieldIndexed() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(); + config.setParentField("_parent"); + IndexWriter w = new IndexWriter(dir, config); + + int[] docIds = {0, 1, 2}; + long[] values = {1, 2, 3}; + w.addBatch( + simpleBatch(3, new ArrayLongColumn("numeric", NumericDocValuesField.TYPE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Every batch doc should have the parent field + NumericDocValues parentDv = leaf.getNumericDocValues("_parent"); + assertNotNull(parentDv); + for (int i = 0; i < 3; i++) { + assertEquals(i, parentDv.nextDoc()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testPointsColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Create a points-only FieldType (1 dimension, Integer.BYTES) + FieldType pointType = new FieldType(); + pointType.setDimensions(1, Integer.BYTES); + pointType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("point", pointType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 10))); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 20))); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 30))); + assertEquals(0, searcher.count(IntPoint.newExactQuery("point", 99))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("point", 10, 30))); + + r.close(); + w.close(); + dir.close(); + } + + public void testPointsWithDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 1D int points + SORTED_NUMERIC DV via the compat layer. + FieldType pointAndDvType = new FieldType(); + pointAndDvType.setDimensions(1, Integer.BYTES); + pointAndDvType.setDocValuesType(DocValuesType.SORTED_NUMERIC); + pointAndDvType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "field", pointAndDvType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + assertEquals(1, searcher.count(IntPoint.newExactQuery("field", 10))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("field", 10, 30))); + + LeafReader leaf = getOnlyLeafReader(r); + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.nextValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testSparsePointsColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(1, Integer.BYTES); + pointType.freeze(); + + // Only doc 1 out of 3 has a point value + int[] docIds = {1}; + long[] values = {42}; + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("point", pointType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("point", 42))); + assertEquals(0, searcher.count(IntPoint.newExactQuery("point", 0))); + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + NUMERIC doc values + FieldType storedNumericType = new FieldType(); + storedNumericType.setStored(true); + storedNumericType.setDocValuesType(DocValuesType.NUMERIC); + storedNumericType.freeze(); + + int[] docIds = {0, 1, 2}; + long[] values = {100, 200, 300}; + w.addBatch(simpleBatch(3, new ArrayLongColumn("val", storedNumericType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("val").numericValue().longValue()); + } + + // Verify doc values + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredBinaryColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + SORTED doc values + FieldType storedSortedType = new FieldType(); + storedSortedType.setStored(true); + storedSortedType.setDocValuesType(DocValuesType.SORTED); + storedSortedType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("aaa"), newBytesRef("bbb"), newBytesRef("ccc")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("val", storedSortedType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("val").binaryValue()); + } + + // Verify doc values + SortedDocValues dv = leaf.getSortedDocValues("val"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredOnlyColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored only — no doc values, no points + FieldType storedOnlyType = new FieldType(); + storedOnlyType.setStored(true); + storedOnlyType.freeze(); + + int[] docIds = {0, 1, 2}; + long[] values = {10, 20, 30}; + w.addBatch(simpleBatch(3, new ArrayLongColumn("stored", storedOnlyType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("stored").numericValue().longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testMixedStoredAndNonStoredColumns() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedNumericType = new FieldType(); + storedNumericType.setStored(true); + storedNumericType.setDocValuesType(DocValuesType.NUMERIC); + storedNumericType.freeze(); + + int[] allDocs = {0, 1, 2}; + long[] storedValues = {100, 200, 300}; + long[] dvOnlyValues = {1, 2, 3}; + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn("stored_field", storedNumericType, allDocs, storedValues), + new ArrayLongColumn("dv_only", NumericDocValuesField.TYPE, allDocs, dvOnlyValues))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored field + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(storedValues[i], doc.getField("stored_field").numericValue().longValue()); + assertNull(doc.getField("dv_only")); // non-stored column should not appear + } + + // Verify both doc values columns + NumericDocValues storedDv = leaf.getNumericDocValues("stored_field"); + NumericDocValues dvOnly = leaf.getNumericDocValues("dv_only"); + for (int i = 0; i < 3; i++) { + assertEquals(i, storedDv.nextDoc()); + assertEquals(storedValues[i], storedDv.longValue()); + assertEquals(i, dvOnly.nextDoc()); + assertEquals(dvOnlyValues[i], dvOnly.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredPointsColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + points + FieldType storedPointType = new FieldType(); + storedPointType.setStored(true); + storedPointType.setDimensions(1, Integer.BYTES); + storedPointType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "pt", storedPointType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields — decoded as ints. + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + Document doc = storedFields.document(i); + assertEquals(raw[i], doc.getField("pt").numericValue().intValue()); + } + + // Verify points + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("pt", 10))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("pt", 10, 30))); + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // StringField-like: DOCS, omitNorms, non-tokenized + FieldType stringType = new FieldType(); + stringType.setIndexOptions(IndexOptions.DOCS); + stringType.setOmitNorms(true); + stringType.setTokenized(false); + stringType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("alpha"), newBytesRef("beta"), newBytesRef("alpha")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("tag", stringType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(2, searcher.count(new TermQuery(new Term("tag", "alpha")))); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "beta")))); + assertEquals(0, searcher.count(new TermQuery(new Term("tag", "gamma")))); + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedWithDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Inverted + SORTED doc values (like a StringField with doc values) + FieldType invertedDvType = new FieldType(); + invertedDvType.setIndexOptions(IndexOptions.DOCS); + invertedDvType.setOmitNorms(true); + invertedDvType.setTokenized(false); + invertedDvType.setDocValuesType(DocValuesType.SORTED); + invertedDvType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("x"), newBytesRef("y"), newBytesRef("x")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", invertedDvType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify inverted index + assertEquals(2, searcher.count(new TermQuery(new Term("field", "x")))); + assertEquals(1, searcher.count(new TermQuery(new Term("field", "y")))); + + // Verify doc values + LeafReader leaf = getOnlyLeafReader(r); + SortedDocValues dv = leaf.getSortedDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedWithStored() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Inverted + stored (like StringField with Store.YES) + FieldType invertedStoredType = new FieldType(StringField.TYPE_STORED); + invertedStoredType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("aaa"), newBytesRef("bbb"), newBytesRef("ccc")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", invertedStoredType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify inverted index + assertEquals(1, searcher.count(new TermQuery(new Term("field", "aaa")))); + assertEquals(1, searcher.count(new TermQuery(new Term("field", "bbb")))); + + // Verify stored fields + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + Document doc = storedFields.document(i); + assertEquals(values[i], doc.getField("field").binaryValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedWithStoredAndDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Inverted + stored + SORTED doc values + FieldType allType = new FieldType(); + allType.setIndexOptions(IndexOptions.DOCS); + allType.setOmitNorms(true); + allType.setTokenized(false); + allType.setStored(true); + allType.setDocValuesType(DocValuesType.SORTED); + allType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {newBytesRef("x"), newBytesRef("y"), newBytesRef("z")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", allType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify inverted index + assertEquals(1, searcher.count(new TermQuery(new Term("field", "x")))); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < 3; i++) { + assertEquals(values[i], storedFields.document(i).getField("field").binaryValue()); + } + + // Verify doc values + SortedDocValues dv = leaf.getSortedDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testInvertedSparse() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType stringType = new FieldType(); + stringType.setIndexOptions(IndexOptions.DOCS); + stringType.setOmitNorms(true); + stringType.setTokenized(false); + stringType.freeze(); + + // Only doc 1 out of 3 has a term + int[] docIds = {1}; + BytesRef[] values = {newBytesRef("found")}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("tag", stringType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "found")))); + + r.close(); + w.close(); + dir.close(); + } + + public void testTokenizedColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w = new IndexWriter(dir, config); + + // TextField-like: tokenized, DOCS_AND_FREQS_AND_POSITIONS + int[] docIds = {0, 1, 2}; + BytesRef[] values = { + newBytesRef("quick brown fox"), newBytesRef("lazy brown dog"), newBytesRef("quick fox jumps") + }; + w.addBatch( + simpleBatch(3, new ArrayBinaryColumn("text", TextField.TYPE_NOT_STORED, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + + // Each word was tokenized — verify individual terms + assertEquals(2, searcher.count(new TermQuery(new Term("text", "quick")))); + assertEquals(2, searcher.count(new TermQuery(new Term("text", "brown")))); + assertEquals(2, searcher.count(new TermQuery(new Term("text", "fox")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "lazy")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "dog")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "jumps")))); + assertEquals(0, searcher.count(new TermQuery(new Term("text", "missing")))); + + r.close(); + w.close(); + dir.close(); + } + + public void testTokenizedWithStored() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random())); + IndexWriter w = new IndexWriter(dir, config); + + int[] docIds = {0, 1}; + BytesRef[] values = {newBytesRef("hello world"), newBytesRef("goodbye world")}; + w.addBatch( + simpleBatch(2, new ArrayBinaryColumn("text", TextField.TYPE_STORED, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + IndexSearcher searcher = new IndexSearcher(r); + + // Verify tokenized search + assertEquals(2, searcher.count(new TermQuery(new Term("text", "world")))); + assertEquals(1, searcher.count(new TermQuery(new Term("text", "hello")))); + + // Verify stored fields + StoredFields storedFields = leaf.storedFields(); + assertEquals(values[0], storedFields.document(0).getField("text").binaryValue()); + assertEquals(values[1], storedFields.document(1).getField("text").binaryValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testColumnWithNoneDocValuesTypeAndNoPointsThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // FieldType with NONE doc values type and no points + FieldType badType = new FieldType(); + badType.freeze(); + + int[] docIds = {0}; + long[] values = {1}; + expectThrows( + IllegalArgumentException.class, + () -> w.addBatch(simpleBatch(1, new ArrayLongColumn("bad", badType, docIds, values)))); + + // Writer should still be usable after the failure + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "numeric", NumericDocValuesField.TYPE, new int[] {0}, new long[] {42}))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("numeric"); + assertNotNull(dv); + // The failed batch's doc was marked deleted; the successful batch's doc is still live + int doc = dv.nextDoc(); + assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS); + assertEquals(42, dv.longValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredWithDocValuesAndPoints() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + SORTED_NUMERIC DV + 4-byte points + FieldType allType = new FieldType(); + allType.setStored(true); + allType.setDocValuesType(DocValuesType.SORTED_NUMERIC); + allType.setDimensions(1, Integer.BYTES); + allType.freeze(); + + int[] raw = {10, 20, 30}; + int[] docIds = {0, 1, 2}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, new ArrayLongColumn("field", allType, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields — decoded as ints. + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("field").numericValue().intValue()); + } + + // Verify doc values (raw int widened to long). + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("field"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.nextValue()); + } + + // Verify points + assertEquals(3, leaf.getPointValues("field").size()); + + r.close(); + w.close(); + dir.close(); + } + + public void testMultiValuedStoredWithDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // stored + SORTED_NUMERIC doc values (multi-valued) + FieldType storedSortedNumericType = new FieldType(); + storedSortedNumericType.setStored(true); + storedSortedNumericType.setDocValuesType(DocValuesType.SORTED_NUMERIC); + storedSortedNumericType.freeze(); + + // Doc 0 has two values (10, 20), doc 1 has one value (30) + int[] docIds = {0, 0, 1}; + long[] values = {10, 20, 30}; + w.addBatch(simpleBatch(2, new ArrayLongColumn("val", storedSortedNumericType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Verify stored fields — each value occurrence is stored separately + StoredFields storedFields = leaf.storedFields(); + Document doc0 = storedFields.document(0); + assertEquals(2, doc0.getFields("val").length); + assertEquals(10L, doc0.getFields("val")[0].numericValue().longValue()); + assertEquals(20L, doc0.getFields("val")[1].numericValue().longValue()); + Document doc1 = storedFields.document(1); + assertEquals(1, doc1.getFields("val").length); + assertEquals(30L, doc1.getFields("val")[0].numericValue().longValue()); + + // Verify doc values + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(10, dv.nextValue()); + assertEquals(20, dv.nextValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(30, dv.nextValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + long[] values = {100, 200, 300}; + w.addBatch(simpleBatch(3, new ArrayDenseLongColumn("val", NumericDocValuesField.TYPE, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseSortedNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + long[] values = {10, 20, 30, 40, 50}; + w.addBatch( + simpleBatch(5, new ArrayDenseLongColumn("val", SortedNumericDocValuesField.TYPE, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < 5; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(values[i], dv.nextValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseColumnCountMismatchThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 2 values but batch expects 3 documents + long[] values = {10, 20}; + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 3, new ArrayDenseLongColumn("val", NumericDocValuesField.TYPE, values)))); + + // Writer should still be usable after the failure — use a different field to avoid + // the partially-written DV entries from the failed batch + w.addBatch( + simpleBatch( + 1, new ArrayDenseLongColumn("val2", NumericDocValuesField.TYPE, new long[] {42}))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val2"); + assertNotNull(dv); + int doc = dv.nextDoc(); + assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS); + assertEquals(42, dv.longValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseColumnTooManyValuesThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 5 values but batch only has 3 documents + long[] values = {10, 20, 30, 40, 50}; + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 3, new ArrayDenseLongColumn("val", NumericDocValuesField.TYPE, values)))); + + // Writer should still be usable — no values were written past numDocs + w.addBatch( + simpleBatch( + 1, new ArrayDenseLongColumn("val2", NumericDocValuesField.TYPE, new long[] {42}))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val2"); + assertNotNull(dv); + int doc = dv.nextDoc(); + assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS); + assertEquals(42, dv.longValue()); + + r.close(); + w.close(); + dir.close(); + } + + public void testIntSparseNumericDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Sparse: only docs 0 and 2 have values. + int[] docIds = {0, 2}; + int[] raw = {-7, 9}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", NumericDocValuesField.TYPE, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val"); + assertEquals(0, dv.nextDoc()); + assertEquals(-7, dv.longValue()); + assertEquals(2, dv.nextDoc()); + assertEquals(9, dv.longValue()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testLongColumnPointWidthMismatchThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(1, Integer.BYTES); // expects 4 bytes + pointType.freeze(); + + // LONG kind implies 8-byte point bytes; should fail validation against a 4-byte point type. + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "pt", + pointType, + LongColumn.NumericKind.LONG, + new int[] {0}, + new long[] {1})))); + + w.close(); + dir.close(); + } + + public void testBinaryColumnNumericDVBadFixedSizeThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // Variable-size binary into NUMERIC DV should fail validation (fixedSize=-1). + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayBinaryColumn( + "val", + NumericDocValuesField.TYPE, + new int[] {0}, + new BytesRef[] {newBytesRef("x")})))); + + w.close(); + dir.close(); + } + + public void testLongColumnMultiDimPointsThrows() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(2, Long.BYTES); + pointType.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayLongColumn("pt", pointType, new int[] {0}, new long[] {1})))); + + w.close(); + dir.close(); + } + + public void testDenseLongColumnWithStoredFields() throws IOException { + // Covers the "single column consumed by both passes via fresh cursors" case: a dense + // LongColumn with stored+numeric DV. Row pass uses tuples(), column pass uses values(). + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedNumericType = new FieldType(); + storedNumericType.setStored(true); + storedNumericType.setDocValuesType(DocValuesType.NUMERIC); + storedNumericType.freeze(); + + long[] values = {100, 200, 300, 400}; + w.addBatch(simpleBatch(4, new ArrayDenseLongColumn("val", storedNumericType, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < values.length; i++) { + assertEquals(values[i], storedFields.document(i).getField("val").numericValue().longValue()); + } + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < values.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeIntegerFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + int[] raw = {1, -2, 3}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.INT, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").numericValue().intValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeLongFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + long[] raw = {Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + w.addBatch(simpleBatch(3, new ArrayLongColumn("val", type, new int[] {0, 1, 2}, raw.clone()))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").numericValue().longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeFloatFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + float[] raw = {1.5f, -2.25f, Float.MAX_VALUE}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.FLOAT, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().floatValue(), 0f); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeDoubleFromLongColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + double[] raw = {1.5d, -2.25d, Double.MAX_VALUE}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = NumericUtils.doubleToSortableLong(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.DOUBLE, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().doubleValue(), 0d); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeStringFromBinaryColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + String[] raw = {"hello", "wörld", "🦜"}; + BytesRef[] values = new BytesRef[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = newBytesRef(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayBinaryColumn( + "val", type, new int[] {0, 1, 2}, values, StoredValue.Type.STRING))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").stringValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeFloatWithNumericDV() throws IOException { + // FLOAT kind on a LongColumn that also feeds NumericDV. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + float[] raw = {1.5f, -2.25f, 42.0f}; + long[] values = new long[raw.length]; + for (int i = 0; i < raw.length; i++) { + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + 3, + new ArrayLongColumn( + "val", type, LongColumn.NumericKind.FLOAT, new int[] {0, 1, 2}, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // Stored values decoded as floats. + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().floatValue(), 0f); + } + + // NumericDV stores the sortable-int encoding sign-extended to long. + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeIntegerWithNumericDV() throws IOException { + // INT kind on a LongColumn that also feeds NumericDV. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + int[] raw = {Integer.MIN_VALUE, -1, 0, 42, Integer.MAX_VALUE}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals(raw[i], storedFields.document(i).getField("val").numericValue().intValue()); + } + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeDoubleWithNumericDV() throws IOException { + // DOUBLE kind on a LongColumn that also feeds NumericDV. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + double[] raw = {Double.NEGATIVE_INFINITY, -1.5d, 0.0d, 2.25d, Double.POSITIVE_INFINITY}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.doubleToSortableLong(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.DOUBLE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + StoredFields storedFields = leaf.storedFields(); + for (int i = 0; i < raw.length; i++) { + assertEquals( + raw[i], storedFields.document(i).getField("val").numericValue().doubleValue(), 0d); + } + + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testStoredTypeDataInputRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setStored(true); + type.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayBinaryColumn( + "val", + type, + new int[] {0}, + new BytesRef[] {newBytesRef("x")}, + StoredValue.Type.DATA_INPUT)))); + + w.close(); + dir.close(); + } + + public void testBinaryColumnMultiDimPointsOnly() throws IOException { + // Plain BinaryColumn with 2-D int points (fixedSize = 2 * 4 = 8). Caller pre-packs bytes via + // IntPoint.pack; the chain writes them to points unchanged. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType pointType = new FieldType(); + pointType.setDimensions(2, Integer.BYTES); + pointType.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {IntPoint.pack(1, 10), IntPoint.pack(2, 20), IntPoint.pack(3, 30)}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("pt", pointType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + 1, searcher.count(IntPoint.newRangeQuery("pt", new int[] {1, 10}, new int[] {1, 10}))); + assertEquals( + 3, searcher.count(IntPoint.newRangeQuery("pt", new int[] {0, 0}, new int[] {10, 100}))); + + r.close(); + w.close(); + dir.close(); + } + + public void testBinaryColumnPointsOnlyArbitraryWidth() throws IOException { + // 3-D int points (12 bytes) via plain BinaryColumn — arbitrary widths are fine for the + // opaque-bytes path since no numeric transform is applied. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(3, Integer.BYTES); + type.freeze(); + + int[][] raw = {{1, 2, 3}, {4, 5, 6}, {10, 20, 30}}; + BytesRef[] values = new BytesRef[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = IntPoint.pack(raw[i]); + } + w.addBatch(simpleBatch(raw.length, new ArrayBinaryColumn("pt", type, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + 1, searcher.count(IntPoint.newRangeQuery("pt", new int[] {1, 2, 3}, new int[] {1, 2, 3}))); + assertEquals( + 3, + searcher.count( + IntPoint.newRangeQuery("pt", new int[] {0, 0, 0}, new int[] {100, 100, 100}))); + + r.close(); + w.close(); + dir.close(); + } + + public void testBinaryColumnSortedDVAndPoints() throws IOException { + // Plain BinaryColumn with SORTED DV + 1-D int point. Same BytesRef goes to both writers. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Integer.BYTES); + type.setDocValuesType(DocValuesType.SORTED); + type.freeze(); + + int[] docIds = {0, 1, 2}; + BytesRef[] values = {IntPoint.pack(10), IntPoint.pack(20), IntPoint.pack(30)}; + w.addBatch(simpleBatch(3, new ArrayBinaryColumn("field", type, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedDocValues dv = leaf.getSortedDocValues("field"); + for (int i = 0; i < 3; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.lookupOrd(dv.ordValue())); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(IntPoint.newExactQuery("field", 10))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("field", 10, 30))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindIntPointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Integer.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + int[] raw = {-5, -1, 0, 7, Integer.MAX_VALUE}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = raw[i]; + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.INT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(raw[i], dv.nextValue()); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, + searcher.count(IntPoint.newRangeQuery("val", Integer.MIN_VALUE, Integer.MAX_VALUE))); + assertEquals(1, searcher.count(IntPoint.newExactQuery("val", -5))); + assertEquals(3, searcher.count(IntPoint.newRangeQuery("val", -1, 7))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindLongPointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Long.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + long[] raw = {Long.MIN_VALUE, -100L, 0L, 42L, Long.MAX_VALUE}; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + } + w.addBatch(simpleBatch(raw.length, new ArrayLongColumn("val", type, docIds, raw.clone()))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], dv.nextValue()); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, searcher.count(LongPoint.newRangeQuery("val", Long.MIN_VALUE, Long.MAX_VALUE))); + assertEquals(1, searcher.count(LongPoint.newExactQuery("val", Long.MIN_VALUE))); + assertEquals(3, searcher.count(LongPoint.newRangeQuery("val", -100L, 42L))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindFloatPointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Float.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + float[] raw = {Float.NEGATIVE_INFINITY, -1.5f, 0.0f, 2.25f, Float.POSITIVE_INFINITY}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.FLOAT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + // DV stores the sortable-int encoding; decode via sortableIntToFloat. + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], NumericUtils.sortableIntToFloat((int) dv.nextValue()), 0f); + } + + // Points sort numerically. + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, + searcher.count( + FloatPoint.newRangeQuery("val", Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY))); + assertEquals(1, searcher.count(FloatPoint.newExactQuery("val", -1.5f))); + assertEquals(3, searcher.count(FloatPoint.newRangeQuery("val", -1.5f, 2.25f))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindDoublePointsAndDV() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDimensions(1, Double.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + double[] raw = {Double.NEGATIVE_INFINITY, -1.5d, 0.0d, 2.25d, Double.POSITIVE_INFINITY}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.doubleToSortableLong(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.DOUBLE, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + + SortedNumericDocValues dv = leaf.getSortedNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(raw[i], NumericUtils.sortableLongToDouble(dv.nextValue()), 0d); + } + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals( + raw.length, + searcher.count( + DoublePoint.newRangeQuery("val", Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(1, searcher.count(DoublePoint.newExactQuery("val", -1.5d))); + assertEquals(3, searcher.count(DoublePoint.newRangeQuery("val", -1.5d, 2.25d))); + + r.close(); + w.close(); + dir.close(); + } + + public void testNumericKindPointsAndDVMultiDimRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // 2D int: scenario 3 requires 1D. + FieldType type = new FieldType(); + type.setDimensions(2, Integer.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "val", + type, + LongColumn.NumericKind.LONG, + new int[] {0}, + new long[] {1L})))); + + w.close(); + dir.close(); + } + + public void testNumericKindPointsAndDVWidthMismatch() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // FLOAT kind requires a 4-byte point field; declaring Long.BYTES should throw. + FieldType type = new FieldType(); + type.setDimensions(1, Long.BYTES); + type.setDocValuesType(DocValuesType.SORTED_NUMERIC); + type.freeze(); + + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, + new ArrayLongColumn( + "val", + type, + LongColumn.NumericKind.FLOAT, + new int[] {0}, + new long[] {1L})))); + + w.close(); + dir.close(); + } + + public void testNumericKindFloatDVOnly() throws IOException { + // DV only (no points): LongColumn stores the long value unchanged. For FLOAT, callers feed + // sortable-int bits in the low 32 bits, and DV reads them back sign-extended to long. + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType type = new FieldType(); + type.setDocValuesType(DocValuesType.NUMERIC); + type.freeze(); + + float[] raw = {1.5f, -2.25f, Float.MAX_VALUE}; + long[] values = new long[raw.length]; + int[] docIds = new int[raw.length]; + for (int i = 0; i < raw.length; i++) { + docIds[i] = i; + values[i] = NumericUtils.floatToSortableInt(raw[i]); + } + w.addBatch( + simpleBatch( + raw.length, + new ArrayLongColumn("val", type, LongColumn.NumericKind.FLOAT, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues dv = leaf.getNumericDocValues("val"); + for (int i = 0; i < raw.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(values[i], dv.longValue()); + } + + r.close(); + w.close(); + dir.close(); + } + + /** + * With a sparse row column, the batch must still produce {@code numDocs} documents in the + * segment, and stored-fields for un-populated docs must be empty (not shifted, not missing). This + * guards the row-dense framing contract: every doc-id in {@code [0, numDocs)} is framed + * regardless of whether any row column has a value at that doc. + */ + public void testSparseStoredFramingPreservesNumDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // 5 batch docs, but only docs 1 and 3 have a stored value. + int[] docIds = {1, 3}; + BytesRef[] values = {newBytesRef("one"), newBytesRef("three")}; + w.addBatch(simpleBatch(5, new ArrayBinaryColumn("field", storedOnly, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(5, leaf.maxDoc()); + + StoredFields storedFields = leaf.storedFields(); + assertNull(storedFields.document(0).getField("field")); + assertEquals(newBytesRef("one"), storedFields.document(1).getField("field").binaryValue()); + assertNull(storedFields.document(2).getField("field")); + assertEquals(newBytesRef("three"), storedFields.document(3).getField("field").binaryValue()); + assertNull(storedFields.document(4).getField("field")); + + r.close(); + w.close(); + dir.close(); + } + + /** + * With a sparse indexed row column, the segment must still have {@code numDocs} documents, and + * the inverted index must reflect only the populated docs. Guards termsHash framing alignment. + */ + public void testSparseIndexedFramingPreservesNumDocs() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType indexedType = new FieldType(); + indexedType.setIndexOptions(IndexOptions.DOCS); + indexedType.setOmitNorms(true); + indexedType.setTokenized(false); + indexedType.freeze(); + + // 6 batch docs, only docs 2 and 5 have a term. + int[] docIds = {2, 5}; + BytesRef[] values = {newBytesRef("a"), newBytesRef("b")}; + w.addBatch(simpleBatch(6, new ArrayBinaryColumn("tag", indexedType, docIds, values))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(6, leaf.maxDoc()); + + IndexSearcher searcher = new IndexSearcher(r); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "a")))); + assertEquals(1, searcher.count(new TermQuery(new Term("tag", "b")))); + + r.close(); + w.close(); + dir.close(); + } + + /** + * When some docs in the batch have only a DV column (no row column value), framing still happens + * for every doc: stored fields must be empty for those docs, inverted index untouched, and DV + * values align with their batch doc-ids. + */ + public void testSparseRowMixedWithDenseDocValues() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // Row-sparse stored column: only docs 0 and 3 have a stored value. + int[] storedDocIds = {0, 3}; + BytesRef[] storedValues = {newBytesRef("first"), newBytesRef("fourth")}; + // Dense DV column covering every doc. + long[] dvValues = {100, 200, 300, 400}; + + w.addBatch( + simpleBatch( + 4, + new ArrayBinaryColumn("stored", storedOnly, storedDocIds, storedValues), + new ArrayDenseLongColumn("dv", NumericDocValuesField.TYPE, dvValues))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + assertEquals(4, leaf.maxDoc()); + + StoredFields storedFields = leaf.storedFields(); + assertEquals(newBytesRef("first"), storedFields.document(0).getField("stored").binaryValue()); + assertNull(storedFields.document(1).getField("stored")); + assertNull(storedFields.document(2).getField("stored")); + assertEquals(newBytesRef("fourth"), storedFields.document(3).getField("stored").binaryValue()); + + NumericDocValues dv = leaf.getNumericDocValues("dv"); + for (int i = 0; i < dvValues.length; i++) { + assertEquals(i, dv.nextDoc()); + assertEquals(dvValues[i], dv.longValue()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, dv.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + /** + * Indexing the same logical docs via {@code addBatch} with a sparse row column vs. via {@code + * addDocument} one doc at a time must produce segments with the same {@code maxDoc} and the same + * stored-field / inverted-index visibility. This is the golden equivalence check. + */ + public void testSparseBatchMatchesDocByDoc() throws IOException { + FieldType storedIndexed = new FieldType(StringField.TYPE_STORED); + storedIndexed.freeze(); + + // 7 docs; only docs 1, 2, and 5 have values for the row column. + int[] docIds = {1, 2, 5}; + String[] values = {"alpha", "beta", "gamma"}; + int totalDocs = 7; + + // --- Batch path --- + Directory batchDir = newDirectory(); + try (IndexWriter batchW = new IndexWriter(batchDir, newIndexWriterConfig())) { + BytesRef[] refs = new BytesRef[values.length]; + for (int i = 0; i < values.length; i++) { + refs[i] = newBytesRef(values[i]); + } + // StringField stores as STRING — use the matching storedType so stored-value round-trip is + // comparable between the two paths. + batchW.addBatch( + simpleBatch( + totalDocs, + new ArrayBinaryColumn( + "field", storedIndexed, docIds, refs, StoredValue.Type.STRING))); + } + + // --- Doc-by-doc path --- + Directory singleDir = newDirectory(); + try (IndexWriter singleW = new IndexWriter(singleDir, newIndexWriterConfig())) { + int next = 0; + for (int d = 0; d < totalDocs; d++) { + Document doc = new Document(); + if (next < docIds.length && docIds[next] == d) { + doc.add( + new StringField("field", values[next], org.apache.lucene.document.Field.Store.YES)); + next++; + } + singleW.addDocument(doc); + } + } + + try (DirectoryReader batchR = DirectoryReader.open(batchDir); + DirectoryReader singleR = DirectoryReader.open(singleDir)) { + LeafReader batchLeaf = getOnlyLeafReader(batchR); + LeafReader singleLeaf = getOnlyLeafReader(singleR); + + assertEquals(singleLeaf.maxDoc(), batchLeaf.maxDoc()); + assertEquals(totalDocs, batchLeaf.maxDoc()); + + StoredFields batchStored = batchLeaf.storedFields(); + StoredFields singleStored = singleLeaf.storedFields(); + for (int d = 0; d < totalDocs; d++) { + IndexableField bf = batchStored.document(d).getField("field"); + IndexableField sf = singleStored.document(d).getField("field"); + if (sf == null) { + assertNull("doc " + d + " should have no stored field", bf); + } else { + assertNotNull("doc " + d + " should have a stored field", bf); + assertEquals(sf.stringValue(), bf.stringValue()); + } + } + + IndexSearcher batchSearcher = new IndexSearcher(batchR); + IndexSearcher singleSearcher = new IndexSearcher(singleR); + for (String v : values) { + Term t = new Term("field", v); + assertEquals(singleSearcher.count(new TermQuery(t)), batchSearcher.count(new TermQuery(t))); + } + } + + batchDir.close(); + singleDir.close(); + } + + /** A row column that returns an out-of-order batch doc-id must be rejected. */ + public void testRowColumnOutOfOrderDocIdThrows() throws IOException { + Directory dir = newDirectory(); + try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // docIds intentionally not non-decreasing. + int[] docIds = {2, 1}; + BytesRef[] values = {newBytesRef("a"), newBytesRef("b")}; + expectThrows( + IllegalArgumentException.class, + () -> w.addBatch(simpleBatch(3, new ArrayBinaryColumn("f", storedOnly, docIds, values)))); + } + dir.close(); + } + + /** A row column that returns a batch doc-id {@code >= numDocs} must be rejected. */ + public void testRowColumnOutOfRangeDocIdThrows() throws IOException { + Directory dir = newDirectory(); + try (IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + FieldType storedOnly = new FieldType(); + storedOnly.setStored(true); + storedOnly.freeze(); + + // Batch size 3, but the column advertises a value at doc 5. + int[] docIds = {5}; + BytesRef[] values = {newBytesRef("oob")}; + expectThrows( + IllegalArgumentException.class, + () -> w.addBatch(simpleBatch(3, new ArrayBinaryColumn("f", storedOnly, docIds, values)))); + } + dir.close(); + } + + // --- Test Column implementations backed by arrays --- + + // ---- VectorColumn tests ---- + + public void testDenseFloatVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(3, VectorSimilarityFunction.EUCLIDEAN); + float[][] vectors = { + {1f, 2f, 3f}, {4f, 5f, 6f}, {7f, 8f, 9f}, + }; + w.addBatch(simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, vectors))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < vectors.length; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index()), 0f); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testDenseByteVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = byteVectorType(4, VectorSimilarityFunction.EUCLIDEAN); + byte[][] vectors = { + {1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, + }; + w.addBatch(simpleBatch(3, new ArrayDenseByteVectorColumn("v", vectorType, vectors))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + ByteVectorValues values = leaf.getByteVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < vectors.length; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index())); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSparseFloatVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {0, 2, 5, 9}; + float[][] vectors = {{1f, 1f}, {2f, 2f}, {3f, 3f}, {4f, 4f}}; + // pair with a sparse long column so the batch has a defined doc count > vector count + int[] anchorIds = {0, 9}; + long[] anchorVals = {0L, 9L}; + w.addBatch( + simpleBatch( + 10, + new ArrayFloatVectorColumn("v", vectorType, docIds, vectors), + new ArrayLongColumn("anchor", NumericDocValuesField.TYPE, anchorIds, anchorVals))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < docIds.length; i++) { + assertEquals(docIds[i], it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index()), 0f); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testSparseByteVectorColumn() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = byteVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {1, 4}; + byte[][] vectors = {{1, 2}, {3, 4}}; + int[] anchorIds = {0, 5}; + long[] anchorVals = {0L, 5L}; + w.addBatch( + simpleBatch( + 6, + new ArrayByteVectorColumn("v", vectorType, docIds, vectors), + new ArrayLongColumn("anchor", NumericDocValuesField.TYPE, anchorIds, anchorVals))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + ByteVectorValues values = leaf.getByteVectorValues("v"); + assertNotNull(values); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < docIds.length; i++) { + assertEquals(docIds[i], it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index())); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testVectorMixedWithLongAndBinary() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.DOT_PRODUCT); + float[][] vectors = {{0.6f, 0.8f}, {0.8f, 0.6f}, {1.0f, 0.0f}}; + long[] longs = {10, 20, 30}; + BytesRef[] bins = {newBytesRef("a"), newBytesRef("b"), newBytesRef("c")}; + int[] ids = {0, 1, 2}; + w.addBatch( + simpleBatch( + 3, + new ArrayDenseFloatVectorColumn("v", vectorType, vectors), + new ArrayLongColumn("num", NumericDocValuesField.TYPE, ids, longs), + new ArrayBinaryColumn("bin", BinaryDocValuesField.TYPE, ids, bins))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues nums = leaf.getNumericDocValues("num"); + BinaryDocValues binDv = leaf.getBinaryDocValues("bin"); + FloatVectorValues vec = leaf.getFloatVectorValues("v"); + KnnVectorValues.DocIndexIterator it = vec.iterator(); + for (int i = 0; i < 3; i++) { + assertEquals(i, nums.nextDoc()); + assertEquals(longs[i], nums.longValue()); + assertEquals(i, binDv.nextDoc()); + assertEquals(bins[i], binDv.binaryValue()); + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], vec.vectorValue(it.index()), 0f); + } + r.close(); + w.close(); + dir.close(); + } + + public void testVectorAcrossMultipleBatches() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + float[][] firstBatch = {{1f, 1f}, {2f, 2f}}; + float[][] secondBatch = {{3f, 3f}, {4f, 4f}, {5f, 5f}}; + w.addBatch(simpleBatch(2, new ArrayDenseFloatVectorColumn("v", vectorType, firstBatch))); + w.addBatch(simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, secondBatch))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + KnnVectorValues.DocIndexIterator it = values.iterator(); + float[][] all = {firstBatch[0], firstBatch[1], secondBatch[0], secondBatch[1], secondBatch[2]}; + for (int i = 0; i < all.length; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(all[i], values.vectorValue(it.index()), 0f); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + r.close(); + w.close(); + dir.close(); + } + + public void testEmptyVectorColumnRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // A field type alone is not enough — every batch must have at least one column with data, + // and a vector-only column with no values is the equivalent of "no documents have this + // vector". We pair it with a long anchor to make the batch valid; the vector cursor returns + // NO_MORE_DOCS immediately. + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] anchorIds = {0, 1}; + long[] anchorVals = {0L, 1L}; + w.addBatch( + simpleBatch( + 2, + new ArrayFloatVectorColumn("v", vectorType, new int[0], new float[0][]), + new ArrayLongColumn("anchor", NumericDocValuesField.TYPE, anchorIds, anchorVals))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + FloatVectorValues values = leaf.getFloatVectorValues("v"); + if (values != null) { + assertEquals(DocIdSetIterator.NO_MORE_DOCS, values.iterator().nextDoc()); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testParentFieldWithVectorBatch() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig config = newIndexWriterConfig(); + config.setParentField("_parent"); + IndexWriter w = new IndexWriter(dir, config); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + float[][] vectors = {{1f, 0f}, {0f, 1f}, {1f, 1f}}; + w.addBatch(simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, vectors))); + + DirectoryReader r = DirectoryReader.open(w); + LeafReader leaf = getOnlyLeafReader(r); + NumericDocValues parentDv = leaf.getNumericDocValues("_parent"); + assertNotNull(parentDv); + for (int i = 0; i < 3; i++) { + assertEquals(i, parentDv.nextDoc()); + } + FloatVectorValues values = leaf.getFloatVectorValues("v"); + KnnVectorValues.DocIndexIterator it = values.iterator(); + for (int i = 0; i < 3; i++) { + assertEquals(i, it.nextDoc()); + assertArrayEquals(vectors[i], values.vectorValue(it.index()), 0f); + } + + r.close(); + w.close(); + dir.close(); + } + + public void testFloatVectorEncodingMismatchFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + // FieldType says FLOAT32 but column carries byte[] vectors. + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + byte[][] vectors = {{1, 2}, {3, 4}}; + expectThrows( + ClassCastException.class, + () -> w.addBatch(simpleBatch(2, new ArrayDenseByteVectorColumn("v", vectorType, vectors)))); + w.rollback(); + dir.close(); + } + + public void testWrongDimensionFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(3, VectorSimilarityFunction.EUCLIDEAN); + float[][] vectors = {{1f, 2f, 3f}, {4f, 5f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(2, new ArrayDenseFloatVectorColumn("v", vectorType, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("expected dimension 3")); + w.rollback(); + dir.close(); + } + + public void testZeroDimensionFieldTypeFails() { + FieldType bad = new FieldType(); + // No vector attributes set -> vectorDimension() == 0 + bad.setDocValuesType(DocValuesType.NUMERIC); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})); + assertTrue(e.getMessage(), e.getMessage().contains("vectorDimension() > 0")); + } + + public void testVectorWithDocValuesRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setDocValuesType(DocValuesType.NUMERIC); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testVectorWithStoredRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setStored(true); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testVectorWithIndexOptionsRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setIndexOptions(IndexOptions.DOCS); + bad.setTokenized(false); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testVectorWithPointsRejected() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType bad = new FieldType(); + bad.setVectorAttributes(2, VectorEncoding.FLOAT32, VectorSimilarityFunction.EUCLIDEAN); + bad.setDimensions(1, Integer.BYTES); + bad.freeze(); + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", bad, new float[][] {{1f, 2f}})))); + assertTrue(e.getMessage(), e.getMessage().contains("must be vector-only")); + w.rollback(); + dir.close(); + } + + public void testDuplicateDocIDFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {0, 0}; + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(2, new ArrayFloatVectorColumn("v", vectorType, docIds, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("strictly increasing")); + w.rollback(); + dir.close(); + } + + public void testDecreasingDocIDFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {3, 1}; + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(4, new ArrayFloatVectorColumn("v", vectorType, docIds, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("strictly increasing")); + w.rollback(); + dir.close(); + } + + public void testVectorOutOfRangeDocIDFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + int[] docIds = {0, 5}; + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(3, new ArrayFloatVectorColumn("v", vectorType, docIds, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("out of range")); + w.rollback(); + dir.close(); + } + + public void testDenseVectorColumnTooFewValuesFails() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType vectorType = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + // 2 values declared DENSE but the batch has 3 docs. + float[][] vectors = {{1f, 2f}, {3f, 4f}}; + IllegalArgumentException e = + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch(3, new ArrayDenseFloatVectorColumn("v", vectorType, vectors)))); + assertTrue(e.getMessage(), e.getMessage().contains("Dense column")); + w.rollback(); + dir.close(); + } + + public void testVectorColumnSchemaConsistencyAcrossBatches() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()); + + FieldType float32Type = floatVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + w.addBatch( + simpleBatch( + 1, new ArrayDenseFloatVectorColumn("v", float32Type, new float[][] {{1f, 2f}}))); + + FieldType byteType = byteVectorType(2, VectorSimilarityFunction.EUCLIDEAN); + expectThrows( + IllegalArgumentException.class, + () -> + w.addBatch( + simpleBatch( + 1, new ArrayDenseByteVectorColumn("v", byteType, new byte[][] {{1, 2}})))); + w.rollback(); + dir.close(); + } + + private static FieldType floatVectorType(int dimension, VectorSimilarityFunction sim) { + FieldType type = new FieldType(); + type.setVectorAttributes(dimension, VectorEncoding.FLOAT32, sim); + type.freeze(); + return type; + } + + private static FieldType byteVectorType(int dimension, VectorSimilarityFunction sim) { + FieldType type = new FieldType(); + type.setVectorAttributes(dimension, VectorEncoding.BYTE, sim); + type.freeze(); + return type; + } + + private static ColumnBatch simpleBatch(int numDocs, Column... columns) { + return new ColumnBatch() { + @Override + public int numDocs() { + return numDocs; + } + + @Override + public Iterable columns() { + return List.of(columns); + } + }; + } + + private static class ArrayLongColumn extends LongColumn { + private final int[] docIds; + private final long[] values; + + ArrayLongColumn(String name, IndexableFieldType fieldType, int[] docIds, long[] values) { + super(name, fieldType, Density.SPARSE); + assert docIds.length == values.length; + this.docIds = docIds; + this.values = values; + } + + ArrayLongColumn( + String name, + IndexableFieldType fieldType, + NumericKind numericKind, + int[] docIds, + long[] values) { + super(name, fieldType, Density.SPARSE, numericKind); + assert docIds.length == values.length; + this.docIds = docIds; + this.values = values; + } + + @Override + public LongTupleCursor tuples() { + return new LongTupleCursor() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < docIds.length ? docIds[pos] : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public long longValue() { + return values[pos]; + } + }; + } + } + + private static class ArrayBinaryColumn extends BinaryColumn { + private final int[] docIds; + private final BytesRef[] values; + private final StoredValue.Type storedType; + + ArrayBinaryColumn(String name, IndexableFieldType fieldType, int[] docIds, BytesRef[] values) { + this(name, fieldType, docIds, values, StoredValue.Type.BINARY); + } + + ArrayBinaryColumn( + String name, + IndexableFieldType fieldType, + int[] docIds, + BytesRef[] values, + StoredValue.Type storedType) { + super(name, fieldType, Density.SPARSE); + assert docIds.length == values.length; + this.docIds = docIds; + this.values = values; + this.storedType = storedType; + } + + @Override + public StoredValue.Type storedType() { + return storedType; + } + + @Override + public BinaryTupleCursor tuples() { + return new BinaryTupleCursor() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < docIds.length ? docIds[pos] : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public BytesRef binaryValue() { + return values[pos]; + } + }; + } + } + + /** Dense {@link LongColumn} with an optional bulk values cursor. */ + private static class ArrayDenseLongColumn extends LongColumn { + private final long[] values; + + ArrayDenseLongColumn(String name, IndexableFieldType fieldType, long[] values) { + super(name, fieldType, Density.DENSE); + this.values = values; + } + + @Override + public LongTupleCursor tuples() { + return new LongTupleCursor() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < values.length ? pos : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public long longValue() { + return values[pos]; + } + }; + } + + @Override + public LongValuesCursor values() { + return new LongValuesCursor() { + int pos = 0; + + @Override + public int size() { + return values.length; + } + + @Override + public long nextLong() { + if (pos >= values.length) { + throw new IllegalStateException("LongValuesCursor exhausted: size=" + values.length); + } + return values[pos++]; + } + + @Override + public void fill(long[] dst, int offset, int length) { + if (pos + length > values.length) { + throw new IllegalStateException("LongValuesCursor exhausted: size=" + values.length); + } + System.arraycopy(values, pos, dst, offset, length); + pos += length; + } + }; + } + } + + private static class ArrayFloatVectorColumn extends VectorColumn { + private final int[] docIds; + private final float[][] values; + + ArrayFloatVectorColumn( + String name, IndexableFieldType fieldType, int[] docIds, float[][] values) { + super(name, fieldType, Density.SPARSE); + assert docIds.length == values.length; + this.docIds = docIds; + this.values = values; + } + + @Override + public VectorTupleCursor tuples() { + return new VectorTupleCursor<>() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < docIds.length ? docIds[pos] : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public float[] vectorValue() { + return values[pos]; + } + }; + } + } + + private static class ArrayByteVectorColumn extends VectorColumn { + private final int[] docIds; + private final byte[][] values; + + ArrayByteVectorColumn( + String name, IndexableFieldType fieldType, int[] docIds, byte[][] values) { + super(name, fieldType, Density.SPARSE); + assert docIds.length == values.length; + this.docIds = docIds; + this.values = values; + } + + @Override + public VectorTupleCursor tuples() { + return new VectorTupleCursor<>() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < docIds.length ? docIds[pos] : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public byte[] vectorValue() { + return values[pos]; + } + }; + } + } + + private static class ArrayDenseFloatVectorColumn extends VectorColumn { + private final float[][] values; + + ArrayDenseFloatVectorColumn(String name, IndexableFieldType fieldType, float[][] values) { + super(name, fieldType, Density.DENSE); + this.values = values; + } + + @Override + public VectorTupleCursor tuples() { + return new VectorTupleCursor<>() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < values.length ? pos : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public float[] vectorValue() { + return values[pos]; + } + }; + } + } + + private static class ArrayDenseByteVectorColumn extends VectorColumn { + private final byte[][] values; + + ArrayDenseByteVectorColumn(String name, IndexableFieldType fieldType, byte[][] values) { + super(name, fieldType, Density.DENSE); + this.values = values; + } + + @Override + public VectorTupleCursor tuples() { + return new VectorTupleCursor<>() { + int pos = -1; + + @Override + public int nextDoc() { + pos++; + return pos < values.length ? pos : DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public byte[] vectorValue() { + return values[pos]; + } + }; + } + } +}