Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ API Changes
* GITHUB#15584: Add support for termdoc fields that use custom term freqs (via IndexOptions.DOCS_AND_CUSTOM_FREQS).
IndexWriter counts their terms rather than summing their freqs. Use

* GITHUB#15990: Add experimental api to IndexWriter for columnar indexing.

New Features
---------------------

Expand Down
4 changes: 4 additions & 0 deletions lucene/core/src/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@
exports org.apache.lucene.codecs.hnsw;
exports org.apache.lucene.internal.vectorization to
org.apache.lucene.benchmark.jmh;
exports org.apache.lucene.document.column;

opens org.apache.lucene.document.column to
org.apache.lucene.test_framework;

provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document.column;

import org.apache.lucene.document.StoredValue;
import org.apache.lucene.index.IndexableFieldType;

/**
* A {@link Column} that provides variable-size binary values via a tuple cursor. Used for {@link
* org.apache.lucene.index.DocValuesType#BINARY BINARY}, {@link
* org.apache.lucene.index.DocValuesType#SORTED SORTED}, and {@link
* org.apache.lucene.index.DocValuesType#SORTED_SET SORTED_SET} doc values, and for stored/indexed
* binary or text fields. Values fed to points are passed through unchanged, so callers are
* responsible for producing sort-encoded bytes of the correct total length.
*
* <p>Numeric doc values ({@link org.apache.lucene.index.DocValuesType#NUMERIC NUMERIC} / {@link
* org.apache.lucene.index.DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}) and 1-D numeric points (int
* / long / float / double) are fed by {@link LongColumn} instead.
*
* @lucene.experimental
*/
public abstract class BinaryColumn extends Column {

/** Creates a BinaryColumn with the given field name, type, and density. */
protected BinaryColumn(String name, IndexableFieldType fieldType, Density density) {
super(name, fieldType, density);
}

/**
* The {@link org.apache.lucene.document.StoredValue.Type} to emit when this column is written to
* stored fields. The default is {@link org.apache.lucene.document.StoredValue.Type#BINARY}. Only
* {@link org.apache.lucene.document.StoredValue.Type#BINARY} and {@link
* org.apache.lucene.document.StoredValue.Type#STRING} are supported; numeric stored types require
* {@link LongColumn}.
*/
public StoredValue.Type storedType() {
return StoredValue.Type.BINARY;
}

/** Returns a fresh tuple cursor starting at the beginning of the batch. */
public abstract BinaryTupleCursor tuples();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document.column;

import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;

/**
* A tuple cursor over a {@link BinaryColumn}. Yields {@code (docID, binaryValue)} pairs.
* Batch-local doc-ids are returned in non-decreasing order; the same doc-id may repeat for
* multi-valued fields (e.g. {@link org.apache.lucene.index.DocValuesType#SORTED_SET SORTED_SET}).
*
* @lucene.experimental
*/
public abstract class BinaryTupleCursor {

/** Sole constructor. */
protected BinaryTupleCursor() {}

/**
* Advances to the next doc-id that has a value and returns it, or {@link
* DocIdSetIterator#NO_MORE_DOCS} if exhausted. Doc-ids are batch-local (0 to {@code numDocs -
* 1}).
*/
public abstract int nextDoc();

/**
* Returns the value at the current cursor position. Only valid until the next call to {@link
* #nextDoc()}, and only after a {@code nextDoc()} that returned a value other than {@link
* DocIdSetIterator#NO_MORE_DOCS}.
*/
public abstract BytesRef binaryValue();
}
77 changes: 77 additions & 0 deletions lucene/core/src/java/org/apache/lucene/document/column/Column.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document.column;

import java.util.Objects;
import org.apache.lucene.index.IndexableFieldType;

/**
* A single field's values across multiple documents in a {@link ColumnBatch}. A Column carries only
* metadata (name, field type, and density); iteration is performed via cursors obtained from {@link
* LongColumn}, {@link BinaryColumn}, or {@link VectorColumn}.
*
* <p>Each call that requests a cursor returns a fresh cursor positioned at the first value, so
* columns can be consumed multiple times (for example, once in the row-oriented pass for stored
* fields and again in the column-oriented pass for doc values).
*
* @lucene.experimental
*/
public abstract class Column {

/**
* Whether a column has a value for every document in the batch. This is a contract the column
* asserts up-front so the indexing chain can pick the right code path without probing the data.
*/
public enum Density {
/** The column has a value for every batch-local doc-id in {@code [0, numDocs)}, in order. */
DENSE,
/** The column may be missing values or have multiple values for some doc-ids. */
SPARSE,
}

private final String name;
private final IndexableFieldType fieldType;
private final Density density;

/**
* Creates a Column with the given field name, type, and density.
*
* @param name the field name
* @param fieldType describes how this field should be indexed
* @param density whether this column has a value for every document in the batch
*/
protected Column(String name, IndexableFieldType fieldType, Density density) {
this.name = Objects.requireNonNull(name, "field name must not be null");
this.fieldType = Objects.requireNonNull(fieldType, "field type must not be null");
this.density = Objects.requireNonNull(density, "density must not be null");
}

/** Returns the field name. */
public String name() {
return name;
}

/** Returns the field type describing how this field is indexed. */
public IndexableFieldType fieldType() {
return fieldType;
}

/** Returns the density of this column (whether every doc has a value). */
public Density density() {
return density;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document.column;

/**
* A column-oriented batch of documents for indexing. A Batch contains a collection of {@link
* Column}s, where each Column represents a single field across all documents in the batch.
* Documents are identified by batch-local IDs from 0 (inclusive) to {@link #numDocs()} (exclusive).
*
* @lucene.experimental
*/
public abstract class ColumnBatch {

/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
protected ColumnBatch() {}

/**
* Returns the number of documents in this batch. All column doc-ids must be in the range [0,
* numDocs()).
*/
public abstract int numDocs();

/**
* Returns the columns in this batch. Each column represents a single field across the documents
* in the batch.
*/
public abstract Iterable<Column> columns();
}
Loading
Loading