diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 134d529a0385..0c14009c9806 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -129,6 +129,10 @@ Improvements Optimizations --------------------- +* GITHUB#15139: BooleanQuery rewrites so a FILTER on the primary index sort field can restrict + the remaining clauses' BulkScorer to a contiguous doc id range when safe (sorted numeric range + filters; single-valued sorted-set term filters with dense postings). (Prithvi S) + * GITHUB#15681, GITHUB#15833: Replace pre-sized array or empty array with lambda expression to call Collection#toArray. (Zhou Hui) * GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PrimaryIndexSortFilterBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PrimaryIndexSortFilterBenchmark.java new file mode 100644 index 000000000000..674926cd87bc --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PrimaryIndexSortFilterBenchmark.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Comparator; +import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KeywordField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopFieldCollectorManager; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MMapDirectory; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 5, time = 3) +@Measurement(iterations = 10, time = 5) +@Fork( + value = 3, + jvmArgsAppend = {"-Xmx2g", "-Xms2g"}) +public class PrimaryIndexSortFilterBenchmark { + + private static final int NUM_DOCS = 1_000_000; + private static final int FILTERED_DOCS = 10_000; + private static final int NUM_DESCRIPTION_TERMS = 20; + private static final int NUM_HITS = 10; + + private Directory sortedDir; + private Directory unsortedDir; + private IndexReader sortedReader; + private IndexReader unsortedReader; + private Path sortedPath; + private Path unsortedPath; + private IndexSearcher sortedSearcher; + private IndexSearcher unsortedSearcher; + private Query sortedQuery; + private Query unsortedQuery; + + @Setup(Level.Trial) + public void setUp() throws IOException { + sortedPath = Files.createTempDirectory("primary-sort-filter-sorted"); + unsortedPath = Files.createTempDirectory("primary-sort-filter-unsorted"); + sortedDir = new MMapDirectory(sortedPath); + unsortedDir = new MMapDirectory(unsortedPath); + + buildIndex(sortedDir, true); + buildIndex(unsortedDir, false); + + sortedReader = DirectoryReader.open(sortedDir); + unsortedReader = DirectoryReader.open(unsortedDir); + sortedSearcher = new IndexSearcher(sortedReader); + unsortedSearcher = new IndexSearcher(unsortedReader); + + sortedQuery = sortedSearcher.rewrite(buildQuery()); + unsortedQuery = unsortedSearcher.rewrite(buildQuery()); + } + + private static void buildIndex(Directory dir, boolean sorted) throws IOException { + IndexWriterConfig config = new IndexWriterConfig(); + if (sorted) { + config.setIndexSort( + new Sort( + KeywordField.newSortField( + "category", false, SortedSetSelector.Type.MIN, SortField.STRING_LAST))); + } + + try (IndexWriter writer = new IndexWriter(dir, config)) { + for (int i = 0; i < NUM_DOCS; i++) { + Document doc = new Document(); + String category = i < FILTERED_DOCS ? "books" : (i < NUM_DOCS / 2 ? "music" : "software"); + doc.add(new KeywordField("category", category, Field.Store.NO)); + for (int term = 0; term < NUM_DESCRIPTION_TERMS; term++) { + if ((i + term) % 3 == 0) { + doc.add(new StringField("description", "t" + term, Field.Store.NO)); + } + } + writer.addDocument(doc); + } + writer.forceMerge(1); + } + } + + private static Query buildQuery() { + BooleanQuery.Builder description = new BooleanQuery.Builder(); + for (int term = 0; term < NUM_DESCRIPTION_TERMS; term++) { + description.add( + new TermQuery(new Term("description", "t" + term)), BooleanClause.Occur.SHOULD); + } + + BooleanQuery.Builder query = new BooleanQuery.Builder(); + query.add(description.build(), BooleanClause.Occur.MUST); + query.add(new TermQuery(new Term("category", "books")), BooleanClause.Occur.FILTER); + return query.build(); + } + + @TearDown(Level.Trial) + public void tearDown() throws IOException { + sortedReader.close(); + unsortedReader.close(); + sortedDir.close(); + unsortedDir.close(); + deleteDirectory(sortedPath); + deleteDirectory(unsortedPath); + } + + private static void deleteDirectory(Path path) throws IOException { + if (path != null && Files.exists(path)) { + try (Stream walk = Files.walk(path)) { + for (Path p : walk.sorted(Comparator.reverseOrder()).toList()) { + Files.delete(p); + } + } + } + } + + @Benchmark + public TopDocs benchmarkSortedTopScores() throws IOException { + return sortedSearcher.search(sortedQuery, NUM_HITS); + } + + @Benchmark + public TopDocs benchmarkUnsortedTopScores() throws IOException { + return unsortedSearcher.search(unsortedQuery, NUM_HITS); + } + + @Benchmark + public TopDocs benchmarkSortedComplete() throws IOException { + return sortedSearcher.search( + sortedQuery, new TopScoreDocCollectorManager(NUM_HITS, Integer.MAX_VALUE)); + } + + @Benchmark + public TopDocs benchmarkSortedCompleteNoScores() throws IOException { + return sortedSearcher.search( + sortedQuery, new TopFieldCollectorManager(Sort.INDEXORDER, NUM_HITS, Integer.MAX_VALUE)); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java index c9678fa1f0fd..5b664b60de43 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java @@ -598,9 +598,57 @@ public Query rewrite(IndexSearcher indexSearcher) throws IOException { } } + Query filteredOnPrimaryIndexSortField = rewriteFilteredOnPrimaryIndexSortField(indexSearcher); + if (filteredOnPrimaryIndexSortField != null) { + return filteredOnPrimaryIndexSortField; + } + return super.rewrite(indexSearcher); } + private Query rewriteFilteredOnPrimaryIndexSortField(IndexSearcher indexSearcher) + throws IOException { + BooleanClause selectedFilterClause = null; + for (BooleanClause clause : clauses) { + if (clause.occur() == Occur.FILTER + && FilteredOnPrimaryIndexSortFieldQuery.canOptimize(clause.query(), indexSearcher)) { + selectedFilterClause = clause; + break; + } + } + if (selectedFilterClause == null) { + return null; + } + + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.setMinimumNumberShouldMatch(minimumNumberShouldMatch); + boolean hasRequiredClause = false; + int shouldClauseCount = 0; + for (BooleanClause clause : clauses) { + if (clause == selectedFilterClause) { + continue; + } + if (clause.occur() == Occur.MUST || clause.occur() == Occur.FILTER) { + hasRequiredClause = true; + } else if (clause.occur() == Occur.SHOULD) { + shouldClauseCount++; + } + builder.add(clause); + } + + if (hasRequiredClause == false) { + if (minimumNumberShouldMatch == 0) { + return null; + } else if (shouldClauseCount < minimumNumberShouldMatch) { + return new MatchNoDocsQuery("SHOULD clause count less than minimumNumberShouldMatch"); + } + } + + Query filteredQuery = indexSearcher.rewrite(builder.build()); + return new FilteredOnPrimaryIndexSortFieldQuery( + filteredQuery, selectedFilterClause.query(), this); + } + @Override public void visit(QueryVisitor visitor) { QueryVisitor sub = visitor.getSubVisitor(Occur.MUST, this); diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdRange.java b/lucene/core/src/java/org/apache/lucene/search/DocIdRange.java new file mode 100644 index 000000000000..3cb5d24372e2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdRange.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +record DocIdRange(int minDoc, int maxDoc) { + DocIdRange { + if (minDoc < 0 || maxDoc < minDoc) { + throw new IllegalArgumentException("Invalid doc ID range [" + minDoc + ", " + maxDoc + ")"); + } + } + + boolean isEmpty() { + return minDoc == maxDoc; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/FilteredOnPrimaryIndexSortFieldQuery.java b/lucene/core/src/java/org/apache/lucene/search/FilteredOnPrimaryIndexSortFieldQuery.java new file mode 100644 index 000000000000..70b0e3727454 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/FilteredOnPrimaryIndexSortFieldQuery.java @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Objects; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +final class FilteredOnPrimaryIndexSortFieldQuery extends Query { + private final Query query; + private final Query filter; + private final Query fallbackQuery; + + FilteredOnPrimaryIndexSortFieldQuery(Query query, Query filter, Query fallbackQuery) { + this.query = Objects.requireNonNull(query); + this.filter = Objects.requireNonNull(filter); + this.fallbackQuery = Objects.requireNonNull(fallbackQuery); + } + + static boolean canOptimize(Query filter, IndexSearcher searcher) throws IOException { + String field = filterField(filter); + if (field == null) { + return false; + } + for (LeafReaderContext context : searcher.getIndexReader().leaves()) { + if (filter instanceof IndexSortSortedNumericDocValuesRangeQuery + && primaryIndexSortField(context, field) != null) { + return true; + } else if (filter instanceof TermQuery + && primaryIndexSortField(context, field) instanceof SortedSetSortField + && DocValues.unwrapSingleton(DocValues.getSortedSet(context.reader(), field)) != null) { + return true; + } + } + return false; + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + return this; + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) + throws IOException { + final Weight queryWeight = searcher.createWeight(query, scoreMode, boost); + final Weight fallbackWeight = searcher.createWeight(fallbackQuery, scoreMode, boost); + + return new Weight(this) { + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + return fallbackWeight.explain(context, doc); + } + + @Override + public Matches matches(LeafReaderContext context, int doc) throws IOException { + return fallbackWeight.matches(context, doc); + } + + @Override + public int count(LeafReaderContext context) throws IOException { + return fallbackWeight.count(context); + } + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + final ScorerSupplier fallbackSupplier = fallbackWeight.scorerSupplier(context); + if (fallbackSupplier == null) { + return null; + } + final ScorerSupplier querySupplier = queryWeight.scorerSupplier(context); + if (querySupplier == null) { + return fallbackSupplier; + } + return new ScorerSupplier() { + @Override + public Scorer get(long leadCost) throws IOException { + return fallbackSupplier.get(leadCost); + } + + @Override + public BulkScorer bulkScorer() throws IOException { + final DocIdRange range = getDocIdRange(context); + if (range == null) { + return fallbackSupplier.bulkScorer(); + } + if (range.isEmpty()) { + return emptyBulkScorer(); + } + return new RangeFilteredBulkScorer(querySupplier.bulkScorer(), range); + } + + @Override + public long cost() { + return fallbackSupplier.cost(); + } + + @Override + public void setTopLevelScoringClause() { + fallbackSupplier.setTopLevelScoringClause(); + querySupplier.setTopLevelScoringClause(); + } + }; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return queryWeight.isCacheable(ctx) && fallbackWeight.isCacheable(ctx); + } + }; + } + + private DocIdRange getDocIdRange(LeafReaderContext context) throws IOException { + if (filter instanceof IndexSortSortedNumericDocValuesRangeQuery rangeQuery) { + return rangeQuery.getDenseDocIdRangeForPrimarySort(context); + } else if (filter instanceof TermQuery termQuery) { + return getTermDocIdRange(context, termQuery.getTerm()); + } + return null; + } + + private static DocIdRange getTermDocIdRange(LeafReaderContext context, Term term) + throws IOException { + if (primaryIndexSortField(context, term.field()) instanceof SortedSetSortField == false) { + return null; + } + SortedSetDocValues values = DocValues.getSortedSet(context.reader(), term.field()); + SortedDocValues singleton = DocValues.unwrapSingleton(values); + if (singleton == null) { + return null; + } + int docFreq = context.reader().docFreq(term); + long ord = values.lookupTerm(term.bytes()); + if (ord < 0) { + return docFreq == 0 ? new DocIdRange(0, 0) : null; + } + DocIdRange range = getDocIdRangeForPrimarySort(context, term.bytes()); + if (range.isEmpty()) { + return range; + } + if (docFreq != range.maxDoc() - range.minDoc()) { + return null; + } + PostingsEnum postings = context.reader().postings(term, PostingsEnum.NONE); + if (postings == null || postings.nextDoc() != range.minDoc()) { + return null; + } + if (postings.advance(range.maxDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + return null; + } + return range; + } + + private static SortField primaryIndexSortField(LeafReaderContext context, String field) { + Sort indexSort = context.reader().getMetaData().sort(); + if (indexSort != null + && indexSort.getSort().length > 0 + && indexSort.getSort()[0].getField().equals(field)) { + return indexSort.getSort()[0]; + } + return null; + } + + private static String filterField(Query filter) { + if (filter instanceof IndexSortSortedNumericDocValuesRangeQuery rangeQuery) { + return rangeQuery.getField(); + } else if (filter instanceof NumericDocValuesRangeQuery rangeQuery) { + return rangeQuery.getField(); + } else if (filter instanceof PointRangeQuery pointRangeQuery + && pointRangeQuery.getNumDims() == 1) { + return pointRangeQuery.getField(); + } else if (filter instanceof TermQuery termQuery) { + return termQuery.getTerm().field(); + } + return null; + } + + private static DocIdRange getDocIdRangeForPrimarySort(LeafReaderContext context, BytesRef value) + throws IOException { + SortField sortField = context.reader().getMetaData().sort().getSort()[0]; + int maxDoc = context.reader().maxDoc(); + + ValueComparator comparator = loadComparator(sortField, value, context); + int low = 0; + int high = maxDoc - 1; + while (low <= high) { + int mid = (low + high) >>> 1; + if (comparator.compare(mid) <= 0) { + high = mid - 1; + comparator = loadComparator(sortField, value, context); + } else { + low = mid + 1; + } + } + int firstDocIdInclusive = high + 1; + + comparator = loadComparator(sortField, value, context); + low = firstDocIdInclusive; + high = maxDoc - 1; + while (low <= high) { + int mid = (low + high) >>> 1; + if (comparator.compare(mid) < 0) { + high = mid - 1; + comparator = loadComparator(sortField, value, context); + } else { + low = mid + 1; + } + } + int lastDocIdExclusive = high + 1; + + return new DocIdRange(firstDocIdInclusive, lastDocIdExclusive); + } + + private interface ValueComparator { + int compare(int docID) throws IOException; + } + + private static ValueComparator loadComparator( + SortField sortField, BytesRef topValue, LeafReaderContext context) throws IOException { + @SuppressWarnings("unchecked") + FieldComparator fieldComparator = + (FieldComparator) sortField.getComparator(1, Pruning.NONE); + fieldComparator.setTopValue(topValue); + LeafFieldComparator leafFieldComparator = fieldComparator.getLeafComparator(context); + int direction = sortField.getReverse() ? -1 : 1; + + return doc -> direction * leafFieldComparator.compareTop(doc); + } + + private static BulkScorer emptyBulkScorer() { + return new BulkScorer() { + @Override + public int score(LeafCollector collector, Bits acceptDocs, int min, int max) { + return DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public long cost() { + return 0; + } + }; + } + + private static final class RangeFilteredBulkScorer extends BulkScorer { + private final BulkScorer in; + private final DocIdRange range; + + private RangeFilteredBulkScorer(BulkScorer in, DocIdRange range) { + this.in = Objects.requireNonNull(in); + this.range = Objects.requireNonNull(range); + } + + @Override + public int score(LeafCollector collector, Bits acceptDocs, int min, int max) + throws IOException { + final int filteredMin = Math.max(min, range.minDoc()); + final int filteredMax = Math.min(max, range.maxDoc()); + if (filteredMin >= filteredMax) { + return max <= range.minDoc() ? range.minDoc() : DocIdSetIterator.NO_MORE_DOCS; + } + + final int next = in.score(collector, acceptDocs, filteredMin, filteredMax); + if (range.maxDoc() <= max) { + return DocIdSetIterator.NO_MORE_DOCS; + } + return next; + } + + @Override + public long cost() { + return Math.min(in.cost(), range.maxDoc() - range.minDoc()); + } + } + + @Override + public void visit(QueryVisitor visitor) { + fallbackQuery.visit(visitor); + } + + @Override + public String toString(String field) { + return "FilteredOnPrimaryIndexSortFieldQuery(query=" + + query.toString(field) + + ", filter=" + + filter.toString(field) + + ")"; + } + + @Override + public boolean equals(Object obj) { + if (sameClassAs(obj) == false) { + return false; + } + FilteredOnPrimaryIndexSortFieldQuery that = (FilteredOnPrimaryIndexSortFieldQuery) obj; + return query.equals(that.query) + && filter.equals(that.filter) + && fallbackQuery.equals(that.fallbackQuery); + } + + @Override + public int hashCode() { + return Objects.hash(classHash(), query, filter, fallbackQuery); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java index ff607f3b5b8e..37256f41d781 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSortSortedNumericDocValuesRangeQuery.java @@ -544,6 +544,14 @@ private IteratorAndCount getDocIdSetIteratorOrNull(LeafReaderContext context) th return null; } + DocIdRange getDenseDocIdRangeForPrimarySort(LeafReaderContext context) throws IOException { + IteratorAndCount itAndCount = getDocIdSetIteratorOrNull(context); + if (itAndCount == null || itAndCount.count() < 0) { + return null; + } + return new DocIdRange(itAndCount.minDoc(), itAndCount.maxDoc()); + } + /** * Computes the document IDs that lie within the range [lowerValue, upperValue] by performing * binary search on the field's doc values. @@ -658,22 +666,24 @@ private static SortField.Type getSortFieldType(SortField sortField) { * Provides a {@code DocIdSetIterator} along with an accurate count of documents provided by the * iterator (or {@code -1} if an accurate count is unknown). */ - private record IteratorAndCount(DocIdSetIterator it, int count) { + private record IteratorAndCount(DocIdSetIterator it, int count, int minDoc, int maxDoc) { static IteratorAndCount empty() { - return new IteratorAndCount(DocIdSetIterator.empty(), 0); + return new IteratorAndCount(DocIdSetIterator.empty(), 0, 0, 0); } static IteratorAndCount all(int maxDoc) { - return new IteratorAndCount(DocIdSetIterator.all(maxDoc), maxDoc); + return new IteratorAndCount(DocIdSetIterator.all(maxDoc), maxDoc, 0, maxDoc); } static IteratorAndCount denseRange(int minDoc, int maxDoc) { - return new IteratorAndCount(DocIdSetIterator.range(minDoc, maxDoc), maxDoc - minDoc); + return new IteratorAndCount( + DocIdSetIterator.range(minDoc, maxDoc), maxDoc - minDoc, minDoc, maxDoc); } static IteratorAndCount sparseRange(int minDoc, int maxDoc, DocIdSetIterator delegate) { - return new IteratorAndCount(new BoundedDocIdSetIterator(minDoc, maxDoc, delegate), -1); + return new IteratorAndCount( + new BoundedDocIdSetIterator(minDoc, maxDoc, delegate), -1, minDoc, maxDoc); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java index 4adb4a3cb3c1..24ef50b516c9 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestIndexSortSortedNumericDocValuesRangeQuery.java @@ -19,17 +19,24 @@ import static org.hamcrest.Matchers.instanceOf; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Random; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KeywordField; +import org.apache.lucene.document.LongField; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.index.RandomIndexWriter; @@ -37,6 +44,8 @@ import org.apache.lucene.tests.search.QueryUtils; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; import org.hamcrest.MatcherAssert; @LuceneTestCase.SuppressCodecs(value = "SimpleText") @@ -166,6 +175,469 @@ private static void assertSameHits(IndexSearcher searcher, Query q1, Query q2, b } } + public void testBooleanFilterOnPrimaryIndexSortFieldRestrictsBulkScoringRange() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setIndexSort( + new Sort(LongField.newSortField("sort", false, SortedNumericSelector.Type.MIN))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 99; i >= 0; --i) { + Document doc = new Document(); + doc.add(new LongField("sort", i, Field.Store.NO)); + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query filter = LongField.newRangeQuery("sort", 40, 59); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 100); + assertEquals(20, topDocs.totalHits.value()); + assertFalse(recordingQuery.scoredRanges.isEmpty()); + for (DocIdRange range : recordingQuery.scoredRanges) { + assertTrue(range.minDoc() >= 40); + assertTrue(range.maxDoc() <= 60); + } + + reader.close(); + dir.close(); + } + + public void testBooleanTermFilterOnPrimaryIndexSortFieldRestrictsBulkScoringRange() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(16))); + iwc.setIndexSort( + new Sort( + KeywordField.newSortField( + "category", false, SortedSetSelector.Type.MIN, SortField.STRING_LAST))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 0; i < 100; ++i) { + Document doc = new Document(); + String category; + if (i < 30) { + category = "articles"; + } else if (i < 50) { + category = "books"; + } else { + category = "music"; + } + doc.add(new KeywordField("category", category, Field.Store.NO)); + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query filter = KeywordField.newExactQuery("category", "books"); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 100); + assertEquals(20, topDocs.totalHits.value()); + assertFalse(recordingQuery.scoredRanges.isEmpty()); + for (DocIdRange range : recordingQuery.scoredRanges) { + assertTrue(range.minDoc() >= 30); + assertTrue(range.maxDoc() <= 50); + } + + reader.close(); + dir.close(); + } + + public void testBooleanTermFilterOnReversePrimaryIndexSortFieldRestrictsBulkScoringRange() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(16))); + iwc.setIndexSort( + new Sort( + KeywordField.newSortField( + "category", true, SortedSetSelector.Type.MIN, SortField.STRING_LAST))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 0; i < 100; ++i) { + Document doc = new Document(); + String category; + if (i < 30) { + category = "articles"; + } else if (i < 50) { + category = "books"; + } else { + category = "music"; + } + doc.add(new KeywordField("category", category, Field.Store.NO)); + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query filter = KeywordField.newExactQuery("category", "books"); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 100); + assertEquals(20, topDocs.totalHits.value()); + assertFalse(recordingQuery.scoredRanges.isEmpty()); + for (DocIdRange range : recordingQuery.scoredRanges) { + assertTrue(range.minDoc() >= 50); + assertTrue(range.maxDoc() <= 70); + } + + reader.close(); + dir.close(); + } + + public void testBooleanTermFilterOnPrimaryIndexSortFieldRespectsMustNot() throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(16))); + iwc.setIndexSort( + new Sort( + KeywordField.newSortField( + "category", false, SortedSetSelector.Type.MIN, SortField.STRING_LAST))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 0; i < 100; ++i) { + Document doc = new Document(); + String category; + if (i < 30) { + category = "articles"; + } else if (i < 50) { + category = "books"; + if (i % 2 == 0) { + doc.add(new StringField("excluded", "yes", Field.Store.NO)); + } + } else { + category = "music"; + } + doc.add(new KeywordField("category", category, Field.Store.NO)); + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(KeywordField.newExactQuery("category", "books"), BooleanClause.Occur.FILTER) + .add( + new TermQuery(new org.apache.lucene.index.Term("excluded", "yes")), + BooleanClause.Occur.MUST_NOT) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 100); + assertEquals(10, topDocs.totalHits.value()); + assertFalse(recordingQuery.scoredRanges.isEmpty()); + for (DocIdRange range : recordingQuery.scoredRanges) { + assertTrue(range.minDoc() >= 30); + assertTrue(range.maxDoc() <= 50); + } + + reader.close(); + dir.close(); + } + + public void testBooleanTermFilterOnPrimaryIndexSortFieldWithMixedSegmentFallback() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setMergePolicy(NoMergePolicy.INSTANCE); + iwc.setCodec(TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(16))); + iwc.setIndexSort( + new Sort( + KeywordField.newSortField( + "category", false, SortedSetSelector.Type.MIN, SortField.STRING_LAST))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 0; i < 20; ++i) { + Document doc = new Document(); + String category = i < 10 ? "books" : "music"; + doc.add(new KeywordField("category", category, Field.Store.NO)); + iw.addDocument(doc); + } + iw.commit(false); + + for (int i = 0; i < 10; ++i) { + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("category", new BytesRef("music"))); + if (i % 2 == 0) { + doc.add(new StringField("category", "books", Field.Store.NO)); + } else { + doc.add(new StringField("category", "music", Field.Store.NO)); + } + iw.addDocument(doc); + } + iw.commit(false); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + assertTrue(reader.leaves().size() >= 2); + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add( + new TermQuery(new org.apache.lucene.index.Term("category", "books")), + BooleanClause.Occur.FILTER) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 30); + assertEquals(15, topDocs.totalHits.value()); + + reader.close(); + dir.close(); + } + + public void testBooleanQueryWithTwoOptimizableFiltersOnPrimaryIndexSortField() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setIndexSort( + new Sort(LongField.newSortField("sort", false, SortedNumericSelector.Type.MIN))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 99; i >= 0; --i) { + Document doc = new Document(); + doc.add(new LongField("sort", i, Field.Store.NO)); + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + Query wideFilter = LongField.newRangeQuery("sort", 20, 79); + Query narrowFilter = LongField.newRangeQuery("sort", 40, 59); + assertTwoOptimizableFiltersMatch(searcher, wideFilter, narrowFilter); + assertTwoOptimizableFiltersMatch(searcher, narrowFilter, wideFilter); + + reader.close(); + dir.close(); + } + + private static void assertTwoOptimizableFiltersMatch( + IndexSearcher searcher, Query firstFilter, Query secondFilter) throws IOException { + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(firstFilter, BooleanClause.Occur.FILTER) + .add(secondFilter, BooleanClause.Occur.FILTER) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 100); + assertEquals(20, topDocs.totalHits.value()); + assertFalse(recordingQuery.scoredRanges.isEmpty()); + } + + public void testBooleanTermFilterOnPrimaryIndexSortFieldStillAppliesPostingsFilter() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setIndexSort(new Sort(new SortedSetSortField("category", false))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 0; i < 20; ++i) { + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("category", new BytesRef("books"))); + if (i % 2 == 0) { + doc.add(new StringField("category", "books", Field.Store.NO)); + } else { + doc.add(new StringField("category", "other", Field.Store.NO)); + } + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query filter = new TermQuery(new org.apache.lucene.index.Term("category", "books")); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + TopDocs topDocs = searcher.search(query, 20); + assertEquals(10, topDocs.totalHits.value()); + + reader.close(); + dir.close(); + } + + public void + testBooleanTermFilterOnPrimaryIndexSortFieldFallsBackWhenPostingsTermMissingFromDocValues() + throws IOException { + Directory dir = newDirectory(); + + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setIndexSort(new Sort(new SortedSetSortField("category", false))); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + for (int i = 0; i < 20; ++i) { + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("category", new BytesRef("music"))); + if (i % 2 == 0) { + doc.add(new StringField("category", "books", Field.Store.NO)); + } else { + doc.add(new StringField("category", "music", Field.Store.NO)); + } + iw.addDocument(doc); + } + iw.forceMerge(1); + + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + RecordingMatchAllQuery recordingQuery = new RecordingMatchAllQuery(); + Query filter = new TermQuery(new org.apache.lucene.index.Term("category", "books")); + Query query = + new BooleanQuery.Builder() + .add(recordingQuery, BooleanClause.Occur.MUST) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + MatcherAssert.assertThat( + searcher.rewrite(query), instanceOf(FilteredOnPrimaryIndexSortFieldQuery.class)); + + TopDocs topDocs = searcher.search(query, 20); + assertEquals(10, topDocs.totalHits.value()); + + reader.close(); + dir.close(); + } + + private static class RecordingMatchAllQuery extends Query { + final List scoredRanges = new ArrayList<>(); + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { + return new ConstantScoreWeight(this, boost) { + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) { + final int maxDoc = context.reader().maxDoc(); + final float queryScore = score(); + return new ScorerSupplier() { + @Override + public Scorer get(long leadCost) throws IOException { + return new ConstantScoreScorer(queryScore, scoreMode, DocIdSetIterator.all(maxDoc)); + } + + @Override + public BulkScorer bulkScorer() { + return new BulkScorer() { + @Override + public int score(LeafCollector collector, Bits acceptDocs, int min, int max) + throws IOException { + scoredRanges.add(new DocIdRange(min, max)); + SimpleScorable scorable = new SimpleScorable(); + scorable.setScore(queryScore); + collector.setScorer(scorable); + for (int doc = min; doc < max; ++doc) { + if (acceptDocs == null || acceptDocs.get(doc)) { + collector.collect(doc); + } + } + return max == maxDoc ? DocIdSetIterator.NO_MORE_DOCS : max; + } + + @Override + public long cost() { + return maxDoc; + } + }; + } + + @Override + public long cost() { + return maxDoc; + } + }; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return true; + } + }; + } + + @Override + public String toString(String field) { + return "RecordingMatchAllQuery"; + } + + @Override + public void visit(QueryVisitor visitor) { + visitor.visitLeaf(this); + } + + @Override + public boolean equals(Object obj) { + return this == obj; + } + + @Override + public int hashCode() { + return System.identityHashCode(this); + } + } + public void testEquals() { Query q1 = createQuery("foo", 3, 5); QueryUtils.checkEqual(q1, createQuery("foo", 3, 5));