diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 31a7c5aa45d3..672abe124fa0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -89,6 +89,8 @@ Improvements * GITHUB#15453: Avoid unnecessary sorting and instantiations in readMapOfStrings. (Benjamin Lerer) +* GITHUB#15653: Introduce DocValuesStatsCollectorManager to parallelize search when using DocValuesStatsCollector. (Binlong Gao) + Optimizations --------------------- * GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui) diff --git a/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java b/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java index 880536858780..f03e8d2ea7bc 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java @@ -28,8 +28,8 @@ /** Holds statistics for a DocValues field. */ public abstract class DocValuesStats { - private int missing = 0; - private int count = 0; + int missing = 0; + int count = 0; protected final String field; @@ -73,6 +73,43 @@ final void addMissing() { ++missing; } + void merge(DocValuesStats other) { + count += other.count; + missing += other.missing; + if (other.min != null && (min == null || compareMin(other.min, min) < 0)) { + copyMin(other); + } + if (other.max != null && (max == null || compareMax(other.max, max) > 0)) { + copyMax(other); + } + } + + @SuppressWarnings("unchecked") + void copyMin(DocValuesStats other) { + min = (T) other.min; + } + + @SuppressWarnings("unchecked") + void copyMax(DocValuesStats other) { + max = (T) other.max; + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + int compareMin(Object a, Object b) { + if (a instanceof Number numA && b instanceof Number numB) { + return Double.compare(numA.doubleValue(), numB.doubleValue()); + } + return ((Comparable) a).compareTo(b); + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + int compareMax(Object a, Object b) { + if (a instanceof Number numA && b instanceof Number numB) { + return Double.compare(numA.doubleValue(), numB.doubleValue()); + } + return ((Comparable) a).compareTo(b); + } + /** The field for which these stats were computed. */ public final String field() { return field; @@ -142,6 +179,47 @@ public final double stdev() { * might overflow. */ public abstract T sum(); + + @Override + void merge(DocValuesStats other) { + if (!(other instanceof NumericDocValuesStats o)) { + throw new IllegalArgumentException("Cannot merge different stat types"); + } + + this.missing += o.missing(); + + if (o.count() == 0) { + return; + } + if (this.count() == 0) { + this.count = o.count(); + copyMin(o); + copyMax(o); + this.mean = o.mean; + this.variance = o.variance; + return; + } + + int totalCount = this.count() + o.count(); + double combinedMean = (sum().doubleValue() + o.sum().doubleValue()) / totalCount; + double targetDelta = this.mean - combinedMean; + double sourceDelta = o.mean - combinedMean; + + this.variance = + this.variance + + o.variance + + targetDelta * targetDelta * this.count() + + sourceDelta * sourceDelta * o.count(); + this.mean = combinedMean; + this.count = totalCount; + + if (compareMin(o.min(), min()) < 0) { + copyMin(o); + } + if (compareMax(o.max(), max()) > 0) { + copyMax(o); + } + } } /** Holds DocValues statistics for a numeric field storing {@code long} values. */ @@ -173,6 +251,14 @@ protected void doAccumulate(int count) throws IOException { public Long sum() { return sum; } + + @Override + void merge(DocValuesStats other) { + super.merge(other); + if (other instanceof LongDocValuesStats o) { + sum += o.sum; + } + } } /** Holds DocValues statistics for a numeric field storing {@code double} values. */ @@ -205,6 +291,14 @@ protected void doAccumulate(int count) throws IOException { public Double sum() { return sum; } + + @Override + void merge(DocValuesStats other) { + super.merge(other); + if (other instanceof DoubleDocValuesStats o) { + sum += o.sum; + } + } } /** Holds statistics for a sorted-numeric DocValues field. */ @@ -258,6 +352,49 @@ public final long valuesCount() { * might overflow. */ public abstract T sum(); + + @Override + void merge(DocValuesStats other) { + if (!(other instanceof SortedNumericDocValuesStats o)) { + throw new IllegalArgumentException("Cannot merge different stat types"); + } + + this.missing += o.missing(); + + if (o.count() == 0) { + return; + } + if (this.count() == 0) { + this.count = o.count(); + copyMin(o); + copyMax(o); + this.mean = o.mean; + this.variance = o.variance; + this.valuesCount = o.valuesCount; + return; + } + + long totalValuesCount = this.valuesCount + o.valuesCount; + double combinedMean = (sum().doubleValue() + o.sum().doubleValue()) / totalValuesCount; + double targetDelta = this.mean - combinedMean; + double sourceDelta = o.mean - combinedMean; + + this.variance = + this.variance + + o.variance + + targetDelta * targetDelta * this.valuesCount + + sourceDelta * sourceDelta * o.valuesCount; + this.mean = combinedMean; + this.valuesCount = totalValuesCount; + this.count += o.count(); + + if (compareMin(o.min(), min()) < 0) { + copyMin(o); + } + if (compareMax(o.max(), max()) > 0) { + copyMax(o); + } + } } /** Holds DocValues statistics for a sorted-numeric field storing {@code long} values. */ @@ -296,6 +433,14 @@ protected void doAccumulate(int count) throws IOException { public Long sum() { return sum; } + + @Override + void merge(DocValuesStats other) { + super.merge(other); + if (other instanceof SortedLongDocValuesStats o) { + sum += o.sum; + } + } } /** Holds DocValues statistics for a sorted-numeric field storing {@code double} values. */ @@ -335,6 +480,14 @@ protected void doAccumulate(int count) throws IOException { public Double sum() { return sum; } + + @Override + void merge(DocValuesStats other) { + super.merge(other); + if (other instanceof SortedDoubleDocValuesStats o) { + sum += o.sum; + } + } } private static BytesRef copyFrom(BytesRef src, BytesRef dest) { diff --git a/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollector.java b/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollector.java index 29f31651e5d6..d4116251c2c4 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollector.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollector.java @@ -68,4 +68,9 @@ public void collect(int doc) throws IOException { public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; } + + /** Returns the statistics computed by this collector. */ + public DocValuesStats getStats() { + return stats; + } } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollectorManager.java b/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollectorManager.java new file mode 100644 index 000000000000..3f99736c7dcc --- /dev/null +++ b/lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStatsCollectorManager.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.misc.search; + +import java.io.IOException; +import java.util.Collection; +import java.util.function.Supplier; +import org.apache.lucene.search.CollectorManager; + +/** + * A {@link CollectorManager} implementation for {@link DocValuesStatsCollector}. + * + * @param the type of {@link DocValuesStats} + */ +public class DocValuesStatsCollectorManager> + implements CollectorManager { + + private final Supplier statsSupplier; + + /** + * Creates a new DocValuesStatsCollectorManager. + * + * @param statsSupplier a supplier that creates new stats instances for each collector + */ + public DocValuesStatsCollectorManager(Supplier statsSupplier) { + this.statsSupplier = statsSupplier; + } + + @Override + public DocValuesStatsCollector newCollector() throws IOException { + return new DocValuesStatsCollector(statsSupplier.get()); + } + + @Override + @SuppressWarnings("unchecked") + public S reduce(Collection collectors) throws IOException { + if (collectors.isEmpty()) { + return null; + } + + S merged = statsSupplier.get(); + for (DocValuesStatsCollector collector : collectors) { + S stats = (S) collector.getStats(); + merged.merge(stats); + } + return merged; + } +} diff --git a/lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java b/lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java index d02881836019..068e817ba809 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/search/TestDocValuesStatsCollector.java @@ -35,7 +35,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.misc.search.DocValuesStats.DoubleDocValuesStats; import org.apache.lucene.misc.search.DocValuesStats.LongDocValuesStats; @@ -46,6 +46,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; @@ -55,16 +56,21 @@ public class TestDocValuesStatsCollector extends LuceneTestCase { public void testNoDocsWithField() throws IOException { try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { - int numDocs = TestUtil.nextInt(random(), 1, 100); + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir)) { + int numDocs = TestUtil.nextInt(random(), 50, 100); for (int i = 0; i < numDocs; i++) { indexWriter.addDocument(new Document()); + if (i % 10 == 0) { + indexWriter.commit(); + } } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - LongDocValuesStats stats = new LongDocValuesStats("foo"); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + LongDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new LongDocValuesStats("foo"))); assertEquals(0, stats.count()); assertEquals(numDocs, stats.missing()); @@ -74,17 +80,19 @@ public void testNoDocsWithField() throws IOException { public void testOneDoc() throws IOException { try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir)) { String field = "numeric"; Document doc = new Document(); doc.add(new NumericDocValuesField(field, 1)); doc.add(new StringField("id", "doc1", Store.NO)); indexWriter.addDocument(doc); - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - LongDocValuesStats stats = new LongDocValuesStats(field); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + LongDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new LongDocValuesStats(field))); assertEquals(1, stats.count()); assertEquals(0, stats.missing()); @@ -99,218 +107,235 @@ public void testOneDoc() throws IOException { } public void testDocsWithLongValues() throws IOException { - try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { - String field = "numeric"; - int numDocs = TestUtil.nextInt(random(), 1, 100); - long[] docValues = new long[numDocs]; - int nextVal = 1; - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - if (random().nextBoolean()) { // not all documents have a value - doc.add(new NumericDocValuesField(field, nextVal)); - doc.add(new StringField("id", "doc" + i, Store.NO)); - docValues[i] = nextVal; - ++nextVal; - } - indexWriter.addDocument(doc); - } - - // 20% of cases delete some docs - if (random().nextDouble() < 0.2) { + try (Directory dir = newDirectory()) { + IndexWriterConfig config = newIndexWriterConfig(); + config.setMaxBufferedDocs(10); + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir, config)) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 50, 100); + long[] docValues = new long[numDocs]; + int nextVal = 1; for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); if (random().nextBoolean()) { - indexWriter.deleteDocuments(new Term("id", "doc" + i)); - docValues[i] = 0; + doc.add(new NumericDocValuesField(field, nextVal)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = nextVal; + ++nextVal; } + indexWriter.addDocument(doc); } - } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - LongDocValuesStats stats = new LongDocValuesStats(field); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = 0; + } + } + } - int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); - assertEquals(expCount, stats.count()); - int numDocsWithoutField = (int) getZeroValues(docValues).count(); - assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); - if (stats.count() > 0) { - LongSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); - assertEquals(sumStats.getMax(), stats.max().longValue()); - assertEquals(sumStats.getMin(), stats.min().longValue()); - assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); - assertEquals(sumStats.getSum(), stats.sum().longValue()); - double variance = computeVariance(docValues, stats.mean, stats.count()); - assertEquals(variance, stats.variance(), 0.00001); - assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + LongDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new LongDocValuesStats(field))); + + int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); + assertEquals(expCount, stats.count()); + int numDocsWithoutField = (int) getZeroValues(docValues).count(); + assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); + if (stats.count() > 0) { + LongSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().longValue()); + assertEquals(sumStats.getMin(), stats.min().longValue()); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum().longValue()); + double variance = computeVariance(docValues, stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + } } } } } public void testDocsWithDoubleValues() throws IOException { - try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { - String field = "numeric"; - int numDocs = TestUtil.nextInt(random(), 1, 100); - double[] docValues = new double[numDocs]; - double nextVal = 1.0; - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - if (random().nextBoolean()) { // not all documents have a value - doc.add(new DoubleDocValuesField(field, nextVal)); - doc.add(new StringField("id", "doc" + i, Store.NO)); - docValues[i] = nextVal; - ++nextVal; - } - indexWriter.addDocument(doc); - } - - // 20% of cases delete some docs - if (random().nextDouble() < 0.2) { + try (Directory dir = newDirectory()) { + IndexWriterConfig config = newIndexWriterConfig(); + config.setMaxBufferedDocs(10); + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir, config)) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 50, 100); + double[] docValues = new double[numDocs]; + double nextVal = 1.0; for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); if (random().nextBoolean()) { - indexWriter.deleteDocuments(new Term("id", "doc" + i)); - docValues[i] = 0; + doc.add(new DoubleDocValuesField(field, nextVal)); + doc.add(new StringField("id", "doc" + i, Store.NO)); + docValues[i] = nextVal; + ++nextVal; } + indexWriter.addDocument(doc); } - } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - DoubleDocValuesStats stats = new DoubleDocValuesStats(field); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = 0; + } + } + } - int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); - assertEquals(expCount, stats.count()); - int numDocsWithoutField = (int) getZeroValues(docValues).count(); - assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); - if (stats.count() > 0) { - DoubleSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); - assertEquals(sumStats.getMax(), stats.max().doubleValue(), 0.00001); - assertEquals(sumStats.getMin(), stats.min().doubleValue(), 0.00001); - assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); - assertEquals(sumStats.getSum(), stats.sum(), 0.00001); - double variance = computeVariance(docValues, stats.mean, stats.count()); - assertEquals(variance, stats.variance(), 0.00001); - assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + DoubleDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new DoubleDocValuesStats(field))); + + int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); + assertEquals(expCount, stats.count()); + int numDocsWithoutField = (int) getZeroValues(docValues).count(); + assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); + if (stats.count() > 0) { + DoubleSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().doubleValue(), 0.00001); + assertEquals(sumStats.getMin(), stats.min().doubleValue(), 0.00001); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum(), 0.00001); + double variance = computeVariance(docValues, stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + } } } } } public void testDocsWithMultipleLongValues() throws IOException { - try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { - String field = "numeric"; - int numDocs = TestUtil.nextInt(random(), 1, 100); - long[][] docValues = new long[numDocs][]; - long nextVal = 1; - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - if (random().nextBoolean()) { // not all documents have a value - int numValues = TestUtil.nextInt(random(), 1, 5); - docValues[i] = new long[numValues]; - for (int j = 0; j < numValues; j++) { - doc.add(new SortedNumericDocValuesField(field, nextVal)); - docValues[i][j] = nextVal; - ++nextVal; - } - doc.add(new StringField("id", "doc" + i, Store.NO)); - } - indexWriter.addDocument(doc); - } - - // 20% of cases delete some docs - if (random().nextDouble() < 0.2) { + try (Directory dir = newDirectory()) { + IndexWriterConfig config = newIndexWriterConfig(); + config.setMaxBufferedDocs(10); + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir, config)) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 50, 100); + long[][] docValues = new long[numDocs][]; + long nextVal = 1; for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); if (random().nextBoolean()) { - indexWriter.deleteDocuments(new Term("id", "doc" + i)); - docValues[i] = null; + int numValues = TestUtil.nextInt(random(), 1, 5); + docValues[i] = new long[numValues]; + for (int j = 0; j < numValues; j++) { + doc.add(new SortedNumericDocValuesField(field, nextVal)); + docValues[i][j] = nextVal; + ++nextVal; + } + doc.add(new StringField("id", "doc" + i, Store.NO)); } + indexWriter.addDocument(doc); } - } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = null; + } + } + } - assertEquals(nonNull(docValues).count(), stats.count()); - int numDocsWithoutField = (int) isNull(docValues).count(); - assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); - if (stats.count() > 0) { - LongSummaryStatistics sumStats = - filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); - assertEquals(sumStats.getMax(), stats.max().longValue()); - assertEquals(sumStats.getMin(), stats.min().longValue()); - assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); - assertEquals(sumStats.getSum(), stats.sum().longValue()); - assertEquals(sumStats.getCount(), stats.valuesCount()); - double variance = - computeVariance( - filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); - assertEquals(variance, stats.variance(), 0.00001); - assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + SortedLongDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new SortedLongDocValuesStats(field))); + + assertEquals(nonNull(docValues).count(), stats.count()); + int numDocsWithoutField = (int) isNull(docValues).count(); + assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); + if (stats.count() > 0) { + LongSummaryStatistics sumStats = + filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max().longValue()); + assertEquals(sumStats.getMin(), stats.min().longValue()); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum().longValue()); + assertEquals(sumStats.getCount(), stats.valuesCount()); + double variance = + computeVariance( + filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + } } } } } public void testDocsWithMultipleDoubleValues() throws IOException { - try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { - String field = "numeric"; - int numDocs = TestUtil.nextInt(random(), 1, 100); - double[][] docValues = new double[numDocs][]; - double nextVal = 1; - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - if (random().nextBoolean()) { // not all documents have a value - int numValues = TestUtil.nextInt(random(), 1, 5); - docValues[i] = new double[numValues]; - for (int j = 0; j < numValues; j++) { - doc.add(new SortedNumericDocValuesField(field, Double.doubleToRawLongBits(nextVal))); - docValues[i][j] = nextVal; - ++nextVal; - } - doc.add(new StringField("id", "doc" + i, Store.NO)); - } - indexWriter.addDocument(doc); - } - - // 20% of cases delete some docs - if (random().nextDouble() < 0.2) { + try (Directory dir = newDirectory()) { + IndexWriterConfig config = newIndexWriterConfig(); + config.setMaxBufferedDocs(10); + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir, config)) { + String field = "numeric"; + int numDocs = TestUtil.nextInt(random(), 50, 100); + double[][] docValues = new double[numDocs][]; + double nextVal = 1; for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); if (random().nextBoolean()) { - indexWriter.deleteDocuments(new Term("id", "doc" + i)); - docValues[i] = null; + int numValues = TestUtil.nextInt(random(), 1, 5); + docValues[i] = new double[numValues]; + for (int j = 0; j < numValues; j++) { + doc.add(new SortedNumericDocValuesField(field, Double.doubleToRawLongBits(nextVal))); + docValues[i][j] = nextVal; + ++nextVal; + } + doc.add(new StringField("id", "doc" + i, Store.NO)); } + indexWriter.addDocument(doc); } - } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + if (random().nextDouble() < 0.2) { + for (int i = 0; i < numDocs; i++) { + if (random().nextBoolean()) { + indexWriter.deleteDocuments(new Term("id", "doc" + i)); + docValues[i] = null; + } + } + } - assertEquals(nonNull(docValues).count(), stats.count()); - int numDocsWithoutField = (int) isNull(docValues).count(); - assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); - if (stats.count() > 0) { - DoubleSummaryStatistics sumStats = - filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); - assertEquals(sumStats.getMax(), stats.max(), 0.00001); - assertEquals(sumStats.getMin(), stats.min(), 0.00001); - assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); - assertEquals(sumStats.getSum(), stats.sum().doubleValue(), 0.00001); - assertEquals(sumStats.getCount(), stats.valuesCount()); - double variance = - computeVariance( - filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); - assertEquals(variance, stats.variance(), 0.00001); - assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + SortedDoubleDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>( + () -> new SortedDoubleDocValuesStats(field))); + + assertEquals(nonNull(docValues).count(), stats.count()); + int numDocsWithoutField = (int) isNull(docValues).count(); + assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); + if (stats.count() > 0) { + DoubleSummaryStatistics sumStats = + filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); + assertEquals(sumStats.getMax(), stats.max(), 0.00001); + assertEquals(sumStats.getMin(), stats.min(), 0.00001); + assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); + assertEquals(sumStats.getSum(), stats.sum().doubleValue(), 0.00001); + assertEquals(sumStats.getCount(), stats.valuesCount()); + double variance = + computeVariance( + filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); + assertEquals(variance, stats.variance(), 0.00001); + assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); + } } } } @@ -318,7 +343,7 @@ public void testDocsWithMultipleDoubleValues() throws IOException { public void testDocsWithSortedValues() throws IOException { try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir)) { String field = "sorted"; int numDocs = TestUtil.nextInt(random(), 1, 100); BytesRef[] docValues = new BytesRef[numDocs]; @@ -343,10 +368,12 @@ public void testDocsWithSortedValues() throws IOException { } } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - SortedDocValuesStats stats = new SortedDocValuesStats(field); - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + SortedDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new SortedDocValuesStats(field))); int expCount = (int) nonNull(docValues).count(); assertEquals(expCount, stats.count()); @@ -362,7 +389,7 @@ public void testDocsWithSortedValues() throws IOException { public void testDocsWithSortedSetValues() throws IOException { try (Directory dir = newDirectory(); - IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir)) { String field = "sorted"; int numDocs = TestUtil.nextInt(random(), 1, 100); BytesRef[][] docValues = new BytesRef[numDocs][]; @@ -391,11 +418,12 @@ public void testDocsWithSortedSetValues() throws IOException { } } - try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { - IndexSearcher searcher = new IndexSearcher(reader); - SortedSetDocValuesStats stats = new SortedSetDocValuesStats(field); - - searcher.search(MatchAllDocsQuery.INSTANCE, new DocValuesStatsCollector(stats)); + try (DirectoryReader reader = indexWriter.getReader()) { + IndexSearcher searcher = newSearcher(reader); + SortedSetDocValuesStats stats = + searcher.search( + MatchAllDocsQuery.INSTANCE, + new DocValuesStatsCollectorManager<>(() -> new SortedSetDocValuesStats(field))); int expCount = (int) nonNull(docValues).count(); assertEquals(expCount, stats.count());