Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ Improvements

* GITHUB#15453: Avoid unnecessary sorting and instantiations in readMapOfStrings. (Benjamin Lerer)

* GITHUB#15653: Introduce DocValuesStatsCollectorManager to parallelize search when using DocValuesStatsCollector. (Binlong Gao)

Optimizations
---------------------
* GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui)
Expand Down
157 changes: 155 additions & 2 deletions lucene/misc/src/java/org/apache/lucene/misc/search/DocValuesStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
/** Holds statistics for a DocValues field. */
public abstract class DocValuesStats<T> {

private int missing = 0;
private int count = 0;
int missing = 0;
int count = 0;

protected final String field;

Expand Down Expand Up @@ -73,6 +73,43 @@ final void addMissing() {
++missing;
}

void merge(DocValuesStats<?> other) {
count += other.count;
missing += other.missing;
if (other.min != null && (min == null || compareMin(other.min, min) < 0)) {
copyMin(other);
}
if (other.max != null && (max == null || compareMax(other.max, max) > 0)) {
copyMax(other);
}
}

@SuppressWarnings("unchecked")
void copyMin(DocValuesStats<?> other) {
min = (T) other.min;
}

@SuppressWarnings("unchecked")
void copyMax(DocValuesStats<?> other) {
max = (T) other.max;
}

@SuppressWarnings({"unchecked", "rawtypes"})
int compareMin(Object a, Object b) {
if (a instanceof Number numA && b instanceof Number numB) {
return Double.compare(numA.doubleValue(), numB.doubleValue());
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this could lead to precision loss? is the special number case necessary?

}
return ((Comparable) a).compareTo(b);
}

@SuppressWarnings({"unchecked", "rawtypes"})
int compareMax(Object a, Object b) {
if (a instanceof Number numA && b instanceof Number numB) {
return Double.compare(numA.doubleValue(), numB.doubleValue());
}
return ((Comparable) a).compareTo(b);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same as compareMin ?

}

/** The field for which these stats were computed. */
public final String field() {
return field;
Expand Down Expand Up @@ -142,6 +179,47 @@ public final double stdev() {
* might overflow.
*/
public abstract T sum();

@Override
void merge(DocValuesStats<?> other) {
if (!(other instanceof NumericDocValuesStats<?> o)) {
throw new IllegalArgumentException("Cannot merge different stat types");
}

this.missing += o.missing();

if (o.count() == 0) {
return;
}
if (this.count() == 0) {
this.count = o.count();
copyMin(o);
copyMax(o);
this.mean = o.mean;
this.variance = o.variance;
return;
}

int totalCount = this.count() + o.count();
double combinedMean = (sum().doubleValue() + o.sum().doubleValue()) / totalCount;
double targetDelta = this.mean - combinedMean;
double sourceDelta = o.mean - combinedMean;

this.variance =
this.variance
+ o.variance
+ targetDelta * targetDelta * this.count()
+ sourceDelta * sourceDelta * o.count();
this.mean = combinedMean;
this.count = totalCount;

if (compareMin(o.min(), min()) < 0) {
copyMin(o);
}
if (compareMax(o.max(), max()) > 0) {
copyMax(o);
}
}
}

/** Holds DocValues statistics for a numeric field storing {@code long} values. */
Expand Down Expand Up @@ -173,6 +251,14 @@ protected void doAccumulate(int count) throws IOException {
public Long sum() {
return sum;
}

@Override
void merge(DocValuesStats<?> other) {
super.merge(other);
if (other instanceof LongDocValuesStats o) {
sum += o.sum;
}
}
}

/** Holds DocValues statistics for a numeric field storing {@code double} values. */
Expand Down Expand Up @@ -205,6 +291,14 @@ protected void doAccumulate(int count) throws IOException {
public Double sum() {
return sum;
}

@Override
void merge(DocValuesStats<?> other) {
super.merge(other);
if (other instanceof DoubleDocValuesStats o) {
sum += o.sum;
}
}
}

/** Holds statistics for a sorted-numeric DocValues field. */
Expand Down Expand Up @@ -258,6 +352,49 @@ public final long valuesCount() {
* might overflow.
*/
public abstract T sum();

@Override
void merge(DocValuesStats<?> other) {
if (!(other instanceof SortedNumericDocValuesStats<?> o)) {
throw new IllegalArgumentException("Cannot merge different stat types");
}

this.missing += o.missing();

if (o.count() == 0) {
return;
}
if (this.count() == 0) {
this.count = o.count();
copyMin(o);
copyMax(o);
this.mean = o.mean;
this.variance = o.variance;
this.valuesCount = o.valuesCount;
return;
}

long totalValuesCount = this.valuesCount + o.valuesCount;
double combinedMean = (sum().doubleValue() + o.sum().doubleValue()) / totalValuesCount;
double targetDelta = this.mean - combinedMean;
double sourceDelta = o.mean - combinedMean;

this.variance =
this.variance
+ o.variance
+ targetDelta * targetDelta * this.valuesCount
+ sourceDelta * sourceDelta * o.valuesCount;
this.mean = combinedMean;
this.valuesCount = totalValuesCount;
this.count += o.count();

if (compareMin(o.min(), min()) < 0) {
copyMin(o);
}
if (compareMax(o.max(), max()) > 0) {
copyMax(o);
}
}
}

/** Holds DocValues statistics for a sorted-numeric field storing {@code long} values. */
Expand Down Expand Up @@ -296,6 +433,14 @@ protected void doAccumulate(int count) throws IOException {
public Long sum() {
return sum;
}

@Override
void merge(DocValuesStats<?> other) {
super.merge(other);
if (other instanceof SortedLongDocValuesStats o) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this type of conditional necessary? Does it ever happen that we get a different type than SortedLongDocValuesStats here? This applies to all merge methods.

sum += o.sum;
}
}
}

/** Holds DocValues statistics for a sorted-numeric field storing {@code double} values. */
Expand Down Expand Up @@ -335,6 +480,14 @@ protected void doAccumulate(int count) throws IOException {
public Double sum() {
return sum;
}

@Override
void merge(DocValuesStats<?> other) {
super.merge(other);
if (other instanceof SortedDoubleDocValuesStats o) {
sum += o.sum;
}
}
}

private static BytesRef copyFrom(BytesRef src, BytesRef dest) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,9 @@ public void collect(int doc) throws IOException {
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}

/** Returns the statistics computed by this collector. */
public DocValuesStats<?> getStats() {
return stats;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.misc.search;

import java.io.IOException;
import java.util.Collection;
import java.util.function.Supplier;
import org.apache.lucene.search.CollectorManager;

/**
* A {@link CollectorManager} implementation for {@link DocValuesStatsCollector}.
*
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we expand javadocs to include a usage example for users?

* @param <S> the type of {@link DocValuesStats}
*/
public class DocValuesStatsCollectorManager<S extends DocValuesStats<?>>
implements CollectorManager<DocValuesStatsCollector, S> {

private final Supplier<S> statsSupplier;

/**
* Creates a new DocValuesStatsCollectorManager.
*
* @param statsSupplier a supplier that creates new stats instances for each collector
*/
public DocValuesStatsCollectorManager(Supplier<S> statsSupplier) {
this.statsSupplier = statsSupplier;
}

@Override
public DocValuesStatsCollector newCollector() throws IOException {
return new DocValuesStatsCollector(statsSupplier.get());
}

@Override
@SuppressWarnings("unchecked")
public S reduce(Collection<DocValuesStatsCollector> collectors) throws IOException {
if (collectors.isEmpty()) {
return null;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a potential bug that could cause problems down the line? Could we instead return empty stats? Is it even necessary to have this special case? You could instead just remove this conditional?

}

S merged = statsSupplier.get();
for (DocValuesStatsCollector collector : collectors) {
S stats = (S) collector.getStats();
merged.merge(stats);
}
return merged;
}
}
Loading
Loading