-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Introduce DocValuesStatsCollectorManager #15653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,8 +28,8 @@ | |
| /** Holds statistics for a DocValues field. */ | ||
| public abstract class DocValuesStats<T> { | ||
|
|
||
| private int missing = 0; | ||
| private int count = 0; | ||
| int missing = 0; | ||
| int count = 0; | ||
|
|
||
| protected final String field; | ||
|
|
||
|
|
@@ -73,6 +73,43 @@ final void addMissing() { | |
| ++missing; | ||
| } | ||
|
|
||
| void merge(DocValuesStats<?> other) { | ||
| count += other.count; | ||
| missing += other.missing; | ||
| if (other.min != null && (min == null || compareMin(other.min, min) < 0)) { | ||
| copyMin(other); | ||
| } | ||
| if (other.max != null && (max == null || compareMax(other.max, max) > 0)) { | ||
| copyMax(other); | ||
| } | ||
| } | ||
|
|
||
| @SuppressWarnings("unchecked") | ||
| void copyMin(DocValuesStats<?> other) { | ||
| min = (T) other.min; | ||
| } | ||
|
|
||
| @SuppressWarnings("unchecked") | ||
| void copyMax(DocValuesStats<?> other) { | ||
| max = (T) other.max; | ||
| } | ||
|
|
||
| @SuppressWarnings({"unchecked", "rawtypes"}) | ||
| int compareMin(Object a, Object b) { | ||
| if (a instanceof Number numA && b instanceof Number numB) { | ||
| return Double.compare(numA.doubleValue(), numB.doubleValue()); | ||
| } | ||
| return ((Comparable) a).compareTo(b); | ||
| } | ||
|
|
||
| @SuppressWarnings({"unchecked", "rawtypes"}) | ||
| int compareMax(Object a, Object b) { | ||
| if (a instanceof Number numA && b instanceof Number numB) { | ||
| return Double.compare(numA.doubleValue(), numB.doubleValue()); | ||
| } | ||
| return ((Comparable) a).compareTo(b); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the same as compareMin ? |
||
| } | ||
|
|
||
| /** The field for which these stats were computed. */ | ||
| public final String field() { | ||
| return field; | ||
|
|
@@ -142,6 +179,47 @@ public final double stdev() { | |
| * might overflow. | ||
| */ | ||
| public abstract T sum(); | ||
|
|
||
| @Override | ||
| void merge(DocValuesStats<?> other) { | ||
| if (!(other instanceof NumericDocValuesStats<?> o)) { | ||
| throw new IllegalArgumentException("Cannot merge different stat types"); | ||
| } | ||
|
|
||
| this.missing += o.missing(); | ||
|
|
||
| if (o.count() == 0) { | ||
| return; | ||
| } | ||
| if (this.count() == 0) { | ||
| this.count = o.count(); | ||
| copyMin(o); | ||
| copyMax(o); | ||
| this.mean = o.mean; | ||
| this.variance = o.variance; | ||
| return; | ||
| } | ||
|
|
||
| int totalCount = this.count() + o.count(); | ||
| double combinedMean = (sum().doubleValue() + o.sum().doubleValue()) / totalCount; | ||
| double targetDelta = this.mean - combinedMean; | ||
| double sourceDelta = o.mean - combinedMean; | ||
|
|
||
| this.variance = | ||
| this.variance | ||
| + o.variance | ||
| + targetDelta * targetDelta * this.count() | ||
| + sourceDelta * sourceDelta * o.count(); | ||
| this.mean = combinedMean; | ||
| this.count = totalCount; | ||
|
|
||
| if (compareMin(o.min(), min()) < 0) { | ||
| copyMin(o); | ||
| } | ||
| if (compareMax(o.max(), max()) > 0) { | ||
| copyMax(o); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** Holds DocValues statistics for a numeric field storing {@code long} values. */ | ||
|
|
@@ -173,6 +251,14 @@ protected void doAccumulate(int count) throws IOException { | |
| public Long sum() { | ||
| return sum; | ||
| } | ||
|
|
||
| @Override | ||
| void merge(DocValuesStats<?> other) { | ||
| super.merge(other); | ||
| if (other instanceof LongDocValuesStats o) { | ||
| sum += o.sum; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** Holds DocValues statistics for a numeric field storing {@code double} values. */ | ||
|
|
@@ -205,6 +291,14 @@ protected void doAccumulate(int count) throws IOException { | |
| public Double sum() { | ||
| return sum; | ||
| } | ||
|
|
||
| @Override | ||
| void merge(DocValuesStats<?> other) { | ||
| super.merge(other); | ||
| if (other instanceof DoubleDocValuesStats o) { | ||
| sum += o.sum; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** Holds statistics for a sorted-numeric DocValues field. */ | ||
|
|
@@ -258,6 +352,49 @@ public final long valuesCount() { | |
| * might overflow. | ||
| */ | ||
| public abstract T sum(); | ||
|
|
||
| @Override | ||
| void merge(DocValuesStats<?> other) { | ||
| if (!(other instanceof SortedNumericDocValuesStats<?> o)) { | ||
| throw new IllegalArgumentException("Cannot merge different stat types"); | ||
| } | ||
|
|
||
| this.missing += o.missing(); | ||
|
|
||
| if (o.count() == 0) { | ||
| return; | ||
| } | ||
| if (this.count() == 0) { | ||
| this.count = o.count(); | ||
| copyMin(o); | ||
| copyMax(o); | ||
| this.mean = o.mean; | ||
| this.variance = o.variance; | ||
| this.valuesCount = o.valuesCount; | ||
| return; | ||
| } | ||
|
|
||
| long totalValuesCount = this.valuesCount + o.valuesCount; | ||
| double combinedMean = (sum().doubleValue() + o.sum().doubleValue()) / totalValuesCount; | ||
| double targetDelta = this.mean - combinedMean; | ||
| double sourceDelta = o.mean - combinedMean; | ||
|
|
||
| this.variance = | ||
| this.variance | ||
| + o.variance | ||
| + targetDelta * targetDelta * this.valuesCount | ||
| + sourceDelta * sourceDelta * o.valuesCount; | ||
| this.mean = combinedMean; | ||
| this.valuesCount = totalValuesCount; | ||
| this.count += o.count(); | ||
|
|
||
| if (compareMin(o.min(), min()) < 0) { | ||
| copyMin(o); | ||
| } | ||
| if (compareMax(o.max(), max()) > 0) { | ||
| copyMax(o); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** Holds DocValues statistics for a sorted-numeric field storing {@code long} values. */ | ||
|
|
@@ -296,6 +433,14 @@ protected void doAccumulate(int count) throws IOException { | |
| public Long sum() { | ||
| return sum; | ||
| } | ||
|
|
||
| @Override | ||
| void merge(DocValuesStats<?> other) { | ||
| super.merge(other); | ||
| if (other instanceof SortedLongDocValuesStats o) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this type of conditional necessary? Does it ever happen that we get a different type than |
||
| sum += o.sum; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** Holds DocValues statistics for a sorted-numeric field storing {@code double} values. */ | ||
|
|
@@ -335,6 +480,14 @@ protected void doAccumulate(int count) throws IOException { | |
| public Double sum() { | ||
| return sum; | ||
| } | ||
|
|
||
| @Override | ||
| void merge(DocValuesStats<?> other) { | ||
| super.merge(other); | ||
| if (other instanceof SortedDoubleDocValuesStats o) { | ||
| sum += o.sum; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private static BytesRef copyFrom(BytesRef src, BytesRef dest) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.lucene.misc.search; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Collection; | ||
| import java.util.function.Supplier; | ||
| import org.apache.lucene.search.CollectorManager; | ||
|
|
||
| /** | ||
| * A {@link CollectorManager} implementation for {@link DocValuesStatsCollector}. | ||
| * | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we expand javadocs to include a usage example for users? |
||
| * @param <S> the type of {@link DocValuesStats} | ||
| */ | ||
| public class DocValuesStatsCollectorManager<S extends DocValuesStats<?>> | ||
| implements CollectorManager<DocValuesStatsCollector, S> { | ||
|
|
||
| private final Supplier<S> statsSupplier; | ||
|
|
||
| /** | ||
| * Creates a new DocValuesStatsCollectorManager. | ||
| * | ||
| * @param statsSupplier a supplier that creates new stats instances for each collector | ||
| */ | ||
| public DocValuesStatsCollectorManager(Supplier<S> statsSupplier) { | ||
| this.statsSupplier = statsSupplier; | ||
| } | ||
|
|
||
| @Override | ||
| public DocValuesStatsCollector newCollector() throws IOException { | ||
| return new DocValuesStatsCollector(statsSupplier.get()); | ||
| } | ||
|
|
||
| @Override | ||
| @SuppressWarnings("unchecked") | ||
| public S reduce(Collection<DocValuesStatsCollector> collectors) throws IOException { | ||
| if (collectors.isEmpty()) { | ||
| return null; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a potential bug that could cause problems down the line? Could we instead return empty stats? Is it even necessary to have this special case? You could instead just remove this conditional? |
||
| } | ||
|
|
||
| S merged = statsSupplier.get(); | ||
| for (DocValuesStatsCollector collector : collectors) { | ||
| S stats = (S) collector.getStats(); | ||
| merged.merge(stats); | ||
| } | ||
| return merged; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe this could lead to precision loss? is the special number case necessary?