-
Notifications
You must be signed in to change notification settings - Fork 2.1k
fix(search): align dataAsset aggregation counts with index=tableColumn totals #27846
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
bd4d566
f264c55
b2d40ff
6d079ab
201bc7f
caf8580
e882607
4a8011b
b3cff35
d81ecc3
718ceb9
90df4c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -18,9 +18,11 @@ | |||||||||
| import es.co.elastic.clients.elasticsearch.core.search.Highlight; | ||||||||||
| import java.util.ArrayList; | ||||||||||
| import java.util.HashMap; | ||||||||||
| import java.util.HashSet; | ||||||||||
| import java.util.LinkedHashMap; | ||||||||||
| import java.util.List; | ||||||||||
| import java.util.Map; | ||||||||||
| import java.util.Set; | ||||||||||
| import java.util.stream.Collectors; | ||||||||||
| import org.openmetadata.schema.api.search.Aggregation; | ||||||||||
| import org.openmetadata.schema.api.search.AssetTypeConfiguration; | ||||||||||
|
|
@@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory | |||||||||
| private static final String MATCH_TYPE_STANDARD = "standard"; | ||||||||||
| private static final String INDEX_ALL = "all"; | ||||||||||
| private static final String INDEX_DATA_ASSET = "dataAsset"; | ||||||||||
| private static final String ENTITY_TYPE_FIELD = "entityType"; | ||||||||||
| private static final String MINIMUM_SHOULD_MATCH = "2<70%"; | ||||||||||
| private static final float DEFAULT_TIE_BREAKER = 0.3f; | ||||||||||
| private static final float DEFAULT_BOOST = 1.0f; | ||||||||||
|
|
@@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2( | |||||||||
| indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| if (indexName.equals("all") || indexName.equals("dataAsset")) { | ||||||||||
| return buildDataAssetSearchBuilderV2( | ||||||||||
| indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); | ||||||||||
| if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { | ||||||||||
| return buildAllAssetsSearchBuilderV2( | ||||||||||
| searchQuery, fromOffset, size, includeExplain, includeAggregations); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| return switch (indexName) { | ||||||||||
|
|
@@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int | |||||||||
| queryBuilder = | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); | ||||||||||
| } else { | ||||||||||
| Map<String, Float> fields = ColumnSearchIndex.getFields(); | ||||||||||
| queryBuilder = | ||||||||||
| ElasticQueryBuilder.multiMatchQuery( | ||||||||||
| query, | ||||||||||
| fields, | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or, | ||||||||||
| String.valueOf(DEFAULT_TIE_BREAKER), | ||||||||||
| "0"); | ||||||||||
| queryBuilder = buildColumnMultiMatchV2(query); | ||||||||||
| } | ||||||||||
| es.co.elastic.clients.elasticsearch.core.search.Highlight hb = | ||||||||||
| buildHighlightsV2(List.of("name", "displayName", "description")); | ||||||||||
| return searchBuilderV2(queryBuilder, hb, from, size); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the | ||||||||||
| * {@code dataAsset} composite query. Uses {@link | ||||||||||
| * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced | ||||||||||
| * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code | ||||||||||
| * first_name} matches any column whose name contains just {@code first} or just {@code name}, | ||||||||||
| * which both inflates the column index hits and creates the dataAsset/tableColumn count | ||||||||||
| * mismatch tracked in github issue #3851. | ||||||||||
| */ | ||||||||||
| private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2( | ||||||||||
| String query) { | ||||||||||
| return ElasticQueryBuilder.multiMatchQuery( | ||||||||||
| query, | ||||||||||
| ColumnSearchIndex.getFields(), | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And, | ||||||||||
| String.valueOf(DEFAULT_TIE_BREAKER), | ||||||||||
| "0"); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder = | ||||||||||
| buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); | ||||||||||
|
|
@@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2( | |||||||||
| return searchRequestBuilder; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type | ||||||||||
| * union: each asset type contributes a clause built with its own configuration (column docs go | ||||||||||
| * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link | ||||||||||
| * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}. | ||||||||||
| * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns | ||||||||||
| * for the same query, by construction. Avoids the composite-config divergence behind | ||||||||||
| * github.com/open-metadata/openmetadata-collate#3851. | ||||||||||
| */ | ||||||||||
| public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2( | ||||||||||
| String query, int from, int size, boolean explain, boolean includeAggregations) { | ||||||||||
| AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings); | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery = | ||||||||||
| buildPerTypeUnionQueryV2(query); | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery = | ||||||||||
| applyFunctionScoringV2(baseQuery, compositeConfig); | ||||||||||
| es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder = | ||||||||||
| buildHighlightingIfNeededV2(query, compositeConfig); | ||||||||||
|
|
||||||||||
| ElasticSearchRequestBuilder searchRequestBuilder = | ||||||||||
| createSearchSourceBuilderV2(finalQuery, from, size); | ||||||||||
| if (highlightBuilder != null) { | ||||||||||
| searchRequestBuilder.highlighter(highlightBuilder); | ||||||||||
| } | ||||||||||
| if (includeAggregations) { | ||||||||||
| addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); | ||||||||||
| } | ||||||||||
| searchRequestBuilder.explain(explain); | ||||||||||
| return searchRequestBuilder; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2( | ||||||||||
| String query) { | ||||||||||
| if (isMatchAllQuery(query)) { | ||||||||||
| return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build(); | ||||||||||
| } | ||||||||||
| ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery(); | ||||||||||
| Set<String> configuredTypes = new HashSet<>(); | ||||||||||
| for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { | ||||||||||
|
gitar-bot[bot] marked this conversation as resolved.
Outdated
|
||||||||||
| String assetType = typeConfig.getAssetType(); | ||||||||||
| if (assetType == null || assetType.equals(INDEX_ALL)) { | ||||||||||
| continue; | ||||||||||
| } | ||||||||||
| configuredTypes.add(assetType); | ||||||||||
| union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); | ||||||||||
| } | ||||||||||
| union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); | ||||||||||
| union.minimumShouldMatch(1); | ||||||||||
| return union.build(); | ||||||||||
|
Comment on lines
+457
to
+511
|
||||||||||
| } | ||||||||||
|
|
||||||||||
| private static boolean isMatchAllQuery(String query) { | ||||||||||
| return query == null || query.trim().isEmpty() || query.trim().equals("*"); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2( | ||||||||||
| String query, String assetType, AssetTypeConfiguration typeConfig) { | ||||||||||
| es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner = | ||||||||||
| Entity.TABLE_COLUMN.equals(assetType) | ||||||||||
| ? buildColumnMultiMatchV2(query) | ||||||||||
| : buildBaseQueryV2(query, typeConfig); | ||||||||||
| return ElasticQueryBuilder.boolQuery() | ||||||||||
| .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) | ||||||||||
| .must(inner) | ||||||||||
| .build(); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in | ||||||||||
| * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). | ||||||||||
| */ | ||||||||||
| private es.co.elastic.clients.elasticsearch._types.query_dsl.Query | ||||||||||
| buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) { | ||||||||||
| ElasticQueryBuilder.BoolQueryBuilder fallback = | ||||||||||
| ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); | ||||||||||
| for (String configured : configuredTypes) { | ||||||||||
| fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); | ||||||||||
|
||||||||||
| for (String configured : configuredTypes) { | |
| fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); | |
| if (!configuredTypes.isEmpty()) { | |
| fallback.mustNot(ElasticQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes)); |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -14,9 +14,11 @@ | |||||||||||||||||
|
|
||||||||||||||||||
| import java.util.ArrayList; | ||||||||||||||||||
| import java.util.HashMap; | ||||||||||||||||||
| import java.util.HashSet; | ||||||||||||||||||
| import java.util.LinkedHashMap; | ||||||||||||||||||
| import java.util.List; | ||||||||||||||||||
| import java.util.Map; | ||||||||||||||||||
| import java.util.Set; | ||||||||||||||||||
| import java.util.stream.Collectors; | ||||||||||||||||||
| import lombok.extern.slf4j.Slf4j; | ||||||||||||||||||
| import org.openmetadata.schema.api.search.Aggregation; | ||||||||||||||||||
|
|
@@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory | |||||||||||||||||
| private static final String MATCH_TYPE_STANDARD = "standard"; | ||||||||||||||||||
| private static final String INDEX_ALL = "all"; | ||||||||||||||||||
| private static final String INDEX_DATA_ASSET = "dataAsset"; | ||||||||||||||||||
| private static final String ENTITY_TYPE_FIELD = "entityType"; | ||||||||||||||||||
| private static final String MINIMUM_SHOULD_MATCH = "2<70%"; | ||||||||||||||||||
| private static final float DEFAULT_TIE_BREAKER = 0.3f; | ||||||||||||||||||
| private static final float DEFAULT_BOOST = 1.0f; | ||||||||||||||||||
|
|
@@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2( | |||||||||||||||||
| indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| if (indexName.equals("all") || indexName.equals("dataAsset")) { | ||||||||||||||||||
| return buildDataAssetSearchBuilderV2( | ||||||||||||||||||
| indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); | ||||||||||||||||||
| if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { | ||||||||||||||||||
| return buildAllAssetsSearchBuilderV2( | ||||||||||||||||||
| searchQuery, fromOffset, size, includeExplain, includeAggregations); | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| return switch (indexName) { | ||||||||||||||||||
|
|
@@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro | |||||||||||||||||
| queryBuilder = | ||||||||||||||||||
| os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); | ||||||||||||||||||
| } else { | ||||||||||||||||||
| Map<String, Float> fields = ColumnSearchIndex.getFields(); | ||||||||||||||||||
| queryBuilder = | ||||||||||||||||||
| OpenSearchQueryBuilder.multiMatchQuery( | ||||||||||||||||||
| query, | ||||||||||||||||||
| fields, | ||||||||||||||||||
| os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, | ||||||||||||||||||
| os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or, | ||||||||||||||||||
| String.valueOf(DEFAULT_TIE_BREAKER), | ||||||||||||||||||
| "0"); | ||||||||||||||||||
| queryBuilder = buildColumnMultiMatchV2(query); | ||||||||||||||||||
| } | ||||||||||||||||||
| os.org.opensearch.client.opensearch.core.search.Highlight highlighter = | ||||||||||||||||||
| buildHighlightsV2(List.of("name", "displayName", "description")); | ||||||||||||||||||
| return searchBuilderV2(queryBuilder, highlighter, from, size); | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| /** | ||||||||||||||||||
| * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the | ||||||||||||||||||
| * {@code dataAsset} composite query. Uses {@link | ||||||||||||||||||
| * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced | ||||||||||||||||||
| * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some | ||||||||||||||||||
| * field. Without {@code And}, a query like {@code first_name} matches any column whose name | ||||||||||||||||||
| * contains just {@code first} or just {@code name}, which both inflates the column index hits | ||||||||||||||||||
| * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851. | ||||||||||||||||||
|
||||||||||||||||||
| * field. Without {@code And}, a query like {@code first_name} matches any column whose name | |
| * contains just {@code first} or just {@code name}, which both inflates the column index hits | |
| * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851. | |
| * field. Previously this query used the same helper with {@code Operator.Or} and a last | |
| * argument of {@code "0"}; with that OR-style query, a search like {@code first_name} matched | |
| * any column whose name contained just {@code first} or just {@code name}, which inflated the | |
| * column index hits and created the dataAsset/tableColumn count mismatch tracked in github | |
| * issue #3851. |
Copilot
AI
Apr 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
buildUnconfiguredAssetFallbackV2 adds one mustNot term(entityType=...) clause per configured type. With many configured asset types this can bloat the query and slow execution. Prefer a single mustNot with a terms query over the configuredTypes set (the query builder already supports termsQuery).
| for (String configured : configuredTypes) { | |
| fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); | |
| if (!configuredTypes.isEmpty()) { | |
| fallback.mustNot( | |
| OpenSearchQueryBuilder.termsQuery( | |
| ENTITY_TYPE_FIELD, | |
| configuredTypes.stream().map(FieldValue::of).toList())); |
Copilot
AI
Apr 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new all/dataAsset per-type-union behavior and the stricter column multi-match are not covered by tests in this PR. Please add/extend tests to ensure (a) dataAsset entityType bucket counts stay in sync with dedicated-index totals (especially tableColumn), and (b) underscore-split identifier queries (e.g. first_name) require all sub-tokens to match (no overmatching on just one token).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Javadoc mentions the previous column builder used
min_should_match=0, but the underlyingmultiMatchQuery(..., tieBreaker, fuzziness)helper’s last argument isfuzziness(and nominimum_should_matchis set when fuzziness is "0"). Please update the comment to reflect the actual previous behavior so it doesn’t mislead future debugging/tuning.