From bd4d566d6b9f520f5fa82060fdd6a195b73db93e Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:30:36 +0530 Subject: [PATCH 1/9] fix(search): align dataAsset aggregation counts with index=tableColumn totals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the `dataAsset`/`all` alias query path so each entity-type bucket in the aggregation matches what its dedicated index returns for the same query. The composite asset config used to merge fields from every type, then apply phrase/ngram-fuzzy semantics to all docs; column docs got semantics different from `buildColumnSearchBuilderV2`, which is why the explore search bar's two calls disagreed on the tableColumn count. The new `buildAllAssetsSearchBuilderV2` builds a per-entity-type bool union: each clause is `filter(entityType=) must()`. The column branch reuses `buildColumnMultiMatchV2`; every other type goes through `buildBaseQueryV2` with its dedicated config; an extra `should` covers asset types in the `dataAsset` alias that lack a config (e.g. `glossary`, `apiCollection`) using the default config. Also tightens the column builder to `Operator.And` so multi-token queries like `first_name` require every sub-token to match somewhere — fixes the `om_analyzer`-driven over-match where the lenient `Or` + `min_should_match=0` variant matched any column whose name contained just `first` or just `name`. `ColumnSearchIndex.getFields()` gains `name.ngram`, `name.compound`, `displayName.ngram`, `displayName.compound` so prefix queries like `fir` still match column docs from both `index=tableColumn` and the dataAsset bucket. Includes `scripts/reproduce_column_agg_mismatch.sh` — multi-query probe that exits non-zero on any divergence between `index=dataAsset` (aggregation bucket) and `index=tableColumn` (total) for the same query. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ElasticSearchSourceBuilderFactory.java | 120 +++++++++++++++-- .../search/indexes/ColumnSearchIndex.java | 4 + .../OpenSearchSourceBuilderFactory.java | 125 ++++++++++++++++-- scripts/reproduce_column_agg_mismatch.sh | 99 ++++++++++++++ 4 files changed, 324 insertions(+), 24 deletions(-) create mode 100755 scripts/reproduce_column_agg_mismatch.sh diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java index ee62799dd9f2..0a3a2ae29210 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java @@ -18,9 +18,11 @@ import es.co.elastic.clients.elasticsearch.core.search.Highlight; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import org.openmetadata.schema.api.search.Aggregation; import org.openmetadata.schema.api.search.AssetTypeConfiguration; @@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int queryBuilder = es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - ElasticQueryBuilder.multiMatchQuery( - query, - fields, - es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, - es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } es.co.elastic.clients.elasticsearch.core.search.Highlight hb = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, hb, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the + * {@code dataAsset} composite query. Uses {@link + * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code + * first_name} matches any column whose name contains just {@code first} or just {@code name}, + * which both inflates the column index hits and creates the dataAsset/tableColumn count + * mismatch tracked in github issue #3851. + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return ElasticQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, + es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + ElasticSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build(); + } + ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return ElasticQueryBuilder.boolQuery() + .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + ElasticQueryBuilder.BoolQueryBuilder fallback = + ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + for (String configured : configuredTypes) { + fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + } + return fallback.build(); + } + public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2( String query, int from, int size) { return buildAggregateSearchBuilderV2(query, from, size, true); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java index 1b9a76525611..9e4ea9659488 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java @@ -153,8 +153,12 @@ public static Map getFields() { Map fields = new HashMap<>(); fields.put("name", 10.0f); fields.put("name.keyword", 20.0f); + fields.put("name.ngram", 1.0f); + fields.put("name.compound", 5.0f); fields.put("displayName", 7.0f); fields.put("displayName.keyword", 20.0f); + fields.put("displayName.ngram", 1.0f); + fields.put("displayName.compound", 4.0f); fields.put("fullyQualifiedName", 5.0f); fields.put("description", 2.0f); fields.put("dataType", 3.0f); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java index 7deb93c7d7da..fdd7bd107981 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java @@ -14,9 +14,11 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.api.search.Aggregation; @@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro queryBuilder = os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - OpenSearchQueryBuilder.multiMatchQuery( - query, - fields, - os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, - os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } os.org.opensearch.client.opensearch.core.search.Highlight highlighter = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, highlighter, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the + * {@code dataAsset} composite query. Uses {@link + * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some + * field. Without {@code And}, a query like {@code first_name} matches any column whose name + * contains just {@code first} or just {@code name}, which both inflates the column index hits + * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return OpenSearchQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, + os.org.opensearch.client.opensearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig(); + os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + OpenSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return OpenSearchQueryBuilder.boolQuery() + .must(OpenSearchQueryBuilder.matchAllQuery()) + .build(); + } + OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + os.org.opensearch.client.opensearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return OpenSearchQueryBuilder.boolQuery() + .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + * Without this, docs of those types would silently disappear from the dataAsset alias after the + * per-type-union refactor. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + OpenSearchQueryBuilder.BoolQueryBuilder fallback = + OpenSearchQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + for (String configured : configuredTypes) { + fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + } + return fallback.build(); + } + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2( String query, AssetTypeConfiguration assetConfig) { if (query == null || query.trim().isEmpty() || query.trim().equals("*")) { diff --git a/scripts/reproduce_column_agg_mismatch.sh b/scripts/reproduce_column_agg_mismatch.sh new file mode 100755 index 000000000000..5601b13c6f1f --- /dev/null +++ b/scripts/reproduce_column_agg_mismatch.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# Reproduces GitHub issue open-metadata/openmetadata-collate#3851: +# +# When the explore search bar fires its two requests for the same query — +# a) GET /api/v1/search/query?q=...&index=dataAsset&size=0 (entity-type aggregation) +# b) GET /api/v1/search/query?q=...&index=tableColumn&size=N (column hits) +# — the count for `tableColumn` reported by (a) does not match the total reported by (b). +# +# Root cause: +# `index=tableColumn` routes through `buildColumnSearchBuilderV2` (a lenient multi_match). +# `index=dataAsset` routes through `buildDataAssetSearchBuilderV2` with the composite asset +# config, which applies stricter phrase / 2<70% matching to column docs. The two query shapes +# produce different result sets even though both indexes contain the same column documents. +# +# Related secondary bug (the "_ issue"): +# The `om_analyzer` splits identifiers like `first_name` on letter/digit/underscore boundaries +# into ["first","name"]. With `Operator.Or` + `min_should_match=0` (the lenient column builder), +# any column whose name contains ONLY "first" or ONLY "name" is also returned, drowning out the +# actual relevant matches. +# +# This script does NOT create test assets — it probes the running catalog with a fixed list of +# queries that are known to expose the divergence on the bundled sample data, prints the counts +# side-by-side, and exits non-zero if any query shows a mismatch. Asset-based isolation lives +# in the regression test (ColumnSearchIndexIT). +# +# Usage: +# OM_HOST=http://localhost:8585 OM_TOKEN= ./reproduce_column_agg_mismatch.sh [extra-query] + +set -euo pipefail + +HOST="${OM_HOST:-http://localhost:8585}" +TOKEN="${OM_TOKEN:-eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg}" + +QUERIES=( + "first_name" + "last_name" + "first name" + "shipping address" + "first name address" +) +[ "${1:-}" ] && QUERIES+=("$1") + +AUTH=(-H "Authorization: Bearer $TOKEN") + +probe() { + local q="$1" + local enc agg_file col_file + enc=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "$q") + agg_file=$(mktemp) + col_file=$(mktemp) + if ! curl -sf "${AUTH[@]}" \ + "$HOST/api/v1/search/query?q=$enc&index=dataAsset&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \ + -o "$agg_file" \ + || ! curl -sf "${AUTH[@]}" \ + "$HOST/api/v1/search/query?q=$enc&index=tableColumn&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \ + -o "$col_file"; then + echo " [ERR ] q=\"$q\" — search API call failed; is OM running at $HOST?" + rm -f "$agg_file" "$col_file" + return 2 + fi + python3 - "$q" "$agg_file" "$col_file" <<'PY' +import json, sys +q, agg_path, col_path = sys.argv[1], sys.argv[2], sys.argv[3] +with open(agg_path) as f: agg = json.load(f) +with open(col_path) as f: col = json.load(f) +agg_total = agg.get('hits', {}).get('total', {}).get('value', '?') +col_total = col.get('hits', {}).get('total', {}).get('value', '?') +buckets = agg.get('aggregations', {}).get('sterms#entityType', {}).get('buckets', []) +agg_tc = next((b.get('doc_count', 0) for b in buckets if b.get('key') == 'tableColumn'), 0) +status = "OK " if agg_tc == col_total else "DIFF" +print(f" [{status}] q={q!r:<26} dataAsset.total={agg_total:<6} agg.tableColumnBucket={agg_tc:<6} tableColumn.total={col_total}") +sys.exit(0 if agg_tc == col_total else 2) +PY + local rc=$? + rm -f "$agg_file" "$col_file" + return $rc +} + +echo "Host : $HOST" +if ! curl -sf -o /dev/null "${AUTH[@]}" "$HOST/api/v1/system/version"; then + echo "ERROR: $HOST is not reachable (or auth is wrong). Start OM and retry." + exit 4 +fi +echo "Probing ${#QUERIES[@]} queries; OK = bucket count matches index=tableColumn total." +echo "------------------------------------------------------------------" +fail=0 +for q in "${QUERIES[@]}"; do + probe "$q" || fail=1 +done +echo "------------------------------------------------------------------" + +ENC0=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "${QUERIES[0]}") +echo +echo "UI verification:" +echo " 1. Open $HOST/explore/tableColumn?search=$ENC0" +echo " 2. Note the count badge on the tableColumn tab vs the entity-type aggregation panel." +echo " Before the fix they diverge; after the fix they should match." + +exit "$fail" From f264c55762f439a36f63f94858ab83380f6b1752 Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:35:54 +0530 Subject: [PATCH 2/9] test(search): add regression tests for dataAsset/tableColumn parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds integration tests under ColumnSearchIndexIT that pin the behavior of the fix in PR #27846: - testDataAssetTableColumnAggregationMatchesTableColumnTotal: dataAsset bucket count for tableColumn equals index=tableColumn total for a multi-token query against seeded columns. - testColumnQueryRequiresAllSubtokensToMatch: query "_first_name" must match the seeded "_first_name" column but NOT "_first_id" — pins the Operator.And fix that closed the om_analyzer sub-token over-match. - testDataAssetTableBucketMatchesTableIndexTotal: same parity guarantee for the "table" entity-type bucket, exercising the per-type-union path for a non-column type. - testPrefixQueryMatchesViaNgramOnBothPaths: short prefix queries (e.g. the first few chars of the seeded tag) must match seeded columns via name.ngram and stay in parity across both endpoints. - testUnconfiguredAssetTypeFallbackMatchesViaDataAsset: a Glossary doc (an asset type without an explicit searchSettings.assetTypeConfigurations entry) must still surface via index=dataAsset, exercising the fallback should clause in buildPerTypeUnionQueryV2. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../it/tests/ColumnSearchIndexIT.java | 315 ++++++++++++++++++ 1 file changed, 315 insertions(+) diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java index 9ca30e3febdc..25da9ea227ff 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java @@ -20,8 +20,11 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.TimeUnit; +import org.awaitility.Awaitility; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -33,15 +36,18 @@ import org.openmetadata.it.util.TestNamespaceExtension; import org.openmetadata.schema.api.data.CreateDatabase; import org.openmetadata.schema.api.data.CreateDatabaseSchema; +import org.openmetadata.schema.api.data.CreateGlossary; import org.openmetadata.schema.api.data.CreateTable; import org.openmetadata.schema.entity.data.Database; import org.openmetadata.schema.entity.data.DatabaseSchema; +import org.openmetadata.schema.entity.data.Glossary; import org.openmetadata.schema.entity.data.Table; import org.openmetadata.schema.entity.services.DatabaseService; import org.openmetadata.schema.type.Column; import org.openmetadata.schema.type.ColumnDataType; import org.openmetadata.sdk.client.OpenMetadataClient; import org.openmetadata.sdk.fluent.DatabaseServices; +import org.openmetadata.service.Entity; /** * Integration tests for column search indexing during table reindexing. Verifies: @@ -353,6 +359,268 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception { } } + @Nested + @DisplayName("DataAsset/TableColumn Aggregation Parity Tests") + @Execution(ExecutionMode.CONCURRENT) + class DataAssetColumnAggregationTests { + + /** + * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two + * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the + * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through + * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the + * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with + * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the + * {@code dataAsset} aggregation under-counts the column index hits. + */ + @Test + @DisplayName( + "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total") + void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns) + throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag); + assertNotNull(table); + + String multiTokenQuery = tag + "_first " + tag + "_address"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until( + () -> { + String r = + client + .search() + .query(multiTokenQuery) + .index("tableColumn") + .size(0) + .deleted(false) + .execute(); + JsonNode root = OBJECT_MAPPER.readTree(r); + long total = root.path("hits").path("total").path("value").asLong(-1); + return total >= 3; + }); + + String columnResponse = + client + .search() + .query(multiTokenQuery) + .index("tableColumn") + .size(0) + .deleted(false) + .execute(); + long columnTotal = + OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong(); + + String aggResponse = + client + .search() + .query(multiTokenQuery) + .index("dataAsset") + .size(0) + .deleted(false) + .execute(); + JsonNode aggBuckets = + OBJECT_MAPPER + .readTree(aggResponse) + .path("aggregations") + .path("sterms#entityType") + .path("buckets"); + long aggColumnCount = 0; + for (JsonNode bucket : aggBuckets) { + if ("tableColumn".equals(bucket.path("key").asText())) { + aggColumnCount = bucket.path("doc_count").asLong(); + break; + } + } + + assertEquals( + columnTotal, + aggColumnCount, + "dataAsset aggregation tableColumn bucket (" + + aggColumnCount + + ") must match index=tableColumn total (" + + columnTotal + + ") for query \"" + + multiTokenQuery + + "\""); + } + + /** + * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name} + * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0} + * column builder, a column called {@code _first_id} matched a query of {@code + * _first_name} because the single token {@code first} was enough. The fix moves the + * column builder to {@code Operator.And}, so every sub-token must match. + */ + @Test + @DisplayName("Column query for first_name must not match a column called first_id") + void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag); + assertNotNull(table); + + String firstNameColumn = tag + "_first_name"; + String firstIdColumn = tag + "_first_id"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn)); + + Set hits = hitsForColumnQuery(client, firstNameColumn); + assertTrue( + hits.contains(firstNameColumn), + "Query " + firstNameColumn + " must match the column with the same name; got " + hits); + assertFalse( + hits.contains(firstIdColumn), + "Query " + + firstNameColumn + + " must not match column " + + firstIdColumn + + " (only the 'first' sub-token overlaps); got " + + hits); + } + + /** + * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal} + * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must + * make every entity-type bucket equal to what the dedicated index returns. + */ + @Test + @DisplayName("Aggregation bucket for table under dataAsset must match index=table total") + void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + String tableQueryTag = "tblparity" + tag; + Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1); + + long tableTotal = totalHitsForIndex(client, table.getName(), "table"); + long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table"); + + assertEquals( + tableTotal, + aggTableBucket, + "dataAsset aggregation table bucket must equal index=table total for query " + + table.getName()); + } + + /** + * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union + * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram} + * and {@code name.compound}, which are now part of {@link + * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket + * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the + * same prefix query, and the seeded column must be in both result sets. + */ + @Test + @DisplayName("Prefix queries match via ngram and stay in parity across both paths") + void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag); + assertNotNull(table); + + String prefixQuery = tag.substring(0, Math.min(4, tag.length())); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5); + + long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn"); + long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN); + + assertTrue( + columnTotal > 0, + "Prefix query " + prefixQuery + " should match seeded columns via name.ngram"); + assertEquals( + columnTotal, + aggColumnBucket, + "dataAsset tableColumn bucket (" + + aggColumnBucket + + ") must match index=tableColumn total (" + + columnTotal + + ") for prefix query " + + prefixQuery); + } + + /** + * Asset types that live in the {@code dataAsset} alias but lack a {@code + * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via + * the composite-config field merge. The per-type-union path adds a fallback {@code should} + * clause for these; this test pins that a glossary doc still appears in {@code + * index=dataAsset} hits when its name matches the query. + */ + @Test + @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias") + void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String name = ns.prefix("glossary_unconfigured_fallback"); + CreateGlossary req = new CreateGlossary().withName(name).withDescription(name); + Glossary glossary = client.glossaries().create(req); + assertNotNull(glossary); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1); + + long total = totalHitsForIndex(client, name, "dataAsset"); + assertTrue( + total >= 1, + "Glossary doc with name " + + name + + " must be reachable via index=dataAsset (fallback clause for unconfigured types)"); + } + + private Set hitsForColumnQuery(OpenMetadataClient client, String query) + throws Exception { + String response = + client.search().query(query).index("tableColumn").size(50).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set names = new HashSet<>(); + for (JsonNode hit : hits) { + names.add(hit.path("_source").path("name").asText()); + } + return names; + } + + private long totalHitsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = client.search().query(query).index(index).size(0).deleted(false).execute(); + return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong(); + } + + private long bucketCountFromDataAsset( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(0).deleted(false).execute(); + JsonNode buckets = + OBJECT_MAPPER + .readTree(response) + .path("aggregations") + .path("sterms#entityType") + .path("buckets"); + for (JsonNode bucket : buckets) { + if (entityType.equals(bucket.path("key").asText())) { + return bucket.path("doc_count").asLong(); + } + } + return 0L; + } + } + @Nested @DisplayName("Nested Column Tests") @Execution(ExecutionMode.CONCURRENT) @@ -493,6 +761,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) { return SdkClients.adminClient().tables().create(tableRequest); } + private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + + org.openmetadata.schema.services.connections.database.PostgresConnection conn = + DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build(); + + DatabaseService dbService = + DatabaseServices.builder() + .name("agg_svc_" + shortId + "_" + baseName) + .connection(conn) + .description("Test service for dataAsset/tableColumn aggregation parity") + .create(); + + CreateDatabase dbReq = new CreateDatabase(); + dbReq.setName("agg_db_" + shortId + "_" + baseName); + dbReq.setService(dbService.getFullyQualifiedName()); + Database database = SdkClients.adminClient().databases().create(dbReq); + + CreateDatabaseSchema schemaReq = new CreateDatabaseSchema(); + schemaReq.setName("agg_schema_" + shortId + "_" + baseName); + schemaReq.setDatabase(database.getFullyQualifiedName()); + DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq); + + CreateTable tableRequest = new CreateTable(); + tableRequest.setName(ns.prefix(baseName)); + tableRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + tableRequest.setColumns( + List.of( + new Column() + .withName(tag + "_first_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("First name"), + new Column() + .withName(tag + "_last_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("Last name"), + new Column() + .withName(tag + "_address") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(512) + .withDescription("Postal address"))); + + return SdkClients.adminClient().tables().create(tableRequest); + } + private Table createTableWithNestedColumns(TestNamespace ns, String baseName) { String shortId = ns.shortPrefix(); From b2d40ff17131687a84e4e1d8f66c2943d7cae9da Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:39:00 +0530 Subject: [PATCH 3/9] fix(search): align dataAsset aggregation counts with index=tableColumn totals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the `dataAsset`/`all` alias query path so each entity-type bucket in the aggregation matches what its dedicated index returns for the same query. The composite asset config used to merge fields from every type, then apply phrase/ngram-fuzzy semantics to all docs; column docs got semantics different from `buildColumnSearchBuilderV2`, which is why the explore search bar's two calls disagreed on the tableColumn count. The new `buildAllAssetsSearchBuilderV2` builds a per-entity-type bool union: each clause is `filter(entityType=) must()`. The column branch reuses `buildColumnMultiMatchV2`; every other type goes through `buildBaseQueryV2` with its dedicated config; an extra `should` covers asset types in the `dataAsset` alias that lack a config (e.g. `glossary`, `apiCollection`) using the default config. Also tightens the column builder to `Operator.And` so multi-token queries like `first_name` require every sub-token to match somewhere — fixes the `om_analyzer`-driven over-match where the lenient `Or` + `min_should_match=0` variant matched any column whose name contained just `first` or just `name`. `ColumnSearchIndex.getFields()` gains `name.ngram`, `name.compound`, `displayName.ngram`, `displayName.compound` so prefix queries like `fir` still match column docs from both `index=tableColumn` and the dataAsset bucket. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ElasticSearchSourceBuilderFactory.java | 120 +++++++++++++++-- .../search/indexes/ColumnSearchIndex.java | 4 + .../OpenSearchSourceBuilderFactory.java | 125 ++++++++++++++++-- 3 files changed, 225 insertions(+), 24 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java index ee62799dd9f2..0a3a2ae29210 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java @@ -18,9 +18,11 @@ import es.co.elastic.clients.elasticsearch.core.search.Highlight; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import org.openmetadata.schema.api.search.Aggregation; import org.openmetadata.schema.api.search.AssetTypeConfiguration; @@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int queryBuilder = es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - ElasticQueryBuilder.multiMatchQuery( - query, - fields, - es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, - es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } es.co.elastic.clients.elasticsearch.core.search.Highlight hb = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, hb, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the + * {@code dataAsset} composite query. Uses {@link + * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code + * first_name} matches any column whose name contains just {@code first} or just {@code name}, + * which both inflates the column index hits and creates the dataAsset/tableColumn count + * mismatch tracked in github issue #3851. + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return ElasticQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, + es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + ElasticSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build(); + } + ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return ElasticQueryBuilder.boolQuery() + .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + ElasticQueryBuilder.BoolQueryBuilder fallback = + ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + for (String configured : configuredTypes) { + fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + } + return fallback.build(); + } + public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2( String query, int from, int size) { return buildAggregateSearchBuilderV2(query, from, size, true); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java index 1b9a76525611..9e4ea9659488 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java @@ -153,8 +153,12 @@ public static Map getFields() { Map fields = new HashMap<>(); fields.put("name", 10.0f); fields.put("name.keyword", 20.0f); + fields.put("name.ngram", 1.0f); + fields.put("name.compound", 5.0f); fields.put("displayName", 7.0f); fields.put("displayName.keyword", 20.0f); + fields.put("displayName.ngram", 1.0f); + fields.put("displayName.compound", 4.0f); fields.put("fullyQualifiedName", 5.0f); fields.put("description", 2.0f); fields.put("dataType", 3.0f); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java index 7deb93c7d7da..fdd7bd107981 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java @@ -14,9 +14,11 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.api.search.Aggregation; @@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro queryBuilder = os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - OpenSearchQueryBuilder.multiMatchQuery( - query, - fields, - os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, - os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } os.org.opensearch.client.opensearch.core.search.Highlight highlighter = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, highlighter, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the + * {@code dataAsset} composite query. Uses {@link + * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some + * field. Without {@code And}, a query like {@code first_name} matches any column whose name + * contains just {@code first} or just {@code name}, which both inflates the column index hits + * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return OpenSearchQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, + os.org.opensearch.client.opensearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig(); + os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + OpenSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return OpenSearchQueryBuilder.boolQuery() + .must(OpenSearchQueryBuilder.matchAllQuery()) + .build(); + } + OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + os.org.opensearch.client.opensearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return OpenSearchQueryBuilder.boolQuery() + .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + * Without this, docs of those types would silently disappear from the dataAsset alias after the + * per-type-union refactor. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + OpenSearchQueryBuilder.BoolQueryBuilder fallback = + OpenSearchQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + for (String configured : configuredTypes) { + fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + } + return fallback.build(); + } + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2( String query, AssetTypeConfiguration assetConfig) { if (query == null || query.trim().isEmpty() || query.trim().equals("*")) { From 6d079ab29e2e9bfb3cf0e4fd93cc2b2cc81ed7f8 Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:39:00 +0530 Subject: [PATCH 4/9] test(search): add regression tests for dataAsset/tableColumn parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds integration tests under ColumnSearchIndexIT that pin the behavior of the fix in PR #27846: - testDataAssetTableColumnAggregationMatchesTableColumnTotal: dataAsset bucket count for tableColumn equals index=tableColumn total for a multi-token query against seeded columns. - testColumnQueryRequiresAllSubtokensToMatch: query "_first_name" must match the seeded "_first_name" column but NOT "_first_id" — pins the Operator.And fix that closed the om_analyzer sub-token over-match. - testDataAssetTableBucketMatchesTableIndexTotal: same parity guarantee for the "table" entity-type bucket, exercising the per-type-union path for a non-column type. - testPrefixQueryMatchesViaNgramOnBothPaths: short prefix queries (e.g. the first few chars of the seeded tag) must match seeded columns via name.ngram and stay in parity across both endpoints. - testUnconfiguredAssetTypeFallbackMatchesViaDataAsset: a Glossary doc (an asset type without an explicit searchSettings.assetTypeConfigurations entry) must still surface via index=dataAsset, exercising the fallback should clause in buildPerTypeUnionQueryV2. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../it/tests/ColumnSearchIndexIT.java | 315 ++++++++++++++++++ 1 file changed, 315 insertions(+) diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java index 9ca30e3febdc..25da9ea227ff 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java @@ -20,8 +20,11 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.TimeUnit; +import org.awaitility.Awaitility; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -33,15 +36,18 @@ import org.openmetadata.it.util.TestNamespaceExtension; import org.openmetadata.schema.api.data.CreateDatabase; import org.openmetadata.schema.api.data.CreateDatabaseSchema; +import org.openmetadata.schema.api.data.CreateGlossary; import org.openmetadata.schema.api.data.CreateTable; import org.openmetadata.schema.entity.data.Database; import org.openmetadata.schema.entity.data.DatabaseSchema; +import org.openmetadata.schema.entity.data.Glossary; import org.openmetadata.schema.entity.data.Table; import org.openmetadata.schema.entity.services.DatabaseService; import org.openmetadata.schema.type.Column; import org.openmetadata.schema.type.ColumnDataType; import org.openmetadata.sdk.client.OpenMetadataClient; import org.openmetadata.sdk.fluent.DatabaseServices; +import org.openmetadata.service.Entity; /** * Integration tests for column search indexing during table reindexing. Verifies: @@ -353,6 +359,268 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception { } } + @Nested + @DisplayName("DataAsset/TableColumn Aggregation Parity Tests") + @Execution(ExecutionMode.CONCURRENT) + class DataAssetColumnAggregationTests { + + /** + * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two + * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the + * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through + * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the + * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with + * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the + * {@code dataAsset} aggregation under-counts the column index hits. + */ + @Test + @DisplayName( + "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total") + void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns) + throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag); + assertNotNull(table); + + String multiTokenQuery = tag + "_first " + tag + "_address"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until( + () -> { + String r = + client + .search() + .query(multiTokenQuery) + .index("tableColumn") + .size(0) + .deleted(false) + .execute(); + JsonNode root = OBJECT_MAPPER.readTree(r); + long total = root.path("hits").path("total").path("value").asLong(-1); + return total >= 3; + }); + + String columnResponse = + client + .search() + .query(multiTokenQuery) + .index("tableColumn") + .size(0) + .deleted(false) + .execute(); + long columnTotal = + OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong(); + + String aggResponse = + client + .search() + .query(multiTokenQuery) + .index("dataAsset") + .size(0) + .deleted(false) + .execute(); + JsonNode aggBuckets = + OBJECT_MAPPER + .readTree(aggResponse) + .path("aggregations") + .path("sterms#entityType") + .path("buckets"); + long aggColumnCount = 0; + for (JsonNode bucket : aggBuckets) { + if ("tableColumn".equals(bucket.path("key").asText())) { + aggColumnCount = bucket.path("doc_count").asLong(); + break; + } + } + + assertEquals( + columnTotal, + aggColumnCount, + "dataAsset aggregation tableColumn bucket (" + + aggColumnCount + + ") must match index=tableColumn total (" + + columnTotal + + ") for query \"" + + multiTokenQuery + + "\""); + } + + /** + * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name} + * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0} + * column builder, a column called {@code _first_id} matched a query of {@code + * _first_name} because the single token {@code first} was enough. The fix moves the + * column builder to {@code Operator.And}, so every sub-token must match. + */ + @Test + @DisplayName("Column query for first_name must not match a column called first_id") + void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag); + assertNotNull(table); + + String firstNameColumn = tag + "_first_name"; + String firstIdColumn = tag + "_first_id"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn)); + + Set hits = hitsForColumnQuery(client, firstNameColumn); + assertTrue( + hits.contains(firstNameColumn), + "Query " + firstNameColumn + " must match the column with the same name; got " + hits); + assertFalse( + hits.contains(firstIdColumn), + "Query " + + firstNameColumn + + " must not match column " + + firstIdColumn + + " (only the 'first' sub-token overlaps); got " + + hits); + } + + /** + * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal} + * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must + * make every entity-type bucket equal to what the dedicated index returns. + */ + @Test + @DisplayName("Aggregation bucket for table under dataAsset must match index=table total") + void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + String tableQueryTag = "tblparity" + tag; + Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1); + + long tableTotal = totalHitsForIndex(client, table.getName(), "table"); + long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table"); + + assertEquals( + tableTotal, + aggTableBucket, + "dataAsset aggregation table bucket must equal index=table total for query " + + table.getName()); + } + + /** + * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union + * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram} + * and {@code name.compound}, which are now part of {@link + * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket + * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the + * same prefix query, and the seeded column must be in both result sets. + */ + @Test + @DisplayName("Prefix queries match via ngram and stay in parity across both paths") + void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag); + assertNotNull(table); + + String prefixQuery = tag.substring(0, Math.min(4, tag.length())); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5); + + long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn"); + long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN); + + assertTrue( + columnTotal > 0, + "Prefix query " + prefixQuery + " should match seeded columns via name.ngram"); + assertEquals( + columnTotal, + aggColumnBucket, + "dataAsset tableColumn bucket (" + + aggColumnBucket + + ") must match index=tableColumn total (" + + columnTotal + + ") for prefix query " + + prefixQuery); + } + + /** + * Asset types that live in the {@code dataAsset} alias but lack a {@code + * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via + * the composite-config field merge. The per-type-union path adds a fallback {@code should} + * clause for these; this test pins that a glossary doc still appears in {@code + * index=dataAsset} hits when its name matches the query. + */ + @Test + @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias") + void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String name = ns.prefix("glossary_unconfigured_fallback"); + CreateGlossary req = new CreateGlossary().withName(name).withDescription(name); + Glossary glossary = client.glossaries().create(req); + assertNotNull(glossary); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1); + + long total = totalHitsForIndex(client, name, "dataAsset"); + assertTrue( + total >= 1, + "Glossary doc with name " + + name + + " must be reachable via index=dataAsset (fallback clause for unconfigured types)"); + } + + private Set hitsForColumnQuery(OpenMetadataClient client, String query) + throws Exception { + String response = + client.search().query(query).index("tableColumn").size(50).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set names = new HashSet<>(); + for (JsonNode hit : hits) { + names.add(hit.path("_source").path("name").asText()); + } + return names; + } + + private long totalHitsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = client.search().query(query).index(index).size(0).deleted(false).execute(); + return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong(); + } + + private long bucketCountFromDataAsset( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(0).deleted(false).execute(); + JsonNode buckets = + OBJECT_MAPPER + .readTree(response) + .path("aggregations") + .path("sterms#entityType") + .path("buckets"); + for (JsonNode bucket : buckets) { + if (entityType.equals(bucket.path("key").asText())) { + return bucket.path("doc_count").asLong(); + } + } + return 0L; + } + } + @Nested @DisplayName("Nested Column Tests") @Execution(ExecutionMode.CONCURRENT) @@ -493,6 +761,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) { return SdkClients.adminClient().tables().create(tableRequest); } + private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + + org.openmetadata.schema.services.connections.database.PostgresConnection conn = + DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build(); + + DatabaseService dbService = + DatabaseServices.builder() + .name("agg_svc_" + shortId + "_" + baseName) + .connection(conn) + .description("Test service for dataAsset/tableColumn aggregation parity") + .create(); + + CreateDatabase dbReq = new CreateDatabase(); + dbReq.setName("agg_db_" + shortId + "_" + baseName); + dbReq.setService(dbService.getFullyQualifiedName()); + Database database = SdkClients.adminClient().databases().create(dbReq); + + CreateDatabaseSchema schemaReq = new CreateDatabaseSchema(); + schemaReq.setName("agg_schema_" + shortId + "_" + baseName); + schemaReq.setDatabase(database.getFullyQualifiedName()); + DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq); + + CreateTable tableRequest = new CreateTable(); + tableRequest.setName(ns.prefix(baseName)); + tableRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + tableRequest.setColumns( + List.of( + new Column() + .withName(tag + "_first_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("First name"), + new Column() + .withName(tag + "_last_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("Last name"), + new Column() + .withName(tag + "_address") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(512) + .withDescription("Postal address"))); + + return SdkClients.adminClient().tables().create(tableRequest); + } + private Table createTableWithNestedColumns(TestNamespace ns, String baseName) { String shortId = ns.shortPrefix(); From caf858063267d74ef9729a70026d24462764393d Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:43:21 +0530 Subject: [PATCH 5/9] remove local script --- scripts/reproduce_column_agg_mismatch.sh | 99 ------------------------ 1 file changed, 99 deletions(-) delete mode 100755 scripts/reproduce_column_agg_mismatch.sh diff --git a/scripts/reproduce_column_agg_mismatch.sh b/scripts/reproduce_column_agg_mismatch.sh deleted file mode 100755 index 5601b13c6f1f..000000000000 --- a/scripts/reproduce_column_agg_mismatch.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash -# Reproduces GitHub issue open-metadata/openmetadata-collate#3851: -# -# When the explore search bar fires its two requests for the same query — -# a) GET /api/v1/search/query?q=...&index=dataAsset&size=0 (entity-type aggregation) -# b) GET /api/v1/search/query?q=...&index=tableColumn&size=N (column hits) -# — the count for `tableColumn` reported by (a) does not match the total reported by (b). -# -# Root cause: -# `index=tableColumn` routes through `buildColumnSearchBuilderV2` (a lenient multi_match). -# `index=dataAsset` routes through `buildDataAssetSearchBuilderV2` with the composite asset -# config, which applies stricter phrase / 2<70% matching to column docs. The two query shapes -# produce different result sets even though both indexes contain the same column documents. -# -# Related secondary bug (the "_ issue"): -# The `om_analyzer` splits identifiers like `first_name` on letter/digit/underscore boundaries -# into ["first","name"]. With `Operator.Or` + `min_should_match=0` (the lenient column builder), -# any column whose name contains ONLY "first" or ONLY "name" is also returned, drowning out the -# actual relevant matches. -# -# This script does NOT create test assets — it probes the running catalog with a fixed list of -# queries that are known to expose the divergence on the bundled sample data, prints the counts -# side-by-side, and exits non-zero if any query shows a mismatch. Asset-based isolation lives -# in the regression test (ColumnSearchIndexIT). -# -# Usage: -# OM_HOST=http://localhost:8585 OM_TOKEN= ./reproduce_column_agg_mismatch.sh [extra-query] - -set -euo pipefail - -HOST="${OM_HOST:-http://localhost:8585}" -TOKEN="${OM_TOKEN:-eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg}" - -QUERIES=( - "first_name" - "last_name" - "first name" - "shipping address" - "first name address" -) -[ "${1:-}" ] && QUERIES+=("$1") - -AUTH=(-H "Authorization: Bearer $TOKEN") - -probe() { - local q="$1" - local enc agg_file col_file - enc=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "$q") - agg_file=$(mktemp) - col_file=$(mktemp) - if ! curl -sf "${AUTH[@]}" \ - "$HOST/api/v1/search/query?q=$enc&index=dataAsset&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \ - -o "$agg_file" \ - || ! curl -sf "${AUTH[@]}" \ - "$HOST/api/v1/search/query?q=$enc&index=tableColumn&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \ - -o "$col_file"; then - echo " [ERR ] q=\"$q\" — search API call failed; is OM running at $HOST?" - rm -f "$agg_file" "$col_file" - return 2 - fi - python3 - "$q" "$agg_file" "$col_file" <<'PY' -import json, sys -q, agg_path, col_path = sys.argv[1], sys.argv[2], sys.argv[3] -with open(agg_path) as f: agg = json.load(f) -with open(col_path) as f: col = json.load(f) -agg_total = agg.get('hits', {}).get('total', {}).get('value', '?') -col_total = col.get('hits', {}).get('total', {}).get('value', '?') -buckets = agg.get('aggregations', {}).get('sterms#entityType', {}).get('buckets', []) -agg_tc = next((b.get('doc_count', 0) for b in buckets if b.get('key') == 'tableColumn'), 0) -status = "OK " if agg_tc == col_total else "DIFF" -print(f" [{status}] q={q!r:<26} dataAsset.total={agg_total:<6} agg.tableColumnBucket={agg_tc:<6} tableColumn.total={col_total}") -sys.exit(0 if agg_tc == col_total else 2) -PY - local rc=$? - rm -f "$agg_file" "$col_file" - return $rc -} - -echo "Host : $HOST" -if ! curl -sf -o /dev/null "${AUTH[@]}" "$HOST/api/v1/system/version"; then - echo "ERROR: $HOST is not reachable (or auth is wrong). Start OM and retry." - exit 4 -fi -echo "Probing ${#QUERIES[@]} queries; OK = bucket count matches index=tableColumn total." -echo "------------------------------------------------------------------" -fail=0 -for q in "${QUERIES[@]}"; do - probe "$q" || fail=1 -done -echo "------------------------------------------------------------------" - -ENC0=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "${QUERIES[0]}") -echo -echo "UI verification:" -echo " 1. Open $HOST/explore/tableColumn?search=$ENC0" -echo " 2. Note the count badge on the tableColumn tab vs the entity-type aggregation panel." -echo " Before the fix they diverge; after the fix they should match." - -exit "$fail" From e8826079f7443d47baa97e288929ceca284b441d Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:47:32 +0530 Subject: [PATCH 6/9] test(search): add complex-syntax and topic-parity coverage - testTopicBucketAndResultSetMatchIndexTopic: pins parity for a non-column entity type at both the count and the FQN-set level, exercising the per-type-union path for `topic`. The dataAsset alias must return the same set of topic FQNs that index=topic returns for the same query. - testComplexSyntaxQueriesKeepParity: runs four representative complex- syntax shapes (quoted phrase, AND, OR, mixed parenthesised) through both endpoints and asserts the tableColumn bucket count equals index=tableColumn total. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../it/tests/ColumnSearchIndexIT.java | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java index 25da9ea227ff..20c3d8110422 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java @@ -42,6 +42,7 @@ import org.openmetadata.schema.entity.data.DatabaseSchema; import org.openmetadata.schema.entity.data.Glossary; import org.openmetadata.schema.entity.data.Table; +import org.openmetadata.schema.entity.data.Topic; import org.openmetadata.schema.entity.services.DatabaseService; import org.openmetadata.schema.type.Column; import org.openmetadata.schema.type.ColumnDataType; @@ -584,6 +585,81 @@ void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) thro + " must be reachable via index=dataAsset (fallback clause for unconfigured types)"); } + /** + * Pins parity for a non-column entity type at both the count and the FQN-set level. Topic is + * chosen to exercise a different per-type clause than {@code table}, which already has its own + * test. The {@code dataAsset} alias must return the same set of topic FQNs that {@code + * index=topic} returns for the same query, with matching counts — otherwise the explore tab's + * topic count and the topic-tab results would disagree. + */ + @Test + @DisplayName("Topic bucket and topic FQN set must match index=topic for same query") + void testTopicBucketAndResultSetMatchIndexTopic(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Topic topic = createUniqueTopic(ns, "topicparity_" + tag, tag); + assertNotNull(topic); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, tag, "topic") >= 1); + + long topicTotal = totalHitsForIndex(client, tag, "topic"); + long aggTopicBucket = bucketCountFromDataAsset(client, tag, Entity.TOPIC); + assertEquals( + topicTotal, + aggTopicBucket, + "dataAsset topic bucket must equal index=topic total for query " + tag); + + Set topicFqns = fqnsForIndex(client, tag, "topic"); + Set dataAssetTopicFqns = fqnsFromDataAssetForType(client, tag, Entity.TOPIC); + assertEquals( + topicFqns, + dataAssetTopicFqns, + "Topic FQNs returned by index=topic must equal topic-typed FQNs returned by " + + "index=dataAsset for the same query"); + } + + /** + * Complex syntax queries (quoted phrases, AND/OR operators) flow through {@code + * buildBaseQueryV2 -> buildComplexSyntaxQueryV2} for non-column types and through the column + * builder for {@code tableColumn}. Both paths run against both endpoints — the parity + * guarantee from the per-type-union refactor must hold here too. This test exercises three + * representative shapes and asserts count parity for the {@code tableColumn} bucket. + */ + @Test + @DisplayName("Complex-syntax queries keep dataAsset/tableColumn parity") + void testComplexSyntaxQueriesKeepParity(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "complex_" + tag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, tag, "tableColumn") >= 5); + + List queries = + List.of( + "\"" + tag + " alpha\"", + tag + " AND alpha", + tag + " OR alpha", + "(" + tag + " AND alpha) OR (" + tag + " AND bravo)"); + + for (String complexQuery : queries) { + long columnTotal = totalHitsForIndex(client, complexQuery, "tableColumn"); + long aggBucket = bucketCountFromDataAsset(client, complexQuery, Entity.TABLE_COLUMN); + assertEquals( + columnTotal, + aggBucket, + "dataAsset tableColumn bucket must equal index=tableColumn total for complex " + + "query " + + complexQuery); + } + } + private Set hitsForColumnQuery(OpenMetadataClient client, String query) throws Exception { String response = @@ -596,6 +672,56 @@ private Set hitsForColumnQuery(OpenMetadataClient client, String query) return names; } + private Set fqnsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = + client.search().query(query).index(index).size(200).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set fqns = new HashSet<>(); + for (JsonNode hit : hits) { + fqns.add(hit.path("_source").path("fullyQualifiedName").asText()); + } + return fqns; + } + + private Set fqnsFromDataAssetForType( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(200).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set fqns = new HashSet<>(); + for (JsonNode hit : hits) { + if (entityType.equals(hit.path("_source").path("entityType").asText())) { + fqns.add(hit.path("_source").path("fullyQualifiedName").asText()); + } + } + return fqns; + } + + private Topic createUniqueTopic(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + org.openmetadata.schema.services.connections.messaging.KafkaConnection kafkaConn = + new org.openmetadata.schema.services.connections.messaging.KafkaConnection() + .withBootstrapServers("localhost:9092"); + org.openmetadata.schema.api.services.CreateMessagingService msgSvcReq = + new org.openmetadata.schema.api.services.CreateMessagingService() + .withName("topic_parity_svc_" + shortId + "_" + baseName) + .withServiceType( + org.openmetadata.schema.api.services.CreateMessagingService.MessagingServiceType + .Kafka) + .withConnection( + new org.openmetadata.schema.type.MessagingConnection().withConfig(kafkaConn)); + org.openmetadata.schema.entity.services.MessagingService msgService = + SdkClients.adminClient().messagingServices().create(msgSvcReq); + + org.openmetadata.schema.api.data.CreateTopic topicRequest = + new org.openmetadata.schema.api.data.CreateTopic() + .withName(ns.prefix(tag + "_topic_" + baseName)) + .withService(msgService.getFullyQualifiedName()) + .withPartitions(1); + return SdkClients.adminClient().topics().create(topicRequest); + } + private long totalHitsForIndex(OpenMetadataClient client, String query, String index) throws Exception { String response = client.search().query(query).index(index).size(0).deleted(false).execute(); From 4a8011b5ca1af134668004928fc2c248452bc48b Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 16:52:49 +0530 Subject: [PATCH 7/9] fix(search): align dataAsset aggregation counts with index=tableColumn totals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the `dataAsset`/`all` alias query path so each entity-type bucket in the aggregation matches what its dedicated index returns for the same query. The composite asset config used to merge fields from every type, then apply phrase/ngram-fuzzy semantics to all docs; column docs got semantics different from `buildColumnSearchBuilderV2`, which is why the explore search bar's two calls disagreed on the tableColumn count. The new `buildAllAssetsSearchBuilderV2` builds a per-entity-type bool union: each clause is `filter(entityType=) must()`. The column branch reuses `buildColumnMultiMatchV2`; every other type goes through `buildBaseQueryV2` with its dedicated config; an extra `should` covers asset types in the `dataAsset` alias that lack a config (e.g. `glossary`, `apiCollection`) using the default config. Also tightens the column builder to `Operator.And` so multi-token queries like `first_name` require every sub-token to match somewhere — fixes the `om_analyzer`-driven over-match where the lenient `Or` + `min_should_match=0` variant matched any column whose name contained just `first` or just `name`. `ColumnSearchIndex.getFields()` gains `name.ngram`, `name.compound`, `displayName.ngram`, `displayName.compound` so prefix queries like `fir` still match column docs from both `index=tableColumn` and the dataAsset bucket. Adds integration tests in ColumnSearchIndexIT covering: - multi-token query parity for the tableColumn bucket vs index=tableColumn - sub-token over-match guard (query `_first_name` must not match a seeded `_first_id` column) - table bucket parity (count) and topic parity (count + FQN-set match) - prefix queries via `name.ngram` keeping parity across both endpoints - complex-syntax queries (quoted phrase, AND/OR, mixed parenthesised) keeping the tableColumn bucket parity - unconfigured asset type fallback (Glossary docs reachable via dataAsset) Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../it/tests/ColumnSearchIndexIT.java | 441 ++++++++++++++++++ .../ElasticSearchSourceBuilderFactory.java | 120 ++++- .../search/indexes/ColumnSearchIndex.java | 4 + .../OpenSearchSourceBuilderFactory.java | 125 ++++- 4 files changed, 666 insertions(+), 24 deletions(-) diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java index 9ca30e3febdc..20c3d8110422 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java @@ -20,8 +20,11 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.TimeUnit; +import org.awaitility.Awaitility; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -33,15 +36,19 @@ import org.openmetadata.it.util.TestNamespaceExtension; import org.openmetadata.schema.api.data.CreateDatabase; import org.openmetadata.schema.api.data.CreateDatabaseSchema; +import org.openmetadata.schema.api.data.CreateGlossary; import org.openmetadata.schema.api.data.CreateTable; import org.openmetadata.schema.entity.data.Database; import org.openmetadata.schema.entity.data.DatabaseSchema; +import org.openmetadata.schema.entity.data.Glossary; import org.openmetadata.schema.entity.data.Table; +import org.openmetadata.schema.entity.data.Topic; import org.openmetadata.schema.entity.services.DatabaseService; import org.openmetadata.schema.type.Column; import org.openmetadata.schema.type.ColumnDataType; import org.openmetadata.sdk.client.OpenMetadataClient; import org.openmetadata.sdk.fluent.DatabaseServices; +import org.openmetadata.service.Entity; /** * Integration tests for column search indexing during table reindexing. Verifies: @@ -353,6 +360,393 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception { } } + @Nested + @DisplayName("DataAsset/TableColumn Aggregation Parity Tests") + @Execution(ExecutionMode.CONCURRENT) + class DataAssetColumnAggregationTests { + + /** + * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two + * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the + * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through + * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the + * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with + * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the + * {@code dataAsset} aggregation under-counts the column index hits. + */ + @Test + @DisplayName( + "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total") + void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns) + throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag); + assertNotNull(table); + + String multiTokenQuery = tag + "_first " + tag + "_address"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until( + () -> { + String r = + client + .search() + .query(multiTokenQuery) + .index("tableColumn") + .size(0) + .deleted(false) + .execute(); + JsonNode root = OBJECT_MAPPER.readTree(r); + long total = root.path("hits").path("total").path("value").asLong(-1); + return total >= 3; + }); + + String columnResponse = + client + .search() + .query(multiTokenQuery) + .index("tableColumn") + .size(0) + .deleted(false) + .execute(); + long columnTotal = + OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong(); + + String aggResponse = + client + .search() + .query(multiTokenQuery) + .index("dataAsset") + .size(0) + .deleted(false) + .execute(); + JsonNode aggBuckets = + OBJECT_MAPPER + .readTree(aggResponse) + .path("aggregations") + .path("sterms#entityType") + .path("buckets"); + long aggColumnCount = 0; + for (JsonNode bucket : aggBuckets) { + if ("tableColumn".equals(bucket.path("key").asText())) { + aggColumnCount = bucket.path("doc_count").asLong(); + break; + } + } + + assertEquals( + columnTotal, + aggColumnCount, + "dataAsset aggregation tableColumn bucket (" + + aggColumnCount + + ") must match index=tableColumn total (" + + columnTotal + + ") for query \"" + + multiTokenQuery + + "\""); + } + + /** + * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name} + * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0} + * column builder, a column called {@code _first_id} matched a query of {@code + * _first_name} because the single token {@code first} was enough. The fix moves the + * column builder to {@code Operator.And}, so every sub-token must match. + */ + @Test + @DisplayName("Column query for first_name must not match a column called first_id") + void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag); + assertNotNull(table); + + String firstNameColumn = tag + "_first_name"; + String firstIdColumn = tag + "_first_id"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn)); + + Set hits = hitsForColumnQuery(client, firstNameColumn); + assertTrue( + hits.contains(firstNameColumn), + "Query " + firstNameColumn + " must match the column with the same name; got " + hits); + assertFalse( + hits.contains(firstIdColumn), + "Query " + + firstNameColumn + + " must not match column " + + firstIdColumn + + " (only the 'first' sub-token overlaps); got " + + hits); + } + + /** + * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal} + * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must + * make every entity-type bucket equal to what the dedicated index returns. + */ + @Test + @DisplayName("Aggregation bucket for table under dataAsset must match index=table total") + void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + String tableQueryTag = "tblparity" + tag; + Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1); + + long tableTotal = totalHitsForIndex(client, table.getName(), "table"); + long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table"); + + assertEquals( + tableTotal, + aggTableBucket, + "dataAsset aggregation table bucket must equal index=table total for query " + + table.getName()); + } + + /** + * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union + * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram} + * and {@code name.compound}, which are now part of {@link + * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket + * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the + * same prefix query, and the seeded column must be in both result sets. + */ + @Test + @DisplayName("Prefix queries match via ngram and stay in parity across both paths") + void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag); + assertNotNull(table); + + String prefixQuery = tag.substring(0, Math.min(4, tag.length())); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5); + + long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn"); + long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN); + + assertTrue( + columnTotal > 0, + "Prefix query " + prefixQuery + " should match seeded columns via name.ngram"); + assertEquals( + columnTotal, + aggColumnBucket, + "dataAsset tableColumn bucket (" + + aggColumnBucket + + ") must match index=tableColumn total (" + + columnTotal + + ") for prefix query " + + prefixQuery); + } + + /** + * Asset types that live in the {@code dataAsset} alias but lack a {@code + * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via + * the composite-config field merge. The per-type-union path adds a fallback {@code should} + * clause for these; this test pins that a glossary doc still appears in {@code + * index=dataAsset} hits when its name matches the query. + */ + @Test + @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias") + void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String name = ns.prefix("glossary_unconfigured_fallback"); + CreateGlossary req = new CreateGlossary().withName(name).withDescription(name); + Glossary glossary = client.glossaries().create(req); + assertNotNull(glossary); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1); + + long total = totalHitsForIndex(client, name, "dataAsset"); + assertTrue( + total >= 1, + "Glossary doc with name " + + name + + " must be reachable via index=dataAsset (fallback clause for unconfigured types)"); + } + + /** + * Pins parity for a non-column entity type at both the count and the FQN-set level. Topic is + * chosen to exercise a different per-type clause than {@code table}, which already has its own + * test. The {@code dataAsset} alias must return the same set of topic FQNs that {@code + * index=topic} returns for the same query, with matching counts — otherwise the explore tab's + * topic count and the topic-tab results would disagree. + */ + @Test + @DisplayName("Topic bucket and topic FQN set must match index=topic for same query") + void testTopicBucketAndResultSetMatchIndexTopic(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Topic topic = createUniqueTopic(ns, "topicparity_" + tag, tag); + assertNotNull(topic); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, tag, "topic") >= 1); + + long topicTotal = totalHitsForIndex(client, tag, "topic"); + long aggTopicBucket = bucketCountFromDataAsset(client, tag, Entity.TOPIC); + assertEquals( + topicTotal, + aggTopicBucket, + "dataAsset topic bucket must equal index=topic total for query " + tag); + + Set topicFqns = fqnsForIndex(client, tag, "topic"); + Set dataAssetTopicFqns = fqnsFromDataAssetForType(client, tag, Entity.TOPIC); + assertEquals( + topicFqns, + dataAssetTopicFqns, + "Topic FQNs returned by index=topic must equal topic-typed FQNs returned by " + + "index=dataAsset for the same query"); + } + + /** + * Complex syntax queries (quoted phrases, AND/OR operators) flow through {@code + * buildBaseQueryV2 -> buildComplexSyntaxQueryV2} for non-column types and through the column + * builder for {@code tableColumn}. Both paths run against both endpoints — the parity + * guarantee from the per-type-union refactor must hold here too. This test exercises three + * representative shapes and asserts count parity for the {@code tableColumn} bucket. + */ + @Test + @DisplayName("Complex-syntax queries keep dataAsset/tableColumn parity") + void testComplexSyntaxQueriesKeepParity(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "complex_" + tag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, tag, "tableColumn") >= 5); + + List queries = + List.of( + "\"" + tag + " alpha\"", + tag + " AND alpha", + tag + " OR alpha", + "(" + tag + " AND alpha) OR (" + tag + " AND bravo)"); + + for (String complexQuery : queries) { + long columnTotal = totalHitsForIndex(client, complexQuery, "tableColumn"); + long aggBucket = bucketCountFromDataAsset(client, complexQuery, Entity.TABLE_COLUMN); + assertEquals( + columnTotal, + aggBucket, + "dataAsset tableColumn bucket must equal index=tableColumn total for complex " + + "query " + + complexQuery); + } + } + + private Set hitsForColumnQuery(OpenMetadataClient client, String query) + throws Exception { + String response = + client.search().query(query).index("tableColumn").size(50).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set names = new HashSet<>(); + for (JsonNode hit : hits) { + names.add(hit.path("_source").path("name").asText()); + } + return names; + } + + private Set fqnsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = + client.search().query(query).index(index).size(200).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set fqns = new HashSet<>(); + for (JsonNode hit : hits) { + fqns.add(hit.path("_source").path("fullyQualifiedName").asText()); + } + return fqns; + } + + private Set fqnsFromDataAssetForType( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(200).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set fqns = new HashSet<>(); + for (JsonNode hit : hits) { + if (entityType.equals(hit.path("_source").path("entityType").asText())) { + fqns.add(hit.path("_source").path("fullyQualifiedName").asText()); + } + } + return fqns; + } + + private Topic createUniqueTopic(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + org.openmetadata.schema.services.connections.messaging.KafkaConnection kafkaConn = + new org.openmetadata.schema.services.connections.messaging.KafkaConnection() + .withBootstrapServers("localhost:9092"); + org.openmetadata.schema.api.services.CreateMessagingService msgSvcReq = + new org.openmetadata.schema.api.services.CreateMessagingService() + .withName("topic_parity_svc_" + shortId + "_" + baseName) + .withServiceType( + org.openmetadata.schema.api.services.CreateMessagingService.MessagingServiceType + .Kafka) + .withConnection( + new org.openmetadata.schema.type.MessagingConnection().withConfig(kafkaConn)); + org.openmetadata.schema.entity.services.MessagingService msgService = + SdkClients.adminClient().messagingServices().create(msgSvcReq); + + org.openmetadata.schema.api.data.CreateTopic topicRequest = + new org.openmetadata.schema.api.data.CreateTopic() + .withName(ns.prefix(tag + "_topic_" + baseName)) + .withService(msgService.getFullyQualifiedName()) + .withPartitions(1); + return SdkClients.adminClient().topics().create(topicRequest); + } + + private long totalHitsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = client.search().query(query).index(index).size(0).deleted(false).execute(); + return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong(); + } + + private long bucketCountFromDataAsset( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(0).deleted(false).execute(); + JsonNode buckets = + OBJECT_MAPPER + .readTree(response) + .path("aggregations") + .path("sterms#entityType") + .path("buckets"); + for (JsonNode bucket : buckets) { + if (entityType.equals(bucket.path("key").asText())) { + return bucket.path("doc_count").asLong(); + } + } + return 0L; + } + } + @Nested @DisplayName("Nested Column Tests") @Execution(ExecutionMode.CONCURRENT) @@ -493,6 +887,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) { return SdkClients.adminClient().tables().create(tableRequest); } + private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + + org.openmetadata.schema.services.connections.database.PostgresConnection conn = + DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build(); + + DatabaseService dbService = + DatabaseServices.builder() + .name("agg_svc_" + shortId + "_" + baseName) + .connection(conn) + .description("Test service for dataAsset/tableColumn aggregation parity") + .create(); + + CreateDatabase dbReq = new CreateDatabase(); + dbReq.setName("agg_db_" + shortId + "_" + baseName); + dbReq.setService(dbService.getFullyQualifiedName()); + Database database = SdkClients.adminClient().databases().create(dbReq); + + CreateDatabaseSchema schemaReq = new CreateDatabaseSchema(); + schemaReq.setName("agg_schema_" + shortId + "_" + baseName); + schemaReq.setDatabase(database.getFullyQualifiedName()); + DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq); + + CreateTable tableRequest = new CreateTable(); + tableRequest.setName(ns.prefix(baseName)); + tableRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + tableRequest.setColumns( + List.of( + new Column() + .withName(tag + "_first_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("First name"), + new Column() + .withName(tag + "_last_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("Last name"), + new Column() + .withName(tag + "_address") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(512) + .withDescription("Postal address"))); + + return SdkClients.adminClient().tables().create(tableRequest); + } + private Table createTableWithNestedColumns(TestNamespace ns, String baseName) { String shortId = ns.shortPrefix(); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java index ee62799dd9f2..0a3a2ae29210 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java @@ -18,9 +18,11 @@ import es.co.elastic.clients.elasticsearch.core.search.Highlight; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import org.openmetadata.schema.api.search.Aggregation; import org.openmetadata.schema.api.search.AssetTypeConfiguration; @@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int queryBuilder = es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - ElasticQueryBuilder.multiMatchQuery( - query, - fields, - es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, - es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } es.co.elastic.clients.elasticsearch.core.search.Highlight hb = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, hb, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the + * {@code dataAsset} composite query. Uses {@link + * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code + * first_name} matches any column whose name contains just {@code first} or just {@code name}, + * which both inflates the column index hits and creates the dataAsset/tableColumn count + * mismatch tracked in github issue #3851. + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return ElasticQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, + es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + ElasticSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build(); + } + ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return ElasticQueryBuilder.boolQuery() + .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + ElasticQueryBuilder.BoolQueryBuilder fallback = + ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + for (String configured : configuredTypes) { + fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + } + return fallback.build(); + } + public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2( String query, int from, int size) { return buildAggregateSearchBuilderV2(query, from, size, true); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java index 1b9a76525611..9e4ea9659488 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java @@ -153,8 +153,12 @@ public static Map getFields() { Map fields = new HashMap<>(); fields.put("name", 10.0f); fields.put("name.keyword", 20.0f); + fields.put("name.ngram", 1.0f); + fields.put("name.compound", 5.0f); fields.put("displayName", 7.0f); fields.put("displayName.keyword", 20.0f); + fields.put("displayName.ngram", 1.0f); + fields.put("displayName.compound", 4.0f); fields.put("fullyQualifiedName", 5.0f); fields.put("description", 2.0f); fields.put("dataType", 3.0f); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java index 7deb93c7d7da..fdd7bd107981 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java @@ -14,9 +14,11 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.api.search.Aggregation; @@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro queryBuilder = os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - OpenSearchQueryBuilder.multiMatchQuery( - query, - fields, - os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, - os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } os.org.opensearch.client.opensearch.core.search.Highlight highlighter = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, highlighter, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the + * {@code dataAsset} composite query. Uses {@link + * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some + * field. Without {@code And}, a query like {@code first_name} matches any column whose name + * contains just {@code first} or just {@code name}, which both inflates the column index hits + * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return OpenSearchQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, + os.org.opensearch.client.opensearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig(); + os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + OpenSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return OpenSearchQueryBuilder.boolQuery() + .must(OpenSearchQueryBuilder.matchAllQuery()) + .build(); + } + OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + os.org.opensearch.client.opensearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return OpenSearchQueryBuilder.boolQuery() + .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + * Without this, docs of those types would silently disappear from the dataAsset alias after the + * per-type-union refactor. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + OpenSearchQueryBuilder.BoolQueryBuilder fallback = + OpenSearchQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + for (String configured : configuredTypes) { + fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + } + return fallback.build(); + } + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2( String query, AssetTypeConfiguration assetConfig) { if (query == null || query.trim().isEmpty() || query.trim().equals("*")) { From 718ceb9de327df8b35a3ae2dddcc96207555a5f5 Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 17:05:27 +0530 Subject: [PATCH 8/9] fix(search): address PR review feedback Bug fixes the reviewer flagged: - buildPerTypeUnionQueryV2 (both factories) now guards against searchSettings.getAssetTypeConfigurations() returning null/empty by falling back to the default config so dataAsset/all queries can't NPE if the search settings haven't been initialized. - testDataAssetTableColumnAggregationMatchesTableColumnTotal previously awaited a multi-token query that no single seeded column could satisfy under the new Operator.And semantics, leaving the parity assertion trivially 0 == 0; switched to a query that actually matches multiple seeded columns. - testColumnQueryRequiresAllSubtokensToMatch was relying on a column name (`_first_id`) that was never created, so the negative assertion was trivially true. createTableWithMultiTokenColumns now seeds first_id (and alpha/bravo columns used by the complex-syntax and prefix tests) so all the parity assertions exercise real indexed documents. - bucketCountFromDataAsset accepts both `entityType` and `sterms#entityType` aggregation key shapes so the helper doesn't break on backends that label the bucket differently. - Javadoc on buildColumnMultiMatchV2 (both factories) corrected to describe the actual previous shape (`Operator.Or` + `fuzziness="0"`, which left minimum_should_match unset) instead of the inaccurate `min_should_match=0`. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../it/tests/ColumnSearchIndexIT.java | 91 ++++++++----------- .../ElasticSearchSourceBuilderFactory.java | 22 +++-- .../OpenSearchSourceBuilderFactory.java | 20 ++-- 3 files changed, 65 insertions(+), 68 deletions(-) diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java index 20c3d8110422..943da589397b 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java @@ -385,59 +385,22 @@ void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag); assertNotNull(table); - String multiTokenQuery = tag + "_first " + tag + "_address"; + // Two analyzed sub-tokens (`` and `name`); seeded columns first_name and last_name + // contain both, so the AND-based column builder produces a non-zero match count we can + // assert real parity against (not a trivial 0 == 0). + String multiTokenQuery = tag + " name"; Awaitility.await() .atMost(90, TimeUnit.SECONDS) .pollInterval(500, TimeUnit.MILLISECONDS) - .until( - () -> { - String r = - client - .search() - .query(multiTokenQuery) - .index("tableColumn") - .size(0) - .deleted(false) - .execute(); - JsonNode root = OBJECT_MAPPER.readTree(r); - long total = root.path("hits").path("total").path("value").asLong(-1); - return total >= 3; - }); - - String columnResponse = - client - .search() - .query(multiTokenQuery) - .index("tableColumn") - .size(0) - .deleted(false) - .execute(); - long columnTotal = - OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong(); + .until(() -> totalHitsForIndex(client, multiTokenQuery, "tableColumn") >= 2); - String aggResponse = - client - .search() - .query(multiTokenQuery) - .index("dataAsset") - .size(0) - .deleted(false) - .execute(); - JsonNode aggBuckets = - OBJECT_MAPPER - .readTree(aggResponse) - .path("aggregations") - .path("sterms#entityType") - .path("buckets"); - long aggColumnCount = 0; - for (JsonNode bucket : aggBuckets) { - if ("tableColumn".equals(bucket.path("key").asText())) { - aggColumnCount = bucket.path("doc_count").asLong(); - break; - } - } + long columnTotal = totalHitsForIndex(client, multiTokenQuery, "tableColumn"); + long aggColumnCount = bucketCountFromDataAsset(client, multiTokenQuery, Entity.TABLE_COLUMN); + assertTrue( + columnTotal >= 2, + "Seeded columns first_name and last_name should match query " + multiTokenQuery); assertEquals( columnTotal, aggColumnCount, @@ -732,13 +695,12 @@ private long bucketCountFromDataAsset( OpenMetadataClient client, String query, String entityType) throws Exception { String response = client.search().query(query).index("dataAsset").size(0).deleted(false).execute(); - JsonNode buckets = - OBJECT_MAPPER - .readTree(response) - .path("aggregations") - .path("sterms#entityType") - .path("buckets"); - for (JsonNode bucket : buckets) { + JsonNode aggregations = OBJECT_MAPPER.readTree(response).path("aggregations"); + JsonNode entityTypeAgg = aggregations.path("entityType"); + if (entityTypeAgg.isMissingNode()) { + entityTypeAgg = aggregations.path("sterms#entityType"); + } + for (JsonNode bucket : entityTypeAgg.path("buckets")) { if (entityType.equals(bucket.path("key").asText())) { return bucket.path("doc_count").asLong(); } @@ -920,6 +882,10 @@ private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName .withDataType(ColumnDataType.VARCHAR) .withDataLength(255) .withDescription("First name"), + new Column() + .withName(tag + "_first_id") + .withDataType(ColumnDataType.INT) + .withDescription("Decoy: shares only the 'first' sub-token with first_name"), new Column() .withName(tag + "_last_name") .withDataType(ColumnDataType.VARCHAR) @@ -929,11 +895,26 @@ private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName .withName(tag + "_address") .withDataType(ColumnDataType.VARCHAR) .withDataLength(512) - .withDescription("Postal address"))); + .withDescription("Postal address"), + new Column() + .withName(tag + "_alpha_amount") + .withDataType(ColumnDataType.DECIMAL) + .withDescription("Alpha amount column for complex-syntax tests"), + new Column() + .withName(tag + "_alpha_address") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(512) + .withDescription("Alpha address column"), + new Column() + .withName(tag + "_bravo_total") + .withDataType(ColumnDataType.DECIMAL) + .withDescription("Bravo total column"))); return SdkClients.adminClient().tables().create(tableRequest); } + private static final int MULTI_TOKEN_SEED_COUNT = 7; + private Table createTableWithNestedColumns(TestNamespace ns, String baseName) { String shortId = ns.shortPrefix(); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java index 0a3a2ae29210..821918f47372 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java @@ -385,13 +385,15 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int } /** - * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the - * {@code dataAsset} composite query. Uses {@link + * Multi-match used both by {@code index=tableColumn} and the column branch of {@link + * #buildPerTypeUnionQueryV2(String)}. Uses {@link * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced - * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code - * first_name} matches any column whose name contains just {@code first} or just {@code name}, - * which both inflates the column index hits and creates the dataAsset/tableColumn count - * mismatch tracked in github issue #3851. + * by {@code om_analyzer} must hit some field. The previous shape used {@code Operator.Or} with + * this helper's {@code fuzziness="0"} — that combination did not set {@code + * minimum_should_match}, so any single sub-token in any field was enough to match. As a result + * a search like {@code first_name} matched columns whose name contained just {@code first} or + * just {@code name}, which both inflated the column index hits and caused the + * dataAsset/tableColumn count mismatch tracked in github issue #3851. */ private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2( String query) { @@ -488,9 +490,15 @@ private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeU if (isMatchAllQuery(query)) { return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build(); } + List configs = searchSettings.getAssetTypeConfigurations(); + if (configs == null || configs.isEmpty()) { + return ElasticQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())) + .build(); + } ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery(); Set configuredTypes = new HashSet<>(); - for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + for (AssetTypeConfiguration typeConfig : configs) { String assetType = typeConfig.getAssetType(); if (assetType == null || assetType.equals(INDEX_ALL)) { continue; diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java index fdd7bd107981..e749a7c02467 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java @@ -385,13 +385,15 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro } /** - * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the - * {@code dataAsset} composite query. Uses {@link + * Multi-match used both by {@code index=tableColumn} and the column branch of {@link + * #buildPerTypeUnionQueryV2(String)}. Uses {@link * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some - * field. Without {@code And}, a query like {@code first_name} matches any column whose name - * contains just {@code first} or just {@code name}, which both inflates the column index hits - * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851. + * field. The previous shape used {@code Operator.Or} with this helper's {@code fuzziness="0"} + * — that combination did not set {@code minimum_should_match}, so any single sub-token in any + * field was enough to match. As a result a search like {@code first_name} matched columns whose + * name contained just {@code first} or just {@code name}, which both inflated the column index + * hits and caused the dataAsset/tableColumn count mismatch tracked in github issue #3851. */ private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2( String query) { @@ -505,9 +507,15 @@ private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeU .must(OpenSearchQueryBuilder.matchAllQuery()) .build(); } + List configs = searchSettings.getAssetTypeConfigurations(); + if (configs == null || configs.isEmpty()) { + return OpenSearchQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())) + .build(); + } OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery(); Set configuredTypes = new HashSet<>(); - for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) { + for (AssetTypeConfiguration typeConfig : configs) { String assetType = typeConfig.getAssetType(); if (assetType == null || assetType.equals(INDEX_ALL)) { continue; From 90df4c2e03dcba415a0b2e34e67687248894e0a9 Mon Sep 17 00:00:00 2001 From: mohitdeuex Date: Thu, 30 Apr 2026 17:28:58 +0530 Subject: [PATCH 9/9] perf(search): collapse fallback mustNot chain into a single termsQuery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `termsQuery(String, Collection)` helper to both OpenSearchQueryBuilder and ElasticQueryBuilder, then uses it in buildUnconfiguredAssetFallbackV2 to replace the per-type `mustNot term(entityType=...)` chain with a single `mustNot terms(entityType=[...])`. Keeps the bool query small regardless of how many configured asset types exist — the previous shape grew one clause per configured type. Issue: open-metadata/openmetadata-collate#3851 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../service/search/elasticsearch/ElasticQueryBuilder.java | 7 +++++++ .../elasticsearch/ElasticSearchSourceBuilderFactory.java | 4 ++-- .../service/search/opensearch/OpenSearchQueryBuilder.java | 6 ++++++ .../search/opensearch/OpenSearchSourceBuilderFactory.java | 4 ++-- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java index 543e75d6363a..48d43a26dd29 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java @@ -1,10 +1,12 @@ package org.openmetadata.service.search.elasticsearch; +import es.co.elastic.clients.elasticsearch._types.FieldValue; import es.co.elastic.clients.elasticsearch._types.query_dsl.Operator; import es.co.elastic.clients.elasticsearch._types.query_dsl.Query; import es.co.elastic.clients.elasticsearch._types.query_dsl.QueryStringQuery; import es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; @@ -28,6 +30,11 @@ public static Query termQuery(String field, int value) { return Query.of(q -> q.term(t -> t.field(field).value(value))); } + public static Query termsQuery(String field, Collection values) { + List fieldValues = values.stream().map(FieldValue::of).toList(); + return Query.of(q -> q.terms(t -> t.field(field).terms(tf -> tf.value(fieldValues)))); + } + public static Query matchQuery(String field, String value) { return Query.of(q -> q.match(m -> m.field(field).query(value))); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java index 821918f47372..a2c2898f11e0 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java @@ -535,8 +535,8 @@ private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTyp buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { ElasticQueryBuilder.BoolQueryBuilder fallback = ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); - for (String configured : configuredTypes) { - fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + if (!configuredTypes.isEmpty()) { + fallback.mustNot(ElasticQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes)); } return fallback.build(); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java index bbe429084cd8..a01f2e07fca8 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java @@ -1,6 +1,7 @@ package org.openmetadata.service.search.opensearch; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; import os.org.opensearch.client.opensearch._types.FieldValue; @@ -29,6 +30,11 @@ public static Query termQuery(String field, int value) { return Query.of(q -> q.term(t -> t.field(field).value(FieldValue.of(value)))); } + public static Query termsQuery(String field, Collection values) { + List fieldValues = values.stream().map(FieldValue::of).toList(); + return Query.of(q -> q.terms(t -> t.field(field).terms(tf -> tf.value(fieldValues)))); + } + public static Query matchQuery(String field, String value) { return Query.of(q -> q.match(m -> m.field(field).query(FieldValue.of(value)))); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java index e749a7c02467..9273c46f4bd2 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java @@ -555,8 +555,8 @@ private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTyp OpenSearchQueryBuilder.BoolQueryBuilder fallback = OpenSearchQueryBuilder.boolQuery() .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); - for (String configured : configuredTypes) { - fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured)); + if (!configuredTypes.isEmpty()) { + fallback.mustNot(OpenSearchQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes)); } return fallback.build(); }