From bd4d566d6b9f520f5fa82060fdd6a195b73db93e Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:30:36 +0530
Subject: [PATCH 1/9] fix(search): align dataAsset aggregation counts with
 index=tableColumn totals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor the `dataAsset`/`all` alias query path so each entity-type bucket in
the aggregation matches what its dedicated index returns for the same query.
The composite asset config used to merge fields from every type, then apply
phrase/ngram-fuzzy semantics to all docs; column docs got semantics different
from `buildColumnSearchBuilderV2`, which is why the explore search bar's two
calls disagreed on the tableColumn count.

The new `buildAllAssetsSearchBuilderV2` builds a per-entity-type bool union:
each clause is `filter(entityType=<type>) must(<type's own query>)`. The
column branch reuses `buildColumnMultiMatchV2`; every other type goes through
`buildBaseQueryV2` with its dedicated config; an extra `should` covers asset
types in the `dataAsset` alias that lack a config (e.g. `glossary`,
`apiCollection`) using the default config.

Also tightens the column builder to `Operator.And` so multi-token queries
like `first_name` require every sub-token to match somewhere — fixes the
`om_analyzer`-driven over-match where the lenient `Or` + `min_should_match=0`
variant matched any column whose name contained just `first` or just `name`.

`ColumnSearchIndex.getFields()` gains `name.ngram`, `name.compound`,
`displayName.ngram`, `displayName.compound` so prefix queries like `fir`
still match column docs from both `index=tableColumn` and the dataAsset
bucket.

Includes `scripts/reproduce_column_agg_mismatch.sh` — multi-query probe that
exits non-zero on any divergence between `index=dataAsset` (aggregation
bucket) and `index=tableColumn` (total) for the same query.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ElasticSearchSourceBuilderFactory.java    | 120 +++++++++++++++--
 .../search/indexes/ColumnSearchIndex.java     |   4 +
 .../OpenSearchSourceBuilderFactory.java       | 125 ++++++++++++++++--
 scripts/reproduce_column_agg_mismatch.sh      |  99 ++++++++++++++
 4 files changed, 324 insertions(+), 24 deletions(-)
 create mode 100755 scripts/reproduce_column_agg_mismatch.sh
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
index ee62799dd9f2..0a3a2ae29210 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
@@ -18,9 +18,11 @@
 import es.co.elastic.clients.elasticsearch.core.search.Highlight;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.stream.Collectors;
 import org.openmetadata.schema.api.search.Aggregation;
 import org.openmetadata.schema.api.search.AssetTypeConfiguration;
@@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory
   private static final String MATCH_TYPE_STANDARD = "standard";
   private static final String INDEX_ALL = "all";
   private static final String INDEX_DATA_ASSET = "dataAsset";
+  private static final String ENTITY_TYPE_FIELD = "entityType";
   private static final String MINIMUM_SHOULD_MATCH = "2<70%";
   private static final float DEFAULT_TIE_BREAKER = 0.3f;
   private static final float DEFAULT_BOOST = 1.0f;
@@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2(
           indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
-    if (indexName.equals("all") || indexName.equals("dataAsset")) {
-      return buildDataAssetSearchBuilderV2(
-          indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
+    if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
+      return buildAllAssetsSearchBuilderV2(
+          searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
     return switch (indexName) {
@@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int
       queryBuilder =
           es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
     } else {
-      Map<String, Float> fields = ColumnSearchIndex.getFields();
-      queryBuilder =
-          ElasticQueryBuilder.multiMatchQuery(
-              query,
-              fields,
-              es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
-              es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or,
-              String.valueOf(DEFAULT_TIE_BREAKER),
-              "0");
+      queryBuilder = buildColumnMultiMatchV2(query);
     }
     es.co.elastic.clients.elasticsearch.core.search.Highlight hb =
         buildHighlightsV2(List.of("name", "displayName", "description"));
     return searchBuilderV2(queryBuilder, hb, from, size);
   }
 
+  /**
+   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
+   * {@code dataAsset} composite query. Uses {@link
+   * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced
+   * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code
+   * first_name} matches any column whose name contains just {@code first} or just {@code name},
+   * which both inflates the column index hits and creates the dataAsset/tableColumn count
+   * mismatch tracked in github issue #3851.
+   */
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2(
+      String query) {
+    return ElasticQueryBuilder.multiMatchQuery(
+        query,
+        ColumnSearchIndex.getFields(),
+        es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
+        es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And,
+        String.valueOf(DEFAULT_TIE_BREAKER),
+        "0");
+  }
+
   public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
     es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder =
         buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
@@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2(
     return searchRequestBuilder;
   }
 
+  /**
+   * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
+   * union: each asset type contributes a clause built with its own configuration (column docs go
+   * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
+   * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
+   * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
+   * for the same query, by construction. Avoids the composite-config divergence behind
+   * github.com/open-metadata/openmetadata-collate#3851.
+   */
+  public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2(
+      String query, int from, int size, boolean explain, boolean includeAggregations) {
+    AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings);
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery =
+        buildPerTypeUnionQueryV2(query);
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery =
+        applyFunctionScoringV2(baseQuery, compositeConfig);
+    es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder =
+        buildHighlightingIfNeededV2(query, compositeConfig);
+
+    ElasticSearchRequestBuilder searchRequestBuilder =
+        createSearchSourceBuilderV2(finalQuery, from, size);
+    if (highlightBuilder != null) {
+      searchRequestBuilder.highlighter(highlightBuilder);
+    }
+    if (includeAggregations) {
+      addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
+    }
+    searchRequestBuilder.explain(explain);
+    return searchRequestBuilder;
+  }
+
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
+      String query) {
+    if (isMatchAllQuery(query)) {
+      return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build();
+    }
+    ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery();
+    Set<String> configuredTypes = new HashSet<>();
+    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+      String assetType = typeConfig.getAssetType();
+      if (assetType == null || assetType.equals(INDEX_ALL)) {
+        continue;
+      }
+      configuredTypes.add(assetType);
+      union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
+    }
+    union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
+    union.minimumShouldMatch(1);
+    return union.build();
+  }
+
+  private static boolean isMatchAllQuery(String query) {
+    return query == null || query.trim().isEmpty() || query.trim().equals("*");
+  }
+
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2(
+      String query, String assetType, AssetTypeConfiguration typeConfig) {
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner =
+        Entity.TABLE_COLUMN.equals(assetType)
+            ? buildColumnMultiMatchV2(query)
+            : buildBaseQueryV2(query, typeConfig);
+    return ElasticQueryBuilder.boolQuery()
+        .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
+        .must(inner)
+        .build();
+  }
+
+  /**
+   * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
+   * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
+   */
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query
+      buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
+    ElasticQueryBuilder.BoolQueryBuilder fallback =
+        ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
+    for (String configured : configuredTypes) {
+      fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    }
+    return fallback.build();
+  }
+
   public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2(
       String query, int from, int size) {
     return buildAggregateSearchBuilderV2(query, from, size, true);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
index 1b9a76525611..9e4ea9659488 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
@@ -153,8 +153,12 @@ public static Map<String, Float> getFields() {
     Map<String, Float> fields = new HashMap<>();
     fields.put("name", 10.0f);
     fields.put("name.keyword", 20.0f);
+    fields.put("name.ngram", 1.0f);
+    fields.put("name.compound", 5.0f);
     fields.put("displayName", 7.0f);
     fields.put("displayName.keyword", 20.0f);
+    fields.put("displayName.ngram", 1.0f);
+    fields.put("displayName.compound", 4.0f);
     fields.put("fullyQualifiedName", 5.0f);
     fields.put("description", 2.0f);
     fields.put("dataType", 3.0f);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
index 7deb93c7d7da..fdd7bd107981 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
@@ -14,9 +14,11 @@
 
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.stream.Collectors;
 import lombok.extern.slf4j.Slf4j;
 import org.openmetadata.schema.api.search.Aggregation;
@@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory
   private static final String MATCH_TYPE_STANDARD = "standard";
   private static final String INDEX_ALL = "all";
   private static final String INDEX_DATA_ASSET = "dataAsset";
+  private static final String ENTITY_TYPE_FIELD = "entityType";
   private static final String MINIMUM_SHOULD_MATCH = "2<70%";
   private static final float DEFAULT_TIE_BREAKER = 0.3f;
   private static final float DEFAULT_BOOST = 1.0f;
@@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2(
           indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
-    if (indexName.equals("all") || indexName.equals("dataAsset")) {
-      return buildDataAssetSearchBuilderV2(
-          indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
+    if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
+      return buildAllAssetsSearchBuilderV2(
+          searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
     return switch (indexName) {
@@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro
       queryBuilder =
           os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
     } else {
-      Map<String, Float> fields = ColumnSearchIndex.getFields();
-      queryBuilder =
-          OpenSearchQueryBuilder.multiMatchQuery(
-              query,
-              fields,
-              os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
-              os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or,
-              String.valueOf(DEFAULT_TIE_BREAKER),
-              "0");
+      queryBuilder = buildColumnMultiMatchV2(query);
     }
     os.org.opensearch.client.opensearch.core.search.Highlight highlighter =
         buildHighlightsV2(List.of("name", "displayName", "description"));
     return searchBuilderV2(queryBuilder, highlighter, from, size);
   }
 
+  /**
+   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
+   * {@code dataAsset} composite query. Uses {@link
+   * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced
+   * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some
+   * field. Without {@code And}, a query like {@code first_name} matches any column whose name
+   * contains just {@code first} or just {@code name}, which both inflates the column index hits
+   * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851.
+   */
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2(
+      String query) {
+    return OpenSearchQueryBuilder.multiMatchQuery(
+        query,
+        ColumnSearchIndex.getFields(),
+        os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
+        os.org.opensearch.client.opensearch._types.query_dsl.Operator.And,
+        String.valueOf(DEFAULT_TIE_BREAKER),
+        "0");
+  }
+
   public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
     os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder =
         buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
@@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2(
     return searchRequestBuilder;
   }
 
+  /**
+   * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
+   * union: each asset type contributes a clause built with its own configuration (column docs go
+   * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
+   * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
+   * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
+   * for the same query, by construction. Avoids the composite-config divergence behind
+   * github.com/open-metadata/openmetadata-collate#3851.
+   */
+  public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2(
+      String query, int from, int size, boolean explain, boolean includeAggregations) {
+    AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig();
+    os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery =
+        buildPerTypeUnionQueryV2(query);
+    os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery =
+        applyFunctionScoringV2(baseQuery, compositeConfig);
+    os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder =
+        buildHighlightingIfNeededV2(query, compositeConfig);
+
+    OpenSearchRequestBuilder searchRequestBuilder =
+        createSearchSourceBuilderV2(finalQuery, from, size);
+    if (highlightBuilder != null) {
+      searchRequestBuilder.highlighter(highlightBuilder);
+    }
+    if (includeAggregations) {
+      addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
+    }
+    searchRequestBuilder.explain(explain);
+    return searchRequestBuilder;
+  }
+
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
+      String query) {
+    if (isMatchAllQuery(query)) {
+      return OpenSearchQueryBuilder.boolQuery()
+          .must(OpenSearchQueryBuilder.matchAllQuery())
+          .build();
+    }
+    OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery();
+    Set<String> configuredTypes = new HashSet<>();
+    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+      String assetType = typeConfig.getAssetType();
+      if (assetType == null || assetType.equals(INDEX_ALL)) {
+        continue;
+      }
+      configuredTypes.add(assetType);
+      union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
+    }
+    union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
+    union.minimumShouldMatch(1);
+    return union.build();
+  }
+
+  private static boolean isMatchAllQuery(String query) {
+    return query == null || query.trim().isEmpty() || query.trim().equals("*");
+  }
+
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2(
+      String query, String assetType, AssetTypeConfiguration typeConfig) {
+    os.org.opensearch.client.opensearch._types.query_dsl.Query inner =
+        Entity.TABLE_COLUMN.equals(assetType)
+            ? buildColumnMultiMatchV2(query)
+            : buildBaseQueryV2(query, typeConfig);
+    return OpenSearchQueryBuilder.boolQuery()
+        .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
+        .must(inner)
+        .build();
+  }
+
+  /**
+   * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
+   * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
+   * Without this, docs of those types would silently disappear from the dataAsset alias after the
+   * per-type-union refactor.
+   */
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query
+      buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
+    OpenSearchQueryBuilder.BoolQueryBuilder fallback =
+        OpenSearchQueryBuilder.boolQuery()
+            .must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
+    for (String configured : configuredTypes) {
+      fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    }
+    return fallback.build();
+  }
+
   private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2(
       String query, AssetTypeConfiguration assetConfig) {
     if (query == null || query.trim().isEmpty() || query.trim().equals("*")) {
diff --git a/scripts/reproduce_column_agg_mismatch.sh b/scripts/reproduce_column_agg_mismatch.sh
new file mode 100755
index 000000000000..5601b13c6f1f
--- /dev/null
+++ b/scripts/reproduce_column_agg_mismatch.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# Reproduces GitHub issue open-metadata/openmetadata-collate#3851:
+#
+#   When the explore search bar fires its two requests for the same query —
+#     a) GET /api/v1/search/query?q=...&index=dataAsset&size=0     (entity-type aggregation)
+#     b) GET /api/v1/search/query?q=...&index=tableColumn&size=N   (column hits)
+#   — the count for `tableColumn` reported by (a) does not match the total reported by (b).
+#
+# Root cause:
+#   `index=tableColumn` routes through `buildColumnSearchBuilderV2` (a lenient multi_match).
+#   `index=dataAsset`  routes through `buildDataAssetSearchBuilderV2` with the composite asset
+#   config, which applies stricter phrase / 2<70% matching to column docs. The two query shapes
+#   produce different result sets even though both indexes contain the same column documents.
+#
+# Related secondary bug (the "_ issue"):
+#   The `om_analyzer` splits identifiers like `first_name` on letter/digit/underscore boundaries
+#   into ["first","name"]. With `Operator.Or` + `min_should_match=0` (the lenient column builder),
+#   any column whose name contains ONLY "first" or ONLY "name" is also returned, drowning out the
+#   actual relevant matches.
+#
+# This script does NOT create test assets — it probes the running catalog with a fixed list of
+# queries that are known to expose the divergence on the bundled sample data, prints the counts
+# side-by-side, and exits non-zero if any query shows a mismatch. Asset-based isolation lives
+# in the regression test (ColumnSearchIndexIT).
+#
+# Usage:
+#   OM_HOST=http://localhost:8585 OM_TOKEN=<jwt> ./reproduce_column_agg_mismatch.sh [extra-query]
+
+set -euo pipefail
+
+HOST="${OM_HOST:-http://localhost:8585}"
+TOKEN="${OM_TOKEN:-eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg}"
+
+QUERIES=(
+  "first_name"
+  "last_name"
+  "first name"
+  "shipping address"
+  "first name address"
+)
+[ "${1:-}" ] && QUERIES+=("$1")
+
+AUTH=(-H "Authorization: Bearer $TOKEN")
+
+probe() {
+  local q="$1"
+  local enc agg_file col_file
+  enc=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "$q")
+  agg_file=$(mktemp)
+  col_file=$(mktemp)
+  if ! curl -sf "${AUTH[@]}" \
+    "$HOST/api/v1/search/query?q=$enc&index=dataAsset&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \
+    -o "$agg_file" \
+    || ! curl -sf "${AUTH[@]}" \
+    "$HOST/api/v1/search/query?q=$enc&index=tableColumn&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \
+    -o "$col_file"; then
+    echo "  [ERR ] q=\"$q\" — search API call failed; is OM running at $HOST?"
+    rm -f "$agg_file" "$col_file"
+    return 2
+  fi
+  python3 - "$q" "$agg_file" "$col_file" <<'PY'
+import json, sys
+q, agg_path, col_path = sys.argv[1], sys.argv[2], sys.argv[3]
+with open(agg_path) as f: agg = json.load(f)
+with open(col_path) as f: col = json.load(f)
+agg_total = agg.get('hits', {}).get('total', {}).get('value', '?')
+col_total = col.get('hits', {}).get('total', {}).get('value', '?')
+buckets = agg.get('aggregations', {}).get('sterms#entityType', {}).get('buckets', [])
+agg_tc = next((b.get('doc_count', 0) for b in buckets if b.get('key') == 'tableColumn'), 0)
+status = "OK  " if agg_tc == col_total else "DIFF"
+print(f"  [{status}] q={q!r:<26} dataAsset.total={agg_total:<6} agg.tableColumnBucket={agg_tc:<6} tableColumn.total={col_total}")
+sys.exit(0 if agg_tc == col_total else 2)
+PY
+  local rc=$?
+  rm -f "$agg_file" "$col_file"
+  return $rc
+}
+
+echo "Host : $HOST"
+if ! curl -sf -o /dev/null "${AUTH[@]}" "$HOST/api/v1/system/version"; then
+  echo "ERROR: $HOST is not reachable (or auth is wrong). Start OM and retry."
+  exit 4
+fi
+echo "Probing ${#QUERIES[@]} queries; OK = bucket count matches index=tableColumn total."
+echo "------------------------------------------------------------------"
+fail=0
+for q in "${QUERIES[@]}"; do
+  probe "$q" || fail=1
+done
+echo "------------------------------------------------------------------"
+
+ENC0=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "${QUERIES[0]}")
+echo
+echo "UI verification:"
+echo "  1. Open $HOST/explore/tableColumn?search=$ENC0"
+echo "  2. Note the count badge on the tableColumn tab vs the entity-type aggregation panel."
+echo "     Before the fix they diverge; after the fix they should match."
+
+exit "$fail"

From f264c55762f439a36f63f94858ab83380f6b1752 Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:35:54 +0530
Subject: [PATCH 2/9] test(search): add regression tests for
 dataAsset/tableColumn parity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds integration tests under ColumnSearchIndexIT that pin the behavior of the
fix in PR #27846:

- testDataAssetTableColumnAggregationMatchesTableColumnTotal: dataAsset
  bucket count for tableColumn equals index=tableColumn total for a
  multi-token query against seeded columns.
- testColumnQueryRequiresAllSubtokensToMatch: query "<tag>_first_name" must
  match the seeded "<tag>_first_name" column but NOT "<tag>_first_id" — pins
  the Operator.And fix that closed the om_analyzer sub-token over-match.
- testDataAssetTableBucketMatchesTableIndexTotal: same parity guarantee for
  the "table" entity-type bucket, exercising the per-type-union path for a
  non-column type.
- testPrefixQueryMatchesViaNgramOnBothPaths: short prefix queries (e.g. the
  first few chars of the seeded tag) must match seeded columns via
  name.ngram and stay in parity across both endpoints.
- testUnconfiguredAssetTypeFallbackMatchesViaDataAsset: a Glossary doc (an
  asset type without an explicit searchSettings.assetTypeConfigurations
  entry) must still surface via index=dataAsset, exercising the fallback
  should clause in buildPerTypeUnionQueryV2.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../it/tests/ColumnSearchIndexIT.java         | 315 ++++++++++++++++++
 1 file changed, 315 insertions(+)

diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
index 9ca30e3febdc..25da9ea227ff 100644
--- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
+++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
@@ -20,8 +20,11 @@
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
+import org.awaitility.Awaitility;
 import org.junit.jupiter.api.DisplayName;
 import org.junit.jupiter.api.Nested;
 import org.junit.jupiter.api.Test;
@@ -33,15 +36,18 @@
 import org.openmetadata.it.util.TestNamespaceExtension;
 import org.openmetadata.schema.api.data.CreateDatabase;
 import org.openmetadata.schema.api.data.CreateDatabaseSchema;
+import org.openmetadata.schema.api.data.CreateGlossary;
 import org.openmetadata.schema.api.data.CreateTable;
 import org.openmetadata.schema.entity.data.Database;
 import org.openmetadata.schema.entity.data.DatabaseSchema;
+import org.openmetadata.schema.entity.data.Glossary;
 import org.openmetadata.schema.entity.data.Table;
 import org.openmetadata.schema.entity.services.DatabaseService;
 import org.openmetadata.schema.type.Column;
 import org.openmetadata.schema.type.ColumnDataType;
 import org.openmetadata.sdk.client.OpenMetadataClient;
 import org.openmetadata.sdk.fluent.DatabaseServices;
+import org.openmetadata.service.Entity;
 
 /**
  * Integration tests for column search indexing during table reindexing. Verifies:
@@ -353,6 +359,268 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception {
     }
   }
 
+  @Nested
+  @DisplayName("DataAsset/TableColumn Aggregation Parity Tests")
+  @Execution(ExecutionMode.CONCURRENT)
+  class DataAssetColumnAggregationTests {
+
+    /**
+     * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two
+     * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the
+     * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through
+     * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the
+     * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with
+     * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the
+     * {@code dataAsset} aggregation under-counts the column index hits.
+     */
+    @Test
+    @DisplayName(
+        "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total")
+    void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns)
+        throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag);
+      assertNotNull(table);
+
+      String multiTokenQuery = tag + "_first " + tag + "_address";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(
+              () -> {
+                String r =
+                    client
+                        .search()
+                        .query(multiTokenQuery)
+                        .index("tableColumn")
+                        .size(0)
+                        .deleted(false)
+                        .execute();
+                JsonNode root = OBJECT_MAPPER.readTree(r);
+                long total = root.path("hits").path("total").path("value").asLong(-1);
+                return total >= 3;
+              });
+
+      String columnResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("tableColumn")
+              .size(0)
+              .deleted(false)
+              .execute();
+      long columnTotal =
+          OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong();
+
+      String aggResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("dataAsset")
+              .size(0)
+              .deleted(false)
+              .execute();
+      JsonNode aggBuckets =
+          OBJECT_MAPPER
+              .readTree(aggResponse)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
+      long aggColumnCount = 0;
+      for (JsonNode bucket : aggBuckets) {
+        if ("tableColumn".equals(bucket.path("key").asText())) {
+          aggColumnCount = bucket.path("doc_count").asLong();
+          break;
+        }
+      }
+
+      assertEquals(
+          columnTotal,
+          aggColumnCount,
+          "dataAsset aggregation tableColumn bucket ("
+              + aggColumnCount
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for query \""
+              + multiTokenQuery
+              + "\"");
+    }
+
+    /**
+     * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name}
+     * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0}
+     * column builder, a column called {@code <tag>_first_id} matched a query of {@code
+     * <tag>_first_name} because the single token {@code first} was enough. The fix moves the
+     * column builder to {@code Operator.And}, so every sub-token must match.
+     */
+    @Test
+    @DisplayName("Column query for first_name must not match a column called first_id")
+    void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag);
+      assertNotNull(table);
+
+      String firstNameColumn = tag + "_first_name";
+      String firstIdColumn = tag + "_first_id";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn));
+
+      Set<String> hits = hitsForColumnQuery(client, firstNameColumn);
+      assertTrue(
+          hits.contains(firstNameColumn),
+          "Query " + firstNameColumn + " must match the column with the same name; got " + hits);
+      assertFalse(
+          hits.contains(firstIdColumn),
+          "Query "
+              + firstNameColumn
+              + " must not match column "
+              + firstIdColumn
+              + " (only the 'first' sub-token overlaps); got "
+              + hits);
+    }
+
+    /**
+     * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal}
+     * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must
+     * make every entity-type bucket equal to what the dedicated index returns.
+     */
+    @Test
+    @DisplayName("Aggregation bucket for table under dataAsset must match index=table total")
+    void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      String tableQueryTag = "tblparity" + tag;
+      Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag);
+      assertNotNull(table);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1);
+
+      long tableTotal = totalHitsForIndex(client, table.getName(), "table");
+      long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table");
+
+      assertEquals(
+          tableTotal,
+          aggTableBucket,
+          "dataAsset aggregation table bucket must equal index=table total for query "
+              + table.getName());
+    }
+
+    /**
+     * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union
+     * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram}
+     * and {@code name.compound}, which are now part of {@link
+     * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket
+     * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the
+     * same prefix query, and the seeded column must be in both result sets.
+     */
+    @Test
+    @DisplayName("Prefix queries match via ngram and stay in parity across both paths")
+    void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag);
+      assertNotNull(table);
+
+      String prefixQuery = tag.substring(0, Math.min(4, tag.length()));
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5);
+
+      long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn");
+      long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN);
+
+      assertTrue(
+          columnTotal > 0,
+          "Prefix query " + prefixQuery + " should match seeded columns via name.ngram");
+      assertEquals(
+          columnTotal,
+          aggColumnBucket,
+          "dataAsset tableColumn bucket ("
+              + aggColumnBucket
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for prefix query "
+              + prefixQuery);
+    }
+
+    /**
+     * Asset types that live in the {@code dataAsset} alias but lack a {@code
+     * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via
+     * the composite-config field merge. The per-type-union path adds a fallback {@code should}
+     * clause for these; this test pins that a glossary doc still appears in {@code
+     * index=dataAsset} hits when its name matches the query.
+     */
+    @Test
+    @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias")
+    void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String name = ns.prefix("glossary_unconfigured_fallback");
+      CreateGlossary req = new CreateGlossary().withName(name).withDescription(name);
+      Glossary glossary = client.glossaries().create(req);
+      assertNotNull(glossary);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1);
+
+      long total = totalHitsForIndex(client, name, "dataAsset");
+      assertTrue(
+          total >= 1,
+          "Glossary doc with name "
+              + name
+              + " must be reachable via index=dataAsset (fallback clause for unconfigured types)");
+    }
+
+    private Set<String> hitsForColumnQuery(OpenMetadataClient client, String query)
+        throws Exception {
+      String response =
+          client.search().query(query).index("tableColumn").size(50).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> names = new HashSet<>();
+      for (JsonNode hit : hits) {
+        names.add(hit.path("_source").path("name").asText());
+      }
+      return names;
+    }
+
+    private long totalHitsForIndex(OpenMetadataClient client, String query, String index)
+        throws Exception {
+      String response = client.search().query(query).index(index).size(0).deleted(false).execute();
+      return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong();
+    }
+
+    private long bucketCountFromDataAsset(
+        OpenMetadataClient client, String query, String entityType) throws Exception {
+      String response =
+          client.search().query(query).index("dataAsset").size(0).deleted(false).execute();
+      JsonNode buckets =
+          OBJECT_MAPPER
+              .readTree(response)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
+      for (JsonNode bucket : buckets) {
+        if (entityType.equals(bucket.path("key").asText())) {
+          return bucket.path("doc_count").asLong();
+        }
+      }
+      return 0L;
+    }
+  }
+
   @Nested
   @DisplayName("Nested Column Tests")
   @Execution(ExecutionMode.CONCURRENT)
@@ -493,6 +761,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) {
     return SdkClients.adminClient().tables().create(tableRequest);
   }
 
+  private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) {
+    String shortId = ns.shortPrefix();
+
+    org.openmetadata.schema.services.connections.database.PostgresConnection conn =
+        DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build();
+
+    DatabaseService dbService =
+        DatabaseServices.builder()
+            .name("agg_svc_" + shortId + "_" + baseName)
+            .connection(conn)
+            .description("Test service for dataAsset/tableColumn aggregation parity")
+            .create();
+
+    CreateDatabase dbReq = new CreateDatabase();
+    dbReq.setName("agg_db_" + shortId + "_" + baseName);
+    dbReq.setService(dbService.getFullyQualifiedName());
+    Database database = SdkClients.adminClient().databases().create(dbReq);
+
+    CreateDatabaseSchema schemaReq = new CreateDatabaseSchema();
+    schemaReq.setName("agg_schema_" + shortId + "_" + baseName);
+    schemaReq.setDatabase(database.getFullyQualifiedName());
+    DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq);
+
+    CreateTable tableRequest = new CreateTable();
+    tableRequest.setName(ns.prefix(baseName));
+    tableRequest.setDatabaseSchema(schema.getFullyQualifiedName());
+    tableRequest.setColumns(
+        List.of(
+            new Column()
+                .withName(tag + "_first_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("First name"),
+            new Column()
+                .withName(tag + "_last_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("Last name"),
+            new Column()
+                .withName(tag + "_address")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(512)
+                .withDescription("Postal address")));
+
+    return SdkClients.adminClient().tables().create(tableRequest);
+  }
+
   private Table createTableWithNestedColumns(TestNamespace ns, String baseName) {
     String shortId = ns.shortPrefix();
 

From b2d40ff17131687a84e4e1d8f66c2943d7cae9da Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:39:00 +0530
Subject: [PATCH 3/9] fix(search): align dataAsset aggregation counts with
 index=tableColumn totals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor the `dataAsset`/`all` alias query path so each entity-type bucket in
the aggregation matches what its dedicated index returns for the same query.
The composite asset config used to merge fields from every type, then apply
phrase/ngram-fuzzy semantics to all docs; column docs got semantics different
from `buildColumnSearchBuilderV2`, which is why the explore search bar's two
calls disagreed on the tableColumn count.

The new `buildAllAssetsSearchBuilderV2` builds a per-entity-type bool union:
each clause is `filter(entityType=<type>) must(<type's own query>)`. The
column branch reuses `buildColumnMultiMatchV2`; every other type goes through
`buildBaseQueryV2` with its dedicated config; an extra `should` covers asset
types in the `dataAsset` alias that lack a config (e.g. `glossary`,
`apiCollection`) using the default config.

Also tightens the column builder to `Operator.And` so multi-token queries
like `first_name` require every sub-token to match somewhere — fixes the
`om_analyzer`-driven over-match where the lenient `Or` + `min_should_match=0`
variant matched any column whose name contained just `first` or just `name`.

`ColumnSearchIndex.getFields()` gains `name.ngram`, `name.compound`,
`displayName.ngram`, `displayName.compound` so prefix queries like `fir`
still match column docs from both `index=tableColumn` and the dataAsset
bucket.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ElasticSearchSourceBuilderFactory.java    | 120 +++++++++++++++--
 .../search/indexes/ColumnSearchIndex.java     |   4 +
 .../OpenSearchSourceBuilderFactory.java       | 125 ++++++++++++++++--
 3 files changed, 225 insertions(+), 24 deletions(-)

diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
index ee62799dd9f2..0a3a2ae29210 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
@@ -18,9 +18,11 @@
 import es.co.elastic.clients.elasticsearch.core.search.Highlight;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.stream.Collectors;
 import org.openmetadata.schema.api.search.Aggregation;
 import org.openmetadata.schema.api.search.AssetTypeConfiguration;
@@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory
   private static final String MATCH_TYPE_STANDARD = "standard";
   private static final String INDEX_ALL = "all";
   private static final String INDEX_DATA_ASSET = "dataAsset";
+  private static final String ENTITY_TYPE_FIELD = "entityType";
   private static final String MINIMUM_SHOULD_MATCH = "2<70%";
   private static final float DEFAULT_TIE_BREAKER = 0.3f;
   private static final float DEFAULT_BOOST = 1.0f;
@@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2(
           indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
-    if (indexName.equals("all") || indexName.equals("dataAsset")) {
-      return buildDataAssetSearchBuilderV2(
-          indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
+    if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
+      return buildAllAssetsSearchBuilderV2(
+          searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
     return switch (indexName) {
@@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int
       queryBuilder =
           es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
     } else {
-      Map<String, Float> fields = ColumnSearchIndex.getFields();
-      queryBuilder =
-          ElasticQueryBuilder.multiMatchQuery(
-              query,
-              fields,
-              es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
-              es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or,
-              String.valueOf(DEFAULT_TIE_BREAKER),
-              "0");
+      queryBuilder = buildColumnMultiMatchV2(query);
     }
     es.co.elastic.clients.elasticsearch.core.search.Highlight hb =
         buildHighlightsV2(List.of("name", "displayName", "description"));
     return searchBuilderV2(queryBuilder, hb, from, size);
   }
 
+  /**
+   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
+   * {@code dataAsset} composite query. Uses {@link
+   * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced
+   * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code
+   * first_name} matches any column whose name contains just {@code first} or just {@code name},
+   * which both inflates the column index hits and creates the dataAsset/tableColumn count
+   * mismatch tracked in github issue #3851.
+   */
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2(
+      String query) {
+    return ElasticQueryBuilder.multiMatchQuery(
+        query,
+        ColumnSearchIndex.getFields(),
+        es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
+        es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And,
+        String.valueOf(DEFAULT_TIE_BREAKER),
+        "0");
+  }
+
   public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
     es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder =
         buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
@@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2(
     return searchRequestBuilder;
   }
 
+  /**
+   * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
+   * union: each asset type contributes a clause built with its own configuration (column docs go
+   * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
+   * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
+   * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
+   * for the same query, by construction. Avoids the composite-config divergence behind
+   * github.com/open-metadata/openmetadata-collate#3851.
+   */
+  public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2(
+      String query, int from, int size, boolean explain, boolean includeAggregations) {
+    AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings);
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery =
+        buildPerTypeUnionQueryV2(query);
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery =
+        applyFunctionScoringV2(baseQuery, compositeConfig);
+    es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder =
+        buildHighlightingIfNeededV2(query, compositeConfig);
+
+    ElasticSearchRequestBuilder searchRequestBuilder =
+        createSearchSourceBuilderV2(finalQuery, from, size);
+    if (highlightBuilder != null) {
+      searchRequestBuilder.highlighter(highlightBuilder);
+    }
+    if (includeAggregations) {
+      addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
+    }
+    searchRequestBuilder.explain(explain);
+    return searchRequestBuilder;
+  }
+
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
+      String query) {
+    if (isMatchAllQuery(query)) {
+      return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build();
+    }
+    ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery();
+    Set<String> configuredTypes = new HashSet<>();
+    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+      String assetType = typeConfig.getAssetType();
+      if (assetType == null || assetType.equals(INDEX_ALL)) {
+        continue;
+      }
+      configuredTypes.add(assetType);
+      union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
+    }
+    union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
+    union.minimumShouldMatch(1);
+    return union.build();
+  }
+
+  private static boolean isMatchAllQuery(String query) {
+    return query == null || query.trim().isEmpty() || query.trim().equals("*");
+  }
+
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2(
+      String query, String assetType, AssetTypeConfiguration typeConfig) {
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner =
+        Entity.TABLE_COLUMN.equals(assetType)
+            ? buildColumnMultiMatchV2(query)
+            : buildBaseQueryV2(query, typeConfig);
+    return ElasticQueryBuilder.boolQuery()
+        .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
+        .must(inner)
+        .build();
+  }
+
+  /**
+   * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
+   * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
+   */
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query
+      buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
+    ElasticQueryBuilder.BoolQueryBuilder fallback =
+        ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
+    for (String configured : configuredTypes) {
+      fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    }
+    return fallback.build();
+  }
+
   public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2(
       String query, int from, int size) {
     return buildAggregateSearchBuilderV2(query, from, size, true);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
index 1b9a76525611..9e4ea9659488 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
@@ -153,8 +153,12 @@ public static Map<String, Float> getFields() {
     Map<String, Float> fields = new HashMap<>();
     fields.put("name", 10.0f);
     fields.put("name.keyword", 20.0f);
+    fields.put("name.ngram", 1.0f);
+    fields.put("name.compound", 5.0f);
     fields.put("displayName", 7.0f);
     fields.put("displayName.keyword", 20.0f);
+    fields.put("displayName.ngram", 1.0f);
+    fields.put("displayName.compound", 4.0f);
     fields.put("fullyQualifiedName", 5.0f);
     fields.put("description", 2.0f);
     fields.put("dataType", 3.0f);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
index 7deb93c7d7da..fdd7bd107981 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
@@ -14,9 +14,11 @@
 
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.stream.Collectors;
 import lombok.extern.slf4j.Slf4j;
 import org.openmetadata.schema.api.search.Aggregation;
@@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory
   private static final String MATCH_TYPE_STANDARD = "standard";
   private static final String INDEX_ALL = "all";
   private static final String INDEX_DATA_ASSET = "dataAsset";
+  private static final String ENTITY_TYPE_FIELD = "entityType";
   private static final String MINIMUM_SHOULD_MATCH = "2<70%";
   private static final float DEFAULT_TIE_BREAKER = 0.3f;
   private static final float DEFAULT_BOOST = 1.0f;
@@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2(
           indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
-    if (indexName.equals("all") || indexName.equals("dataAsset")) {
-      return buildDataAssetSearchBuilderV2(
-          indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
+    if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
+      return buildAllAssetsSearchBuilderV2(
+          searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
     return switch (indexName) {
@@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro
       queryBuilder =
           os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
     } else {
-      Map<String, Float> fields = ColumnSearchIndex.getFields();
-      queryBuilder =
-          OpenSearchQueryBuilder.multiMatchQuery(
-              query,
-              fields,
-              os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
-              os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or,
-              String.valueOf(DEFAULT_TIE_BREAKER),
-              "0");
+      queryBuilder = buildColumnMultiMatchV2(query);
     }
     os.org.opensearch.client.opensearch.core.search.Highlight highlighter =
         buildHighlightsV2(List.of("name", "displayName", "description"));
     return searchBuilderV2(queryBuilder, highlighter, from, size);
   }
 
+  /**
+   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
+   * {@code dataAsset} composite query. Uses {@link
+   * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced
+   * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some
+   * field. Without {@code And}, a query like {@code first_name} matches any column whose name
+   * contains just {@code first} or just {@code name}, which both inflates the column index hits
+   * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851.
+   */
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2(
+      String query) {
+    return OpenSearchQueryBuilder.multiMatchQuery(
+        query,
+        ColumnSearchIndex.getFields(),
+        os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
+        os.org.opensearch.client.opensearch._types.query_dsl.Operator.And,
+        String.valueOf(DEFAULT_TIE_BREAKER),
+        "0");
+  }
+
   public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
     os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder =
         buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
@@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2(
     return searchRequestBuilder;
   }
 
+  /**
+   * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
+   * union: each asset type contributes a clause built with its own configuration (column docs go
+   * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
+   * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
+   * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
+   * for the same query, by construction. Avoids the composite-config divergence behind
+   * github.com/open-metadata/openmetadata-collate#3851.
+   */
+  public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2(
+      String query, int from, int size, boolean explain, boolean includeAggregations) {
+    AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig();
+    os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery =
+        buildPerTypeUnionQueryV2(query);
+    os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery =
+        applyFunctionScoringV2(baseQuery, compositeConfig);
+    os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder =
+        buildHighlightingIfNeededV2(query, compositeConfig);
+
+    OpenSearchRequestBuilder searchRequestBuilder =
+        createSearchSourceBuilderV2(finalQuery, from, size);
+    if (highlightBuilder != null) {
+      searchRequestBuilder.highlighter(highlightBuilder);
+    }
+    if (includeAggregations) {
+      addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
+    }
+    searchRequestBuilder.explain(explain);
+    return searchRequestBuilder;
+  }
+
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
+      String query) {
+    if (isMatchAllQuery(query)) {
+      return OpenSearchQueryBuilder.boolQuery()
+          .must(OpenSearchQueryBuilder.matchAllQuery())
+          .build();
+    }
+    OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery();
+    Set<String> configuredTypes = new HashSet<>();
+    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+      String assetType = typeConfig.getAssetType();
+      if (assetType == null || assetType.equals(INDEX_ALL)) {
+        continue;
+      }
+      configuredTypes.add(assetType);
+      union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
+    }
+    union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
+    union.minimumShouldMatch(1);
+    return union.build();
+  }
+
+  private static boolean isMatchAllQuery(String query) {
+    return query == null || query.trim().isEmpty() || query.trim().equals("*");
+  }
+
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2(
+      String query, String assetType, AssetTypeConfiguration typeConfig) {
+    os.org.opensearch.client.opensearch._types.query_dsl.Query inner =
+        Entity.TABLE_COLUMN.equals(assetType)
+            ? buildColumnMultiMatchV2(query)
+            : buildBaseQueryV2(query, typeConfig);
+    return OpenSearchQueryBuilder.boolQuery()
+        .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
+        .must(inner)
+        .build();
+  }
+
+  /**
+   * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
+   * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
+   * Without this, docs of those types would silently disappear from the dataAsset alias after the
+   * per-type-union refactor.
+   */
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query
+      buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
+    OpenSearchQueryBuilder.BoolQueryBuilder fallback =
+        OpenSearchQueryBuilder.boolQuery()
+            .must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
+    for (String configured : configuredTypes) {
+      fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    }
+    return fallback.build();
+  }
+
   private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2(
       String query, AssetTypeConfiguration assetConfig) {
     if (query == null || query.trim().isEmpty() || query.trim().equals("*")) {

From 6d079ab29e2e9bfb3cf0e4fd93cc2b2cc81ed7f8 Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:39:00 +0530
Subject: [PATCH 4/9] test(search): add regression tests for
 dataAsset/tableColumn parity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds integration tests under ColumnSearchIndexIT that pin the behavior of the
fix in PR #27846:

- testDataAssetTableColumnAggregationMatchesTableColumnTotal: dataAsset
  bucket count for tableColumn equals index=tableColumn total for a
  multi-token query against seeded columns.
- testColumnQueryRequiresAllSubtokensToMatch: query "<tag>_first_name" must
  match the seeded "<tag>_first_name" column but NOT "<tag>_first_id" — pins
  the Operator.And fix that closed the om_analyzer sub-token over-match.
- testDataAssetTableBucketMatchesTableIndexTotal: same parity guarantee for
  the "table" entity-type bucket, exercising the per-type-union path for a
  non-column type.
- testPrefixQueryMatchesViaNgramOnBothPaths: short prefix queries (e.g. the
  first few chars of the seeded tag) must match seeded columns via
  name.ngram and stay in parity across both endpoints.
- testUnconfiguredAssetTypeFallbackMatchesViaDataAsset: a Glossary doc (an
  asset type without an explicit searchSettings.assetTypeConfigurations
  entry) must still surface via index=dataAsset, exercising the fallback
  should clause in buildPerTypeUnionQueryV2.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../it/tests/ColumnSearchIndexIT.java         | 315 ++++++++++++++++++
 1 file changed, 315 insertions(+)

diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
index 9ca30e3febdc..25da9ea227ff 100644
--- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
+++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
@@ -20,8 +20,11 @@
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
+import org.awaitility.Awaitility;
 import org.junit.jupiter.api.DisplayName;
 import org.junit.jupiter.api.Nested;
 import org.junit.jupiter.api.Test;
@@ -33,15 +36,18 @@
 import org.openmetadata.it.util.TestNamespaceExtension;
 import org.openmetadata.schema.api.data.CreateDatabase;
 import org.openmetadata.schema.api.data.CreateDatabaseSchema;
+import org.openmetadata.schema.api.data.CreateGlossary;
 import org.openmetadata.schema.api.data.CreateTable;
 import org.openmetadata.schema.entity.data.Database;
 import org.openmetadata.schema.entity.data.DatabaseSchema;
+import org.openmetadata.schema.entity.data.Glossary;
 import org.openmetadata.schema.entity.data.Table;
 import org.openmetadata.schema.entity.services.DatabaseService;
 import org.openmetadata.schema.type.Column;
 import org.openmetadata.schema.type.ColumnDataType;
 import org.openmetadata.sdk.client.OpenMetadataClient;
 import org.openmetadata.sdk.fluent.DatabaseServices;
+import org.openmetadata.service.Entity;
 
 /**
  * Integration tests for column search indexing during table reindexing. Verifies:
@@ -353,6 +359,268 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception {
     }
   }
 
+  @Nested
+  @DisplayName("DataAsset/TableColumn Aggregation Parity Tests")
+  @Execution(ExecutionMode.CONCURRENT)
+  class DataAssetColumnAggregationTests {
+
+    /**
+     * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two
+     * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the
+     * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through
+     * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the
+     * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with
+     * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the
+     * {@code dataAsset} aggregation under-counts the column index hits.
+     */
+    @Test
+    @DisplayName(
+        "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total")
+    void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns)
+        throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag);
+      assertNotNull(table);
+
+      String multiTokenQuery = tag + "_first " + tag + "_address";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(
+              () -> {
+                String r =
+                    client
+                        .search()
+                        .query(multiTokenQuery)
+                        .index("tableColumn")
+                        .size(0)
+                        .deleted(false)
+                        .execute();
+                JsonNode root = OBJECT_MAPPER.readTree(r);
+                long total = root.path("hits").path("total").path("value").asLong(-1);
+                return total >= 3;
+              });
+
+      String columnResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("tableColumn")
+              .size(0)
+              .deleted(false)
+              .execute();
+      long columnTotal =
+          OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong();
+
+      String aggResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("dataAsset")
+              .size(0)
+              .deleted(false)
+              .execute();
+      JsonNode aggBuckets =
+          OBJECT_MAPPER
+              .readTree(aggResponse)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
+      long aggColumnCount = 0;
+      for (JsonNode bucket : aggBuckets) {
+        if ("tableColumn".equals(bucket.path("key").asText())) {
+          aggColumnCount = bucket.path("doc_count").asLong();
+          break;
+        }
+      }
+
+      assertEquals(
+          columnTotal,
+          aggColumnCount,
+          "dataAsset aggregation tableColumn bucket ("
+              + aggColumnCount
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for query \""
+              + multiTokenQuery
+              + "\"");
+    }
+
+    /**
+     * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name}
+     * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0}
+     * column builder, a column called {@code <tag>_first_id} matched a query of {@code
+     * <tag>_first_name} because the single token {@code first} was enough. The fix moves the
+     * column builder to {@code Operator.And}, so every sub-token must match.
+     */
+    @Test
+    @DisplayName("Column query for first_name must not match a column called first_id")
+    void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag);
+      assertNotNull(table);
+
+      String firstNameColumn = tag + "_first_name";
+      String firstIdColumn = tag + "_first_id";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn));
+
+      Set<String> hits = hitsForColumnQuery(client, firstNameColumn);
+      assertTrue(
+          hits.contains(firstNameColumn),
+          "Query " + firstNameColumn + " must match the column with the same name; got " + hits);
+      assertFalse(
+          hits.contains(firstIdColumn),
+          "Query "
+              + firstNameColumn
+              + " must not match column "
+              + firstIdColumn
+              + " (only the 'first' sub-token overlaps); got "
+              + hits);
+    }
+
+    /**
+     * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal}
+     * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must
+     * make every entity-type bucket equal to what the dedicated index returns.
+     */
+    @Test
+    @DisplayName("Aggregation bucket for table under dataAsset must match index=table total")
+    void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      String tableQueryTag = "tblparity" + tag;
+      Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag);
+      assertNotNull(table);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1);
+
+      long tableTotal = totalHitsForIndex(client, table.getName(), "table");
+      long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table");
+
+      assertEquals(
+          tableTotal,
+          aggTableBucket,
+          "dataAsset aggregation table bucket must equal index=table total for query "
+              + table.getName());
+    }
+
+    /**
+     * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union
+     * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram}
+     * and {@code name.compound}, which are now part of {@link
+     * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket
+     * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the
+     * same prefix query, and the seeded column must be in both result sets.
+     */
+    @Test
+    @DisplayName("Prefix queries match via ngram and stay in parity across both paths")
+    void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag);
+      assertNotNull(table);
+
+      String prefixQuery = tag.substring(0, Math.min(4, tag.length()));
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5);
+
+      long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn");
+      long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN);
+
+      assertTrue(
+          columnTotal > 0,
+          "Prefix query " + prefixQuery + " should match seeded columns via name.ngram");
+      assertEquals(
+          columnTotal,
+          aggColumnBucket,
+          "dataAsset tableColumn bucket ("
+              + aggColumnBucket
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for prefix query "
+              + prefixQuery);
+    }
+
+    /**
+     * Asset types that live in the {@code dataAsset} alias but lack a {@code
+     * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via
+     * the composite-config field merge. The per-type-union path adds a fallback {@code should}
+     * clause for these; this test pins that a glossary doc still appears in {@code
+     * index=dataAsset} hits when its name matches the query.
+     */
+    @Test
+    @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias")
+    void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String name = ns.prefix("glossary_unconfigured_fallback");
+      CreateGlossary req = new CreateGlossary().withName(name).withDescription(name);
+      Glossary glossary = client.glossaries().create(req);
+      assertNotNull(glossary);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1);
+
+      long total = totalHitsForIndex(client, name, "dataAsset");
+      assertTrue(
+          total >= 1,
+          "Glossary doc with name "
+              + name
+              + " must be reachable via index=dataAsset (fallback clause for unconfigured types)");
+    }
+
+    private Set<String> hitsForColumnQuery(OpenMetadataClient client, String query)
+        throws Exception {
+      String response =
+          client.search().query(query).index("tableColumn").size(50).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> names = new HashSet<>();
+      for (JsonNode hit : hits) {
+        names.add(hit.path("_source").path("name").asText());
+      }
+      return names;
+    }
+
+    private long totalHitsForIndex(OpenMetadataClient client, String query, String index)
+        throws Exception {
+      String response = client.search().query(query).index(index).size(0).deleted(false).execute();
+      return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong();
+    }
+
+    private long bucketCountFromDataAsset(
+        OpenMetadataClient client, String query, String entityType) throws Exception {
+      String response =
+          client.search().query(query).index("dataAsset").size(0).deleted(false).execute();
+      JsonNode buckets =
+          OBJECT_MAPPER
+              .readTree(response)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
+      for (JsonNode bucket : buckets) {
+        if (entityType.equals(bucket.path("key").asText())) {
+          return bucket.path("doc_count").asLong();
+        }
+      }
+      return 0L;
+    }
+  }
+
   @Nested
   @DisplayName("Nested Column Tests")
   @Execution(ExecutionMode.CONCURRENT)
@@ -493,6 +761,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) {
     return SdkClients.adminClient().tables().create(tableRequest);
   }
 
+  private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) {
+    String shortId = ns.shortPrefix();
+
+    org.openmetadata.schema.services.connections.database.PostgresConnection conn =
+        DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build();
+
+    DatabaseService dbService =
+        DatabaseServices.builder()
+            .name("agg_svc_" + shortId + "_" + baseName)
+            .connection(conn)
+            .description("Test service for dataAsset/tableColumn aggregation parity")
+            .create();
+
+    CreateDatabase dbReq = new CreateDatabase();
+    dbReq.setName("agg_db_" + shortId + "_" + baseName);
+    dbReq.setService(dbService.getFullyQualifiedName());
+    Database database = SdkClients.adminClient().databases().create(dbReq);
+
+    CreateDatabaseSchema schemaReq = new CreateDatabaseSchema();
+    schemaReq.setName("agg_schema_" + shortId + "_" + baseName);
+    schemaReq.setDatabase(database.getFullyQualifiedName());
+    DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq);
+
+    CreateTable tableRequest = new CreateTable();
+    tableRequest.setName(ns.prefix(baseName));
+    tableRequest.setDatabaseSchema(schema.getFullyQualifiedName());
+    tableRequest.setColumns(
+        List.of(
+            new Column()
+                .withName(tag + "_first_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("First name"),
+            new Column()
+                .withName(tag + "_last_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("Last name"),
+            new Column()
+                .withName(tag + "_address")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(512)
+                .withDescription("Postal address")));
+
+    return SdkClients.adminClient().tables().create(tableRequest);
+  }
+
   private Table createTableWithNestedColumns(TestNamespace ns, String baseName) {
     String shortId = ns.shortPrefix();
 

From caf858063267d74ef9729a70026d24462764393d Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:43:21 +0530
Subject: [PATCH 5/9] remove local script

---
 scripts/reproduce_column_agg_mismatch.sh | 99 ------------------------
 1 file changed, 99 deletions(-)
 delete mode 100755 scripts/reproduce_column_agg_mismatch.sh

diff --git a/scripts/reproduce_column_agg_mismatch.sh b/scripts/reproduce_column_agg_mismatch.sh
deleted file mode 100755
index 5601b13c6f1f..000000000000
--- a/scripts/reproduce_column_agg_mismatch.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env bash
-# Reproduces GitHub issue open-metadata/openmetadata-collate#3851:
-#
-#   When the explore search bar fires its two requests for the same query —
-#     a) GET /api/v1/search/query?q=...&index=dataAsset&size=0     (entity-type aggregation)
-#     b) GET /api/v1/search/query?q=...&index=tableColumn&size=N   (column hits)
-#   — the count for `tableColumn` reported by (a) does not match the total reported by (b).
-#
-# Root cause:
-#   `index=tableColumn` routes through `buildColumnSearchBuilderV2` (a lenient multi_match).
-#   `index=dataAsset`  routes through `buildDataAssetSearchBuilderV2` with the composite asset
-#   config, which applies stricter phrase / 2<70% matching to column docs. The two query shapes
-#   produce different result sets even though both indexes contain the same column documents.
-#
-# Related secondary bug (the "_ issue"):
-#   The `om_analyzer` splits identifiers like `first_name` on letter/digit/underscore boundaries
-#   into ["first","name"]. With `Operator.Or` + `min_should_match=0` (the lenient column builder),
-#   any column whose name contains ONLY "first" or ONLY "name" is also returned, drowning out the
-#   actual relevant matches.
-#
-# This script does NOT create test assets — it probes the running catalog with a fixed list of
-# queries that are known to expose the divergence on the bundled sample data, prints the counts
-# side-by-side, and exits non-zero if any query shows a mismatch. Asset-based isolation lives
-# in the regression test (ColumnSearchIndexIT).
-#
-# Usage:
-#   OM_HOST=http://localhost:8585 OM_TOKEN=<jwt> ./reproduce_column_agg_mismatch.sh [extra-query]
-
-set -euo pipefail
-
-HOST="${OM_HOST:-http://localhost:8585}"
-TOKEN="${OM_TOKEN:-eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg}"
-
-QUERIES=(
-  "first_name"
-  "last_name"
-  "first name"
-  "shipping address"
-  "first name address"
-)
-[ "${1:-}" ] && QUERIES+=("$1")
-
-AUTH=(-H "Authorization: Bearer $TOKEN")
-
-probe() {
-  local q="$1"
-  local enc agg_file col_file
-  enc=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "$q")
-  agg_file=$(mktemp)
-  col_file=$(mktemp)
-  if ! curl -sf "${AUTH[@]}" \
-    "$HOST/api/v1/search/query?q=$enc&index=dataAsset&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \
-    -o "$agg_file" \
-    || ! curl -sf "${AUTH[@]}" \
-    "$HOST/api/v1/search/query?q=$enc&index=tableColumn&from=0&size=0&deleted=false&track_total_hits=true&fetch_source=false" \
-    -o "$col_file"; then
-    echo "  [ERR ] q=\"$q\" — search API call failed; is OM running at $HOST?"
-    rm -f "$agg_file" "$col_file"
-    return 2
-  fi
-  python3 - "$q" "$agg_file" "$col_file" <<'PY'
-import json, sys
-q, agg_path, col_path = sys.argv[1], sys.argv[2], sys.argv[3]
-with open(agg_path) as f: agg = json.load(f)
-with open(col_path) as f: col = json.load(f)
-agg_total = agg.get('hits', {}).get('total', {}).get('value', '?')
-col_total = col.get('hits', {}).get('total', {}).get('value', '?')
-buckets = agg.get('aggregations', {}).get('sterms#entityType', {}).get('buckets', [])
-agg_tc = next((b.get('doc_count', 0) for b in buckets if b.get('key') == 'tableColumn'), 0)
-status = "OK  " if agg_tc == col_total else "DIFF"
-print(f"  [{status}] q={q!r:<26} dataAsset.total={agg_total:<6} agg.tableColumnBucket={agg_tc:<6} tableColumn.total={col_total}")
-sys.exit(0 if agg_tc == col_total else 2)
-PY
-  local rc=$?
-  rm -f "$agg_file" "$col_file"
-  return $rc
-}
-
-echo "Host : $HOST"
-if ! curl -sf -o /dev/null "${AUTH[@]}" "$HOST/api/v1/system/version"; then
-  echo "ERROR: $HOST is not reachable (or auth is wrong). Start OM and retry."
-  exit 4
-fi
-echo "Probing ${#QUERIES[@]} queries; OK = bucket count matches index=tableColumn total."
-echo "------------------------------------------------------------------"
-fail=0
-for q in "${QUERIES[@]}"; do
-  probe "$q" || fail=1
-done
-echo "------------------------------------------------------------------"
-
-ENC0=$(python3 -c "import urllib.parse,sys;print(urllib.parse.quote(sys.argv[1]))" "${QUERIES[0]}")
-echo
-echo "UI verification:"
-echo "  1. Open $HOST/explore/tableColumn?search=$ENC0"
-echo "  2. Note the count badge on the tableColumn tab vs the entity-type aggregation panel."
-echo "     Before the fix they diverge; after the fix they should match."
-
-exit "$fail"

From e8826079f7443d47baa97e288929ceca284b441d Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:47:32 +0530
Subject: [PATCH 6/9] test(search): add complex-syntax and topic-parity
 coverage

- testTopicBucketAndResultSetMatchIndexTopic: pins parity for a non-column
  entity type at both the count and the FQN-set level, exercising the
  per-type-union path for `topic`. The dataAsset alias must return the same
  set of topic FQNs that index=topic returns for the same query.
- testComplexSyntaxQueriesKeepParity: runs four representative complex-
  syntax shapes (quoted phrase, AND, OR, mixed parenthesised) through both
  endpoints and asserts the tableColumn bucket count equals
  index=tableColumn total.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../it/tests/ColumnSearchIndexIT.java         | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
index 25da9ea227ff..20c3d8110422 100644
--- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
+++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
@@ -42,6 +42,7 @@
 import org.openmetadata.schema.entity.data.DatabaseSchema;
 import org.openmetadata.schema.entity.data.Glossary;
 import org.openmetadata.schema.entity.data.Table;
+import org.openmetadata.schema.entity.data.Topic;
 import org.openmetadata.schema.entity.services.DatabaseService;
 import org.openmetadata.schema.type.Column;
 import org.openmetadata.schema.type.ColumnDataType;
@@ -584,6 +585,81 @@ void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) thro
               + " must be reachable via index=dataAsset (fallback clause for unconfigured types)");
     }
 
+    /**
+     * Pins parity for a non-column entity type at both the count and the FQN-set level. Topic is
+     * chosen to exercise a different per-type clause than {@code table}, which already has its own
+     * test. The {@code dataAsset} alias must return the same set of topic FQNs that {@code
+     * index=topic} returns for the same query, with matching counts — otherwise the explore tab's
+     * topic count and the topic-tab results would disagree.
+     */
+    @Test
+    @DisplayName("Topic bucket and topic FQN set must match index=topic for same query")
+    void testTopicBucketAndResultSetMatchIndexTopic(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Topic topic = createUniqueTopic(ns, "topicparity_" + tag, tag);
+      assertNotNull(topic);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, tag, "topic") >= 1);
+
+      long topicTotal = totalHitsForIndex(client, tag, "topic");
+      long aggTopicBucket = bucketCountFromDataAsset(client, tag, Entity.TOPIC);
+      assertEquals(
+          topicTotal,
+          aggTopicBucket,
+          "dataAsset topic bucket must equal index=topic total for query " + tag);
+
+      Set<String> topicFqns = fqnsForIndex(client, tag, "topic");
+      Set<String> dataAssetTopicFqns = fqnsFromDataAssetForType(client, tag, Entity.TOPIC);
+      assertEquals(
+          topicFqns,
+          dataAssetTopicFqns,
+          "Topic FQNs returned by index=topic must equal topic-typed FQNs returned by "
+              + "index=dataAsset for the same query");
+    }
+
+    /**
+     * Complex syntax queries (quoted phrases, AND/OR operators) flow through {@code
+     * buildBaseQueryV2 -> buildComplexSyntaxQueryV2} for non-column types and through the column
+     * builder for {@code tableColumn}. Both paths run against both endpoints — the parity
+     * guarantee from the per-type-union refactor must hold here too. This test exercises three
+     * representative shapes and asserts count parity for the {@code tableColumn} bucket.
+     */
+    @Test
+    @DisplayName("Complex-syntax queries keep dataAsset/tableColumn parity")
+    void testComplexSyntaxQueriesKeepParity(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "complex_" + tag, tag);
+      assertNotNull(table);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, tag, "tableColumn") >= 5);
+
+      List<String> queries =
+          List.of(
+              "\"" + tag + " alpha\"",
+              tag + " AND alpha",
+              tag + " OR alpha",
+              "(" + tag + " AND alpha) OR (" + tag + " AND bravo)");
+
+      for (String complexQuery : queries) {
+        long columnTotal = totalHitsForIndex(client, complexQuery, "tableColumn");
+        long aggBucket = bucketCountFromDataAsset(client, complexQuery, Entity.TABLE_COLUMN);
+        assertEquals(
+            columnTotal,
+            aggBucket,
+            "dataAsset tableColumn bucket must equal index=tableColumn total for complex "
+                + "query "
+                + complexQuery);
+      }
+    }
+
     private Set<String> hitsForColumnQuery(OpenMetadataClient client, String query)
         throws Exception {
       String response =
@@ -596,6 +672,56 @@ private Set<String> hitsForColumnQuery(OpenMetadataClient client, String query)
       return names;
     }
 
+    private Set<String> fqnsForIndex(OpenMetadataClient client, String query, String index)
+        throws Exception {
+      String response =
+          client.search().query(query).index(index).size(200).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> fqns = new HashSet<>();
+      for (JsonNode hit : hits) {
+        fqns.add(hit.path("_source").path("fullyQualifiedName").asText());
+      }
+      return fqns;
+    }
+
+    private Set<String> fqnsFromDataAssetForType(
+        OpenMetadataClient client, String query, String entityType) throws Exception {
+      String response =
+          client.search().query(query).index("dataAsset").size(200).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> fqns = new HashSet<>();
+      for (JsonNode hit : hits) {
+        if (entityType.equals(hit.path("_source").path("entityType").asText())) {
+          fqns.add(hit.path("_source").path("fullyQualifiedName").asText());
+        }
+      }
+      return fqns;
+    }
+
+    private Topic createUniqueTopic(TestNamespace ns, String baseName, String tag) {
+      String shortId = ns.shortPrefix();
+      org.openmetadata.schema.services.connections.messaging.KafkaConnection kafkaConn =
+          new org.openmetadata.schema.services.connections.messaging.KafkaConnection()
+              .withBootstrapServers("localhost:9092");
+      org.openmetadata.schema.api.services.CreateMessagingService msgSvcReq =
+          new org.openmetadata.schema.api.services.CreateMessagingService()
+              .withName("topic_parity_svc_" + shortId + "_" + baseName)
+              .withServiceType(
+                  org.openmetadata.schema.api.services.CreateMessagingService.MessagingServiceType
+                      .Kafka)
+              .withConnection(
+                  new org.openmetadata.schema.type.MessagingConnection().withConfig(kafkaConn));
+      org.openmetadata.schema.entity.services.MessagingService msgService =
+          SdkClients.adminClient().messagingServices().create(msgSvcReq);
+
+      org.openmetadata.schema.api.data.CreateTopic topicRequest =
+          new org.openmetadata.schema.api.data.CreateTopic()
+              .withName(ns.prefix(tag + "_topic_" + baseName))
+              .withService(msgService.getFullyQualifiedName())
+              .withPartitions(1);
+      return SdkClients.adminClient().topics().create(topicRequest);
+    }
+
     private long totalHitsForIndex(OpenMetadataClient client, String query, String index)
         throws Exception {
       String response = client.search().query(query).index(index).size(0).deleted(false).execute();

From 4a8011b5ca1af134668004928fc2c248452bc48b Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 16:52:49 +0530
Subject: [PATCH 7/9] fix(search): align dataAsset aggregation counts with
 index=tableColumn totals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor the `dataAsset`/`all` alias query path so each entity-type bucket in
the aggregation matches what its dedicated index returns for the same query.
The composite asset config used to merge fields from every type, then apply
phrase/ngram-fuzzy semantics to all docs; column docs got semantics different
from `buildColumnSearchBuilderV2`, which is why the explore search bar's two
calls disagreed on the tableColumn count.

The new `buildAllAssetsSearchBuilderV2` builds a per-entity-type bool union:
each clause is `filter(entityType=<type>) must(<type's own query>)`. The
column branch reuses `buildColumnMultiMatchV2`; every other type goes through
`buildBaseQueryV2` with its dedicated config; an extra `should` covers asset
types in the `dataAsset` alias that lack a config (e.g. `glossary`,
`apiCollection`) using the default config.

Also tightens the column builder to `Operator.And` so multi-token queries
like `first_name` require every sub-token to match somewhere — fixes the
`om_analyzer`-driven over-match where the lenient `Or` + `min_should_match=0`
variant matched any column whose name contained just `first` or just `name`.

`ColumnSearchIndex.getFields()` gains `name.ngram`, `name.compound`,
`displayName.ngram`, `displayName.compound` so prefix queries like `fir`
still match column docs from both `index=tableColumn` and the dataAsset
bucket.

Adds integration tests in ColumnSearchIndexIT covering:
- multi-token query parity for the tableColumn bucket vs index=tableColumn
- sub-token over-match guard (query `<tag>_first_name` must not match a
  seeded `<tag>_first_id` column)
- table bucket parity (count) and topic parity (count + FQN-set match)
- prefix queries via `name.ngram` keeping parity across both endpoints
- complex-syntax queries (quoted phrase, AND/OR, mixed parenthesised)
  keeping the tableColumn bucket parity
- unconfigured asset type fallback (Glossary docs reachable via dataAsset)

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../it/tests/ColumnSearchIndexIT.java         | 441 ++++++++++++++++++
 .../ElasticSearchSourceBuilderFactory.java    | 120 ++++-
 .../search/indexes/ColumnSearchIndex.java     |   4 +
 .../OpenSearchSourceBuilderFactory.java       | 125 ++++-
 4 files changed, 666 insertions(+), 24 deletions(-)

diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
index 9ca30e3febdc..20c3d8110422 100644
--- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
+++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
@@ -20,8 +20,11 @@
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
+import org.awaitility.Awaitility;
 import org.junit.jupiter.api.DisplayName;
 import org.junit.jupiter.api.Nested;
 import org.junit.jupiter.api.Test;
@@ -33,15 +36,19 @@
 import org.openmetadata.it.util.TestNamespaceExtension;
 import org.openmetadata.schema.api.data.CreateDatabase;
 import org.openmetadata.schema.api.data.CreateDatabaseSchema;
+import org.openmetadata.schema.api.data.CreateGlossary;
 import org.openmetadata.schema.api.data.CreateTable;
 import org.openmetadata.schema.entity.data.Database;
 import org.openmetadata.schema.entity.data.DatabaseSchema;
+import org.openmetadata.schema.entity.data.Glossary;
 import org.openmetadata.schema.entity.data.Table;
+import org.openmetadata.schema.entity.data.Topic;
 import org.openmetadata.schema.entity.services.DatabaseService;
 import org.openmetadata.schema.type.Column;
 import org.openmetadata.schema.type.ColumnDataType;
 import org.openmetadata.sdk.client.OpenMetadataClient;
 import org.openmetadata.sdk.fluent.DatabaseServices;
+import org.openmetadata.service.Entity;
 
 /**
  * Integration tests for column search indexing during table reindexing. Verifies:
@@ -353,6 +360,393 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception {
     }
   }
 
+  @Nested
+  @DisplayName("DataAsset/TableColumn Aggregation Parity Tests")
+  @Execution(ExecutionMode.CONCURRENT)
+  class DataAssetColumnAggregationTests {
+
+    /**
+     * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two
+     * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the
+     * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through
+     * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the
+     * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with
+     * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the
+     * {@code dataAsset} aggregation under-counts the column index hits.
+     */
+    @Test
+    @DisplayName(
+        "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total")
+    void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns)
+        throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag);
+      assertNotNull(table);
+
+      String multiTokenQuery = tag + "_first " + tag + "_address";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(
+              () -> {
+                String r =
+                    client
+                        .search()
+                        .query(multiTokenQuery)
+                        .index("tableColumn")
+                        .size(0)
+                        .deleted(false)
+                        .execute();
+                JsonNode root = OBJECT_MAPPER.readTree(r);
+                long total = root.path("hits").path("total").path("value").asLong(-1);
+                return total >= 3;
+              });
+
+      String columnResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("tableColumn")
+              .size(0)
+              .deleted(false)
+              .execute();
+      long columnTotal =
+          OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong();
+
+      String aggResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("dataAsset")
+              .size(0)
+              .deleted(false)
+              .execute();
+      JsonNode aggBuckets =
+          OBJECT_MAPPER
+              .readTree(aggResponse)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
+      long aggColumnCount = 0;
+      for (JsonNode bucket : aggBuckets) {
+        if ("tableColumn".equals(bucket.path("key").asText())) {
+          aggColumnCount = bucket.path("doc_count").asLong();
+          break;
+        }
+      }
+
+      assertEquals(
+          columnTotal,
+          aggColumnCount,
+          "dataAsset aggregation tableColumn bucket ("
+              + aggColumnCount
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for query \""
+              + multiTokenQuery
+              + "\"");
+    }
+
+    /**
+     * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name}
+     * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0}
+     * column builder, a column called {@code <tag>_first_id} matched a query of {@code
+     * <tag>_first_name} because the single token {@code first} was enough. The fix moves the
+     * column builder to {@code Operator.And}, so every sub-token must match.
+     */
+    @Test
+    @DisplayName("Column query for first_name must not match a column called first_id")
+    void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag);
+      assertNotNull(table);
+
+      String firstNameColumn = tag + "_first_name";
+      String firstIdColumn = tag + "_first_id";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn));
+
+      Set<String> hits = hitsForColumnQuery(client, firstNameColumn);
+      assertTrue(
+          hits.contains(firstNameColumn),
+          "Query " + firstNameColumn + " must match the column with the same name; got " + hits);
+      assertFalse(
+          hits.contains(firstIdColumn),
+          "Query "
+              + firstNameColumn
+              + " must not match column "
+              + firstIdColumn
+              + " (only the 'first' sub-token overlaps); got "
+              + hits);
+    }
+
+    /**
+     * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal}
+     * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must
+     * make every entity-type bucket equal to what the dedicated index returns.
+     */
+    @Test
+    @DisplayName("Aggregation bucket for table under dataAsset must match index=table total")
+    void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      String tableQueryTag = "tblparity" + tag;
+      Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag);
+      assertNotNull(table);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1);
+
+      long tableTotal = totalHitsForIndex(client, table.getName(), "table");
+      long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table");
+
+      assertEquals(
+          tableTotal,
+          aggTableBucket,
+          "dataAsset aggregation table bucket must equal index=table total for query "
+              + table.getName());
+    }
+
+    /**
+     * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union
+     * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram}
+     * and {@code name.compound}, which are now part of {@link
+     * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket
+     * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the
+     * same prefix query, and the seeded column must be in both result sets.
+     */
+    @Test
+    @DisplayName("Prefix queries match via ngram and stay in parity across both paths")
+    void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag);
+      assertNotNull(table);
+
+      String prefixQuery = tag.substring(0, Math.min(4, tag.length()));
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5);
+
+      long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn");
+      long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN);
+
+      assertTrue(
+          columnTotal > 0,
+          "Prefix query " + prefixQuery + " should match seeded columns via name.ngram");
+      assertEquals(
+          columnTotal,
+          aggColumnBucket,
+          "dataAsset tableColumn bucket ("
+              + aggColumnBucket
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for prefix query "
+              + prefixQuery);
+    }
+
+    /**
+     * Asset types that live in the {@code dataAsset} alias but lack a {@code
+     * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via
+     * the composite-config field merge. The per-type-union path adds a fallback {@code should}
+     * clause for these; this test pins that a glossary doc still appears in {@code
+     * index=dataAsset} hits when its name matches the query.
+     */
+    @Test
+    @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias")
+    void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String name = ns.prefix("glossary_unconfigured_fallback");
+      CreateGlossary req = new CreateGlossary().withName(name).withDescription(name);
+      Glossary glossary = client.glossaries().create(req);
+      assertNotNull(glossary);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1);
+
+      long total = totalHitsForIndex(client, name, "dataAsset");
+      assertTrue(
+          total >= 1,
+          "Glossary doc with name "
+              + name
+              + " must be reachable via index=dataAsset (fallback clause for unconfigured types)");
+    }
+
+    /**
+     * Pins parity for a non-column entity type at both the count and the FQN-set level. Topic is
+     * chosen to exercise a different per-type clause than {@code table}, which already has its own
+     * test. The {@code dataAsset} alias must return the same set of topic FQNs that {@code
+     * index=topic} returns for the same query, with matching counts — otherwise the explore tab's
+     * topic count and the topic-tab results would disagree.
+     */
+    @Test
+    @DisplayName("Topic bucket and topic FQN set must match index=topic for same query")
+    void testTopicBucketAndResultSetMatchIndexTopic(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Topic topic = createUniqueTopic(ns, "topicparity_" + tag, tag);
+      assertNotNull(topic);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, tag, "topic") >= 1);
+
+      long topicTotal = totalHitsForIndex(client, tag, "topic");
+      long aggTopicBucket = bucketCountFromDataAsset(client, tag, Entity.TOPIC);
+      assertEquals(
+          topicTotal,
+          aggTopicBucket,
+          "dataAsset topic bucket must equal index=topic total for query " + tag);
+
+      Set<String> topicFqns = fqnsForIndex(client, tag, "topic");
+      Set<String> dataAssetTopicFqns = fqnsFromDataAssetForType(client, tag, Entity.TOPIC);
+      assertEquals(
+          topicFqns,
+          dataAssetTopicFqns,
+          "Topic FQNs returned by index=topic must equal topic-typed FQNs returned by "
+              + "index=dataAsset for the same query");
+    }
+
+    /**
+     * Complex syntax queries (quoted phrases, AND/OR operators) flow through {@code
+     * buildBaseQueryV2 -> buildComplexSyntaxQueryV2} for non-column types and through the column
+     * builder for {@code tableColumn}. Both paths run against both endpoints — the parity
+     * guarantee from the per-type-union refactor must hold here too. This test exercises three
+     * representative shapes and asserts count parity for the {@code tableColumn} bucket.
+     */
+    @Test
+    @DisplayName("Complex-syntax queries keep dataAsset/tableColumn parity")
+    void testComplexSyntaxQueriesKeepParity(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "complex_" + tag, tag);
+      assertNotNull(table);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, tag, "tableColumn") >= 5);
+
+      List<String> queries =
+          List.of(
+              "\"" + tag + " alpha\"",
+              tag + " AND alpha",
+              tag + " OR alpha",
+              "(" + tag + " AND alpha) OR (" + tag + " AND bravo)");
+
+      for (String complexQuery : queries) {
+        long columnTotal = totalHitsForIndex(client, complexQuery, "tableColumn");
+        long aggBucket = bucketCountFromDataAsset(client, complexQuery, Entity.TABLE_COLUMN);
+        assertEquals(
+            columnTotal,
+            aggBucket,
+            "dataAsset tableColumn bucket must equal index=tableColumn total for complex "
+                + "query "
+                + complexQuery);
+      }
+    }
+
+    private Set<String> hitsForColumnQuery(OpenMetadataClient client, String query)
+        throws Exception {
+      String response =
+          client.search().query(query).index("tableColumn").size(50).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> names = new HashSet<>();
+      for (JsonNode hit : hits) {
+        names.add(hit.path("_source").path("name").asText());
+      }
+      return names;
+    }
+
+    private Set<String> fqnsForIndex(OpenMetadataClient client, String query, String index)
+        throws Exception {
+      String response =
+          client.search().query(query).index(index).size(200).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> fqns = new HashSet<>();
+      for (JsonNode hit : hits) {
+        fqns.add(hit.path("_source").path("fullyQualifiedName").asText());
+      }
+      return fqns;
+    }
+
+    private Set<String> fqnsFromDataAssetForType(
+        OpenMetadataClient client, String query, String entityType) throws Exception {
+      String response =
+          client.search().query(query).index("dataAsset").size(200).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> fqns = new HashSet<>();
+      for (JsonNode hit : hits) {
+        if (entityType.equals(hit.path("_source").path("entityType").asText())) {
+          fqns.add(hit.path("_source").path("fullyQualifiedName").asText());
+        }
+      }
+      return fqns;
+    }
+
+    private Topic createUniqueTopic(TestNamespace ns, String baseName, String tag) {
+      String shortId = ns.shortPrefix();
+      org.openmetadata.schema.services.connections.messaging.KafkaConnection kafkaConn =
+          new org.openmetadata.schema.services.connections.messaging.KafkaConnection()
+              .withBootstrapServers("localhost:9092");
+      org.openmetadata.schema.api.services.CreateMessagingService msgSvcReq =
+          new org.openmetadata.schema.api.services.CreateMessagingService()
+              .withName("topic_parity_svc_" + shortId + "_" + baseName)
+              .withServiceType(
+                  org.openmetadata.schema.api.services.CreateMessagingService.MessagingServiceType
+                      .Kafka)
+              .withConnection(
+                  new org.openmetadata.schema.type.MessagingConnection().withConfig(kafkaConn));
+      org.openmetadata.schema.entity.services.MessagingService msgService =
+          SdkClients.adminClient().messagingServices().create(msgSvcReq);
+
+      org.openmetadata.schema.api.data.CreateTopic topicRequest =
+          new org.openmetadata.schema.api.data.CreateTopic()
+              .withName(ns.prefix(tag + "_topic_" + baseName))
+              .withService(msgService.getFullyQualifiedName())
+              .withPartitions(1);
+      return SdkClients.adminClient().topics().create(topicRequest);
+    }
+
+    private long totalHitsForIndex(OpenMetadataClient client, String query, String index)
+        throws Exception {
+      String response = client.search().query(query).index(index).size(0).deleted(false).execute();
+      return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong();
+    }
+
+    private long bucketCountFromDataAsset(
+        OpenMetadataClient client, String query, String entityType) throws Exception {
+      String response =
+          client.search().query(query).index("dataAsset").size(0).deleted(false).execute();
+      JsonNode buckets =
+          OBJECT_MAPPER
+              .readTree(response)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
+      for (JsonNode bucket : buckets) {
+        if (entityType.equals(bucket.path("key").asText())) {
+          return bucket.path("doc_count").asLong();
+        }
+      }
+      return 0L;
+    }
+  }
+
   @Nested
   @DisplayName("Nested Column Tests")
   @Execution(ExecutionMode.CONCURRENT)
@@ -493,6 +887,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) {
     return SdkClients.adminClient().tables().create(tableRequest);
   }
 
+  private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) {
+    String shortId = ns.shortPrefix();
+
+    org.openmetadata.schema.services.connections.database.PostgresConnection conn =
+        DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build();
+
+    DatabaseService dbService =
+        DatabaseServices.builder()
+            .name("agg_svc_" + shortId + "_" + baseName)
+            .connection(conn)
+            .description("Test service for dataAsset/tableColumn aggregation parity")
+            .create();
+
+    CreateDatabase dbReq = new CreateDatabase();
+    dbReq.setName("agg_db_" + shortId + "_" + baseName);
+    dbReq.setService(dbService.getFullyQualifiedName());
+    Database database = SdkClients.adminClient().databases().create(dbReq);
+
+    CreateDatabaseSchema schemaReq = new CreateDatabaseSchema();
+    schemaReq.setName("agg_schema_" + shortId + "_" + baseName);
+    schemaReq.setDatabase(database.getFullyQualifiedName());
+    DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq);
+
+    CreateTable tableRequest = new CreateTable();
+    tableRequest.setName(ns.prefix(baseName));
+    tableRequest.setDatabaseSchema(schema.getFullyQualifiedName());
+    tableRequest.setColumns(
+        List.of(
+            new Column()
+                .withName(tag + "_first_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("First name"),
+            new Column()
+                .withName(tag + "_last_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("Last name"),
+            new Column()
+                .withName(tag + "_address")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(512)
+                .withDescription("Postal address")));
+
+    return SdkClients.adminClient().tables().create(tableRequest);
+  }
+
   private Table createTableWithNestedColumns(TestNamespace ns, String baseName) {
     String shortId = ns.shortPrefix();
 
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
index ee62799dd9f2..0a3a2ae29210 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
@@ -18,9 +18,11 @@
 import es.co.elastic.clients.elasticsearch.core.search.Highlight;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.stream.Collectors;
 import org.openmetadata.schema.api.search.Aggregation;
 import org.openmetadata.schema.api.search.AssetTypeConfiguration;
@@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory
   private static final String MATCH_TYPE_STANDARD = "standard";
   private static final String INDEX_ALL = "all";
   private static final String INDEX_DATA_ASSET = "dataAsset";
+  private static final String ENTITY_TYPE_FIELD = "entityType";
   private static final String MINIMUM_SHOULD_MATCH = "2<70%";
   private static final float DEFAULT_TIE_BREAKER = 0.3f;
   private static final float DEFAULT_BOOST = 1.0f;
@@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2(
           indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
-    if (indexName.equals("all") || indexName.equals("dataAsset")) {
-      return buildDataAssetSearchBuilderV2(
-          indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
+    if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
+      return buildAllAssetsSearchBuilderV2(
+          searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
     return switch (indexName) {
@@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int
       queryBuilder =
           es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
     } else {
-      Map<String, Float> fields = ColumnSearchIndex.getFields();
-      queryBuilder =
-          ElasticQueryBuilder.multiMatchQuery(
-              query,
-              fields,
-              es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
-              es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or,
-              String.valueOf(DEFAULT_TIE_BREAKER),
-              "0");
+      queryBuilder = buildColumnMultiMatchV2(query);
     }
     es.co.elastic.clients.elasticsearch.core.search.Highlight hb =
         buildHighlightsV2(List.of("name", "displayName", "description"));
     return searchBuilderV2(queryBuilder, hb, from, size);
   }
 
+  /**
+   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
+   * {@code dataAsset} composite query. Uses {@link
+   * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced
+   * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code
+   * first_name} matches any column whose name contains just {@code first} or just {@code name},
+   * which both inflates the column index hits and creates the dataAsset/tableColumn count
+   * mismatch tracked in github issue #3851.
+   */
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2(
+      String query) {
+    return ElasticQueryBuilder.multiMatchQuery(
+        query,
+        ColumnSearchIndex.getFields(),
+        es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
+        es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And,
+        String.valueOf(DEFAULT_TIE_BREAKER),
+        "0");
+  }
+
   public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
     es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder =
         buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
@@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2(
     return searchRequestBuilder;
   }
 
+  /**
+   * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
+   * union: each asset type contributes a clause built with its own configuration (column docs go
+   * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
+   * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
+   * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
+   * for the same query, by construction. Avoids the composite-config divergence behind
+   * github.com/open-metadata/openmetadata-collate#3851.
+   */
+  public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2(
+      String query, int from, int size, boolean explain, boolean includeAggregations) {
+    AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings);
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery =
+        buildPerTypeUnionQueryV2(query);
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery =
+        applyFunctionScoringV2(baseQuery, compositeConfig);
+    es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder =
+        buildHighlightingIfNeededV2(query, compositeConfig);
+
+    ElasticSearchRequestBuilder searchRequestBuilder =
+        createSearchSourceBuilderV2(finalQuery, from, size);
+    if (highlightBuilder != null) {
+      searchRequestBuilder.highlighter(highlightBuilder);
+    }
+    if (includeAggregations) {
+      addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
+    }
+    searchRequestBuilder.explain(explain);
+    return searchRequestBuilder;
+  }
+
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
+      String query) {
+    if (isMatchAllQuery(query)) {
+      return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build();
+    }
+    ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery();
+    Set<String> configuredTypes = new HashSet<>();
+    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+      String assetType = typeConfig.getAssetType();
+      if (assetType == null || assetType.equals(INDEX_ALL)) {
+        continue;
+      }
+      configuredTypes.add(assetType);
+      union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
+    }
+    union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
+    union.minimumShouldMatch(1);
+    return union.build();
+  }
+
+  private static boolean isMatchAllQuery(String query) {
+    return query == null || query.trim().isEmpty() || query.trim().equals("*");
+  }
+
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2(
+      String query, String assetType, AssetTypeConfiguration typeConfig) {
+    es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner =
+        Entity.TABLE_COLUMN.equals(assetType)
+            ? buildColumnMultiMatchV2(query)
+            : buildBaseQueryV2(query, typeConfig);
+    return ElasticQueryBuilder.boolQuery()
+        .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
+        .must(inner)
+        .build();
+  }
+
+  /**
+   * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
+   * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
+   */
+  private es.co.elastic.clients.elasticsearch._types.query_dsl.Query
+      buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
+    ElasticQueryBuilder.BoolQueryBuilder fallback =
+        ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
+    for (String configured : configuredTypes) {
+      fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    }
+    return fallback.build();
+  }
+
   public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2(
       String query, int from, int size) {
     return buildAggregateSearchBuilderV2(query, from, size, true);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
index 1b9a76525611..9e4ea9659488 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java
@@ -153,8 +153,12 @@ public static Map<String, Float> getFields() {
     Map<String, Float> fields = new HashMap<>();
     fields.put("name", 10.0f);
     fields.put("name.keyword", 20.0f);
+    fields.put("name.ngram", 1.0f);
+    fields.put("name.compound", 5.0f);
     fields.put("displayName", 7.0f);
     fields.put("displayName.keyword", 20.0f);
+    fields.put("displayName.ngram", 1.0f);
+    fields.put("displayName.compound", 4.0f);
     fields.put("fullyQualifiedName", 5.0f);
     fields.put("description", 2.0f);
     fields.put("dataType", 3.0f);
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
index 7deb93c7d7da..fdd7bd107981 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
@@ -14,9 +14,11 @@
 
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.stream.Collectors;
 import lombok.extern.slf4j.Slf4j;
 import org.openmetadata.schema.api.search.Aggregation;
@@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory
   private static final String MATCH_TYPE_STANDARD = "standard";
   private static final String INDEX_ALL = "all";
   private static final String INDEX_DATA_ASSET = "dataAsset";
+  private static final String ENTITY_TYPE_FIELD = "entityType";
   private static final String MINIMUM_SHOULD_MATCH = "2<70%";
   private static final float DEFAULT_TIE_BREAKER = 0.3f;
   private static final float DEFAULT_BOOST = 1.0f;
@@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2(
           indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
-    if (indexName.equals("all") || indexName.equals("dataAsset")) {
-      return buildDataAssetSearchBuilderV2(
-          indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
+    if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
+      return buildAllAssetsSearchBuilderV2(
+          searchQuery, fromOffset, size, includeExplain, includeAggregations);
     }
 
     return switch (indexName) {
@@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro
       queryBuilder =
           os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
     } else {
-      Map<String, Float> fields = ColumnSearchIndex.getFields();
-      queryBuilder =
-          OpenSearchQueryBuilder.multiMatchQuery(
-              query,
-              fields,
-              os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
-              os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or,
-              String.valueOf(DEFAULT_TIE_BREAKER),
-              "0");
+      queryBuilder = buildColumnMultiMatchV2(query);
     }
     os.org.opensearch.client.opensearch.core.search.Highlight highlighter =
         buildHighlightsV2(List.of("name", "displayName", "description"));
     return searchBuilderV2(queryBuilder, highlighter, from, size);
   }
 
+  /**
+   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
+   * {@code dataAsset} composite query. Uses {@link
+   * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced
+   * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some
+   * field. Without {@code And}, a query like {@code first_name} matches any column whose name
+   * contains just {@code first} or just {@code name}, which both inflates the column index hits
+   * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851.
+   */
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2(
+      String query) {
+    return OpenSearchQueryBuilder.multiMatchQuery(
+        query,
+        ColumnSearchIndex.getFields(),
+        os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
+        os.org.opensearch.client.opensearch._types.query_dsl.Operator.And,
+        String.valueOf(DEFAULT_TIE_BREAKER),
+        "0");
+  }
+
   public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
     os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder =
         buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
@@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2(
     return searchRequestBuilder;
   }
 
+  /**
+   * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
+   * union: each asset type contributes a clause built with its own configuration (column docs go
+   * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
+   * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
+   * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
+   * for the same query, by construction. Avoids the composite-config divergence behind
+   * github.com/open-metadata/openmetadata-collate#3851.
+   */
+  public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2(
+      String query, int from, int size, boolean explain, boolean includeAggregations) {
+    AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig();
+    os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery =
+        buildPerTypeUnionQueryV2(query);
+    os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery =
+        applyFunctionScoringV2(baseQuery, compositeConfig);
+    os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder =
+        buildHighlightingIfNeededV2(query, compositeConfig);
+
+    OpenSearchRequestBuilder searchRequestBuilder =
+        createSearchSourceBuilderV2(finalQuery, from, size);
+    if (highlightBuilder != null) {
+      searchRequestBuilder.highlighter(highlightBuilder);
+    }
+    if (includeAggregations) {
+      addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
+    }
+    searchRequestBuilder.explain(explain);
+    return searchRequestBuilder;
+  }
+
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
+      String query) {
+    if (isMatchAllQuery(query)) {
+      return OpenSearchQueryBuilder.boolQuery()
+          .must(OpenSearchQueryBuilder.matchAllQuery())
+          .build();
+    }
+    OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery();
+    Set<String> configuredTypes = new HashSet<>();
+    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+      String assetType = typeConfig.getAssetType();
+      if (assetType == null || assetType.equals(INDEX_ALL)) {
+        continue;
+      }
+      configuredTypes.add(assetType);
+      union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
+    }
+    union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
+    union.minimumShouldMatch(1);
+    return union.build();
+  }
+
+  private static boolean isMatchAllQuery(String query) {
+    return query == null || query.trim().isEmpty() || query.trim().equals("*");
+  }
+
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2(
+      String query, String assetType, AssetTypeConfiguration typeConfig) {
+    os.org.opensearch.client.opensearch._types.query_dsl.Query inner =
+        Entity.TABLE_COLUMN.equals(assetType)
+            ? buildColumnMultiMatchV2(query)
+            : buildBaseQueryV2(query, typeConfig);
+    return OpenSearchQueryBuilder.boolQuery()
+        .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
+        .must(inner)
+        .build();
+  }
+
+  /**
+   * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
+   * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
+   * Without this, docs of those types would silently disappear from the dataAsset alias after the
+   * per-type-union refactor.
+   */
+  private os.org.opensearch.client.opensearch._types.query_dsl.Query
+      buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
+    OpenSearchQueryBuilder.BoolQueryBuilder fallback =
+        OpenSearchQueryBuilder.boolQuery()
+            .must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
+    for (String configured : configuredTypes) {
+      fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    }
+    return fallback.build();
+  }
+
   private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2(
       String query, AssetTypeConfiguration assetConfig) {
     if (query == null || query.trim().isEmpty() || query.trim().equals("*")) {

From 718ceb9de327df8b35a3ae2dddcc96207555a5f5 Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 17:05:27 +0530
Subject: [PATCH 8/9] fix(search): address PR review feedback

Bug fixes the reviewer flagged:

- buildPerTypeUnionQueryV2 (both factories) now guards against
  searchSettings.getAssetTypeConfigurations() returning null/empty by
  falling back to the default config so dataAsset/all queries can't NPE if
  the search settings haven't been initialized.
- testDataAssetTableColumnAggregationMatchesTableColumnTotal previously
  awaited a multi-token query that no single seeded column could satisfy
  under the new Operator.And semantics, leaving the parity assertion
  trivially 0 == 0; switched to a query that actually matches multiple
  seeded columns.
- testColumnQueryRequiresAllSubtokensToMatch was relying on a column name
  (`<tag>_first_id`) that was never created, so the negative assertion was
  trivially true. createTableWithMultiTokenColumns now seeds first_id (and
  alpha/bravo columns used by the complex-syntax and prefix tests) so all
  the parity assertions exercise real indexed documents.
- bucketCountFromDataAsset accepts both `entityType` and
  `sterms#entityType` aggregation key shapes so the helper doesn't break
  on backends that label the bucket differently.
- Javadoc on buildColumnMultiMatchV2 (both factories) corrected to
  describe the actual previous shape (`Operator.Or` + `fuzziness="0"`,
  which left minimum_should_match unset) instead of the inaccurate
  `min_should_match=0`.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../it/tests/ColumnSearchIndexIT.java         | 91 ++++++++-----------
 .../ElasticSearchSourceBuilderFactory.java    | 22 +++--
 .../OpenSearchSourceBuilderFactory.java       | 20 ++--
 3 files changed, 65 insertions(+), 68 deletions(-)

diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
index 20c3d8110422..943da589397b 100644
--- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
+++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
@@ -385,59 +385,22 @@ void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns
       Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag);
       assertNotNull(table);
 
-      String multiTokenQuery = tag + "_first " + tag + "_address";
+      // Two analyzed sub-tokens (`<tag>` and `name`); seeded columns first_name and last_name
+      // contain both, so the AND-based column builder produces a non-zero match count we can
+      // assert real parity against (not a trivial 0 == 0).
+      String multiTokenQuery = tag + " name";
 
       Awaitility.await()
           .atMost(90, TimeUnit.SECONDS)
           .pollInterval(500, TimeUnit.MILLISECONDS)
-          .until(
-              () -> {
-                String r =
-                    client
-                        .search()
-                        .query(multiTokenQuery)
-                        .index("tableColumn")
-                        .size(0)
-                        .deleted(false)
-                        .execute();
-                JsonNode root = OBJECT_MAPPER.readTree(r);
-                long total = root.path("hits").path("total").path("value").asLong(-1);
-                return total >= 3;
-              });
-
-      String columnResponse =
-          client
-              .search()
-              .query(multiTokenQuery)
-              .index("tableColumn")
-              .size(0)
-              .deleted(false)
-              .execute();
-      long columnTotal =
-          OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong();
+          .until(() -> totalHitsForIndex(client, multiTokenQuery, "tableColumn") >= 2);
 
-      String aggResponse =
-          client
-              .search()
-              .query(multiTokenQuery)
-              .index("dataAsset")
-              .size(0)
-              .deleted(false)
-              .execute();
-      JsonNode aggBuckets =
-          OBJECT_MAPPER
-              .readTree(aggResponse)
-              .path("aggregations")
-              .path("sterms#entityType")
-              .path("buckets");
-      long aggColumnCount = 0;
-      for (JsonNode bucket : aggBuckets) {
-        if ("tableColumn".equals(bucket.path("key").asText())) {
-          aggColumnCount = bucket.path("doc_count").asLong();
-          break;
-        }
-      }
+      long columnTotal = totalHitsForIndex(client, multiTokenQuery, "tableColumn");
+      long aggColumnCount = bucketCountFromDataAsset(client, multiTokenQuery, Entity.TABLE_COLUMN);
 
+      assertTrue(
+          columnTotal >= 2,
+          "Seeded columns first_name and last_name should match query " + multiTokenQuery);
       assertEquals(
           columnTotal,
           aggColumnCount,
@@ -732,13 +695,12 @@ private long bucketCountFromDataAsset(
         OpenMetadataClient client, String query, String entityType) throws Exception {
       String response =
           client.search().query(query).index("dataAsset").size(0).deleted(false).execute();
-      JsonNode buckets =
-          OBJECT_MAPPER
-              .readTree(response)
-              .path("aggregations")
-              .path("sterms#entityType")
-              .path("buckets");
-      for (JsonNode bucket : buckets) {
+      JsonNode aggregations = OBJECT_MAPPER.readTree(response).path("aggregations");
+      JsonNode entityTypeAgg = aggregations.path("entityType");
+      if (entityTypeAgg.isMissingNode()) {
+        entityTypeAgg = aggregations.path("sterms#entityType");
+      }
+      for (JsonNode bucket : entityTypeAgg.path("buckets")) {
         if (entityType.equals(bucket.path("key").asText())) {
           return bucket.path("doc_count").asLong();
         }
@@ -920,6 +882,10 @@ private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName
                 .withDataType(ColumnDataType.VARCHAR)
                 .withDataLength(255)
                 .withDescription("First name"),
+            new Column()
+                .withName(tag + "_first_id")
+                .withDataType(ColumnDataType.INT)
+                .withDescription("Decoy: shares only the 'first' sub-token with first_name"),
             new Column()
                 .withName(tag + "_last_name")
                 .withDataType(ColumnDataType.VARCHAR)
@@ -929,11 +895,26 @@ private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName
                 .withName(tag + "_address")
                 .withDataType(ColumnDataType.VARCHAR)
                 .withDataLength(512)
-                .withDescription("Postal address")));
+                .withDescription("Postal address"),
+            new Column()
+                .withName(tag + "_alpha_amount")
+                .withDataType(ColumnDataType.DECIMAL)
+                .withDescription("Alpha amount column for complex-syntax tests"),
+            new Column()
+                .withName(tag + "_alpha_address")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(512)
+                .withDescription("Alpha address column"),
+            new Column()
+                .withName(tag + "_bravo_total")
+                .withDataType(ColumnDataType.DECIMAL)
+                .withDescription("Bravo total column")));
 
     return SdkClients.adminClient().tables().create(tableRequest);
   }
 
+  private static final int MULTI_TOKEN_SEED_COUNT = 7;
+
   private Table createTableWithNestedColumns(TestNamespace ns, String baseName) {
     String shortId = ns.shortPrefix();
 
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
index 0a3a2ae29210..821918f47372 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
@@ -385,13 +385,15 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int
   }
 
   /**
-   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
-   * {@code dataAsset} composite query. Uses {@link
+   * Multi-match used both by {@code index=tableColumn} and the column branch of {@link
+   * #buildPerTypeUnionQueryV2(String)}. Uses {@link
    * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced
-   * by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code
-   * first_name} matches any column whose name contains just {@code first} or just {@code name},
-   * which both inflates the column index hits and creates the dataAsset/tableColumn count
-   * mismatch tracked in github issue #3851.
+   * by {@code om_analyzer} must hit some field. The previous shape used {@code Operator.Or} with
+   * this helper's {@code fuzziness="0"} — that combination did not set {@code
+   * minimum_should_match}, so any single sub-token in any field was enough to match. As a result
+   * a search like {@code first_name} matched columns whose name contained just {@code first} or
+   * just {@code name}, which both inflated the column index hits and caused the
+   * dataAsset/tableColumn count mismatch tracked in github issue #3851.
    */
   private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2(
       String query) {
@@ -488,9 +490,15 @@ private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeU
     if (isMatchAllQuery(query)) {
       return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build();
     }
+    List<AssetTypeConfiguration> configs = searchSettings.getAssetTypeConfigurations();
+    if (configs == null || configs.isEmpty()) {
+      return ElasticQueryBuilder.boolQuery()
+          .must(buildBaseQueryV2(query, getOrCreateDefaultConfig()))
+          .build();
+    }
     ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery();
     Set<String> configuredTypes = new HashSet<>();
-    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+    for (AssetTypeConfiguration typeConfig : configs) {
       String assetType = typeConfig.getAssetType();
       if (assetType == null || assetType.equals(INDEX_ALL)) {
         continue;
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
index fdd7bd107981..e749a7c02467 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
@@ -385,13 +385,15 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro
   }
 
   /**
-   * Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
-   * {@code dataAsset} composite query. Uses {@link
+   * Multi-match used both by {@code index=tableColumn} and the column branch of {@link
+   * #buildPerTypeUnionQueryV2(String)}. Uses {@link
    * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced
    * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some
-   * field. Without {@code And}, a query like {@code first_name} matches any column whose name
-   * contains just {@code first} or just {@code name}, which both inflates the column index hits
-   * and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851.
+   * field. The previous shape used {@code Operator.Or} with this helper's {@code fuzziness="0"}
+   * — that combination did not set {@code minimum_should_match}, so any single sub-token in any
+   * field was enough to match. As a result a search like {@code first_name} matched columns whose
+   * name contained just {@code first} or just {@code name}, which both inflated the column index
+   * hits and caused the dataAsset/tableColumn count mismatch tracked in github issue #3851.
    */
   private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2(
       String query) {
@@ -505,9 +507,15 @@ private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeU
           .must(OpenSearchQueryBuilder.matchAllQuery())
           .build();
     }
+    List<AssetTypeConfiguration> configs = searchSettings.getAssetTypeConfigurations();
+    if (configs == null || configs.isEmpty()) {
+      return OpenSearchQueryBuilder.boolQuery()
+          .must(buildBaseQueryV2(query, getOrCreateDefaultConfig()))
+          .build();
+    }
     OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery();
     Set<String> configuredTypes = new HashSet<>();
-    for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
+    for (AssetTypeConfiguration typeConfig : configs) {
       String assetType = typeConfig.getAssetType();
       if (assetType == null || assetType.equals(INDEX_ALL)) {
         continue;

From 90df4c2e03dcba415a0b2e34e67687248894e0a9 Mon Sep 17 00:00:00 2001
From: mohitdeuex <mohit.y@deuexsolutions.com>
Date: Thu, 30 Apr 2026 17:28:58 +0530
Subject: [PATCH 9/9] perf(search): collapse fallback mustNot chain into a
 single termsQuery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a `termsQuery(String, Collection<String>)` helper to both
OpenSearchQueryBuilder and ElasticQueryBuilder, then uses it in
buildUnconfiguredAssetFallbackV2 to replace the per-type `mustNot
term(entityType=...)` chain with a single `mustNot terms(entityType=[...])`.
Keeps the bool query small regardless of how many configured asset types
exist — the previous shape grew one clause per configured type.

Issue: open-metadata/openmetadata-collate#3851

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../service/search/elasticsearch/ElasticQueryBuilder.java  | 7 +++++++
 .../elasticsearch/ElasticSearchSourceBuilderFactory.java   | 4 ++--
 .../service/search/opensearch/OpenSearchQueryBuilder.java  | 6 ++++++
 .../search/opensearch/OpenSearchSourceBuilderFactory.java  | 4 ++--
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java
index 543e75d6363a..48d43a26dd29 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java
@@ -1,10 +1,12 @@
 package org.openmetadata.service.search.elasticsearch;
 
+import es.co.elastic.clients.elasticsearch._types.FieldValue;
 import es.co.elastic.clients.elasticsearch._types.query_dsl.Operator;
 import es.co.elastic.clients.elasticsearch._types.query_dsl.Query;
 import es.co.elastic.clients.elasticsearch._types.query_dsl.QueryStringQuery;
 import es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 
@@ -28,6 +30,11 @@ public static Query termQuery(String field, int value) {
     return Query.of(q -> q.term(t -> t.field(field).value(value)));
   }
 
+  public static Query termsQuery(String field, Collection<String> values) {
+    List<FieldValue> fieldValues = values.stream().map(FieldValue::of).toList();
+    return Query.of(q -> q.terms(t -> t.field(field).terms(tf -> tf.value(fieldValues))));
+  }
+
   public static Query matchQuery(String field, String value) {
     return Query.of(q -> q.match(m -> m.field(field).query(value)));
   }
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
index 821918f47372..a2c2898f11e0 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java
@@ -535,8 +535,8 @@ private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTyp
       buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
     ElasticQueryBuilder.BoolQueryBuilder fallback =
         ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
-    for (String configured : configuredTypes) {
-      fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    if (!configuredTypes.isEmpty()) {
+      fallback.mustNot(ElasticQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes));
     }
     return fallback.build();
   }
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java
index bbe429084cd8..a01f2e07fca8 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java
@@ -1,6 +1,7 @@
 package org.openmetadata.service.search.opensearch;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import os.org.opensearch.client.opensearch._types.FieldValue;
@@ -29,6 +30,11 @@ public static Query termQuery(String field, int value) {
     return Query.of(q -> q.term(t -> t.field(field).value(FieldValue.of(value))));
   }
 
+  public static Query termsQuery(String field, Collection<String> values) {
+    List<FieldValue> fieldValues = values.stream().map(FieldValue::of).toList();
+    return Query.of(q -> q.terms(t -> t.field(field).terms(tf -> tf.value(fieldValues))));
+  }
+
   public static Query matchQuery(String field, String value) {
     return Query.of(q -> q.match(m -> m.field(field).query(FieldValue.of(value))));
   }
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
index e749a7c02467..9273c46f4bd2 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java
@@ -555,8 +555,8 @@ private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTyp
     OpenSearchQueryBuilder.BoolQueryBuilder fallback =
         OpenSearchQueryBuilder.boolQuery()
             .must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
-    for (String configured : configuredTypes) {
-      fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
+    if (!configuredTypes.isEmpty()) {
+      fallback.mustNot(OpenSearchQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes));
     }
     return fallback.build();
   }