open-metadata · mohityadav766 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/...tadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/...tadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java
@@ -20,8 +20,11 @@
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
+import org.awaitility.Awaitility;
 import org.junit.jupiter.api.DisplayName;
 import org.junit.jupiter.api.Nested;
 import org.junit.jupiter.api.Test;
@@ -33,15 +36,18 @@
 import org.openmetadata.it.util.TestNamespaceExtension;
 import org.openmetadata.schema.api.data.CreateDatabase;
 import org.openmetadata.schema.api.data.CreateDatabaseSchema;
+import org.openmetadata.schema.api.data.CreateGlossary;
 import org.openmetadata.schema.api.data.CreateTable;
 import org.openmetadata.schema.entity.data.Database;
 import org.openmetadata.schema.entity.data.DatabaseSchema;
+import org.openmetadata.schema.entity.data.Glossary;
 import org.openmetadata.schema.entity.data.Table;
 import org.openmetadata.schema.entity.services.DatabaseService;
 import org.openmetadata.schema.type.Column;
 import org.openmetadata.schema.type.ColumnDataType;
 import org.openmetadata.sdk.client.OpenMetadataClient;
 import org.openmetadata.sdk.fluent.DatabaseServices;
+import org.openmetadata.service.Entity;
 
 /**
  * Integration tests for column search indexing during table reindexing. Verifies:
@@ -353,6 +359,268 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception {
     }
   }
 
+  @Nested
+  @DisplayName("DataAsset/TableColumn Aggregation Parity Tests")
+  @Execution(ExecutionMode.CONCURRENT)
+  class DataAssetColumnAggregationTests {
+
+    /**
+     * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two
+     * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the
+     * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through
+     * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the
+     * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with
+     * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the
+     * {@code dataAsset} aggregation under-counts the column index hits.
+     */
+    @Test
+    @DisplayName(
+        "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total")
+    void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns)
+        throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag);
+      assertNotNull(table);
+
+      String multiTokenQuery = tag + "_first " + tag + "_address";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(
+              () -> {
+                String r =
+                    client
+                        .search()
+                        .query(multiTokenQuery)
+                        .index("tableColumn")
+                        .size(0)
+                        .deleted(false)
+                        .execute();
+                JsonNode root = OBJECT_MAPPER.readTree(r);
+                long total = root.path("hits").path("total").path("value").asLong(-1);
+                return total >= 3;
+              });
+
+      String columnResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("tableColumn")
+              .size(0)
+              .deleted(false)
+              .execute();
+      long columnTotal =
+          OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong();
+
+      String aggResponse =
+          client
+              .search()
+              .query(multiTokenQuery)
+              .index("dataAsset")
+              .size(0)
+              .deleted(false)
+              .execute();
+      JsonNode aggBuckets =
+          OBJECT_MAPPER
+              .readTree(aggResponse)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
-      JsonNode aggBuckets =
-          OBJECT_MAPPER
-              .readTree(aggResponse)
-              .path("aggregations")
-              .path("sterms#entityType")
-              .path("buckets");
+      JsonNode aggregations = OBJECT_MAPPER.readTree(aggResponse).path("aggregations");
+      JsonNode entityTypeAggregation = aggregations.path("entityType");
+      if (entityTypeAggregation.isMissingNode()) {
+        entityTypeAggregation = aggregations.path("sterms#entityType");
+      }
+      JsonNode aggBuckets = entityTypeAggregation.path("buckets");
-      JsonNode aggBuckets =
-          OBJECT_MAPPER
-              .readTree(aggResponse)
-              .path("aggregations")
-              .path("sterms#entityType")
-              .path("buckets");
+      JsonNode aggregations = OBJECT_MAPPER.readTree(aggResponse).path("aggregations");
+      JsonNode entityTypeAggregation = aggregations.path("entityType");
+      if (entityTypeAggregation.isMissingNode()) {
+        entityTypeAggregation = aggregations.path("sterms#entityType");
+      }
+      JsonNode aggBuckets = entityTypeAggregation.path("buckets");
+      long aggColumnCount = 0;
+      for (JsonNode bucket : aggBuckets) {
+        if ("tableColumn".equals(bucket.path("key").asText())) {
+          aggColumnCount = bucket.path("doc_count").asLong();
+          break;
+        }
+      }
+
+      assertEquals(
+          columnTotal,
+          aggColumnCount,
+          "dataAsset aggregation tableColumn bucket ("
+              + aggColumnCount
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for query \""
+              + multiTokenQuery
+              + "\"");
+    }
+
+    /**
+     * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name}
+     * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0}
+     * column builder, a column called {@code <tag>_first_id} matched a query of {@code
+     * <tag>_first_name} because the single token {@code first} was enough. The fix moves the
+     * column builder to {@code Operator.And}, so every sub-token must match.
-     * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0}
-     * column builder, a column called {@code <tag>_first_id} matched a query of {@code
-     * <tag>_first_name} because the single token {@code first} was enough. The fix moves the
-     * column builder to {@code Operator.And}, so every sub-token must match.
+     * into {@code [first, name]}; with the old {@code Operator.Or} column builder, the string
+     * {@code "0"} was passed as the fuzziness value rather than setting {@code
+     * minimum_should_match}, so a column called {@code <tag>_first_id} matched a query of
+     * {@code <tag>_first_name} because the single token {@code first} was enough. The fix moves
+     * the column builder to {@code Operator.And}, so every sub-token must match.
-     * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0}
-     * column builder, a column called {@code <tag>_first_id} matched a query of {@code
-     * <tag>_first_name} because the single token {@code first} was enough. The fix moves the
-     * column builder to {@code Operator.And}, so every sub-token must match.
+     * into {@code [first, name]}; with the old {@code Operator.Or} column builder, the string
+     * {@code "0"} was passed as the fuzziness value rather than setting {@code
+     * minimum_should_match}, so a column called {@code <tag>_first_id} matched a query of
+     * {@code <tag>_first_name} because the single token {@code first} was enough. The fix moves
+     * the column builder to {@code Operator.And}, so every sub-token must match.
+     */
+    @Test
+    @DisplayName("Column query for first_name must not match a column called first_id")
+    void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag);
+      assertNotNull(table);
+
+      String firstNameColumn = tag + "_first_name";
+      String firstIdColumn = tag + "_first_id";
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn));
+
+      Set<String> hits = hitsForColumnQuery(client, firstNameColumn);
+      assertTrue(
+          hits.contains(firstNameColumn),
+          "Query " + firstNameColumn + " must match the column with the same name; got " + hits);
+      assertFalse(
+          hits.contains(firstIdColumn),
+          "Query "
+              + firstNameColumn
+              + " must not match column "
+              + firstIdColumn
+              + " (only the 'first' sub-token overlaps); got "
+              + hits);
+    }
+
+    /**
+     * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal}
+     * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must
+     * make every entity-type bucket equal to what the dedicated index returns.
+     */
+    @Test
+    @DisplayName("Aggregation bucket for table under dataAsset must match index=table total")
+    void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      String tableQueryTag = "tblparity" + tag;
+      Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag);
+      assertNotNull(table);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1);
+
+      long tableTotal = totalHitsForIndex(client, table.getName(), "table");
+      long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table");
+
+      assertEquals(
+          tableTotal,
+          aggTableBucket,
+          "dataAsset aggregation table bucket must equal index=table total for query "
+              + table.getName());
+    }
+
+    /**
+     * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union
+     * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram}
+     * and {@code name.compound}, which are now part of {@link
+     * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket
+     * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the
+     * same prefix query, and the seeded column must be in both result sets.
+     */
+    @Test
+    @DisplayName("Prefix queries match via ngram and stay in parity across both paths")
+    void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String tag = ns.shortPrefix();
+      Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag);
+      assertNotNull(table);
+
+      String prefixQuery = tag.substring(0, Math.min(4, tag.length()));
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5);
-          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5);
+          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 3);
-          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5);
+          .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 3);
+
+      long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn");
+      long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN);
+
+      assertTrue(
+          columnTotal > 0,
+          "Prefix query " + prefixQuery + " should match seeded columns via name.ngram");
+      assertEquals(
+          columnTotal,
+          aggColumnBucket,
+          "dataAsset tableColumn bucket ("
+              + aggColumnBucket
+              + ") must match index=tableColumn total ("
+              + columnTotal
+              + ") for prefix query "
+              + prefixQuery);
+    }
+
+    /**
+     * Asset types that live in the {@code dataAsset} alias but lack a {@code
+     * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via
+     * the composite-config field merge. The per-type-union path adds a fallback {@code should}
+     * clause for these; this test pins that a glossary doc still appears in {@code
+     * index=dataAsset} hits when its name matches the query.
+     */
+    @Test
+    @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias")
+    void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception {
+      OpenMetadataClient client = SdkClients.adminClient();
+      String name = ns.prefix("glossary_unconfigured_fallback");
+      CreateGlossary req = new CreateGlossary().withName(name).withDescription(name);
+      Glossary glossary = client.glossaries().create(req);
+      assertNotNull(glossary);
+
+      Awaitility.await()
+          .atMost(90, TimeUnit.SECONDS)
+          .pollInterval(500, TimeUnit.MILLISECONDS)
+          .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1);
+
+      long total = totalHitsForIndex(client, name, "dataAsset");
+      assertTrue(
+          total >= 1,
+          "Glossary doc with name "
+              + name
+              + " must be reachable via index=dataAsset (fallback clause for unconfigured types)");
+    }
+
+    private Set<String> hitsForColumnQuery(OpenMetadataClient client, String query)
+        throws Exception {
+      String response =
+          client.search().query(query).index("tableColumn").size(50).deleted(false).execute();
+      JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits");
+      Set<String> names = new HashSet<>();
+      for (JsonNode hit : hits) {
+        names.add(hit.path("_source").path("name").asText());
+      }
+      return names;
+    }
+
+    private long totalHitsForIndex(OpenMetadataClient client, String query, String index)
+        throws Exception {
+      String response = client.search().query(query).index(index).size(0).deleted(false).execute();
+      return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong();
+    }
+
+    private long bucketCountFromDataAsset(
+        OpenMetadataClient client, String query, String entityType) throws Exception {
+      String response =
+          client.search().query(query).index("dataAsset").size(0).deleted(false).execute();
+      JsonNode buckets =
+          OBJECT_MAPPER
+              .readTree(response)
+              .path("aggregations")
+              .path("sterms#entityType")
+              .path("buckets");
-      JsonNode buckets =
-          OBJECT_MAPPER
-              .readTree(response)
-              .path("aggregations")
-              .path("sterms#entityType")
-              .path("buckets");
+      JsonNode root = OBJECT_MAPPER.readTree(response);
+      JsonNode aggregations = root.path("aggregations");
+      JsonNode buckets = aggregations.path("sterms#entityType").path("buckets");
+      if (buckets.isMissingNode() || !buckets.isArray()) {
+        buckets = aggregations.path("entityType").path("buckets");
+      }
-      JsonNode buckets =
-          OBJECT_MAPPER
-              .readTree(response)
-              .path("aggregations")
-              .path("sterms#entityType")
-              .path("buckets");
+      JsonNode root = OBJECT_MAPPER.readTree(response);
+      JsonNode aggregations = root.path("aggregations");
+      JsonNode buckets = aggregations.path("sterms#entityType").path("buckets");
+      if (buckets.isMissingNode() || !buckets.isArray()) {
+        buckets = aggregations.path("entityType").path("buckets");
+      }
+      for (JsonNode bucket : buckets) {
+        if (entityType.equals(bucket.path("key").asText())) {
+          return bucket.path("doc_count").asLong();
+        }
+      }
+      return 0L;
+    }
+  }
+
   @Nested
   @DisplayName("Nested Column Tests")
   @Execution(ExecutionMode.CONCURRENT)
@@ -493,6 +761,53 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) {
     return SdkClients.adminClient().tables().create(tableRequest);
   }
 
+  private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) {
+    String shortId = ns.shortPrefix();
+
+    org.openmetadata.schema.services.connections.database.PostgresConnection conn =
+        DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build();
+
+    DatabaseService dbService =
+        DatabaseServices.builder()
+            .name("agg_svc_" + shortId + "_" + baseName)
+            .connection(conn)
+            .description("Test service for dataAsset/tableColumn aggregation parity")
+            .create();
+
+    CreateDatabase dbReq = new CreateDatabase();
+    dbReq.setName("agg_db_" + shortId + "_" + baseName);
+    dbReq.setService(dbService.getFullyQualifiedName());
+    Database database = SdkClients.adminClient().databases().create(dbReq);
+
+    CreateDatabaseSchema schemaReq = new CreateDatabaseSchema();
+    schemaReq.setName("agg_schema_" + shortId + "_" + baseName);
+    schemaReq.setDatabase(database.getFullyQualifiedName());
+    DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq);
+
+    CreateTable tableRequest = new CreateTable();
+    tableRequest.setName(ns.prefix(baseName));
+    tableRequest.setDatabaseSchema(schema.getFullyQualifiedName());
+    tableRequest.setColumns(
+        List.of(
+            new Column()
+                .withName(tag + "_first_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("First name"),
+            new Column()
+                .withName(tag + "_last_name")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(255)
+                .withDescription("Last name"),
+            new Column()
+                .withName(tag + "_address")
+                .withDataType(ColumnDataType.VARCHAR)
+                .withDataLength(512)
+                .withDescription("Postal address")));
+
+    return SdkClients.adminClient().tables().create(tableRequest);
+  }
+
   private Table createTableWithNestedColumns(TestNamespace ns, String baseName) {
     String shortId = ns.shortPrefix();