diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java index 9ca30e3febdc..943da589397b 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/ColumnSearchIndexIT.java @@ -20,8 +20,11 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.TimeUnit; +import org.awaitility.Awaitility; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -33,15 +36,19 @@ import org.openmetadata.it.util.TestNamespaceExtension; import org.openmetadata.schema.api.data.CreateDatabase; import org.openmetadata.schema.api.data.CreateDatabaseSchema; +import org.openmetadata.schema.api.data.CreateGlossary; import org.openmetadata.schema.api.data.CreateTable; import org.openmetadata.schema.entity.data.Database; import org.openmetadata.schema.entity.data.DatabaseSchema; +import org.openmetadata.schema.entity.data.Glossary; import org.openmetadata.schema.entity.data.Table; +import org.openmetadata.schema.entity.data.Topic; import org.openmetadata.schema.entity.services.DatabaseService; import org.openmetadata.schema.type.Column; import org.openmetadata.schema.type.ColumnDataType; import org.openmetadata.sdk.client.OpenMetadataClient; import org.openmetadata.sdk.fluent.DatabaseServices; +import org.openmetadata.service.Entity; /** * Integration tests for column search indexing during table reindexing. Verifies: @@ -353,6 +360,355 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception { } } + @Nested + @DisplayName("DataAsset/TableColumn Aggregation Parity Tests") + @Execution(ExecutionMode.CONCURRENT) + class DataAssetColumnAggregationTests { + + /** + * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two + * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the + * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through + * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the + * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with + * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the + * {@code dataAsset} aggregation under-counts the column index hits. + */ + @Test + @DisplayName( + "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total") + void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns) + throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag); + assertNotNull(table); + + // Two analyzed sub-tokens (`` and `name`); seeded columns first_name and last_name + // contain both, so the AND-based column builder produces a non-zero match count we can + // assert real parity against (not a trivial 0 == 0). + String multiTokenQuery = tag + " name"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, multiTokenQuery, "tableColumn") >= 2); + + long columnTotal = totalHitsForIndex(client, multiTokenQuery, "tableColumn"); + long aggColumnCount = bucketCountFromDataAsset(client, multiTokenQuery, Entity.TABLE_COLUMN); + + assertTrue( + columnTotal >= 2, + "Seeded columns first_name and last_name should match query " + multiTokenQuery); + assertEquals( + columnTotal, + aggColumnCount, + "dataAsset aggregation tableColumn bucket (" + + aggColumnCount + + ") must match index=tableColumn total (" + + columnTotal + + ") for query \"" + + multiTokenQuery + + "\""); + } + + /** + * Guards against the {@code _} over-match: the {@code om_analyzer} splits {@code first_name} + * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0} + * column builder, a column called {@code _first_id} matched a query of {@code + * _first_name} because the single token {@code first} was enough. The fix moves the + * column builder to {@code Operator.And}, so every sub-token must match. + */ + @Test + @DisplayName("Column query for first_name must not match a column called first_id") + void testColumnQueryRequiresAllSubtokensToMatch(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "subtoken_" + tag, tag); + assertNotNull(table); + + String firstNameColumn = tag + "_first_name"; + String firstIdColumn = tag + "_first_id"; + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> hitsForColumnQuery(client, firstNameColumn).contains(firstNameColumn)); + + Set hits = hitsForColumnQuery(client, firstNameColumn); + assertTrue( + hits.contains(firstNameColumn), + "Query " + firstNameColumn + " must match the column with the same name; got " + hits); + assertFalse( + hits.contains(firstIdColumn), + "Query " + + firstNameColumn + + " must not match column " + + firstIdColumn + + " (only the 'first' sub-token overlaps); got " + + hits); + } + + /** + * Same parity guarantee that {@link #testDataAssetTableColumnAggregationMatchesTableColumnTotal} + * pins for {@code tableColumn}, but for the {@code table} bucket. The per-type-union path must + * make every entity-type bucket equal to what the dedicated index returns. + */ + @Test + @DisplayName("Aggregation bucket for table under dataAsset must match index=table total") + void testDataAssetTableBucketMatchesTableIndexTotal(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + String tableQueryTag = "tblparity" + tag; + Table table = createTableWithMultiTokenColumns(ns, tableQueryTag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, table.getName(), "table") >= 1); + + long tableTotal = totalHitsForIndex(client, table.getName(), "table"); + long aggTableBucket = bucketCountFromDataAsset(client, table.getName(), "table"); + + assertEquals( + tableTotal, + aggTableBucket, + "dataAsset aggregation table bucket must equal index=table total for query " + + table.getName()); + } + + /** + * Pins prefix-style matching so it stays aligned across the two paths after the per-type-union + * refactor. Production behavior for short queries like {@code fir} relies on {@code name.ngram} + * and {@code name.compound}, which are now part of {@link + * org.openmetadata.service.search.indexes.ColumnSearchIndex#getFields()}; the dataAsset bucket + * for {@code tableColumn} must produce the same total as {@code index=tableColumn} for the + * same prefix query, and the seeded column must be in both result sets. + */ + @Test + @DisplayName("Prefix queries match via ngram and stay in parity across both paths") + void testPrefixQueryMatchesViaNgramOnBothPaths(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "ngram_" + tag, tag); + assertNotNull(table); + + String prefixQuery = tag.substring(0, Math.min(4, tag.length())); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5); + + long columnTotal = totalHitsForIndex(client, prefixQuery, "tableColumn"); + long aggColumnBucket = bucketCountFromDataAsset(client, prefixQuery, Entity.TABLE_COLUMN); + + assertTrue( + columnTotal > 0, + "Prefix query " + prefixQuery + " should match seeded columns via name.ngram"); + assertEquals( + columnTotal, + aggColumnBucket, + "dataAsset tableColumn bucket (" + + aggColumnBucket + + ") must match index=tableColumn total (" + + columnTotal + + ") for prefix query " + + prefixQuery); + } + + /** + * Asset types that live in the {@code dataAsset} alias but lack a {@code + * searchSettings.assetTypeConfigurations} entry (e.g. {@code glossary}) used to be matched via + * the composite-config field merge. The per-type-union path adds a fallback {@code should} + * clause for these; this test pins that a glossary doc still appears in {@code + * index=dataAsset} hits when its name matches the query. + */ + @Test + @DisplayName("Asset types without dedicated config (glossary) still match via dataAsset alias") + void testUnconfiguredAssetTypeFallbackMatchesViaDataAsset(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String name = ns.prefix("glossary_unconfigured_fallback"); + CreateGlossary req = new CreateGlossary().withName(name).withDescription(name); + Glossary glossary = client.glossaries().create(req); + assertNotNull(glossary); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, name, "dataAsset") >= 1); + + long total = totalHitsForIndex(client, name, "dataAsset"); + assertTrue( + total >= 1, + "Glossary doc with name " + + name + + " must be reachable via index=dataAsset (fallback clause for unconfigured types)"); + } + + /** + * Pins parity for a non-column entity type at both the count and the FQN-set level. Topic is + * chosen to exercise a different per-type clause than {@code table}, which already has its own + * test. The {@code dataAsset} alias must return the same set of topic FQNs that {@code + * index=topic} returns for the same query, with matching counts — otherwise the explore tab's + * topic count and the topic-tab results would disagree. + */ + @Test + @DisplayName("Topic bucket and topic FQN set must match index=topic for same query") + void testTopicBucketAndResultSetMatchIndexTopic(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Topic topic = createUniqueTopic(ns, "topicparity_" + tag, tag); + assertNotNull(topic); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, tag, "topic") >= 1); + + long topicTotal = totalHitsForIndex(client, tag, "topic"); + long aggTopicBucket = bucketCountFromDataAsset(client, tag, Entity.TOPIC); + assertEquals( + topicTotal, + aggTopicBucket, + "dataAsset topic bucket must equal index=topic total for query " + tag); + + Set topicFqns = fqnsForIndex(client, tag, "topic"); + Set dataAssetTopicFqns = fqnsFromDataAssetForType(client, tag, Entity.TOPIC); + assertEquals( + topicFqns, + dataAssetTopicFqns, + "Topic FQNs returned by index=topic must equal topic-typed FQNs returned by " + + "index=dataAsset for the same query"); + } + + /** + * Complex syntax queries (quoted phrases, AND/OR operators) flow through {@code + * buildBaseQueryV2 -> buildComplexSyntaxQueryV2} for non-column types and through the column + * builder for {@code tableColumn}. Both paths run against both endpoints — the parity + * guarantee from the per-type-union refactor must hold here too. This test exercises three + * representative shapes and asserts count parity for the {@code tableColumn} bucket. + */ + @Test + @DisplayName("Complex-syntax queries keep dataAsset/tableColumn parity") + void testComplexSyntaxQueriesKeepParity(TestNamespace ns) throws Exception { + OpenMetadataClient client = SdkClients.adminClient(); + String tag = ns.shortPrefix(); + Table table = createTableWithMultiTokenColumns(ns, "complex_" + tag, tag); + assertNotNull(table); + + Awaitility.await() + .atMost(90, TimeUnit.SECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .until(() -> totalHitsForIndex(client, tag, "tableColumn") >= 5); + + List queries = + List.of( + "\"" + tag + " alpha\"", + tag + " AND alpha", + tag + " OR alpha", + "(" + tag + " AND alpha) OR (" + tag + " AND bravo)"); + + for (String complexQuery : queries) { + long columnTotal = totalHitsForIndex(client, complexQuery, "tableColumn"); + long aggBucket = bucketCountFromDataAsset(client, complexQuery, Entity.TABLE_COLUMN); + assertEquals( + columnTotal, + aggBucket, + "dataAsset tableColumn bucket must equal index=tableColumn total for complex " + + "query " + + complexQuery); + } + } + + private Set hitsForColumnQuery(OpenMetadataClient client, String query) + throws Exception { + String response = + client.search().query(query).index("tableColumn").size(50).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set names = new HashSet<>(); + for (JsonNode hit : hits) { + names.add(hit.path("_source").path("name").asText()); + } + return names; + } + + private Set fqnsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = + client.search().query(query).index(index).size(200).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set fqns = new HashSet<>(); + for (JsonNode hit : hits) { + fqns.add(hit.path("_source").path("fullyQualifiedName").asText()); + } + return fqns; + } + + private Set fqnsFromDataAssetForType( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(200).deleted(false).execute(); + JsonNode hits = OBJECT_MAPPER.readTree(response).path("hits").path("hits"); + Set fqns = new HashSet<>(); + for (JsonNode hit : hits) { + if (entityType.equals(hit.path("_source").path("entityType").asText())) { + fqns.add(hit.path("_source").path("fullyQualifiedName").asText()); + } + } + return fqns; + } + + private Topic createUniqueTopic(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + org.openmetadata.schema.services.connections.messaging.KafkaConnection kafkaConn = + new org.openmetadata.schema.services.connections.messaging.KafkaConnection() + .withBootstrapServers("localhost:9092"); + org.openmetadata.schema.api.services.CreateMessagingService msgSvcReq = + new org.openmetadata.schema.api.services.CreateMessagingService() + .withName("topic_parity_svc_" + shortId + "_" + baseName) + .withServiceType( + org.openmetadata.schema.api.services.CreateMessagingService.MessagingServiceType + .Kafka) + .withConnection( + new org.openmetadata.schema.type.MessagingConnection().withConfig(kafkaConn)); + org.openmetadata.schema.entity.services.MessagingService msgService = + SdkClients.adminClient().messagingServices().create(msgSvcReq); + + org.openmetadata.schema.api.data.CreateTopic topicRequest = + new org.openmetadata.schema.api.data.CreateTopic() + .withName(ns.prefix(tag + "_topic_" + baseName)) + .withService(msgService.getFullyQualifiedName()) + .withPartitions(1); + return SdkClients.adminClient().topics().create(topicRequest); + } + + private long totalHitsForIndex(OpenMetadataClient client, String query, String index) + throws Exception { + String response = client.search().query(query).index(index).size(0).deleted(false).execute(); + return OBJECT_MAPPER.readTree(response).path("hits").path("total").path("value").asLong(); + } + + private long bucketCountFromDataAsset( + OpenMetadataClient client, String query, String entityType) throws Exception { + String response = + client.search().query(query).index("dataAsset").size(0).deleted(false).execute(); + JsonNode aggregations = OBJECT_MAPPER.readTree(response).path("aggregations"); + JsonNode entityTypeAgg = aggregations.path("entityType"); + if (entityTypeAgg.isMissingNode()) { + entityTypeAgg = aggregations.path("sterms#entityType"); + } + for (JsonNode bucket : entityTypeAgg.path("buckets")) { + if (entityType.equals(bucket.path("key").asText())) { + return bucket.path("doc_count").asLong(); + } + } + return 0L; + } + } + @Nested @DisplayName("Nested Column Tests") @Execution(ExecutionMode.CONCURRENT) @@ -493,6 +849,72 @@ private Table createTableWithColumns(TestNamespace ns, String baseName) { return SdkClients.adminClient().tables().create(tableRequest); } + private Table createTableWithMultiTokenColumns(TestNamespace ns, String baseName, String tag) { + String shortId = ns.shortPrefix(); + + org.openmetadata.schema.services.connections.database.PostgresConnection conn = + DatabaseServices.postgresConnection().hostPort("localhost:5432").username("test").build(); + + DatabaseService dbService = + DatabaseServices.builder() + .name("agg_svc_" + shortId + "_" + baseName) + .connection(conn) + .description("Test service for dataAsset/tableColumn aggregation parity") + .create(); + + CreateDatabase dbReq = new CreateDatabase(); + dbReq.setName("agg_db_" + shortId + "_" + baseName); + dbReq.setService(dbService.getFullyQualifiedName()); + Database database = SdkClients.adminClient().databases().create(dbReq); + + CreateDatabaseSchema schemaReq = new CreateDatabaseSchema(); + schemaReq.setName("agg_schema_" + shortId + "_" + baseName); + schemaReq.setDatabase(database.getFullyQualifiedName()); + DatabaseSchema schema = SdkClients.adminClient().databaseSchemas().create(schemaReq); + + CreateTable tableRequest = new CreateTable(); + tableRequest.setName(ns.prefix(baseName)); + tableRequest.setDatabaseSchema(schema.getFullyQualifiedName()); + tableRequest.setColumns( + List.of( + new Column() + .withName(tag + "_first_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("First name"), + new Column() + .withName(tag + "_first_id") + .withDataType(ColumnDataType.INT) + .withDescription("Decoy: shares only the 'first' sub-token with first_name"), + new Column() + .withName(tag + "_last_name") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(255) + .withDescription("Last name"), + new Column() + .withName(tag + "_address") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(512) + .withDescription("Postal address"), + new Column() + .withName(tag + "_alpha_amount") + .withDataType(ColumnDataType.DECIMAL) + .withDescription("Alpha amount column for complex-syntax tests"), + new Column() + .withName(tag + "_alpha_address") + .withDataType(ColumnDataType.VARCHAR) + .withDataLength(512) + .withDescription("Alpha address column"), + new Column() + .withName(tag + "_bravo_total") + .withDataType(ColumnDataType.DECIMAL) + .withDescription("Bravo total column"))); + + return SdkClients.adminClient().tables().create(tableRequest); + } + + private static final int MULTI_TOKEN_SEED_COUNT = 7; + private Table createTableWithNestedColumns(TestNamespace ns, String baseName) { String shortId = ns.shortPrefix(); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java index 543e75d6363a..48d43a26dd29 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticQueryBuilder.java @@ -1,10 +1,12 @@ package org.openmetadata.service.search.elasticsearch; +import es.co.elastic.clients.elasticsearch._types.FieldValue; import es.co.elastic.clients.elasticsearch._types.query_dsl.Operator; import es.co.elastic.clients.elasticsearch._types.query_dsl.Query; import es.co.elastic.clients.elasticsearch._types.query_dsl.QueryStringQuery; import es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; @@ -28,6 +30,11 @@ public static Query termQuery(String field, int value) { return Query.of(q -> q.term(t -> t.field(field).value(value))); } + public static Query termsQuery(String field, Collection values) { + List fieldValues = values.stream().map(FieldValue::of).toList(); + return Query.of(q -> q.terms(t -> t.field(field).terms(tf -> tf.value(fieldValues)))); + } + public static Query matchQuery(String field, String value) { return Query.of(q -> q.match(m -> m.field(field).query(value))); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java index ee62799dd9f2..a2c2898f11e0 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchSourceBuilderFactory.java @@ -18,9 +18,11 @@ import es.co.elastic.clients.elasticsearch.core.search.Highlight; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import org.openmetadata.schema.api.search.Aggregation; import org.openmetadata.schema.api.search.AssetTypeConfiguration; @@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,35 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int queryBuilder = es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - ElasticQueryBuilder.multiMatchQuery( - query, - fields, - es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, - es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } es.co.elastic.clients.elasticsearch.core.search.Highlight hb = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, hb, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column branch of {@link + * #buildPerTypeUnionQueryV2(String)}. Uses {@link + * es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} must hit some field. The previous shape used {@code Operator.Or} with + * this helper's {@code fuzziness="0"} — that combination did not set {@code + * minimum_should_match}, so any single sub-token in any field was enough to match. As a result + * a search like {@code first_name} matched columns whose name contained just {@code first} or + * just {@code name}, which both inflated the column index hits and caused the + * dataAsset/tableColumn count mismatch tracked in github issue #3851. + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return ElasticQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields, + es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -437,6 +454,93 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + ElasticSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build(); + } + List configs = searchSettings.getAssetTypeConfigurations(); + if (configs == null || configs.isEmpty()) { + return ElasticQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())) + .build(); + } + ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : configs) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return ElasticQueryBuilder.boolQuery() + .filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + */ + private es.co.elastic.clients.elasticsearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + ElasticQueryBuilder.BoolQueryBuilder fallback = + ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + if (!configuredTypes.isEmpty()) { + fallback.mustNot(ElasticQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes)); + } + return fallback.build(); + } + public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2( String query, int from, int size) { return buildAggregateSearchBuilderV2(query, from, size, true); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java index 1b9a76525611..9e4ea9659488 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/indexes/ColumnSearchIndex.java @@ -153,8 +153,12 @@ public static Map getFields() { Map fields = new HashMap<>(); fields.put("name", 10.0f); fields.put("name.keyword", 20.0f); + fields.put("name.ngram", 1.0f); + fields.put("name.compound", 5.0f); fields.put("displayName", 7.0f); fields.put("displayName.keyword", 20.0f); + fields.put("displayName.ngram", 1.0f); + fields.put("displayName.compound", 4.0f); fields.put("fullyQualifiedName", 5.0f); fields.put("description", 2.0f); fields.put("dataType", 3.0f); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java index bbe429084cd8..a01f2e07fca8 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchQueryBuilder.java @@ -1,6 +1,7 @@ package org.openmetadata.service.search.opensearch; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.Map; import os.org.opensearch.client.opensearch._types.FieldValue; @@ -29,6 +30,11 @@ public static Query termQuery(String field, int value) { return Query.of(q -> q.term(t -> t.field(field).value(FieldValue.of(value)))); } + public static Query termsQuery(String field, Collection values) { + List fieldValues = values.stream().map(FieldValue::of).toList(); + return Query.of(q -> q.terms(t -> t.field(field).terms(tf -> tf.value(fieldValues)))); + } + public static Query matchQuery(String field, String value) { return Query.of(q -> q.match(m -> m.field(field).query(FieldValue.of(value)))); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java index 7deb93c7d7da..9273c46f4bd2 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/opensearch/OpenSearchSourceBuilderFactory.java @@ -14,9 +14,11 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.api.search.Aggregation; @@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory private static final String MATCH_TYPE_STANDARD = "standard"; private static final String INDEX_ALL = "all"; private static final String INDEX_DATA_ASSET = "dataAsset"; + private static final String ENTITY_TYPE_FIELD = "entityType"; private static final String MINIMUM_SHOULD_MATCH = "2<70%"; private static final float DEFAULT_TIE_BREAKER = 0.3f; private static final float DEFAULT_BOOST = 1.0f; @@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2( indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); } - if (indexName.equals("all") || indexName.equals("dataAsset")) { - return buildDataAssetSearchBuilderV2( - indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations); + if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) { + return buildAllAssetsSearchBuilderV2( + searchQuery, fromOffset, size, includeExplain, includeAggregations); } return switch (indexName) { @@ -374,21 +377,35 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro queryBuilder = os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m)); } else { - Map fields = ColumnSearchIndex.getFields(); - queryBuilder = - OpenSearchQueryBuilder.multiMatchQuery( - query, - fields, - os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, - os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or, - String.valueOf(DEFAULT_TIE_BREAKER), - "0"); + queryBuilder = buildColumnMultiMatchV2(query); } os.org.opensearch.client.opensearch.core.search.Highlight highlighter = buildHighlightsV2(List.of("name", "displayName", "description")); return searchBuilderV2(queryBuilder, highlighter, from, size); } + /** + * Multi-match used both by {@code index=tableColumn} and the column branch of {@link + * #buildPerTypeUnionQueryV2(String)}. Uses {@link + * os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced + * by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some + * field. The previous shape used {@code Operator.Or} with this helper's {@code fuzziness="0"} + * — that combination did not set {@code minimum_should_match}, so any single sub-token in any + * field was enough to match. As a result a search like {@code first_name} matched columns whose + * name contained just {@code first} or just {@code name}, which both inflated the column index + * hits and caused the dataAsset/tableColumn count mismatch tracked in github issue #3851. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2( + String query) { + return OpenSearchQueryBuilder.multiMatchQuery( + query, + ColumnSearchIndex.getFields(), + os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields, + os.org.opensearch.client.opensearch._types.query_dsl.Operator.And, + String.valueOf(DEFAULT_TIE_BREAKER), + "0"); + } + public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) { os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder = buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields()); @@ -452,6 +469,98 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2( return searchRequestBuilder; } + /** + * Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type + * union: each asset type contributes a clause built with its own configuration (column docs go + * through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link + * #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=}. + * Each entity-type bucket in the aggregation therefore equals what the dedicated index returns + * for the same query, by construction. Avoids the composite-config divergence behind + * github.com/open-metadata/openmetadata-collate#3851. + */ + public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2( + String query, int from, int size, boolean explain, boolean includeAggregations) { + AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig(); + os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery = + buildPerTypeUnionQueryV2(query); + os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery = + applyFunctionScoringV2(baseQuery, compositeConfig); + os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder = + buildHighlightingIfNeededV2(query, compositeConfig); + + OpenSearchRequestBuilder searchRequestBuilder = + createSearchSourceBuilderV2(finalQuery, from, size); + if (highlightBuilder != null) { + searchRequestBuilder.highlighter(highlightBuilder); + } + if (includeAggregations) { + addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig); + } + searchRequestBuilder.explain(explain); + return searchRequestBuilder; + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2( + String query) { + if (isMatchAllQuery(query)) { + return OpenSearchQueryBuilder.boolQuery() + .must(OpenSearchQueryBuilder.matchAllQuery()) + .build(); + } + List configs = searchSettings.getAssetTypeConfigurations(); + if (configs == null || configs.isEmpty()) { + return OpenSearchQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())) + .build(); + } + OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery(); + Set configuredTypes = new HashSet<>(); + for (AssetTypeConfiguration typeConfig : configs) { + String assetType = typeConfig.getAssetType(); + if (assetType == null || assetType.equals(INDEX_ALL)) { + continue; + } + configuredTypes.add(assetType); + union.should(buildAssetTypeClauseV2(query, assetType, typeConfig)); + } + union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes)); + union.minimumShouldMatch(1); + return union.build(); + } + + private static boolean isMatchAllQuery(String query) { + return query == null || query.trim().isEmpty() || query.trim().equals("*"); + } + + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2( + String query, String assetType, AssetTypeConfiguration typeConfig) { + os.org.opensearch.client.opensearch._types.query_dsl.Query inner = + Entity.TABLE_COLUMN.equals(assetType) + ? buildColumnMultiMatchV2(query) + : buildBaseQueryV2(query, typeConfig); + return OpenSearchQueryBuilder.boolQuery() + .filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType)) + .must(inner) + .build(); + } + + /** + * Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in + * {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}). + * Without this, docs of those types would silently disappear from the dataAsset alias after the + * per-type-union refactor. + */ + private os.org.opensearch.client.opensearch._types.query_dsl.Query + buildUnconfiguredAssetFallbackV2(String query, Set configuredTypes) { + OpenSearchQueryBuilder.BoolQueryBuilder fallback = + OpenSearchQueryBuilder.boolQuery() + .must(buildBaseQueryV2(query, getOrCreateDefaultConfig())); + if (!configuredTypes.isEmpty()) { + fallback.mustNot(OpenSearchQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes)); + } + return fallback.build(); + } + private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2( String query, AssetTypeConfiguration assetConfig) { if (query == null || query.trim().isEmpty() || query.trim().equals("*")) {