Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@
import es.co.elastic.clients.elasticsearch.core.search.Highlight;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.openmetadata.schema.api.search.Aggregation;
import org.openmetadata.schema.api.search.AssetTypeConfiguration;
Expand Down Expand Up @@ -53,6 +55,7 @@ public class ElasticSearchSourceBuilderFactory
private static final String MATCH_TYPE_STANDARD = "standard";
private static final String INDEX_ALL = "all";
private static final String INDEX_DATA_ASSET = "dataAsset";
private static final String ENTITY_TYPE_FIELD = "entityType";
private static final String MINIMUM_SHOULD_MATCH = "2<70%";
private static final float DEFAULT_TIE_BREAKER = 0.3f;
private static final float DEFAULT_BOOST = 1.0f;
Expand Down Expand Up @@ -341,9 +344,9 @@ public ElasticSearchRequestBuilder getSearchSourceBuilderV2(
indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
}

if (indexName.equals("all") || indexName.equals("dataAsset")) {
return buildDataAssetSearchBuilderV2(
indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
return buildAllAssetsSearchBuilderV2(
searchQuery, fromOffset, size, includeExplain, includeAggregations);
}

return switch (indexName) {
Expand Down Expand Up @@ -374,21 +377,33 @@ public ElasticSearchRequestBuilder buildColumnSearchBuilderV2(String query, int
queryBuilder =
es.co.elastic.clients.elasticsearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
} else {
Map<String, Float> fields = ColumnSearchIndex.getFields();
queryBuilder =
ElasticQueryBuilder.multiMatchQuery(
query,
fields,
es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.Or,
String.valueOf(DEFAULT_TIE_BREAKER),
"0");
queryBuilder = buildColumnMultiMatchV2(query);
}
es.co.elastic.clients.elasticsearch.core.search.Highlight hb =
buildHighlightsV2(List.of("name", "displayName", "description"));
return searchBuilderV2(queryBuilder, hb, from, size);
}

/**
* Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
* {@code dataAsset} composite query. Uses {@link
* es.co.elastic.clients.elasticsearch._types.query_dsl.Operator#And} so every sub-token produced
* by {@code om_analyzer} must hit some field. Without {@code And}, a query like {@code
* first_name} matches any column whose name contains just {@code first} or just {@code name},
* which both inflates the column index hits and creates the dataAsset/tableColumn count
* mismatch tracked in github issue #3851.
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Javadoc mentions the previous column builder used min_should_match=0, but the underlying multiMatchQuery(..., tieBreaker, fuzziness) helper’s last argument is fuzziness (and no minimum_should_match is set when fuzziness is "0"). Please update the comment to reflect the actual previous behavior so it doesn’t mislead future debugging/tuning.

Suggested change
* mismatch tracked in github issue #3851.
* mismatch tracked in github issue #3851. The previous builder behavior here was equivalent to
* passing {@code fuzziness="0"} to {@code multiMatchQuery(..., tieBreaker, fuzziness)}, which
* disables fuzziness; this helper invocation does not set {@code minimum_should_match}.

Copilot uses AI. Check for mistakes.
*/
private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildColumnMultiMatchV2(
String query) {
return ElasticQueryBuilder.multiMatchQuery(
query,
ColumnSearchIndex.getFields(),
es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType.BestFields,
es.co.elastic.clients.elasticsearch._types.query_dsl.Operator.And,
String.valueOf(DEFAULT_TIE_BREAKER),
"0");
}

public ElasticSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
es.co.elastic.clients.elasticsearch._types.query_dsl.Query queryBuilder =
buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
Expand Down Expand Up @@ -437,6 +452,87 @@ public ElasticSearchRequestBuilder buildDataAssetSearchBuilderV2(
return searchRequestBuilder;
}

/**
* Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
* union: each asset type contributes a clause built with its own configuration (column docs go
* through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
* #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
* Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
* for the same query, by construction. Avoids the composite-config divergence behind
* github.com/open-metadata/openmetadata-collate#3851.
*/
public ElasticSearchRequestBuilder buildAllAssetsSearchBuilderV2(
String query, int from, int size, boolean explain, boolean includeAggregations) {
AssetTypeConfiguration compositeConfig = buildCompositeAssetConfig(searchSettings);
es.co.elastic.clients.elasticsearch._types.query_dsl.Query baseQuery =
buildPerTypeUnionQueryV2(query);
es.co.elastic.clients.elasticsearch._types.query_dsl.Query finalQuery =
applyFunctionScoringV2(baseQuery, compositeConfig);
es.co.elastic.clients.elasticsearch.core.search.Highlight highlightBuilder =
buildHighlightingIfNeededV2(query, compositeConfig);

ElasticSearchRequestBuilder searchRequestBuilder =
createSearchSourceBuilderV2(finalQuery, from, size);
if (highlightBuilder != null) {
searchRequestBuilder.highlighter(highlightBuilder);
}
if (includeAggregations) {
addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
}
searchRequestBuilder.explain(explain);
return searchRequestBuilder;
}

private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
String query) {
if (isMatchAllQuery(query)) {
return ElasticQueryBuilder.boolQuery().must(ElasticQueryBuilder.matchAllQuery()).build();
}
ElasticQueryBuilder.BoolQueryBuilder union = ElasticQueryBuilder.boolQuery();
Set<String> configuredTypes = new HashSet<>();
for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
Comment thread
gitar-bot[bot] marked this conversation as resolved.
Outdated
String assetType = typeConfig.getAssetType();
if (assetType == null || assetType.equals(INDEX_ALL)) {
continue;
}
configuredTypes.add(assetType);
union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
}
union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
union.minimumShouldMatch(1);
return union.build();
Comment on lines +457 to +511
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This refactor changes the query semantics for the high-traffic all/dataAsset alias (per-entity-type bool union) and tightens column matching, but there are no automated tests added here to lock in the new bucket-parity and underscore sub-token behavior. Please add/extend tests (unit and/or integration) that assert: (1) index=dataAsset entityType bucket counts match the totals from the corresponding dedicated index for at least table and tableColumn, and (2) a query like first_name does not match columns that only contain first or only name.

Copilot uses AI. Check for mistakes.
}

private static boolean isMatchAllQuery(String query) {
return query == null || query.trim().isEmpty() || query.trim().equals("*");
}

private es.co.elastic.clients.elasticsearch._types.query_dsl.Query buildAssetTypeClauseV2(
String query, String assetType, AssetTypeConfiguration typeConfig) {
es.co.elastic.clients.elasticsearch._types.query_dsl.Query inner =
Entity.TABLE_COLUMN.equals(assetType)
? buildColumnMultiMatchV2(query)
: buildBaseQueryV2(query, typeConfig);
return ElasticQueryBuilder.boolQuery()
.filter(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
.must(inner)
.build();
}

/**
* Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
* {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
*/
private es.co.elastic.clients.elasticsearch._types.query_dsl.Query
buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
ElasticQueryBuilder.BoolQueryBuilder fallback =
ElasticQueryBuilder.boolQuery().must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
for (String configured : configuredTypes) {
fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

buildUnconfiguredAssetFallbackV2 currently emits one mustNot term(entityType=...) per configured type. With many asset types this can create a large bool query and add overhead. Use a single mustNot with a terms query over configuredTypes instead (ElasticQueryBuilder supports termsQuery).

Suggested change
for (String configured : configuredTypes) {
fallback.mustNot(ElasticQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
if (!configuredTypes.isEmpty()) {
fallback.mustNot(ElasticQueryBuilder.termsQuery(ENTITY_TYPE_FIELD, configuredTypes));

Copilot uses AI. Check for mistakes.
}
return fallback.build();
}

public ElasticSearchRequestBuilder buildAggregateSearchBuilderV2(
String query, int from, int size) {
return buildAggregateSearchBuilderV2(query, from, size, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,12 @@ public static Map<String, Float> getFields() {
Map<String, Float> fields = new HashMap<>();
fields.put("name", 10.0f);
fields.put("name.keyword", 20.0f);
fields.put("name.ngram", 1.0f);
fields.put("name.compound", 5.0f);
fields.put("displayName", 7.0f);
fields.put("displayName.keyword", 20.0f);
fields.put("displayName.ngram", 1.0f);
fields.put("displayName.compound", 4.0f);
fields.put("fullyQualifiedName", 5.0f);
fields.put("description", 2.0f);
fields.put("dataType", 3.0f);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.schema.api.search.Aggregation;
Expand Down Expand Up @@ -50,6 +52,7 @@ public class OpenSearchSourceBuilderFactory
private static final String MATCH_TYPE_STANDARD = "standard";
private static final String INDEX_ALL = "all";
private static final String INDEX_DATA_ASSET = "dataAsset";
private static final String ENTITY_TYPE_FIELD = "entityType";
private static final String MINIMUM_SHOULD_MATCH = "2<70%";
private static final float DEFAULT_TIE_BREAKER = 0.3f;
private static final float DEFAULT_BOOST = 1.0f;
Expand Down Expand Up @@ -322,9 +325,9 @@ public OpenSearchRequestBuilder getSearchSourceBuilderV2(
indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
}

if (indexName.equals("all") || indexName.equals("dataAsset")) {
return buildDataAssetSearchBuilderV2(
indexName, searchQuery, fromOffset, size, includeExplain, includeAggregations);
if (indexName.equals(INDEX_ALL) || indexName.equals(INDEX_DATA_ASSET)) {
return buildAllAssetsSearchBuilderV2(
searchQuery, fromOffset, size, includeExplain, includeAggregations);
}

return switch (indexName) {
Expand Down Expand Up @@ -374,21 +377,33 @@ public OpenSearchRequestBuilder buildColumnSearchBuilderV2(String query, int fro
queryBuilder =
os.org.opensearch.client.opensearch._types.query_dsl.Query.of(q -> q.matchAll(m -> m));
} else {
Map<String, Float> fields = ColumnSearchIndex.getFields();
queryBuilder =
OpenSearchQueryBuilder.multiMatchQuery(
query,
fields,
os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
os.org.opensearch.client.opensearch._types.query_dsl.Operator.Or,
String.valueOf(DEFAULT_TIE_BREAKER),
"0");
queryBuilder = buildColumnMultiMatchV2(query);
}
os.org.opensearch.client.opensearch.core.search.Highlight highlighter =
buildHighlightsV2(List.of("name", "displayName", "description"));
return searchBuilderV2(queryBuilder, highlighter, from, size);
}

/**
* Multi-match used both by {@code index=tableColumn} and the column-scoped should clause in the
* {@code dataAsset} composite query. Uses {@link
* os.org.opensearch.client.opensearch._types.query_dsl.Operator#And} so every sub-token produced
* by {@code om_analyzer} (which splits on letter/digit/underscore boundaries) must hit some
* field. Without {@code And}, a query like {@code first_name} matches any column whose name
* contains just {@code first} or just {@code name}, which both inflates the column index hits
* and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851.
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Javadoc references the old behavior as Operator.Or + min_should_match=0, but the helper used here takes fuzziness as the last parameter and only sets minimum_should_match when fuzziness is enabled and operator is OR. Please adjust the comment to match the real previous query shape to avoid confusion.

Suggested change
* field. Without {@code And}, a query like {@code first_name} matches any column whose name
* contains just {@code first} or just {@code name}, which both inflates the column index hits
* and creates the dataAsset/tableColumn count mismatch tracked in github issue #3851.
* field. Previously this query used the same helper with {@code Operator.Or} and a last
* argument of {@code "0"}; with that OR-style query, a search like {@code first_name} matched
* any column whose name contained just {@code first} or just {@code name}, which inflated the
* column index hits and created the dataAsset/tableColumn count mismatch tracked in github
* issue #3851.

Copilot uses AI. Check for mistakes.
*/
private os.org.opensearch.client.opensearch._types.query_dsl.Query buildColumnMultiMatchV2(
String query) {
return OpenSearchQueryBuilder.multiMatchQuery(
query,
ColumnSearchIndex.getFields(),
os.org.opensearch.client.opensearch._types.query_dsl.TextQueryType.BestFields,
os.org.opensearch.client.opensearch._types.query_dsl.Operator.And,
String.valueOf(DEFAULT_TIE_BREAKER),
"0");
}

public OpenSearchRequestBuilder buildServiceSearchBuilderV2(String query, int from, int size) {
os.org.opensearch.client.opensearch._types.query_dsl.Query queryBuilder =
buildSearchQueryBuilderV2(query, SearchIndex.getDefaultFields());
Expand Down Expand Up @@ -452,6 +467,92 @@ public OpenSearchRequestBuilder buildDataAssetSearchBuilderV2(
return searchRequestBuilder;
}

/**
* Build a search source for the {@code all} / {@code dataAsset} alias as a per-entity-type
* union: each asset type contributes a clause built with its own configuration (column docs go
* through {@link #buildColumnMultiMatchV2(String)}, every other type through {@link
* #buildBaseQueryV2(String, AssetTypeConfiguration)}), filtered by {@code entityType=<type>}.
* Each entity-type bucket in the aggregation therefore equals what the dedicated index returns
* for the same query, by construction. Avoids the composite-config divergence behind
* github.com/open-metadata/openmetadata-collate#3851.
*/
public OpenSearchRequestBuilder buildAllAssetsSearchBuilderV2(
String query, int from, int size, boolean explain, boolean includeAggregations) {
AssetTypeConfiguration compositeConfig = getOrBuildCompositeConfig();
os.org.opensearch.client.opensearch._types.query_dsl.Query baseQuery =
buildPerTypeUnionQueryV2(query);
os.org.opensearch.client.opensearch._types.query_dsl.Query finalQuery =
applyFunctionScoringV2(baseQuery, compositeConfig);
os.org.opensearch.client.opensearch.core.search.Highlight highlightBuilder =
buildHighlightingIfNeededV2(query, compositeConfig);

OpenSearchRequestBuilder searchRequestBuilder =
createSearchSourceBuilderV2(finalQuery, from, size);
if (highlightBuilder != null) {
searchRequestBuilder.highlighter(highlightBuilder);
}
if (includeAggregations) {
addConfiguredAggregationsV2(searchRequestBuilder, compositeConfig);
}
searchRequestBuilder.explain(explain);
return searchRequestBuilder;
}

private os.org.opensearch.client.opensearch._types.query_dsl.Query buildPerTypeUnionQueryV2(
String query) {
if (isMatchAllQuery(query)) {
return OpenSearchQueryBuilder.boolQuery()
.must(OpenSearchQueryBuilder.matchAllQuery())
.build();
}
OpenSearchQueryBuilder.BoolQueryBuilder union = OpenSearchQueryBuilder.boolQuery();
Set<String> configuredTypes = new HashSet<>();
for (AssetTypeConfiguration typeConfig : searchSettings.getAssetTypeConfigurations()) {
String assetType = typeConfig.getAssetType();
if (assetType == null || assetType.equals(INDEX_ALL)) {
continue;
}
configuredTypes.add(assetType);
union.should(buildAssetTypeClauseV2(query, assetType, typeConfig));
}
union.should(buildUnconfiguredAssetFallbackV2(query, configuredTypes));
union.minimumShouldMatch(1);
return union.build();
}

private static boolean isMatchAllQuery(String query) {
return query == null || query.trim().isEmpty() || query.trim().equals("*");
}

private os.org.opensearch.client.opensearch._types.query_dsl.Query buildAssetTypeClauseV2(
String query, String assetType, AssetTypeConfiguration typeConfig) {
os.org.opensearch.client.opensearch._types.query_dsl.Query inner =
Entity.TABLE_COLUMN.equals(assetType)
? buildColumnMultiMatchV2(query)
: buildBaseQueryV2(query, typeConfig);
return OpenSearchQueryBuilder.boolQuery()
.filter(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, assetType))
.must(inner)
.build();
}

/**
* Catches asset types that are part of the {@code dataAsset} alias but lack a dedicated entry in
* {@code searchSettings.assetTypeConfigurations} (e.g. {@code glossary}, {@code apiCollection}).
* Without this, docs of those types would silently disappear from the dataAsset alias after the
* per-type-union refactor.
*/
private os.org.opensearch.client.opensearch._types.query_dsl.Query
buildUnconfiguredAssetFallbackV2(String query, Set<String> configuredTypes) {
OpenSearchQueryBuilder.BoolQueryBuilder fallback =
OpenSearchQueryBuilder.boolQuery()
.must(buildBaseQueryV2(query, getOrCreateDefaultConfig()));
for (String configured : configuredTypes) {
fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

buildUnconfiguredAssetFallbackV2 adds one mustNot term(entityType=...) clause per configured type. With many configured asset types this can bloat the query and slow execution. Prefer a single mustNot with a terms query over the configuredTypes set (the query builder already supports termsQuery).

Suggested change
for (String configured : configuredTypes) {
fallback.mustNot(OpenSearchQueryBuilder.termQuery(ENTITY_TYPE_FIELD, configured));
if (!configuredTypes.isEmpty()) {
fallback.mustNot(
OpenSearchQueryBuilder.termsQuery(
ENTITY_TYPE_FIELD,
configuredTypes.stream().map(FieldValue::of).toList()));

Copilot uses AI. Check for mistakes.
}
return fallback.build();
}
Comment on lines +472 to +562
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new all/dataAsset per-type-union behavior and the stricter column multi-match are not covered by tests in this PR. Please add/extend tests to ensure (a) dataAsset entityType bucket counts stay in sync with dedicated-index totals (especially tableColumn), and (b) underscore-split identifier queries (e.g. first_name) require all sub-tokens to match (no overmatching on just one token).

Copilot uses AI. Check for mistakes.

private os.org.opensearch.client.opensearch._types.query_dsl.Query buildBaseQueryV2(
String query, AssetTypeConfiguration assetConfig) {
if (query == null || query.trim().isEmpty() || query.trim().equals("*")) {
Expand Down
Loading
Loading