-
Notifications
You must be signed in to change notification settings - Fork 2.1k
fix(search): align dataAsset aggregation counts with index=tableColumn totals #27846
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
bd4d566
f264c55
b2d40ff
6d079ab
201bc7f
caf8580
e882607
4a8011b
b3cff35
d81ecc3
718ceb9
90df4c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -20,8 +20,11 @@ | |||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| import com.fasterxml.jackson.databind.JsonNode; | ||||||||||||||||||||||||||
| import com.fasterxml.jackson.databind.ObjectMapper; | ||||||||||||||||||||||||||
| import java.util.HashSet; | ||||||||||||||||||||||||||
| import java.util.List; | ||||||||||||||||||||||||||
| import java.util.Set; | ||||||||||||||||||||||||||
| import java.util.concurrent.TimeUnit; | ||||||||||||||||||||||||||
| import org.awaitility.Awaitility; | ||||||||||||||||||||||||||
| import org.junit.jupiter.api.DisplayName; | ||||||||||||||||||||||||||
| import org.junit.jupiter.api.Nested; | ||||||||||||||||||||||||||
| import org.junit.jupiter.api.Test; | ||||||||||||||||||||||||||
|
|
@@ -33,15 +36,18 @@ | |||||||||||||||||||||||||
| import org.openmetadata.it.util.TestNamespaceExtension; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.api.data.CreateDatabase; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.api.data.CreateDatabaseSchema; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.api.data.CreateGlossary; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.api.data.CreateTable; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.entity.data.Database; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.entity.data.DatabaseSchema; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.entity.data.Glossary; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.entity.data.Table; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.entity.services.DatabaseService; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.type.Column; | ||||||||||||||||||||||||||
| import org.openmetadata.schema.type.ColumnDataType; | ||||||||||||||||||||||||||
| import org.openmetadata.sdk.client.OpenMetadataClient; | ||||||||||||||||||||||||||
| import org.openmetadata.sdk.fluent.DatabaseServices; | ||||||||||||||||||||||||||
| import org.openmetadata.service.Entity; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||
| * Integration tests for column search indexing during table reindexing. Verifies: | ||||||||||||||||||||||||||
|
|
@@ -353,6 +359,268 @@ void testColumnFilterByDatabase(TestNamespace ns) throws Exception { | |||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| @Nested | ||||||||||||||||||||||||||
| @DisplayName("DataAsset/TableColumn Aggregation Parity Tests") | ||||||||||||||||||||||||||
| @Execution(ExecutionMode.CONCURRENT) | ||||||||||||||||||||||||||
| class DataAssetColumnAggregationTests { | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||
| * Regression for github.com/open-metadata/openmetadata-collate/issues/3851. The UI fires two | ||||||||||||||||||||||||||
| * requests when the user types a multi-word query: {@code index=dataAsset&size=0} for the | ||||||||||||||||||||||||||
| * entity-type aggregation and {@code index=tableColumn} for the column hits. They route through | ||||||||||||||||||||||||||
| * different builders in {@code OpenSearchSourceBuilderFactory}: {@code tableColumn} uses the | ||||||||||||||||||||||||||
| * dedicated lenient column builder while {@code dataAsset} uses the composite asset config with | ||||||||||||||||||||||||||
| * stricter phrase/{@code 2<70%} matching. Without the fix, the {@code tableColumn} bucket in the | ||||||||||||||||||||||||||
| * {@code dataAsset} aggregation under-counts the column index hits. | ||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||
| @Test | ||||||||||||||||||||||||||
| @DisplayName( | ||||||||||||||||||||||||||
| "Aggregation bucket for tableColumn under dataAsset must match index=tableColumn total") | ||||||||||||||||||||||||||
| void testDataAssetTableColumnAggregationMatchesTableColumnTotal(TestNamespace ns) | ||||||||||||||||||||||||||
| throws Exception { | ||||||||||||||||||||||||||
| OpenMetadataClient client = SdkClients.adminClient(); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| String tag = ns.shortPrefix(); | ||||||||||||||||||||||||||
| Table table = createTableWithMultiTokenColumns(ns, "agg_parity_" + tag, tag); | ||||||||||||||||||||||||||
| assertNotNull(table); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| String multiTokenQuery = tag + "_first " + tag + "_address"; | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| Awaitility.await() | ||||||||||||||||||||||||||
| .atMost(90, TimeUnit.SECONDS) | ||||||||||||||||||||||||||
| .pollInterval(500, TimeUnit.MILLISECONDS) | ||||||||||||||||||||||||||
| .until( | ||||||||||||||||||||||||||
| () -> { | ||||||||||||||||||||||||||
| String r = | ||||||||||||||||||||||||||
| client | ||||||||||||||||||||||||||
| .search() | ||||||||||||||||||||||||||
| .query(multiTokenQuery) | ||||||||||||||||||||||||||
| .index("tableColumn") | ||||||||||||||||||||||||||
| .size(0) | ||||||||||||||||||||||||||
| .deleted(false) | ||||||||||||||||||||||||||
| .execute(); | ||||||||||||||||||||||||||
| JsonNode root = OBJECT_MAPPER.readTree(r); | ||||||||||||||||||||||||||
| long total = root.path("hits").path("total").path("value").asLong(-1); | ||||||||||||||||||||||||||
| return total >= 3; | ||||||||||||||||||||||||||
| }); | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| String columnResponse = | ||||||||||||||||||||||||||
| client | ||||||||||||||||||||||||||
| .search() | ||||||||||||||||||||||||||
| .query(multiTokenQuery) | ||||||||||||||||||||||||||
| .index("tableColumn") | ||||||||||||||||||||||||||
| .size(0) | ||||||||||||||||||||||||||
| .deleted(false) | ||||||||||||||||||||||||||
| .execute(); | ||||||||||||||||||||||||||
| long columnTotal = | ||||||||||||||||||||||||||
| OBJECT_MAPPER.readTree(columnResponse).path("hits").path("total").path("value").asLong(); | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| String aggResponse = | ||||||||||||||||||||||||||
| client | ||||||||||||||||||||||||||
| .search() | ||||||||||||||||||||||||||
| .query(multiTokenQuery) | ||||||||||||||||||||||||||
| .index("dataAsset") | ||||||||||||||||||||||||||
| .size(0) | ||||||||||||||||||||||||||
| .deleted(false) | ||||||||||||||||||||||||||
| .execute(); | ||||||||||||||||||||||||||
| JsonNode aggBuckets = | ||||||||||||||||||||||||||
| OBJECT_MAPPER | ||||||||||||||||||||||||||
| .readTree(aggResponse) | ||||||||||||||||||||||||||
| .path("aggregations") | ||||||||||||||||||||||||||
| .path("sterms#entityType") | ||||||||||||||||||||||||||
| .path("buckets"); | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
| JsonNode aggBuckets = | |
| OBJECT_MAPPER | |
| .readTree(aggResponse) | |
| .path("aggregations") | |
| .path("sterms#entityType") | |
| .path("buckets"); | |
| JsonNode aggregations = OBJECT_MAPPER.readTree(aggResponse).path("aggregations"); | |
| JsonNode entityTypeAggregation = aggregations.path("entityType"); | |
| if (entityTypeAggregation.isMissingNode()) { | |
| entityTypeAggregation = aggregations.path("sterms#entityType"); | |
| } | |
| JsonNode aggBuckets = entityTypeAggregation.path("buckets"); |
Copilot
AI
Apr 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment says the old column builder used Operator.Or + min_should_match=0, but the implementation was passing the string "0" as the fuzziness parameter (see ElasticQueryBuilder/OpenSearchQueryBuilder.multiMatchQuery(..., fuzziness)). minimum_should_match was not set at all when fuzziness was "0". Please update the comment to reflect the actual behavior so future debugging isn’t misled.
| * into {@code [first, name]}; with the old {@code Operator.Or} + {@code min_should_match=0} | |
| * column builder, a column called {@code <tag>_first_id} matched a query of {@code | |
| * <tag>_first_name} because the single token {@code first} was enough. The fix moves the | |
| * column builder to {@code Operator.And}, so every sub-token must match. | |
| * into {@code [first, name]}; with the old {@code Operator.Or} column builder, the string | |
| * {@code "0"} was passed as the fuzziness value rather than setting {@code | |
| * minimum_should_match}, so a column called {@code <tag>_first_id} matched a query of | |
| * {@code <tag>_first_name} because the single token {@code first} was enough. The fix moves | |
| * the column builder to {@code Operator.And}, so every sub-token must match. |
Copilot
AI
Apr 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Awaitility condition totalHitsForIndex(...) >= 5 looks too high for the data created in this test (the helper creates 3 columns). If the random tag is unique (likely), the prefix query may only ever match those seeded columns, causing an unnecessary timeout/flaky failure. Consider waiting for the expected minimum (e.g., >= 3) or simply > 0.
| .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 5); | |
| .until(() -> totalHitsForIndex(client, prefixQuery, "tableColumn") >= 3); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Quality: Glossary test doesn't clean up created glossary entity
testUnconfiguredAssetTypeFallbackMatchesViaDataAsset creates a glossary via client.glossaries().create(req) but never deletes it. The other helpers use TestNamespace-scoped names that presumably get cleaned up, but this glossary is created ad-hoc. If the test framework doesn't auto-clean glossaries, this is a resource leak across test runs. Verify the cleanup strategy or add an @AfterEach / try-finally deletion.
Was this helpful? React with 👍 / 👎 | Reply gitar fix to apply this suggestion
Copilot
AI
Apr 30, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bucketCountFromDataAsset assumes the entity-type aggregation lives under aggregations.sterms#entityType, but other integration tests in this repo already need to support aggregations.entityType as well. This helper should check for both keys; otherwise it will silently return 0 and break the parity assertions depending on search backend/serialization.
| JsonNode buckets = | |
| OBJECT_MAPPER | |
| .readTree(response) | |
| .path("aggregations") | |
| .path("sterms#entityType") | |
| .path("buckets"); | |
| JsonNode root = OBJECT_MAPPER.readTree(response); | |
| JsonNode aggregations = root.path("aggregations"); | |
| JsonNode buckets = aggregations.path("sterms#entityType").path("buckets"); | |
| if (buckets.isMissingNode() || !buckets.isArray()) { | |
| buckets = aggregations.path("entityType").path("buckets"); | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Quality: createTableWithMultiTokenColumns duplicates boilerplate from other helpers
The new createTableWithMultiTokenColumns method (lines 764-809) repeats ~30 lines of service/database/schema creation that are identical to createTableWithColumns, createTableWithNestedColumns, and createTableWithDeeplyNestedColumns. Per the project guidelines (no duplication, extract shared logic), consider extracting the common infra setup into a shared helper that accepts a List<Column> and returns a Table.
Was this helpful? React with 👍 / 👎 | Reply gitar fix to apply this suggestion
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The awaited query
tag + "_first " + tag + "_address"is unlikely to ever reachtotal >= 3now that the column builder uses amulti_matchwithOperator.And(each hit must contain all analyzed tokens, so no single column will match both "first" and "*_address"). This can make the Awaitility wait time out and fail the test. Use a query that is expected to match at least one seeded column under the new AND semantics (e.g., target a single column name) and wait for that expected minimum instead of>= 3.