diff --git a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/AppsResourceIT.java b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/AppsResourceIT.java index d31ab9e95ae2..6bf2a452196b 100644 --- a/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/AppsResourceIT.java +++ b/openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/AppsResourceIT.java @@ -81,8 +81,15 @@ static void setup() { private void waitForAppJobCompletion(String appName) { HttpClient httpClient = SdkClients.adminClient().getHttpClient(); try { + // AppRunRecord.status is a lowercase enum (see appRunRecord.json: started, running, + // completed, failed, success, activeError, stopped, ...). Comparing with case-insensitive + // matchers — using uppercase here matches none of the real values and silently makes the + // wait a no-op. 5-minute ceiling covers an in-flight reindex from another test class + // (e.g. SearchIndexingFieldsParityIT triggers an "all entities" reindex that can take + // minutes); a 30s ceiling fell through to the catch and let the trigger Awaitility below + // hit its own 2-minute "already running" wall. Awaitility.await("Wait for app job completion: " + appName) - .atMost(Duration.ofSeconds(30)) + .atMost(Duration.ofMinutes(5)) .pollDelay(Duration.ofMillis(500)) .pollInterval(Duration.ofSeconds(2)) .ignoreExceptions() @@ -98,9 +105,7 @@ private void waitForAppJobCompletion(String appName) { return true; } String status = latestRun.getStatus().value(); - return "SUCCESS".equals(status) - || "FAILED".equals(status) - || "COMPLETED".equals(status); + return !"running".equalsIgnoreCase(status) && !"started".equalsIgnoreCase(status); }); } catch (org.awaitility.core.ConditionTimeoutException e) { // Best-effort wait — the app may be continuously running under parallel test load. diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/DefaultRecreateHandler.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/DefaultRecreateHandler.java index daba88d33742..20cc570a83fe 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/DefaultRecreateHandler.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/DefaultRecreateHandler.java @@ -171,6 +171,18 @@ public void finalizeReindex(EntityReindexContext context, boolean reindexSuccess } } + // After the first reindex, the canonical name is an alias on the previous staged, not a + // concrete index. OpenSearch's listIndicesByPrefix returns that alias name as one of its + // result keys, which then drives a delete-by-name attempt that fails with + // "matches an alias, specify the corresponding concrete indices" and burns ~31s of + // exponential backoff per entity (1+2+4+8+16s before giving up). With 60 entity types + // a full reindex wastes ~30 minutes in cleanup. Drop the alias name from the cleanup set + // when it is currently an alias — it does not need to be deleted; the swap moves the + // alias atomically and the underlying old concrete is in oldIndicesToDelete already. + if (!searchClient.getIndicesByAlias(canonicalIndex).isEmpty()) { + oldIndicesToDelete.remove(canonicalIndex); + } + LOG.debug( "finalizeReindex entity '{}': aliases={}, oldIndices={}, stagedIndex={}", entityType, diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/DefaultRecreateHandlerTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/DefaultRecreateHandlerTest.java index 935e1fc57ec4..646a0e837fe3 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/DefaultRecreateHandlerTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/DefaultRecreateHandlerTest.java @@ -535,7 +535,9 @@ class FinalizeReindexTests { @DisplayName("Should promote partial data and record success when failed reindex has documents") void testFinalizeReindexPromotesPartialData() { AliasState aliasState = new AliasState(); - aliasState.put("table_search_index", Set.of("table_search_index")); + // Canonical is a concrete index with no aliases (the realistic first-reindex shape; OS/ES + // forbid an alias and a concrete sharing the same name). + aliasState.put("table_search_index", Set.of()); aliasState.put("table_search_index_rebuild_old", Set.of("stale")); aliasState.put("table_search_index_rebuild_new", new HashSet<>()); @@ -667,6 +669,62 @@ void testFinalizeReindexRecordsPromotionFailureOnException() { verify(metrics).recordPromotionFailure("table"); } + + @Test + @DisplayName( + "Should not delete-by-alias-name when canonical is currently an alias on a previous staged") + void testFinalizeReindexSkipsDeleteWhenCanonicalIsAlias() { + // After the first reindex, the canonical name (table_search_index) is an alias on the + // previous staged index, not a concrete one. OpenSearch's listIndicesByPrefix returns the + // alias name as one of its result keys; without the guard, finalizeReindex would attempt + // deleteIndexWithBackoff(canonicalIndex), fail with "matches an alias" and burn ~31s of + // exponential backoff per entity. The guard must drop the alias name from oldIndicesToDelete + // BEFORE the delete branch fires. + AliasState aliasState = new AliasState(); + aliasState.put( + "table_search_index_rebuild_old", + new HashSet<>(Set.of("table_search_index", "table", "all"))); + aliasState.put("table_search_index_rebuild_new", new HashSet<>()); + // Simulate the OpenSearch behavior where listIndicesByPrefix surfaces the alias name itself + // among its result keys (the key in our AliasState mock is what listIndicesByPrefix returns). + aliasState.put("table_search_index", Set.of()); + + SearchClient client = aliasState.toMock(); + SearchRepository repo = mock(SearchRepository.class); + when(repo.getSearchClient()).thenReturn(client); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(repo); + + EntityReindexContext context = + EntityReindexContext.builder() + .entityType("table") + .canonicalIndex("table_search_index") + .activeIndex("table_search_index_rebuild_old") + .stagedIndex("table_search_index_rebuild_new") + .existingAliases(new HashSet<>(Set.of("table_search_index", "table", "all"))) + .canonicalAliases("table") + .parentAliases(new HashSet<>(Set.of("all"))) + .build(); + + new DefaultRecreateHandler().finalizeReindex(context, true); + } + + verify(client, never()).deleteIndexWithBackoff("table_search_index"); + assertTrue( + aliasState.deletedIndices.contains("table_search_index_rebuild_old"), + "Old concrete rebuild must still be cleaned up by the swap path"); + Set stagedAliases = aliasState.indexAliases.get("table_search_index_rebuild_new"); + assertTrue( + stagedAliases.contains("table_search_index"), + () -> "Canonical alias must end up on staged after promotion; got " + stagedAliases); + assertTrue( + stagedAliases.contains("table"), + () -> "Short alias must end up on staged after promotion; got " + stagedAliases); + assertTrue( + stagedAliases.contains("all"), + () -> "Parent alias must end up on staged after promotion; got " + stagedAliases); + } } @Nested