Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
1650bf0
clustermap: extend skip-bad-foreign-node logic to update path
snalli Apr 30, 2026
7d0d36a
ci: cap unit-test step at 2h and log STARTED test events
snalli May 1, 2026
cb3166a
ci: log per-test duration and per-suite totals
snalli May 1, 2026
5b1950c
ci: add concurrency group so PR pushes supersede stale runs
snalli May 1, 2026
574fbd6
ci: drop unit-test step timeout to capture full hang signature
snalli May 1, 2026
2f6bb75
ci: trim per-test log volume by ~67%
snalli May 1, 2026
a83d79d
ci: timestamp each test STARTED line so durations can be inferred
snalli May 1, 2026
81d5bd8
ci: timestamp failed tests and stop logging individual skips
snalli May 1, 2026
6af217a
Fix SSLSelectorTest hang and enforce serial test execution
snalli May 1, 2026
5abea96
Fix duplicatePartitionOnSameNodeSkipsNodeTest planting for [0]/[7]
snalli May 1, 2026
0996b44
Fix Process.exitValue() race in Utils.preAllocateFileIfNeeded
snalli May 1, 2026
682d2e3
Fix interrupt-flag leak from testGetFileCopyGetMetaDataResponseExpect…
snalli May 1, 2026
6014aea
Trim SSLSelectorTest cost: drop poolSize=0 params and tighten deadline
snalli May 1, 2026
b5c53ac
Ignore deferred SSL handshake test and staged file-copy test classes
snalli May 1, 2026
03279a6
Ignore vcr CloudBlobStoreTest — CosmosDB V1 path not on AmbryLI prod
snalli May 1, 2026
6d78317
Drop now-redundant test-helper guards; scrub internal references
snalli May 1, 2026
895b868
Bump Helix routing-table init wait to 10m + ignore AzureStorageContai…
snalli May 1, 2026
c46174b
Replace test band-aids with proper fixes from debug-PR diagnoses
snalli May 1, 2026
d3dc376
Revert overly-aggressive HelixClusterManagerTest @After cleanup
snalli May 1, 2026
4f277be
Fix HelixClusterManagerTest state-leak; revert Utils + Helix wait
snalli May 1, 2026
a791ac5
Ignore inconsistentReplicaCapacityTest + parallelize unit-test
snalli May 1, 2026
16987cf
Expand HelixClusterManagerTest @After to clean clustermap-config clus…
snalli May 1, 2026
d482dd2
Refactor duplicatePartition test into helpers; tighten SSL TODO
snalli May 1, 2026
2aec994
CI: 1h timeout per unit-test module + fail-fast on the matrix
snalli May 1, 2026
546396e
CI: add commented placeholder for ambry-file-transfer module
snalli May 2, 2026
d2fb947
Tune unit-test retry budget: maxRetries 3->2, maxFailures 10->5
snalli May 2, 2026
6da7090
Scope Netty leak detection + add forkEvery=1 for stateful modules
snalli May 2, 2026
462435c
Tune intTest retry budget to match unit-test: 3->2, 10->5
snalli May 2, 2026
2f01033
JVM startup tuning for test JVMs: C1-only + SerialGC
snalli May 2, 2026
5ef8019
Apply forkEvery=1 globally; drop statefulModules list
snalli May 2, 2026
036a7ff
Revert -XX:+UseSerialGC; keep TieredStopAtLevel=1
snalli May 2, 2026
ecaa508
Apply gradle/CI hacks to int-test, store-test, server-int-test
snalli May 2, 2026
9b97043
Exclude ambry-store from forkEvery=1; add timestamped events to intTest
snalli May 2, 2026
1790ef9
Fix StorageManagerTest brittleness; re-include ambry-store in forkEve…
snalli May 2, 2026
057b95a
Remove forkEvery=1 from intTest{}; keep it on test{} only
snalli May 2, 2026
17cd12d
Revert allocator-cache-disabling for intTest{}; keep paranoid leak de…
snalli May 2, 2026
d4d3bc2
@Ignore non-Azurite-using cloud/vcr tests
snalli May 2, 2026
45cde2e
Revert -XX:TieredStopAtLevel=1 from intTest{}; integration tests need C2
snalli May 2, 2026
76e42f0
Convert unit-test matrix to per-group top-level jobs via composite ac…
snalli May 2, 2026
6640704
Bump actions/checkout v2 -> v4 and actions/setup-java v2 -> v4
snalli May 2, 2026
b8e76fb
Tune fetch-depth: 0 -> 100 for all actions/checkout invocations
snalli May 2, 2026
268a09d
Fix composite action validation; rename heavy/light groups
snalli May 2, 2026
cfd798c
Remove ambry-store's test{} override; inherit global testLogging config
snalli May 2, 2026
f70e313
Standardize timeout-minutes to 60 across all CI jobs
snalli May 2, 2026
76aa9ef
Add fetch-tags: true to all checkouts; fixes shipkit-auto-version
snalli May 2, 2026
ab4d010
Run runner-spec diagnostic after setup-java, not before
snalli May 2, 2026
92bee23
Add --warning-mode=summary to all test gradle invocations
snalli May 2, 2026
4a33e42
Add GitHub Step Summary with test results table to composite action
snalli May 2, 2026
2f8cb52
Extract Tier 3 step-summary logic to a script file
snalli May 2, 2026
83f97a8
Log SKIPPED tests inline (in addition to FAILED)
snalli May 2, 2026
b3b4e69
Remove codecov upload + step summary from unit-test composite action
snalli May 2, 2026
fd18476
Move ambry-router to mysql-stack group (router needs MySQL too)
snalli May 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1974,9 +1974,31 @@ private void addOrUpdateInstanceInfos(Iterable<DataNodeConfig> dataNodeConfigs,
List<ReplicaId> totalAddedReplicas = new ArrayList<>();
List<ReplicaId> totalRemovedReplicas = new ArrayList<>();
for (DataNodeConfig dataNodeConfig : dataNodeConfigs) {
String instanceName = dataNodeConfig.getInstanceName();
Pair<List<ReplicaId>, List<ReplicaId>> addedAndRemovedReplicas;
if (instanceNameToAmbryDataNode.containsKey(dataNodeConfig.getInstanceName())) {
addedAndRemovedReplicas = updateInstanceInfo(dataNodeConfig, dcName);
if (instanceNameToAmbryDataNode.containsKey(instanceName)) {
// Update path. Mirrors createNewInstance's skip-foreign / fail-self policy: if validation
// throws (e.g. duplicate partition or inconsistent capacity arrived via an update to an
// already-known node), drop the bad node from the cluster map instead of leaving stale
// state behind. createNewInstance has the same wrapper inline; we keep both surfaces in
// sync so the skip path covers both branches uniformly.
try {
addedAndRemovedReplicas = updateInstanceInfo(dataNodeConfig, dcName);
} catch (Exception e) {
if (instanceName.equals(selfInstanceName)) {
logger.error(
"Failed to update existing node {} (self) in datacenter {}. Failing initialization "
+ "since the server cannot operate with a broken local config.",
instanceName, dcName, e);
throw e;
}
logger.error(
"Failed to update existing node {} in datacenter {}, removing this node from the cluster map.",
instanceName, dcName, e);
handleDataNodeDelete(instanceName);
dataNodeInitializationFailureCount.incrementAndGet();
continue;
}
} else {
addedAndRemovedReplicas = new Pair<>(createNewInstance(dataNodeConfig, dcName), new ArrayList<>());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,34 +525,46 @@ public void duplicatePartitionOnSameNodeSkipsNodeTest() throws Exception {
new MockHelixCluster("AmbryTest-", testHardwareLayoutPath, testPartitionLayoutPath, testZkLayoutPath, localDc,
useAggregatedView, 100, fullAutoCompatible ? 10000 : -1);

// Pick a node in the local DC and inject a duplicate partition across two disks in its InstanceConfig
// Pick a node in the local DC that has at least two disks, at least one replica to duplicate,
// AND is not the current server (selfInstanceName). With 3 replicas spread across N nodes some
// nodes have no replicas, and picking instanceConfigs.get(0) was flaky two ways: it could land
// on an empty node, or on the self-instance — either of which flips the test off the
// foreign-node skip path it intends to exercise.
MockHelixAdmin localAdmin = testCluster.getHelixAdminFromDc(localDc);
List<InstanceConfig> instanceConfigs = localAdmin.getInstanceConfigs("AmbryTest-" + staticClusterName);
InstanceConfig targetConfig = instanceConfigs.get(0);
String targetInstanceName = targetConfig.getInstanceName();

// Find two disk mount paths on this node and a partition on the first disk
Map<String, Map<String, String>> mapFields = targetConfig.getRecord().getMapFields();
InstanceConfig targetConfig = null;
List<String> diskMountPaths = new ArrayList<>();
String duplicatePartitionEntry = null;
for (Map.Entry<String, Map<String, String>> entry : mapFields.entrySet()) {
if (entry.getValue().containsKey(DISK_STATE)) {
diskMountPaths.add(entry.getKey());
if (duplicatePartitionEntry == null) {
String replicasStr = entry.getValue().get(REPLICAS_STR);
if (replicasStr != null && !replicasStr.isEmpty()) {
// Take the first replica entry (e.g., "0:1073741824:defaultPartitionClass,")
duplicatePartitionEntry = replicasStr.split(REPLICAS_DELIM_STR)[0];
for (InstanceConfig candidate : instanceConfigs) {
if (candidate.getInstanceName().equals(selfInstanceName)) {
continue;
}
List<String> candidateDiskPaths = new ArrayList<>();
String candidateReplica = null;
for (Map.Entry<String, Map<String, String>> entry : candidate.getRecord().getMapFields().entrySet()) {
if (entry.getValue().containsKey(DISK_STATE)) {
candidateDiskPaths.add(entry.getKey());
if (candidateReplica == null) {
String replicasStr = entry.getValue().get(REPLICAS_STR);
if (replicasStr != null && !replicasStr.isEmpty()) {
candidateReplica = replicasStr.split(REPLICAS_DELIM_STR)[0];
}
}
}
}
if (candidateDiskPaths.size() >= 2 && candidateReplica != null) {
targetConfig = candidate;
diskMountPaths = candidateDiskPaths;
duplicatePartitionEntry = candidateReplica;
break;
}
}
assertTrue("Node should have at least 2 disks", diskMountPaths.size() >= 2);
assertNotNull("Should find a replica to duplicate", duplicatePartitionEntry);
assertNotNull("No non-self instance with >=2 disks and a replica found in localDc", targetConfig);
String targetInstanceName = targetConfig.getInstanceName();

// Add the duplicate partition to the second disk
String secondDisk = diskMountPaths.get(1);
Map<String, String> secondDiskProps = mapFields.get(secondDisk);
Map<String, String> secondDiskProps = targetConfig.getRecord().getMapFields().get(secondDisk);
String existingReplicas = secondDiskProps.get(REPLICAS_STR);
secondDiskProps.put(REPLICAS_STR, existingReplicas + duplicatePartitionEntry + REPLICAS_DELIM_STR);
localAdmin.setInstanceConfig("AmbryTest-" + staticClusterName, targetInstanceName, targetConfig);
Expand Down
Loading