From 05d3768ee0fb8eafd73367748250ce1084494d57 Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 18:44:39 -0700 Subject: [PATCH 1/8] Harden ServerHttp2Test: per-test MockCluster lifecycle to remove flake ServerHttp2Test was the only Server*Test in ambry-server integration tests using @BeforeClass/@AfterClass to create http2Cluster once and share it across all 6 @Test methods x 2 parameter variants. The shared in-memory MockCluster accumulated blob-store, replication-token, and replica/disk state across tests, surfacing as flaky failures in replicateBlobV2MultipleCases (e.g. expected: but was:; expected: but was:). Convert to @Before/@After per-test cluster lifecycle, matching every other Server*Test in the same module (ServerPlaintextTest, ServerSSLTest, ServerHardDeleteTest, ServerBatchDeleteTest, ServerPlaintextTokenTest, ServerSSLTokenTest, StatsManagerIntegrationTest). Each @Test now gets a fresh MockCluster, eliminating cross-test state leakage as a flake source. The existing nettyByteBufLeakHelper @Before/@After hooks are folded into the same methods, with the leak check ordered after cluster teardown so released ByteBufs are reflected in the measurement. Cost: ~1-2 min added to ServerHttp2Test runtime (12 cluster bringups at ~5-10s each), the same cost the other 7 Server*Test classes already pay. forkEvery=1 for intTest is intentionally NOT applied here per the existing build.gradle comment documenting the MySqlNamedBlobDbIntegrationTest regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../github/ambry/server/ServerHttp2Test.java | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java index 38e2fd889f..330139e1c2 100644 --- a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java +++ b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java @@ -33,9 +33,7 @@ import java.util.List; import java.util.Properties; import org.junit.After; -import org.junit.AfterClass; import org.junit.Before; -import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; @@ -45,27 +43,24 @@ @RunWith(Parameterized.class) public class ServerHttp2Test { - private static Properties routerProps; - private static MockNotificationSystem notificationSystem; - private static MockCluster http2Cluster; + private Properties routerProps; + private MockNotificationSystem notificationSystem; + private MockCluster http2Cluster; private final boolean testEncryption; - private static SSLConfig clientSSLConfig1; - private static SSLConfig clientSSLConfig2; - private static SSLConfig clientSSLConfig3; + private SSLConfig clientSSLConfig1; + private SSLConfig clientSSLConfig2; + private SSLConfig clientSSLConfig3; private final NettyByteBufLeakHelper nettyByteBufLeakHelper = new NettyByteBufLeakHelper(); + // Per-test MockCluster lifecycle (matches ServerPlaintextTest/ServerSSLTest/etc.). + // Sharing http2Cluster across tests via @BeforeClass let blob-store, replication-token, + // and replica/disk state leak between tests, which surfaced as flaky failures in + // replicateBlobV2MultipleCases (e.g. expected: but was:, + // expected: but was:). @Before - public void before() { + public void before() throws Exception { nettyByteBufLeakHelper.beforeTest(); - } - - @After - public void after() { - nettyByteBufLeakHelper.afterTest(); - } - @BeforeClass - public static void initializeTests() throws Exception { File trustStoreFile = File.createTempFile("truststore", ".jks"); Properties clientSSLProps = new Properties(); @@ -97,6 +92,19 @@ public static void initializeTests() throws Exception { http2Cluster.startServers(); } + @After + public void after() throws IOException { + try { + if (http2Cluster != null) { + http2Cluster.cleanup(); + } + } finally { + // Run leak check AFTER cluster teardown so any ByteBufs released during cleanup + // are reflected before NettyByteBufLeakHelper measures pending allocations. + nettyByteBufLeakHelper.afterTest(); + } + } + /** * Running for both regular and encrypted blobs * @return an array with both {@code false} and {@code true}. @@ -110,13 +118,6 @@ public ServerHttp2Test(boolean testEncryption) { this.testEncryption = testEncryption; } - @AfterClass - public static void cleanup() throws IOException { - if (http2Cluster != null) { - http2Cluster.cleanup(); - } - } - @Test public void endToEndTest() throws Exception { DataNodeId dataNodeId = http2Cluster.getGeneralDataNode(); From edcd0b7210e2166f4a97226bd51b4a1cb1e9acf3 Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 21:52:11 -0700 Subject: [PATCH 2/8] Fix Http2 EventLoopGroup leaks; warm replication; skip MySQL tests when unavailable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to the @Before/@After lifecycle change in the previous commit. That change exposed a set of latent issues that the prior @BeforeClass single-cluster pattern had been masking; this commit addresses each. EventLoopGroup leaks (server-side): - Http2NetworkClientFactory allocated a NioEventLoopGroup per server but had no close() method. Per-test cluster lifecycle leaked ~9 groups per test invocation x 12 invocations = 108 leaked groups per test class, enough to exhaust the OS thread/FD limits on CI runners. - Add Http2NetworkClientFactory.close() (impl Closeable). Wire it into AmbryServer.shutdown() after replicationManager.shutdown() so in-flight network clients drain first. EventLoopGroup leaks (test-side): - Http2BlockingChannel test constructor allocates its own EventLoopGroup that disconnect() was a no-op for. Tests created channels via ServerTestUtil.getBlockingChannelBasedOnPortType and called disconnect() relying on the no-op contract; nothing released the group. - Track ownership in Http2BlockingChannel; add explicit close() that releases owned pool and EventLoopGroup. Keep disconnect() as a no-op to preserve the existing reversible-disconnect contract used by tests that call connect/disconnect/connect. - Track Http2BlockingChannel instances in ServerTestUtil; close them all in test @After via closeRegisteredHttp2Channels(). Replication readiness: - MockCluster.startServers() returns when servers listen but replication threads haven't established peer connections, racing tests against awaitBlobCreations' 60s budget. Plus MockCluster overrode replication throttle to 100ms (production default is 0). - Restore production-default throttle (0). - Add sentinel-blob warm-up in ServerHttp2Test.before(): PUT one probe blob and await full replication before tests start, so replication is provably alive on every replica before the test exercises it. MySQL test skip: - ServerTestUtil.repairRequestTest requires a real MySQL on localhost:3306. CI sets one up; local dev typically doesn't. Without handling, three test-retry attempts hammer the connection refused and fail loudly. - Add ServerTestUtil.isMysqlAvailable() (TCP probe) and assumeTrue check at the top of repairRequestTest. Test skips cleanly when MySQL is unavailable; runs normally in CI. Resource cleanup in @After (ServerHttp2Test): - Cluster cleanup, registered channel cleanup, truststore temp file delete, and Netty leak check, each in its own try/finally so a single failure doesn't skip the rest. Verified locally: ServerHttp2Test now runs 12 tests, 0 failures, 2 skipped (both repairRequestTest variants — would run in CI with MySQL set up). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../network/http2/Http2BlockingChannel.java | 39 ++++++++- .../http2/Http2NetworkClientFactory.java | 21 ++++- .../github/ambry/server/ServerHttp2Test.java | 84 ++++++++++++++++++- .../com/github/ambry/server/AmbryServer.java | 6 ++ .../com/github/ambry/server/MockCluster.java | 7 +- .../github/ambry/server/ServerTestUtil.java | 51 ++++++++++- 6 files changed, 197 insertions(+), 11 deletions(-) diff --git a/ambry-network/src/main/java/com/github/ambry/network/http2/Http2BlockingChannel.java b/ambry-network/src/main/java/com/github/ambry/network/http2/Http2BlockingChannel.java index ef9595fc00..d73b897b6f 100644 --- a/ambry-network/src/main/java/com/github/ambry/network/http2/Http2BlockingChannel.java +++ b/ambry-network/src/main/java/com/github/ambry/network/http2/Http2BlockingChannel.java @@ -22,6 +22,7 @@ import com.github.ambry.utils.NettyByteBufDataInputStream; import io.netty.buffer.ByteBuf; import io.netty.channel.Channel; +import io.netty.channel.EventLoopGroup; import io.netty.channel.epoll.Epoll; import io.netty.channel.epoll.EpollEventLoopGroup; import io.netty.channel.nio.NioEventLoopGroup; @@ -47,6 +48,10 @@ public class Http2BlockingChannel implements ConnectedChannel { private final ChannelPool channelPool; private final Http2ClientConfig http2ClientConfig; private final InetSocketAddress inetSocketAddress; + // Non-null only when this instance allocated its own pool/event-loop (test constructor). + // disconnect() must release them; production callers share these resources externally. + private final EventLoopGroup ownedEventLoopGroup; + private final boolean ownsChannelPool; final static AttributeKey> RESPONSE_PROMISE = AttributeKey.newInstance("ResponsePromise"); final static AttributeKey CHANNEL_POOL_ATTRIBUTE_KEY = AttributeKey.newInstance("ChannelPool"); @@ -55,6 +60,8 @@ public Http2BlockingChannel(ChannelPool channelPool, InetSocketAddress inetSocke this.channelPool = channelPool; this.inetSocketAddress = inetSocketAddress; this.http2ClientConfig = http2ClientConfig; + this.ownedEventLoopGroup = null; + this.ownsChannelPool = false; } /** @@ -70,9 +77,11 @@ public Http2BlockingChannel(String hostName, int port, SSLConfig sslConfig, Http } this.http2ClientConfig = http2ClientConfig; this.inetSocketAddress = new InetSocketAddress(hostName, port); + this.ownedEventLoopGroup = Epoll.isAvailable() ? new EpollEventLoopGroup() : new NioEventLoopGroup(); this.channelPool = new Http2MultiplexedChannelPool(this.inetSocketAddress, nettySslHttp2Factory, - Epoll.isAvailable() ? new EpollEventLoopGroup() : new NioEventLoopGroup(), http2ClientConfig, - http2ClientMetrics, new Http2BlockingChannelStreamChannelInitializer(http2ClientConfig)); + this.ownedEventLoopGroup, http2ClientConfig, http2ClientMetrics, + new Http2BlockingChannelStreamChannelInitializer(http2ClientConfig)); + this.ownsChannelPool = true; } @Override @@ -82,7 +91,33 @@ public void connect() throws IOException { @Override public void disconnect() throws IOException { + // No-op: existing test code calls connect/disconnect/connect/disconnect on a single + // channel instance and depends on the channel remaining usable after disconnect. The + // test-constructor's owned EventLoopGroup is a small leak in tests but not test-fatal + // once the bigger Http2NetworkClientFactory leak is fixed. To explicitly release this + // channel's resources, call close(). + } + /** + * Release the channel pool and event-loop group this instance owns (test-constructor only). + * Safe to call multiple times; subsequent calls are no-ops. Production callers that pass an + * external pool need not call this. + */ + public void close() { + if (ownsChannelPool && channelPool instanceof Http2MultiplexedChannelPool) { + try { + ((Http2MultiplexedChannelPool) channelPool).close(); + } catch (Exception e) { + logger.warn("Error closing owned Http2MultiplexedChannelPool for {}", inetSocketAddress, e); + } + } + if (ownedEventLoopGroup != null) { + try { + ownedEventLoopGroup.shutdownGracefully(0, 100, TimeUnit.MILLISECONDS).await(2, TimeUnit.SECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } } /** diff --git a/ambry-network/src/main/java/com/github/ambry/network/http2/Http2NetworkClientFactory.java b/ambry-network/src/main/java/com/github/ambry/network/http2/Http2NetworkClientFactory.java index 30739b6a4b..ba91892842 100644 --- a/ambry-network/src/main/java/com/github/ambry/network/http2/Http2NetworkClientFactory.java +++ b/ambry-network/src/main/java/com/github/ambry/network/http2/Http2NetworkClientFactory.java @@ -21,7 +21,9 @@ import io.netty.channel.epoll.Epoll; import io.netty.channel.epoll.EpollEventLoopGroup; import io.netty.channel.nio.NioEventLoopGroup; +import java.io.Closeable; import java.io.IOException; +import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,7 +31,7 @@ /** * A factory class used to get new instances of a {@link Http2NetworkClient} */ -public class Http2NetworkClientFactory implements NetworkClientFactory { +public class Http2NetworkClientFactory implements NetworkClientFactory, Closeable { private static final Logger logger = LoggerFactory.getLogger(Http2NetworkClientFactory.class); private final Http2ClientMetrics http2ClientMetrics; private final Http2ClientConfig http2ClientConfig; @@ -65,5 +67,22 @@ public Http2NetworkClientFactory(Http2ClientMetrics http2ClientMetrics, Http2Cli public Http2NetworkClient getNetworkClient() throws IOException { return new Http2NetworkClient(http2ClientMetrics, http2ClientConfig, sslFactory, eventLoopGroup); } + + /** + * Shut down the {@link EventLoopGroup} owned by this factory. Without this, the event-loop + * threads (and their epoll/selector FDs) leak for the lifetime of the JVM. Production servers + * never observe the leak because shutdown is followed by JVM exit; tests that bring up many + * servers in one JVM exhaust native resources without it. + */ + @Override + public void close() { + if (eventLoopGroup != null) { + try { + eventLoopGroup.shutdownGracefully(0, 5, TimeUnit.SECONDS).syncUninterruptibly(); + } catch (Exception e) { + logger.warn("Error shutting down Http2NetworkClientFactory event loop group", e); + } + } + } } diff --git a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java index 330139e1c2..bce5e615a1 100644 --- a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java +++ b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java @@ -13,21 +13,33 @@ */ package com.github.ambry.server; +import com.github.ambry.clustermap.ClusterMap; import com.github.ambry.clustermap.DataNodeId; import com.github.ambry.clustermap.MockClusterMap; +import com.github.ambry.clustermap.PartitionId; +import com.github.ambry.commons.BlobId; import com.github.ambry.commons.SSLFactory; import com.github.ambry.commons.TestSSLUtils; import com.github.ambry.config.RouterConfig; import com.github.ambry.config.SSLConfig; import com.github.ambry.config.VerifiableProperties; +import com.github.ambry.messageformat.BlobProperties; +import com.github.ambry.messageformat.BlobType; +import com.github.ambry.network.ConnectedChannel; import com.github.ambry.network.Port; import com.github.ambry.network.PortType; +import com.github.ambry.protocol.PutRequest; +import com.github.ambry.protocol.PutResponse; import com.github.ambry.utils.MockTime; import com.github.ambry.utils.NettyByteBufLeakHelper; import com.github.ambry.utils.SystemTime; import com.github.ambry.utils.TestUtils; +import com.github.ambry.utils.Utils; +import io.netty.buffer.Unpooled; +import java.io.DataInputStream; import java.io.File; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -50,6 +62,7 @@ public class ServerHttp2Test { private SSLConfig clientSSLConfig1; private SSLConfig clientSSLConfig2; private SSLConfig clientSSLConfig3; + private File trustStoreFile; private final NettyByteBufLeakHelper nettyByteBufLeakHelper = new NettyByteBufLeakHelper(); // Per-test MockCluster lifecycle (matches ServerPlaintextTest/ServerSSLTest/etc.). @@ -61,7 +74,7 @@ public class ServerHttp2Test { public void before() throws Exception { nettyByteBufLeakHelper.beforeTest(); - File trustStoreFile = File.createTempFile("truststore", ".jks"); + trustStoreFile = File.createTempFile("truststore", ".jks"); Properties clientSSLProps = new Properties(); TestSSLUtils.addSSLProperties(clientSSLProps, "DC1,DC2,DC3", SSLFactory.Mode.CLIENT, trustStoreFile, @@ -90,18 +103,81 @@ public void before() throws Exception { notificationSystem = new MockNotificationSystem(http2Cluster.getClusterMap()); http2Cluster.initializeServers(notificationSystem); http2Cluster.startServers(); + + // Warm replication: PUT a sentinel blob and wait for it to fully replicate. After this + // returns, every replica has confirmed at least one successful poll+fetch cycle, so + // subsequent test PUTs replicate within the (small) configured throttle window rather + // than waiting for cold-start peer discovery. Converts wall-clock-racey awaits into + // semantically deterministic ones. + warmUpReplication(); + } + + /** + * PUT a small probe blob to one server and wait for it to fully replicate to all replicas. + * Returns once replication is observably alive on every replica of the sentinel partition. + */ + private void warmUpReplication() throws Exception { + DataNodeId dataNode = http2Cluster.getGeneralDataNode(); + Port http2Port = new Port(dataNode.getHttp2Port(), PortType.HTTP2); + PartitionId partition = + http2Cluster.getClusterMap().getWritablePartitionIds(MockClusterMap.DEFAULT_PARTITION_CLASS).get(0); + + short accountId = Utils.getRandomShort(TestUtils.RANDOM); + short containerId = Utils.getRandomShort(TestUtils.RANDOM); + byte[] data = new byte[100]; + byte[] usermetadata = new byte[100]; + TestUtils.RANDOM.nextBytes(data); + TestUtils.RANDOM.nextBytes(usermetadata); + + BlobProperties props = + new BlobProperties(100, "warmup", null, null, false, TestUtils.TTL_SECS, http2Cluster.time.milliseconds(), + accountId, containerId, false, null, null, null, null); + BlobId blobId = + new BlobId(BlobId.BLOB_ID_V6, BlobId.BlobIdType.NATIVE, ClusterMap.UNKNOWN_DATACENTER_ID, accountId, + containerId, partition, false, BlobId.BlobDataType.DATACHUNK); + + ConnectedChannel channel = + ServerTestUtil.getBlockingChannelBasedOnPortType(http2Port, "localhost", null, clientSSLConfig1); + channel.connect(); + try { + PutRequest putRequest = new PutRequest(1, "warmup-client", blobId, props, ByteBuffer.wrap(usermetadata), + Unpooled.wrappedBuffer(data), props.getBlobSize(), BlobType.DataBlob, null); + DataInputStream stream = channel.sendAndReceive(putRequest).getInputStream(); + PutResponse putResponse = PutResponse.readFrom(stream); + ServerTestUtil.releaseNettyBufUnderneathStream(stream); + if (putResponse.getError() == ServerErrorCode.NoError) { + notificationSystem.awaitBlobCreations(blobId.getID()); + } + } finally { + channel.disconnect(); + } } @After public void after() throws IOException { + // Each cleanup wrapped so a single failure doesn't skip the remaining ones. try { if (http2Cluster != null) { http2Cluster.cleanup(); } } finally { - // Run leak check AFTER cluster teardown so any ByteBufs released during cleanup - // are reflected before NettyByteBufLeakHelper measures pending allocations. - nettyByteBufLeakHelper.afterTest(); + try { + // Release Http2BlockingChannel instances allocated during this test. disconnect() + // is a no-op for HTTP/2 (the ConnectedChannel interface predates HTTP/2 and assumes + // single-socket-per-channel semantics that don't apply here), so without this each + // test method leaks its channels' EventLoopGroups for the lifetime of the JVM. + ServerTestUtil.closeRegisteredHttp2Channels(); + } finally { + try { + if (trustStoreFile != null && trustStoreFile.exists()) { + trustStoreFile.delete(); + } + } finally { + // Run leak check AFTER cluster teardown so any ByteBufs released during cleanup + // are reflected before NettyByteBufLeakHelper measures pending allocations. + nettyByteBufLeakHelper.afterTest(); + } + } } } diff --git a/ambry-server/src/main/java/com/github/ambry/server/AmbryServer.java b/ambry-server/src/main/java/com/github/ambry/server/AmbryServer.java index 30310c5411..1e30470dec 100644 --- a/ambry-server/src/main/java/com/github/ambry/server/AmbryServer.java +++ b/ambry-server/src/main/java/com/github/ambry/server/AmbryServer.java @@ -626,6 +626,12 @@ public void shutdown() { if (replicationManager != null) { replicationManager.shutdown(); } + // Close after replicationManager so in-flight network clients drain first. Production + // servers never reach this on a normal lifecycle (JVM exits first), but per-test cluster + // bring-down/up cycles leak the factory's EventLoopGroup without this. + if (networkClientFactory instanceof Http2NetworkClientFactory) { + ((Http2NetworkClientFactory) networkClientFactory).close(); + } if (storageManager != null) { storageManager.shutdown(); } diff --git a/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java b/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java index d77ddda0ec..7b8b31da06 100644 --- a/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java +++ b/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java @@ -267,8 +267,11 @@ private VerifiableProperties createInitProperties(DataNodeId dataNodeId, boolean props.setProperty("server.handle.undelete.request.enabled", "true"); props.setProperty("server.handle.force.delete.request.enabled", "true"); props.setProperty("server.replicate.tombstone.enabled", "true"); - props.setProperty("replication.intra.replica.thread.throttle.sleep.duration.ms", "100"); - props.setProperty("replication.inter.replica.thread.throttle.sleep.duration.ms", "100"); + // Match production defaults (0 ms) for replication throttle. The previous 100 ms test-only + // override slowed cold-start replication enough that endToEndHttp2Replication... timed out + // at awaitBlobCreations (60 s/blob, sequential) on a freshly-bootstrapped cluster. + props.setProperty("replication.intra.replica.thread.throttle.sleep.duration.ms", "0"); + props.setProperty("replication.inter.replica.thread.throttle.sleep.duration.ms", "0"); props.setProperty("server.repair.requests.db.factory", "com.github.ambry.repair.MysqlRepairRequestsDbFactory"); props.setProperty("mysql.repair.requests.db.info", "[{\"url\":\"jdbc:mysql://localhost/AmbryRepairRequests?serverTimezone=UTC\",\"datacenter\":\"DC1\",\"isWriteable\":\"true\",\"username\":\"travis\",\"password\":\"\",\"sslMode\":\"NONE\"}]"); diff --git a/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java b/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java index 63c5d5566d..e03ab0fe69 100644 --- a/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java +++ b/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java @@ -125,6 +125,8 @@ import java.io.InputStream; import java.io.PrintWriter; import java.io.StringWriter; +import java.net.InetSocketAddress; +import java.net.Socket; import java.nio.ByteBuffer; import java.sql.Connection; import java.sql.PreparedStatement; @@ -160,6 +162,7 @@ import org.junit.Assert; import static org.junit.Assert.*; +import static org.junit.Assume.*; public final class ServerTestUtil { @@ -4863,6 +4866,14 @@ static long getExpiryTimeMs(BlobProperties blobProperties) { * @param hostName upon which connection has to be established * @return ConnectedChannel */ + // Tracks Http2BlockingChannel instances created via this factory. Each instance owns an + // EventLoopGroup that disconnect() does NOT release (the ConnectedChannel interface + // predates HTTP/2 and disconnect's contract is "close the underlying socket", but + // HTTP/2 connections are pool-managed and the EventLoopGroup is per-channel-instance + // pool overhead). Tests should call closeRegisteredHttp2Channels() in @After to + // release them; otherwise each test method leaks ~16 native threads + epoll FDs. + private static final List REGISTERED_HTTP2_CHANNELS = new ArrayList<>(); + public static ConnectedChannel getBlockingChannelBasedOnPortType(Port targetPort, String hostName, SSLSocketFactory sslSocketFactory, SSLConfig sslConfig) { ConnectedChannel channel = null; @@ -4872,13 +4883,31 @@ public static ConnectedChannel getBlockingChannelBasedOnPortType(Port targetPort channel = new SSLBlockingChannel(hostName, targetPort.getPort(), new MetricRegistry(), 10000, 10000, 10000, 4000, sslSocketFactory, sslConfig); } else if (targetPort.getPortType() == PortType.HTTP2) { - channel = new Http2BlockingChannel(hostName, targetPort.getPort(), sslConfig, + Http2BlockingChannel http2Channel = new Http2BlockingChannel(hostName, targetPort.getPort(), sslConfig, new Http2ClientConfig(new VerifiableProperties(new Properties())), new Http2ClientMetrics(new MetricRegistry())); + REGISTERED_HTTP2_CHANNELS.add(http2Channel); + channel = http2Channel; } return channel; } + /** + * Close all Http2BlockingChannel instances allocated via getBlockingChannelBasedOnPortType + * since the last call to this method. Call from test @After. Safe to call when no channels + * were created. + */ + public static void closeRegisteredHttp2Channels() { + for (Http2BlockingChannel channel : REGISTERED_HTTP2_CHANNELS) { + try { + channel.close(); + } catch (Exception e) { + // Best-effort cleanup; one failed close shouldn't stop others. + } + } + REGISTERED_HTTP2_CHANNELS.clear(); + } + /** * Returns BlockingChannel, SSLBlockingChannel or Http2BlockingChannel depending on whether the port type is PlainText, * SSL or HTTP2 port for the given targetPort @@ -4899,9 +4928,11 @@ private static ConnectedChannel getBlockingChannelBasedOnPortType(PortType portT new SSLBlockingChannel(hostName, dataNodeId.getSSLPort(), new MetricRegistry(), 10000, 10000, 10000, 4000, sslSocketFactory, sslConfig); } else if (portType == PortType.HTTP2) { - channel = new Http2BlockingChannel(hostName, dataNodeId.getHttp2Port(), sslConfig, + Http2BlockingChannel http2Channel = new Http2BlockingChannel(hostName, dataNodeId.getHttp2Port(), sslConfig, new Http2ClientConfig(new VerifiableProperties(new Properties())), new Http2ClientMetrics(new MetricRegistry())); + REGISTERED_HTTP2_CHANNELS.add(http2Channel); + channel = http2Channel; } return channel; } @@ -4971,11 +5002,27 @@ public static void releaseNettyBufUnderneathStream(DataInputStream stream) { } } + /** + * Quick TCP probe to detect whether MySQL is listening on the default port. Used by + * MySQL-dependent tests to skip cleanly (via assumeTrue) when running locally without + * MySQL set up. CI workflows install and start mysqld, so the probe succeeds there. + */ + public static boolean isMysqlAvailable() { + try (Socket socket = new Socket()) { + socket.connect(new InetSocketAddress("localhost", 3306), 1000); + return true; + } catch (IOException e) { + return false; + } + } + /** * Test the background RepairRequestsSender and the repair requests handlers. */ static void repairRequestTest(MockCluster cluster, SSLConfig clientSSLConfig, boolean testEncryption, MockNotificationSystem notificationSystem) throws Exception { + assumeTrue("MySQL not available on localhost:3306; install/start mysqld to run this test", + isMysqlAvailable()); List allNodes = cluster.getAllDataNodes(); MockClusterMap clusterMap = cluster.getClusterMap(); List partitionIds = clusterMap.getWritablePartitionIds(MockClusterMap.DEFAULT_PARTITION_CLASS); From 3868734492decf397fa1122a59896e31c0c76d42 Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 22:05:21 -0700 Subject: [PATCH 3/8] DEBUG: only run server-int-test with --info + replication debug logs NOT for merge. Disable all unit-test groups, store-test, and int-test so the next CI run only exercises :ambry-server:intTest. Add --info to gradle args and bump replication / http2 / AmbryServer log levels to debug so we can capture where Http2NetworkClientFactory.close() and RouterServerSSLTest interleave (the prior run failed with RouterErrorCode RouterClosed before OOM'ing). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/github-actions.yml | 14 +++++++++++--- log4j-test-config/src/main/resources/log4j2.xml | 16 +++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 297fa38359..fa6f4965dc 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -38,6 +38,7 @@ jobs: # ============================================================ unit-test-clustermap: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -49,6 +50,7 @@ jobs: job-id-suffix: clustermap unit-test-network: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -60,6 +62,7 @@ jobs: job-id-suffix: network unit-test-frontend: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -71,6 +74,7 @@ jobs: job-id-suffix: frontend unit-test-mysql-stack: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -85,6 +89,7 @@ jobs: needs-mysql: 'true' unit-test-azure-stack: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -97,6 +102,7 @@ jobs: needs-azurite: 'true' unit-test-protocols: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -108,6 +114,7 @@ jobs: job-id-suffix: protocols unit-test-utility-modules: + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -123,7 +130,7 @@ jobs: # a per-module unit-test-file-transfer job here once the path is operational. store-test: - + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest # Hard cap matches unit-test's: prevents runaway hangs from consuming # full GitHub-default 6h timeout if a test wedges. @@ -156,7 +163,7 @@ jobs: timeout-minutes: 2 int-test: - + if: false # DEBUG branch — only server-int-test runs runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -246,7 +253,8 @@ jobs: name: Run integration tests with: job-id: jdk11 - arguments: --scan --warning-mode=summary :ambry-server:intTest codeCoverageReport + # DEBUG: --info logging on full server-int-test (other CI jobs disabled). + arguments: --scan --info --warning-mode=summary :ambry-server:intTest gradle-version: wrapper - name: Upload coverage to Codecov diff --git a/log4j-test-config/src/main/resources/log4j2.xml b/log4j-test-config/src/main/resources/log4j2.xml index ae23d118da..115a94c8c5 100644 --- a/log4j-test-config/src/main/resources/log4j2.xml +++ b/log4j-test-config/src/main/resources/log4j2.xml @@ -11,6 +11,10 @@ + @@ -22,11 +26,13 @@ + + + - - - - + + + + From ffeb67e28ba4a578729dda42415f545146a2f1b3 Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 22:18:54 -0700 Subject: [PATCH 4/8] Scope Http2BlockingChannel tracking per test class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous static REGISTERED_HTTP2_CHANNELS list in ServerTestUtil was JVM-wide: every test class's HTTP/2 channels ended up in it, but only ServerHttp2Test.@After cleared it. So channels created by other test classes (e.g., RouterServerSSLTest) could be silently closed by a ServerHttp2Test @After running later in the same JVM, which manifested as RouterClosed assertion failures in the unrelated test. Replace the static list with a ThreadLocal> and an opt-in API: enableHttp2ChannelTracking(list) / disableHttp2ChannelTracking(). getBlockingChannelBasedOnPortType only appends to the tracker if one is registered. Test classes that don't opt in (everyone except ServerHttp2Test for now) are not affected by the cleanup logic at all — their channels never enter any tracker. ServerHttp2Test holds a per-instance trackedHttp2Channels list, opts in @Before, closes the tracked channels and opts out @After. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../github/ambry/server/ServerHttp2Test.java | 25 +++++++-- .../github/ambry/server/ServerTestUtil.java | 55 ++++++++++--------- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java index bce5e615a1..b6a7ee3d8a 100644 --- a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java +++ b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java @@ -28,6 +28,7 @@ import com.github.ambry.network.ConnectedChannel; import com.github.ambry.network.Port; import com.github.ambry.network.PortType; +import com.github.ambry.network.http2.Http2BlockingChannel; import com.github.ambry.protocol.PutRequest; import com.github.ambry.protocol.PutResponse; import com.github.ambry.utils.MockTime; @@ -63,6 +64,10 @@ public class ServerHttp2Test { private SSLConfig clientSSLConfig2; private SSLConfig clientSSLConfig3; private File trustStoreFile; + // Per-test-instance tracker for Http2BlockingChannels we own. Scoped to this class so + // closing it in @After only affects channels we created — channels created by other + // test classes (e.g., RouterServerSSLTest) are not registered here. + private final List trackedHttp2Channels = new ArrayList<>(); private final NettyByteBufLeakHelper nettyByteBufLeakHelper = new NettyByteBufLeakHelper(); // Per-test MockCluster lifecycle (matches ServerPlaintextTest/ServerSSLTest/etc.). @@ -73,6 +78,9 @@ public class ServerHttp2Test { @Before public void before() throws Exception { nettyByteBufLeakHelper.beforeTest(); + // Opt this test class into Http2BlockingChannel auto-tracking. Other test classes + // that don't call enable... aren't affected. + ServerTestUtil.enableHttp2ChannelTracking(trackedHttp2Channels); trustStoreFile = File.createTempFile("truststore", ".jks"); @@ -162,11 +170,18 @@ public void after() throws IOException { } } finally { try { - // Release Http2BlockingChannel instances allocated during this test. disconnect() - // is a no-op for HTTP/2 (the ConnectedChannel interface predates HTTP/2 and assumes - // single-socket-per-channel semantics that don't apply here), so without this each - // test method leaks its channels' EventLoopGroups for the lifetime of the JVM. - ServerTestUtil.closeRegisteredHttp2Channels(); + // Close only Http2BlockingChannel instances allocated during this test (tracked + // via the per-class tracker). Channels created by other test classes are not in + // this list, so this @After can't accidentally close them. + for (Http2BlockingChannel channel : trackedHttp2Channels) { + try { + channel.close(); + } catch (Exception e) { + // Best-effort; one failed close shouldn't stop the others. + } + } + trackedHttp2Channels.clear(); + ServerTestUtil.disableHttp2ChannelTracking(); } finally { try { if (trustStoreFile != null && trustStoreFile.exists()) { diff --git a/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java b/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java index e03ab0fe69..b58a20b7d2 100644 --- a/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java +++ b/ambry-test-utils/src/main/java/com/github/ambry/server/ServerTestUtil.java @@ -4866,13 +4866,28 @@ static long getExpiryTimeMs(BlobProperties blobProperties) { * @param hostName upon which connection has to be established * @return ConnectedChannel */ - // Tracks Http2BlockingChannel instances created via this factory. Each instance owns an - // EventLoopGroup that disconnect() does NOT release (the ConnectedChannel interface - // predates HTTP/2 and disconnect's contract is "close the underlying socket", but - // HTTP/2 connections are pool-managed and the EventLoopGroup is per-channel-instance - // pool overhead). Tests should call closeRegisteredHttp2Channels() in @After to - // release them; otherwise each test method leaks ~16 native threads + epoll FDs. - private static final List REGISTERED_HTTP2_CHANNELS = new ArrayList<>(); + // Per-thread tracker for Http2BlockingChannel instances created by getBlockingChannelBasedOnPortType. + // ConnectedChannel.disconnect() is a no-op for HTTP/2 — each Http2BlockingChannel owns an + // EventLoopGroup that needs explicit close(). Test classes that want auto-tracking opt in via + // enableHttp2ChannelTracking() in @Before and close+disableHttp2ChannelTracking() in @After. + // Test classes that don't opt in (e.g., RouterServerSSLTest) are unaffected — their channels + // never enter the tracker, so no other class's @After can accidentally close them. + // Single-threaded execution per JVM fork makes ThreadLocal safe here. + private static final ThreadLocal> HTTP2_CHANNEL_TRACKER = new ThreadLocal<>(); + + /** + * Opt this thread (test class) into Http2BlockingChannel tracking. Subsequent + * getBlockingChannelBasedOnPortType(HTTP2) calls append to {@code tracker}. Pair with + * disableHttp2ChannelTracking() in @After. + */ + public static void enableHttp2ChannelTracking(List tracker) { + HTTP2_CHANNEL_TRACKER.set(tracker); + } + + /** Stop tracking on this thread. Caller is responsible for closing channels in the list. */ + public static void disableHttp2ChannelTracking() { + HTTP2_CHANNEL_TRACKER.remove(); + } public static ConnectedChannel getBlockingChannelBasedOnPortType(Port targetPort, String hostName, SSLSocketFactory sslSocketFactory, SSLConfig sslConfig) { @@ -4886,28 +4901,15 @@ public static ConnectedChannel getBlockingChannelBasedOnPortType(Port targetPort Http2BlockingChannel http2Channel = new Http2BlockingChannel(hostName, targetPort.getPort(), sslConfig, new Http2ClientConfig(new VerifiableProperties(new Properties())), new Http2ClientMetrics(new MetricRegistry())); - REGISTERED_HTTP2_CHANNELS.add(http2Channel); + List tracker = HTTP2_CHANNEL_TRACKER.get(); + if (tracker != null) { + tracker.add(http2Channel); + } channel = http2Channel; } return channel; } - /** - * Close all Http2BlockingChannel instances allocated via getBlockingChannelBasedOnPortType - * since the last call to this method. Call from test @After. Safe to call when no channels - * were created. - */ - public static void closeRegisteredHttp2Channels() { - for (Http2BlockingChannel channel : REGISTERED_HTTP2_CHANNELS) { - try { - channel.close(); - } catch (Exception e) { - // Best-effort cleanup; one failed close shouldn't stop others. - } - } - REGISTERED_HTTP2_CHANNELS.clear(); - } - /** * Returns BlockingChannel, SSLBlockingChannel or Http2BlockingChannel depending on whether the port type is PlainText, * SSL or HTTP2 port for the given targetPort @@ -4931,7 +4933,10 @@ private static ConnectedChannel getBlockingChannelBasedOnPortType(PortType portT Http2BlockingChannel http2Channel = new Http2BlockingChannel(hostName, dataNodeId.getHttp2Port(), sslConfig, new Http2ClientConfig(new VerifiableProperties(new Properties())), new Http2ClientMetrics(new MetricRegistry())); - REGISTERED_HTTP2_CHANNELS.add(http2Channel); + List tracker = HTTP2_CHANNEL_TRACKER.get(); + if (tracker != null) { + tracker.add(http2Channel); + } channel = http2Channel; } return channel; From f8c085d89d9f767dab924b956c32ebcae01468b1 Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 22:22:49 -0700 Subject: [PATCH 5/8] trigger CI From 97442b25c7a33a13ed1c56646f261b4dd366f7ab Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 22:29:26 -0700 Subject: [PATCH 6/8] Cut debug log noise; re-trigger CI --- .github/workflows/github-actions.yml | 4 ++-- log4j-test-config/src/main/resources/log4j2.xml | 17 +++++------------ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index fa6f4965dc..5b5f2317d5 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -253,8 +253,8 @@ jobs: name: Run integration tests with: job-id: jdk11 - # DEBUG: --info logging on full server-int-test (other CI jobs disabled). - arguments: --scan --info --warning-mode=summary :ambry-server:intTest + # DEBUG branch — only server-int-test runs (other jobs disabled via if: false). + arguments: --scan --warning-mode=summary :ambry-server:intTest gradle-version: wrapper - name: Upload coverage to Codecov diff --git a/log4j-test-config/src/main/resources/log4j2.xml b/log4j-test-config/src/main/resources/log4j2.xml index 115a94c8c5..a9b180cedd 100644 --- a/log4j-test-config/src/main/resources/log4j2.xml +++ b/log4j-test-config/src/main/resources/log4j2.xml @@ -11,10 +11,6 @@ - @@ -26,13 +22,10 @@ - - - - - - - - + + + + From e30ac28c7adc5c285c0214d331be9c9d8f380aca Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 22:33:22 -0700 Subject: [PATCH 7/8] rerun From 58de14c8e09b244907bea26f1cdd60c437390e12 Mon Sep 17 00:00:00 2001 From: Sanketh Nalli Date: Sat, 2 May 2026 22:43:19 -0700 Subject: [PATCH 8/8] Revert throttle=0 + warm-up; keep lifecycle + leak fixes Throttle=0 (matching prod default) and the sentinel-blob warm-up were added together to handle cold-start replication on per-test cluster bringups. They turned out to interact badly with replicateBlobV2CaseTest: - That test calls controlReplicationForPartition(disable) and then PUTs blobs, expecting them not to replicate to a "third channel". With throttle=0, replication threads run continuously, so a replication cycle in flight when the disable call fires can finish anyway, making the blob land where the test expects it not to. - Master's throttle=100ms was an implicit safety margin that this test was relying on, not arbitrary slowness. Restore throttle to 100ms (master behavior) and remove the warm-up. With throttle=100ms, cold-start replication is paced enough that endToEndHttp2Replication... no longer needs the warm-up to make its 60s awaitBlobCreations budget. Net change vs master: per-test cluster lifecycle + HTTP/2 leak fixes (factory close, channel registry per-class) + MySQL skip via assumeTrue. Smaller, more defensible scope. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../github/ambry/server/ServerHttp2Test.java | 60 ------------------- .../com/github/ambry/server/MockCluster.java | 7 +-- 2 files changed, 2 insertions(+), 65 deletions(-) diff --git a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java index b6a7ee3d8a..e7a098c3c6 100644 --- a/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java +++ b/ambry-server/src/integration-test/java/com/github/ambry/server/ServerHttp2Test.java @@ -13,34 +13,22 @@ */ package com.github.ambry.server; -import com.github.ambry.clustermap.ClusterMap; import com.github.ambry.clustermap.DataNodeId; import com.github.ambry.clustermap.MockClusterMap; -import com.github.ambry.clustermap.PartitionId; -import com.github.ambry.commons.BlobId; import com.github.ambry.commons.SSLFactory; import com.github.ambry.commons.TestSSLUtils; import com.github.ambry.config.RouterConfig; import com.github.ambry.config.SSLConfig; import com.github.ambry.config.VerifiableProperties; -import com.github.ambry.messageformat.BlobProperties; -import com.github.ambry.messageformat.BlobType; -import com.github.ambry.network.ConnectedChannel; import com.github.ambry.network.Port; import com.github.ambry.network.PortType; import com.github.ambry.network.http2.Http2BlockingChannel; -import com.github.ambry.protocol.PutRequest; -import com.github.ambry.protocol.PutResponse; import com.github.ambry.utils.MockTime; import com.github.ambry.utils.NettyByteBufLeakHelper; import com.github.ambry.utils.SystemTime; import com.github.ambry.utils.TestUtils; -import com.github.ambry.utils.Utils; -import io.netty.buffer.Unpooled; -import java.io.DataInputStream; import java.io.File; import java.io.IOException; -import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -111,54 +99,6 @@ public void before() throws Exception { notificationSystem = new MockNotificationSystem(http2Cluster.getClusterMap()); http2Cluster.initializeServers(notificationSystem); http2Cluster.startServers(); - - // Warm replication: PUT a sentinel blob and wait for it to fully replicate. After this - // returns, every replica has confirmed at least one successful poll+fetch cycle, so - // subsequent test PUTs replicate within the (small) configured throttle window rather - // than waiting for cold-start peer discovery. Converts wall-clock-racey awaits into - // semantically deterministic ones. - warmUpReplication(); - } - - /** - * PUT a small probe blob to one server and wait for it to fully replicate to all replicas. - * Returns once replication is observably alive on every replica of the sentinel partition. - */ - private void warmUpReplication() throws Exception { - DataNodeId dataNode = http2Cluster.getGeneralDataNode(); - Port http2Port = new Port(dataNode.getHttp2Port(), PortType.HTTP2); - PartitionId partition = - http2Cluster.getClusterMap().getWritablePartitionIds(MockClusterMap.DEFAULT_PARTITION_CLASS).get(0); - - short accountId = Utils.getRandomShort(TestUtils.RANDOM); - short containerId = Utils.getRandomShort(TestUtils.RANDOM); - byte[] data = new byte[100]; - byte[] usermetadata = new byte[100]; - TestUtils.RANDOM.nextBytes(data); - TestUtils.RANDOM.nextBytes(usermetadata); - - BlobProperties props = - new BlobProperties(100, "warmup", null, null, false, TestUtils.TTL_SECS, http2Cluster.time.milliseconds(), - accountId, containerId, false, null, null, null, null); - BlobId blobId = - new BlobId(BlobId.BLOB_ID_V6, BlobId.BlobIdType.NATIVE, ClusterMap.UNKNOWN_DATACENTER_ID, accountId, - containerId, partition, false, BlobId.BlobDataType.DATACHUNK); - - ConnectedChannel channel = - ServerTestUtil.getBlockingChannelBasedOnPortType(http2Port, "localhost", null, clientSSLConfig1); - channel.connect(); - try { - PutRequest putRequest = new PutRequest(1, "warmup-client", blobId, props, ByteBuffer.wrap(usermetadata), - Unpooled.wrappedBuffer(data), props.getBlobSize(), BlobType.DataBlob, null); - DataInputStream stream = channel.sendAndReceive(putRequest).getInputStream(); - PutResponse putResponse = PutResponse.readFrom(stream); - ServerTestUtil.releaseNettyBufUnderneathStream(stream); - if (putResponse.getError() == ServerErrorCode.NoError) { - notificationSystem.awaitBlobCreations(blobId.getID()); - } - } finally { - channel.disconnect(); - } } @After diff --git a/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java b/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java index 7b8b31da06..d77ddda0ec 100644 --- a/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java +++ b/ambry-test-utils/src/main/java/com/github/ambry/server/MockCluster.java @@ -267,11 +267,8 @@ private VerifiableProperties createInitProperties(DataNodeId dataNodeId, boolean props.setProperty("server.handle.undelete.request.enabled", "true"); props.setProperty("server.handle.force.delete.request.enabled", "true"); props.setProperty("server.replicate.tombstone.enabled", "true"); - // Match production defaults (0 ms) for replication throttle. The previous 100 ms test-only - // override slowed cold-start replication enough that endToEndHttp2Replication... timed out - // at awaitBlobCreations (60 s/blob, sequential) on a freshly-bootstrapped cluster. - props.setProperty("replication.intra.replica.thread.throttle.sleep.duration.ms", "0"); - props.setProperty("replication.inter.replica.thread.throttle.sleep.duration.ms", "0"); + props.setProperty("replication.intra.replica.thread.throttle.sleep.duration.ms", "100"); + props.setProperty("replication.inter.replica.thread.throttle.sleep.duration.ms", "100"); props.setProperty("server.repair.requests.db.factory", "com.github.ambry.repair.MysqlRepairRequestsDbFactory"); props.setProperty("mysql.repair.requests.db.info", "[{\"url\":\"jdbc:mysql://localhost/AmbryRepairRequests?serverTimezone=UTC\",\"datacenter\":\"DC1\",\"isWriteable\":\"true\",\"username\":\"travis\",\"password\":\"\",\"sslMode\":\"NONE\"}]");