diff --git a/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java b/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java index f2b2240a174e..102e5c9c5f2b 100644 --- a/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java +++ b/hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java @@ -581,7 +581,6 @@ protected List balanceTable(TableName tableName, rackManager, regionCacheRatioOnOldServerMap); long startTime = EnvironmentEdgeManager.currentTime(); - cluster.setStopRequestedAt(startTime + maxRunningTime); initCosts(cluster); balancerConditionals.loadClusterState(cluster); @@ -632,6 +631,10 @@ protected List balanceTable(TableName tableName, currentCost / sumMultiplier, functionCost(), computedMaxSteps); final String initFunctionTotalCosts = totalCostsPerFunc(); + long searchStartTime = EnvironmentEdgeManager.currentTime(); + // Budget maxRunningTime for the stochastic walk only; initialization (cluster costs, etc.) + // can be substantial on busy hosts and must not consume the search deadline. + cluster.setStopRequestedAt(searchStartTime + maxRunningTime); // Perform a stochastic walk to see if we can get a good fit. long step; boolean planImprovedConditionals = false; diff --git a/hbase-common/src/test/java/org/apache/hadoop/hbase/HBaseJupiterExtension.java b/hbase-common/src/test/java/org/apache/hadoop/hbase/HBaseJupiterExtension.java index 9d4ea87e0ec1..057c4642ffaf 100644 --- a/hbase-common/src/test/java/org/apache/hadoop/hbase/HBaseJupiterExtension.java +++ b/hbase-common/src/test/java/org/apache/hadoop/hbase/HBaseJupiterExtension.java @@ -84,7 +84,7 @@ public class HBaseJupiterExtension implements InvocationInterceptor, BeforeAllCa private static final Map TAG_TO_TIMEOUT = ImmutableMap.of(SmallTests.TAG, Duration.ofMinutes(3), MediumTests.TAG, Duration.ofMinutes(6), - LargeTests.TAG, Duration.ofMinutes(13), IntegrationTests.TAG, Duration.ZERO); + LargeTests.TAG, Duration.ofMinutes(20), IntegrationTests.TAG, Duration.ZERO); private static final String EXECUTOR = "executor"; diff --git a/hbase-compression/pom.xml b/hbase-compression/pom.xml index c2e4633b3987..f829c174a044 100644 --- a/hbase-compression/pom.xml +++ b/hbase-compression/pom.xml @@ -45,6 +45,11 @@ hbase-resource-bundle true + + org.junit.jupiter + junit-jupiter-api + test + diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtil.java index 365855bbd3cf..e3f1bc3a071b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtil.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtil.java @@ -434,7 +434,13 @@ private void createSubDirAndSystemProperty(String propertyName, Path parent, Str String sysValue = System.getProperty(propertyName); - if (sysValue != null) { + // Check if directory sharing should be disabled for this test. + // Tests that run with high parallelism and don't need shared directories can set this + // to avoid race conditions where one test's tearDown() deletes directories another test + // is still using. + boolean disableSharing = conf.getBoolean("hbase.test.disable-directory-sharing", false); + + if (sysValue != null && !disableSharing) { // There is already a value set. So we do nothing but hope // that there will be no conflicts LOG.info("System.getProperty(\"" + propertyName + "\") already set to: " + sysValue @@ -447,7 +453,7 @@ private void createSubDirAndSystemProperty(String propertyName, Path parent, Str } conf.set(propertyName, sysValue); } else { - // Ok, it's not set, so we create it as a subdirectory + // Ok, it's not set (or sharing is disabled), so we create it as a subdirectory createSubDir(propertyName, parent, subDirName); System.setProperty(propertyName, conf.get(propertyName)); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java index fc5822477937..8b9736aefb69 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java @@ -75,7 +75,6 @@ public static void setUpBeforeClass() throws Exception { conf.setInt(HConstants.ZK_SESSION_TIMEOUT, 1000); conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS, MockLoadBalancer.class, LoadBalancer.class); - TEST_UTIL.startMiniDFSCluster(2); } @AfterAll diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/AbstractTestAsyncTableScan.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/AbstractTestAsyncTableScan.java index 3797c5b5de5a..3f8acf4fd73a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/AbstractTestAsyncTableScan.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/AbstractTestAsyncTableScan.java @@ -73,8 +73,19 @@ public abstract class AbstractTestAsyncTableScan { protected static final OpenTelemetryClassRule OTEL_CLASS_RULE = OpenTelemetryClassRule.create(); - protected static final MiniClusterRule MINI_CLUSTER_RULE = MiniClusterRule.newBuilder() - .setMiniClusterOption(StartTestingClusterOption.builder().numWorkers(3).build()).build(); + + private static Configuration createConfiguration() { + Configuration conf = new Configuration(); + // Disable directory sharing to prevent race conditions when tests run in parallel. + // Each test instance gets its own isolated directories to avoid one test's tearDown() + // deleting directories another parallel test is still using. + conf.setBoolean("hbase.test.disable-directory-sharing", true); + return conf; + } + + protected static final MiniClusterRule MINI_CLUSTER_RULE = + MiniClusterRule.newBuilder().setConfiguration(createConfiguration()) + .setMiniClusterOption(StartTestingClusterOption.builder().numWorkers(3).build()).build(); protected static final ConnectionRule CONN_RULE = ConnectionRule.createAsyncConnectionRule(MINI_CLUSTER_RULE::createAsyncConnection); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnection.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnection.java index 42d5a87a9cec..ee60c7af67d4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnection.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnection.java @@ -87,6 +87,10 @@ public class TestConnection { @BeforeClass public static void setUpBeforeClass() throws Exception { ResourceLeakDetector.setLevel(Level.PARANOID); + // Disable directory sharing to prevent race conditions when tests run in parallel. + // Each test instance gets its own isolated directories to avoid one test's tearDown() + // deleting directories another parallel test is still using. + TEST_UTIL.getConfiguration().setBoolean("hbase.test.disable-directory-sharing", true); TEST_UTIL.getConfiguration().setBoolean(HConstants.STATUS_PUBLISHED, true); // Up the handlers; this test needs more than usual. TEST_UTIL.getConfiguration().setInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT, 10); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestPrefetchWithBucketCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestPrefetchWithBucketCache.java index 8e341979a592..b3dc54cba54a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestPrefetchWithBucketCache.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/bucket/TestPrefetchWithBucketCache.java @@ -22,7 +22,6 @@ import static org.apache.hadoop.hbase.io.hfile.BlockCacheFactory.BUCKET_CACHE_BUCKETS_KEY; import static org.apache.hadoop.hbase.io.hfile.bucket.BucketCache.QUEUE_ADDITION_WAIT_TIME; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -328,7 +327,8 @@ public void testPrefetchRunTriggersEvictions() throws Exception { conf.setLong(QUEUE_ADDITION_WAIT_TIME, 0); blockCache = BlockCacheFactory.createBlockCache(conf); cacheConf = new CacheConfig(conf, blockCache); - Path storeFile = writeStoreFile("testPrefetchInterruptOnCapacity", 10000); + // Use 15000 KVs to ensure file reliably exceeds 1MB cache capacity even with size variance + Path storeFile = writeStoreFile("testPrefetchRunTriggersEvictions", 15000); // Prefetches the file blocks createReaderAndWaitForPrefetchInterruption(storeFile); Waiter.waitFor(conf, (PrefetchExecutor.getPrefetchDelay() + 1000), @@ -343,14 +343,16 @@ public void testPrefetchRunTriggersEvictions() throws Exception { } return true; }); - if (bc.getStats().getFailedInserts() == 0) { - // With no wait time configuration, prefetch should trigger evictions once it reaches - // cache capacity - assertNotEquals(0, bc.getStats().getEvictedCount()); - } else { - LOG.info("We had {} cache insert failures, which may cause cache usage " - + "to never reach capacity.", bc.getStats().getFailedInserts()); - } + // With no wait time configuration, prefetch will either trigger evictions when reaching + // cache capacity, or have failed inserts when the writer queue fills faster than it drains. + // Both outcomes are valid - test should only fail if NEITHER happens, which would indicate + // a problem with the capacity management logic. + long evictions = bc.getStats().getEvictedCount(); + long failedInserts = bc.getStats().getFailedInserts(); + assertTrue( + "Expected either evictions or failed inserts to demonstrate capacity management, " + + "but got evictions=" + evictions + ", failedInserts=" + failedInserts, + evictions > 0 || failedInserts > 0); } @Test diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRollbackSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRollbackSCP.java index 14863515392f..e0b3cd175e8d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRollbackSCP.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRollbackSCP.java @@ -133,6 +133,11 @@ public static void tearDownAfterClass() throws IOException { @BeforeEach public void setUp() throws IOException { UTIL.ensureSomeNonStoppedRegionServersAvailable(2); + // Surefire reruns failed tests in the same JVM without re-running @BeforeClass. Reset injection + // state so compareAndSet in persistToMeta can succeed again and kill-before-store flags clear. + INJECTED.set(false); + ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdateInRollback( + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(), false); } private ServerCrashProcedure getSCPForServer(ServerName serverName) throws IOException { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestBlockBytesScannedQuota.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestBlockBytesScannedQuota.java index 79e84349292c..67cafa53c5f4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestBlockBytesScannedQuota.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/quotas/TestBlockBytesScannedQuota.java @@ -260,7 +260,7 @@ public void testBBSMultiGet() throws Exception { private void testTraffic(Callable trafficCallable, long expectedSuccess, long marginOfError) throws Exception { - TEST_UTIL.waitFor(5_000, () -> { + TEST_UTIL.waitFor(30_000, () -> { long actualSuccess; try { actualSuccess = trafficCallable.call(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java index d5de6418a8bc..aea71b02ff40 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java @@ -265,6 +265,10 @@ public class TestHRegion { public void setup() throws IOException { TEST_UTIL = new HBaseTestingUtil(); CONF = TEST_UTIL.getConfiguration(); + // Disable directory sharing to prevent race conditions when tests run in parallel. + // Each test instance gets its own isolated directories to avoid one test's tearDown() + // deleting directories another parallel test is still using. + CONF.setBoolean("hbase.test.disable-directory-sharing", true); NettyAsyncFSWALConfigHelper.setEventLoopConfig(CONF, GROUP, NioSocketChannel.class); dir = TEST_UTIL.getDataTestDir("TestHRegion").toString(); method = name.getMethodName(); @@ -1351,18 +1355,23 @@ public void testGetWhileRegionClose() throws IOException { threads[i].start(); } } finally { + done.set(true); + for (GetTillDoneOrException t : threads) { + if (t != null) { + try { + t.join(5000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } if (this.region != null) { HBaseTestingUtil.closeRegionAndWAL(this.region); this.region = null; } } - done.set(true); + // Check for errors after threads have been stopped for (GetTillDoneOrException t : threads) { - try { - t.join(); - } catch (InterruptedException e) { - e.printStackTrace(); - } if (t.e != null) { LOG.info("Exception=" + t.e); assertFalse("Found a NPE in " + t.getName(), t.e instanceof NullPointerException); @@ -1390,7 +1399,7 @@ class GetTillDoneOrException extends Thread { @Override public void run() { - while (!this.done.get()) { + while (!this.done.get() && !Thread.currentThread().isInterrupted()) { try { assertTrue(region.get(g).size() > 0); this.count.incrementAndGet(); @@ -4574,7 +4583,7 @@ public void checkNoError() { @Override public void run() { done = false; - while (!done) { + while (!done && !Thread.currentThread().isInterrupted()) { synchronized (this) { try { wait(); @@ -4790,7 +4799,7 @@ public void checkNoError() { @Override public void run() { done = false; - while (!done) { + while (!done && !Thread.currentThread().isInterrupted()) { try { for (int r = 0; r < numRows; r++) { byte[] row = Bytes.toBytes("row" + r); @@ -5327,7 +5336,7 @@ public void testParallelIncrementWithMemStoreFlush() throws Exception { Runnable flusher = new Runnable() { @Override public void run() { - while (!incrementDone.get()) { + while (!incrementDone.get() && !Thread.currentThread().isInterrupted()) { try { region.flush(true); } catch (Exception e) { @@ -5343,28 +5352,39 @@ public void run() { long expected = (long) threadNum * incCounter; Thread[] incrementers = new Thread[threadNum]; Thread flushThread = new Thread(flusher); + flushThread.setName("FlushThread-" + method); for (int i = 0; i < threadNum; i++) { incrementers[i] = new Thread(new Incrementer(this.region, incCounter)); incrementers[i].start(); } flushThread.start(); - for (int i = 0; i < threadNum; i++) { - incrementers[i].join(); - } + try { + for (int i = 0; i < threadNum; i++) { + incrementers[i].join(); + } - incrementDone.set(true); - flushThread.join(); + incrementDone.set(true); + flushThread.join(); - Get get = new Get(Incrementer.incRow); - get.addColumn(Incrementer.family, Incrementer.qualifier); - get.readVersions(1); - Result res = this.region.get(get); - List kvs = res.getColumnCells(Incrementer.family, Incrementer.qualifier); + Get get = new Get(Incrementer.incRow); + get.addColumn(Incrementer.family, Incrementer.qualifier); + get.readVersions(1); + Result res = this.region.get(get); + List kvs = res.getColumnCells(Incrementer.family, Incrementer.qualifier); - // we just got the latest version - assertEquals(1, kvs.size()); - Cell kv = kvs.get(0); - assertEquals(expected, Bytes.toLong(kv.getValueArray(), kv.getValueOffset())); + // we just got the latest version + assertEquals(1, kvs.size()); + Cell kv = kvs.get(0); + assertEquals(expected, Bytes.toLong(kv.getValueArray(), kv.getValueOffset())); + } finally { + // Ensure flush thread is stopped even if test fails or times out + incrementDone.set(true); + flushThread.interrupt(); + flushThread.join(5000); // Wait up to 5 seconds for thread to stop + if (flushThread.isAlive()) { + LOG.warn("Flush thread did not stop within timeout for test " + method); + } + } } /** @@ -5412,7 +5432,7 @@ public void testParallelAppendWithMemStoreFlush() throws Exception { Runnable flusher = new Runnable() { @Override public void run() { - while (!appendDone.get()) { + while (!appendDone.get() && !Thread.currentThread().isInterrupted()) { try { region.flush(true); } catch (Exception e) { @@ -5432,30 +5452,42 @@ public void run() { } Thread[] appenders = new Thread[threadNum]; Thread flushThread = new Thread(flusher); + flushThread.setName("FlushThread-" + method); for (int i = 0; i < threadNum; i++) { appenders[i] = new Thread(new Appender(this.region, appendCounter)); appenders[i].start(); } flushThread.start(); - for (int i = 0; i < threadNum; i++) { - appenders[i].join(); - } - - appendDone.set(true); - flushThread.join(); - - Get get = new Get(Appender.appendRow); - get.addColumn(Appender.family, Appender.qualifier); - get.readVersions(1); - Result res = this.region.get(get); - List kvs = res.getColumnCells(Appender.family, Appender.qualifier); + try { + for (int i = 0; i < threadNum; i++) { + appenders[i].join(); + } - // we just got the latest version - assertEquals(1, kvs.size()); - Cell kv = kvs.get(0); - byte[] appendResult = new byte[kv.getValueLength()]; - System.arraycopy(kv.getValueArray(), kv.getValueOffset(), appendResult, 0, kv.getValueLength()); - assertArrayEquals(expected, appendResult); + appendDone.set(true); + flushThread.join(); + + Get get = new Get(Appender.appendRow); + get.addColumn(Appender.family, Appender.qualifier); + get.readVersions(1); + Result res = this.region.get(get); + List kvs = res.getColumnCells(Appender.family, Appender.qualifier); + + // we just got the latest version + assertEquals(1, kvs.size()); + Cell kv = kvs.get(0); + byte[] appendResult = new byte[kv.getValueLength()]; + System.arraycopy(kv.getValueArray(), kv.getValueOffset(), appendResult, 0, + kv.getValueLength()); + assertArrayEquals(expected, appendResult); + } finally { + // Ensure flush thread is stopped even if test fails or times out + appendDone.set(true); + flushThread.interrupt(); + flushThread.join(5000); // Wait up to 5 seconds for thread to stop + if (flushThread.isAlive()) { + LOG.warn("Flush thread did not stop within timeout for test " + method); + } + } } /** @@ -7489,7 +7521,7 @@ public void testMutateRowInParallel() throws Exception { // Writer thread Thread writerThread = new Thread(() -> { try { - while (true) { + while (!Thread.currentThread().isInterrupted()) { // If all the reader threads finish, then stop the writer thread if (latch.await(0, TimeUnit.MILLISECONDS)) { return; @@ -7514,15 +7546,19 @@ public void testMutateRowInParallel() throws Exception { .addColumn(fam1, q3, tsIncrement + 1, Bytes.toBytes(1L)) .addColumn(fam1, q4, tsAppend + 1, Bytes.toBytes("a")) }); } + } catch (InterruptedException e) { + // Test interrupted, exit gracefully + Thread.currentThread().interrupt(); } catch (Exception e) { assertionError.set(new AssertionError(e)); } }); + writerThread.setName("WriterThread-" + method); writerThread.start(); // Reader threads for (int i = 0; i < numReaderThreads; i++) { - new Thread(() -> { + Thread readerThread = new Thread(() -> { try { for (int j = 0; j < 10000; j++) { // Verify the values @@ -7551,13 +7587,24 @@ public void testMutateRowInParallel() throws Exception { } latch.countDown(); - }).start(); + }); + readerThread.setName("ReaderThread-" + i + "-" + method); + readerThread.start(); } - writerThread.join(); + try { + writerThread.join(); - if (assertionError.get() != null) { - throw assertionError.get(); + if (assertionError.get() != null) { + throw assertionError.get(); + } + } finally { + // Ensure writer thread is stopped on test timeout + writerThread.interrupt(); + writerThread.join(5000); + if (writerThread.isAlive()) { + LOG.warn("Writer thread did not stop within timeout for test " + method); + } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java index 5ab593b00ace..911d15c5d39a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java @@ -192,6 +192,10 @@ protected static void loadData(String prefix, byte[] row, byte[] familyName) thr protected static void setupConfig(HBaseTestingUtil util, String znodeParent) { Configuration conf = util.getConfiguration(); conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, znodeParent); + // Disable directory sharing to prevent race conditions when tests run in parallel. + // Each test instance gets its own isolated directories to avoid one test's tearDown() + // deleting directories another parallel test is still using. + conf.setBoolean("hbase.test.disable-directory-sharing", true); // We don't want too many edits per batch sent to the ReplicationEndpoint to trigger // sufficient number of events. But we don't want to go too low because // HBaseInterClusterReplicationEndpoint partitions entries into batches and we want