diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Checkpointing/StateMachineDriver.cs b/libs/storage/Tsavorite/cs/src/core/Index/Checkpointing/StateMachineDriver.cs index b6edc09960e..de614eca4fe 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Checkpointing/StateMachineDriver.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Checkpointing/StateMachineDriver.cs @@ -66,6 +66,12 @@ void DecrementActiveTransactions(long txnVersion) internal void TrackLastVersion(long version) { + // Only create and enqueue one semaphore per version, if we create a + // new one on each call, the earlier semaphore is orphaned in the waitingList + // and never released, and we permanently block ProcessWaitingListAsync. + if (lastVersion == version) + return; + if (GetNumActiveTransactions(version) > 0) { // Set version number first, then create semaphore diff --git a/libs/storage/Tsavorite/cs/test/StateMachineDriverTests.cs b/libs/storage/Tsavorite/cs/test/StateMachineDriverTests.cs index 51c205b76b2..396423c818b 100644 --- a/libs/storage/Tsavorite/cs/test/StateMachineDriverTests.cs +++ b/libs/storage/Tsavorite/cs/test/StateMachineDriverTests.cs @@ -401,4 +401,57 @@ public async ValueTask GrowIndexVersionSwitchTxnTest( [Values] bool useTimingFuzzing) => await DoGrowIndexVersionSwitchEquivalenceCheck(indexSize, useTimingFuzzing).ConfigureAwait(false); } + + /// + /// Regression test for checkpoint deadlock with two-store checkpoints. + /// + /// TrackLastVersion is called once per store during the IN_PROGRESS phase. + /// Without the fix, the second call overwrites lastVersionTransactionsDone, + /// orphaning the first semaphore in the waitingList. ProcessWaitingListAsync + /// then waits on it forever. + /// + [AllureNUnit] + [TestFixture] + public class TrackLastVersionTwoStoreDeadlock : AllureTestBase + { + [Test] + public async Task TrackLastVersionCalledTwiceDoesNotDeadlock() + { + var epoch = new LightEpoch(); + try + { + var driver = new StateMachineDriver(epoch); + + // Simulate an active transaction (e.g. Lua script touching both stores) + var txnVersion = driver.AcquireTransactionVersion(); + + // GlobalAfterEnteringState calls TrackLastVersion once per store + driver.TrackLastVersion(txnVersion); // MainStore + driver.TrackLastVersion(txnVersion); // ObjectStore + + // Transaction completes + driver.EndTransaction(txnVersion); + + // Verify all waitingList semaphores are released (not orphaned) + var waitingList = (System.Collections.Generic.List) + typeof(StateMachineDriver) + .GetField("waitingList", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance) + .GetValue(driver); + + ClassicAssert.AreEqual(1, waitingList.Count, + "Expected 1 semaphore in waitingList, not 2. " + + "Two means the second TrackLastVersion call created a new semaphore " + + "that overwrote the first, orphaning it."); + + var acquired = await waitingList[0].WaitAsync(System.TimeSpan.FromSeconds(5)); + ClassicAssert.IsTrue(acquired, + "Semaphore was not released after EndTransaction. " + + "This causes ProcessWaitingListAsync to deadlock permanently."); + } + finally + { + epoch.Dispose(); + } + } + } } \ No newline at end of file