From e7353ef6ca9f17b4b06dc93ac5175e64ed672932 Mon Sep 17 00:00:00 2001
From: Christoph Sturm <me@christophsturm.com>
Date: Sat, 16 May 2026 16:10:33 +0200
Subject: [PATCH 1/2] Use scoped MLX stream in RoPE test

Swift Testing can run suites concurrently. The previous .serialized trait only serialized tests within RoPEApplicationTests, so it did not prevent gemma3nAttentionTest from running alongside KV cache and speculative decoding tests.

Those tests all enqueue and evaluate MLX work through StreamOrDevice.default. Without a task-local override, that resolves to the shared default GPU stream, so separate tests can interact through the same underlying command-buffer lifecycle and trip Metal's 'Completed handler provided after commit call' assertion.

Run the RoPE test under Stream.withNewDefaultStream and synchronize that scoped stream before returning. This isolates the MLX stream state for the test while preserving Swift Testing parallelism instead of serializing unrelated tests.
---
 Tests/MLXLMTests/RoPEApplicationTests.swift | 82 +++++++++++----------
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/Tests/MLXLMTests/RoPEApplicationTests.swift b/Tests/MLXLMTests/RoPEApplicationTests.swift
index 991a9acb2..8a9178132 100644
--- a/Tests/MLXLMTests/RoPEApplicationTests.swift
+++ b/Tests/MLXLMTests/RoPEApplicationTests.swift
@@ -6,51 +6,55 @@ import Testing
 @testable import MLXLLM
 @testable import MLXLMCommon
 
-@Suite(.serialized)
+@Suite
 struct RoPEApplicationTests {
 
     /// Gemma3nAttention applies rope, updates the cache and applies rope again.
     /// Ensure that it is correctly implemented.  We can observe prefill vs single token
     /// generaton and they should produce the same answers if implemented correctly.
     @Test func gemma3nAttentionTest() {
-        let config = Gemma3nTextConfiguration()
-        let attention = Gemma3nAttention(config, layerIdx: 3)
-
-        #expect(!attention.isKvSharedLayer)
-
-        let B = 1
-        let L = 4
-        let D = config.hiddenSize
-
-        MLXRandom.seed(42)
-        let x = MLXRandom.normal([B, L, D])
-        eval(x)
-
-        // Batch: process all L tokens at once with a causal mask
-        let cacheBatch = KVCacheSimple()
-        let causalMask = createAttentionMask(h: x, cache: cacheBatch)
-        let outputBatch = attention(x, mask: causalMask, cache: cacheBatch)
-        eval(outputBatch)
-
-        // Sequential: process one token at a time (mask=.none since L=1 with cache)
-        let cacheSeq = KVCacheSimple()
-        var seqOutputs: [MLXArray] = []
-        for i in 0 ..< L {
-            let token = x[0..., i ..< (i + 1), 0...]
-            let mask = createAttentionMask(h: token, cache: cacheSeq)
-            let out = attention(token, mask: mask, cache: cacheSeq)
-            seqOutputs.append(out)
+        Stream.withNewDefaultStream {
+            defer { StreamOrDevice.default.stream.synchronize() }
+
+            let config = Gemma3nTextConfiguration()
+            let attention = Gemma3nAttention(config, layerIdx: 3)
+
+            #expect(!attention.isKvSharedLayer)
+
+            let B = 1
+            let L = 4
+            let D = config.hiddenSize
+
+            MLXRandom.seed(42)
+            let x = MLXRandom.normal([B, L, D])
+            eval(x)
+
+            // Batch: process all L tokens at once with a causal mask
+            let cacheBatch = KVCacheSimple()
+            let causalMask = createAttentionMask(h: x, cache: cacheBatch)
+            let outputBatch = attention(x, mask: causalMask, cache: cacheBatch)
+            eval(outputBatch)
+
+            // Sequential: process one token at a time (mask=.none since L=1 with cache)
+            let cacheSeq = KVCacheSimple()
+            var seqOutputs: [MLXArray] = []
+            for i in 0 ..< L {
+                let token = x[0..., i ..< (i + 1), 0...]
+                let mask = createAttentionMask(h: token, cache: cacheSeq)
+                let out = attention(token, mask: mask, cache: cacheSeq)
+                seqOutputs.append(out)
+            }
+            let outputSeq = concatenated(seqOutputs, axis: 1)
+            eval(outputSeq)
+
+            // With correct RoPE these would match.  The buggy code would use
+            // different offsets for keys/queries.
+            let match = allClose(outputBatch, outputSeq, atol: 1e-4)
+            eval(match)
+            print(outputBatch)
+            print(outputSeq)
+            print(abs(outputSeq - outputBatch))
+            #expect(match.item(Bool.self))
         }
-        let outputSeq = concatenated(seqOutputs, axis: 1)
-        eval(outputSeq)
-
-        // With correct RoPE these would match.  The buggy code would use
-        // different offsets for keys/queries.
-        let match = allClose(outputBatch, outputSeq, atol: 1e-4)
-        eval(match)
-        print(outputBatch)
-        print(outputSeq)
-        print(abs(outputSeq - outputBatch))
-        #expect(match.item(Bool.self))
     }
 }

From 589975d015e8f1bb57789c884fa2155d96c3abc3 Mon Sep 17 00:00:00 2001
From: Christoph Sturm <me@christophsturm.com>
Date: Sat, 16 May 2026 16:24:28 +0200
Subject: [PATCH 2/2] Remove RoPE test debug output

---
 Tests/MLXLMTests/RoPEApplicationTests.swift | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Tests/MLXLMTests/RoPEApplicationTests.swift b/Tests/MLXLMTests/RoPEApplicationTests.swift
index 8a9178132..6bf07cec6 100644
--- a/Tests/MLXLMTests/RoPEApplicationTests.swift
+++ b/Tests/MLXLMTests/RoPEApplicationTests.swift
@@ -51,9 +51,6 @@ struct RoPEApplicationTests {
             // different offsets for keys/queries.
             let match = allClose(outputBatch, outputSeq, atol: 1e-4)
             eval(match)
-            print(outputBatch)
-            print(outputSeq)
-            print(abs(outputSeq - outputBatch))
             #expect(match.item(Bool.self))
         }
     }