From b06fa03ac23a4b586ea597433492649bb3ba3437 Mon Sep 17 00:00:00 2001
From: anupsv <6407789+anupsv@users.noreply.github.com>
Date: Wed, 20 May 2026 22:21:43 -0700
Subject: [PATCH 1/3] Add LlamaModelTP: tensor-parallel variant of LlamaModel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LlamaModelTP mirrors LlamaModel but uses sharded linear layers from
mlx-swift's AllToShardedLinear / ShardedToAllLinear so two (or more)
ranks can split each layer's compute and weight memory:

- Q/K/V projections + gate/up are AllToShardedLinear (column-parallel):
  each rank holds outDim/world_size rows; output is sharded across heads.
  No collective op in forward.
- O projection + down are ShardedToAllLinear (row-parallel): each rank
  holds inDim/world_size cols; allSum in forward to combine partial
  contributions across ranks.
- Embedding, layernorms, and the (optional, non-tied) LM head stay
  replicated — LM head sharding is a future optimization.

Per-rank: attentionHeads/world_size local Q heads and kvHeads/world_size
local KV heads. The KV cache is sized to the local KV head count so
each rank only stores its shard.

Weight loading: LlamaModelTP.sanitize(weights:) slices column-parallel
weights along axis 0 (output dim) and row-parallel weights along axis 1
(input dim) into each rank's shard before module assignment. Singleton
group (world=1) is a pass-through. Quantized weights (.scales/.biases)
are passed through unmodified — quantized TP support is a follow-up
that needs packed-uint32-aware slicing.

The exposed shardWeightIfNeeded static helper centralizes the
key-pattern → axis decision so tests can validate the sharding logic
without spinning up a real distributed group.

Tests/MLXLMTests/LlamaTPTests.swift covers:
- Parameter shape parity between LlamaModel and LlamaModelTP(world=1)
- Forward-pass output shape with singleton group
- Numerical equivalence: with identical weights, LlamaModelTP(world=1)
  matches LlamaModel logits to within 1e-3 (float-accumulation noise)
- Static shardWeightIfNeeded behavior for column-parallel, row-parallel,
  embedding, and layernorm key patterns

Stacked on feat/llama-pipeline-parallel (PR #24) for callPartial; that
remains the fallback path for non-Llama models. Requires Cmlx + the
sharded linear primitives from Layr-Labs/mlx-swift#3.
---
 Libraries/MLXLLM/Models/LlamaTP.swift | 340 ++++++++++++++++++++++++++
 Tests/MLXLMTests/LlamaTPTests.swift   | 175 +++++++++++++
 2 files changed, 515 insertions(+)
 create mode 100644 Libraries/MLXLLM/Models/LlamaTP.swift
 create mode 100644 Tests/MLXLMTests/LlamaTPTests.swift

diff --git a/Libraries/MLXLLM/Models/LlamaTP.swift b/Libraries/MLXLLM/Models/LlamaTP.swift
new file mode 100644
index 000000000..3b4e06456
--- /dev/null
+++ b/Libraries/MLXLLM/Models/LlamaTP.swift
@@ -0,0 +1,340 @@
+// Copyright © 2026 Apple Inc. (TP variant — Layr-Labs)
+
+import Foundation
+import MLX
+import MLXLMCommon
+import MLXNN
+
+// Tensor-parallel variant of Llama.swift. Each rank holds a column-shard of
+// the Q/K/V/gate/up projections and a row-shard of the O/down projections.
+// Embedding, layernorms, and the final LM head stay replicated across ranks.
+//
+// Per-rank: attentionHeads / group.size local Q heads and kvHeads / group.size
+// local KV heads. Both must be divisible by group.size or init throws.
+//
+// On a singleton group (size 1), LlamaModelTP produces output bit-equivalent
+// to LlamaModel modulo float accumulation order — used as the equivalence
+// baseline in tests.
+
+class LlamaAttentionTP: Module {
+
+    let args: LlamaConfiguration
+    let scale: Float
+    let group: DistributedGroup
+    let localHeads: Int
+    let localKVHeads: Int
+
+    @ModuleInfo(key: "q_proj") var wq: AllToShardedLinear
+    @ModuleInfo(key: "k_proj") var wk: AllToShardedLinear
+    @ModuleInfo(key: "v_proj") var wv: AllToShardedLinear
+    @ModuleInfo(key: "o_proj") var wo: ShardedToAllLinear
+
+    let rope: RoPELayer
+
+    init(_ args: LlamaConfiguration, group: DistributedGroup) throws {
+        self.args = args
+        self.group = group
+
+        let dim = args.hiddenSize
+        let heads = args.attentionHeads
+        let kvHeads = args.kvHeads
+        let headDim = args.resolvedHeadDimensions
+        self.scale = pow(Float(headDim), -0.5)
+
+        // Validate divisibility — both Q and KV head counts must shard cleanly.
+        guard heads % group.size == 0 else {
+            throw DistributedError.invalidConfiguration(
+                "attentionHeads=\(heads) must be divisible by group size \(group.size)")
+        }
+        guard kvHeads % group.size == 0 else {
+            throw DistributedError.invalidConfiguration(
+                "kvHeads=\(kvHeads) must be divisible by group size \(group.size)")
+        }
+        self.localHeads = heads / group.size
+        self.localKVHeads = kvHeads / group.size
+
+        self._wq.wrappedValue = try AllToShardedLinear(
+            inputDimensions: dim, outputDimensions: heads * headDim,
+            bias: args.attentionBias, group: group)
+        self._wk.wrappedValue = try AllToShardedLinear(
+            inputDimensions: dim, outputDimensions: kvHeads * headDim,
+            bias: args.attentionBias, group: group)
+        self._wv.wrappedValue = try AllToShardedLinear(
+            inputDimensions: dim, outputDimensions: kvHeads * headDim,
+            bias: args.attentionBias, group: group)
+        self._wo.wrappedValue = try ShardedToAllLinear(
+            inputDimensions: heads * headDim, outputDimensions: dim,
+            bias: args.attentionBias, group: group)
+
+        self.rope = initializeRope(
+            dims: headDim, base: args.ropeTheta,
+            traditional: args.ropeTraditional,
+            scalingConfig: args.ropeScaling,
+            maxPositionEmbeddings: args.maxPositionEmbeddings)
+    }
+
+    func callAsFunction(
+        _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
+    ) -> MLXArray {
+        let (B, L) = (x.dim(0), x.dim(1))
+
+        var queries = wq(x)
+        var keys = wk(x)
+        var values = wv(x)
+
+        // Reshape using LOCAL head counts — each rank has only its shard.
+        queries = queries.reshaped(B, L, localHeads, -1).transposed(0, 2, 1, 3)
+        keys = keys.reshaped(B, L, localKVHeads, -1).transposed(0, 2, 1, 3)
+        values = values.reshaped(B, L, localKVHeads, -1).transposed(0, 2, 1, 3)
+
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
+
+        let output = attentionWithCacheUpdate(
+            queries: queries,
+            keys: keys,
+            values: values,
+            cache: cache,
+            scale: scale,
+            mask: mask
+        )
+        .transposed(0, 2, 1, 3)
+        .reshaped(B, L, -1)
+
+        // ShardedToAllLinear's forward runs allSum internally so the result is
+        // identical on every rank.
+        return wo(output)
+    }
+}
+
+class LlamaMLPTP: Module, UnaryLayer {
+
+    @ModuleInfo(key: "gate_proj") var gate: AllToShardedLinear
+    @ModuleInfo(key: "down_proj") var down: ShardedToAllLinear
+    @ModuleInfo(key: "up_proj") var up: AllToShardedLinear
+
+    init(_ args: LlamaConfiguration, group: DistributedGroup) throws {
+        self._gate.wrappedValue = try AllToShardedLinear(
+            inputDimensions: args.hiddenSize, outputDimensions: args.intermediateSize,
+            bias: args.mlpBias, group: group)
+        self._down.wrappedValue = try ShardedToAllLinear(
+            inputDimensions: args.intermediateSize, outputDimensions: args.hiddenSize,
+            bias: args.mlpBias, group: group)
+        self._up.wrappedValue = try AllToShardedLinear(
+            inputDimensions: args.hiddenSize, outputDimensions: args.intermediateSize,
+            bias: args.mlpBias, group: group)
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let activation = silu(gate(x))
+        return down(activation * up(x))
+    }
+}
+
+class LlamaTransformerBlockTP: Module {
+    @ModuleInfo(key: "self_attn") var attention: LlamaAttentionTP
+    @ModuleInfo(key: "mlp") var mlp: LlamaMLPTP
+
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+
+    init(_ args: LlamaConfiguration, group: DistributedGroup) throws {
+        self._attention.wrappedValue = try LlamaAttentionTP(args, group: group)
+        self._mlp.wrappedValue = try LlamaMLPTP(args, group: group)
+        self._inputLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        self._postAttentionLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    func callAsFunction(
+        _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
+    ) -> MLXArray {
+        var r = attention(inputLayerNorm(x), mask: mask, cache: cache)
+        let h = x + r
+        r = mlp(postAttentionLayerNorm(h))
+        let out = h + r
+        return out
+    }
+}
+
+public class LlamaModelInnerTP: Module {
+
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
+
+    let layers: [LlamaTransformerBlockTP]
+    let norm: RMSNorm
+    let group: DistributedGroup
+
+    init(_ args: LlamaConfiguration, group: DistributedGroup) throws {
+        precondition(args.vocabularySize > 0)
+
+        self.group = group
+        self._embedTokens.wrappedValue = Embedding(
+            embeddingCount: args.vocabularySize, dimensions: args.hiddenSize)
+
+        var layers: [LlamaTransformerBlockTP] = []
+        layers.reserveCapacity(args.hiddenLayers)
+        for _ in 0 ..< args.hiddenLayers {
+            layers.append(try LlamaTransformerBlockTP(args, group: group))
+        }
+        self.layers = layers
+        self.norm = RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
+        var h = embedTokens(inputs)
+
+        let mask = createAttentionMask(h: h, cache: cache?.first)
+
+        for (i, layer) in layers.enumerated() {
+            h = layer(h, mask: mask, cache: cache?[i])
+        }
+
+        return norm(h)
+    }
+}
+
+/// Tensor-parallel variant of `LlamaModel`. Each rank holds a column-shard of
+/// the Q/K/V/gate/up projections and a row-shard of the O/down projections.
+/// Use `LlamaModel` for single-rank inference.
+public class LlamaModelTP: Module, LLMModel, KVCacheDimensionProvider {
+
+    public let vocabularySize: Int
+    public let kvHeads: [Int]
+    public let group: DistributedGroup
+
+    public let model: LlamaModelInnerTP
+
+    @ModuleInfo(key: "lm_head") var lmHead: Linear?
+
+    public init(_ args: LlamaConfiguration, group: DistributedGroup) throws {
+        self.vocabularySize = args.vocabularySize
+        // KV heads reported here are the LOCAL kv-head count for cache sizing —
+        // the KV cache only stores this rank's shard of the keys/values.
+        guard args.kvHeads % group.size == 0 else {
+            throw DistributedError.invalidConfiguration(
+                "kvHeads=\(args.kvHeads) must be divisible by group size \(group.size)")
+        }
+        let localKV = args.kvHeads / group.size
+        self.kvHeads = (0 ..< args.hiddenLayers).map { _ in localKV }
+        self.group = group
+        self.model = try LlamaModelInnerTP(args, group: group)
+        if !args.tieWordEmbeddings {
+            // LM head stays replicated for simplicity; column-parallel +
+            // allGather is a future optimization.
+            self._lmHead.wrappedValue = Linear(args.hiddenSize, args.vocabularySize, bias: false)
+        }
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]?) -> MLXArray {
+        let out = model(inputs, cache: cache)
+        if let lmHead {
+            return lmHead(out)
+        } else {
+            return model.embedTokens.asLinear(out)
+        }
+    }
+
+    /// Slices column-parallel and row-parallel weights into rank-local shards
+    /// before module assignment. Singleton group (size 1) is a pass-through.
+    /// Quantized weights are passed through unmodified — quantized TP support
+    /// is a follow-up that needs packed-uint32-aware slicing.
+    public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        // First apply LlamaModel's standard cleanup (drop rotary_emb.inv_freq).
+        var result = weights.filter {
+            !$0.key.contains("self_attn.rotary_emb.inv_freq")
+        }
+
+        let world = group.size
+        if world == 1 { return result }
+
+        let rank = group.rank
+        var sliced: [String: MLXArray] = [:]
+        sliced.reserveCapacity(result.count)
+
+        for (key, value) in result {
+            sliced[key] = LlamaModelTP.shardWeightIfNeeded(
+                key: key, value: value, rank: rank, world: world)
+        }
+        return sliced
+    }
+
+    /// Decides which axis to slice based on the parameter name and produces
+    /// the rank's shard. Column-parallel weights slice axis 0 (output dim);
+    /// row-parallel weights slice axis 1 (input dim); biases for column-parallel
+    /// also slice (since the bias rides along with the output dim); biases for
+    /// row-parallel stay full (each rank has the full bias and the allreduce
+    /// then re-adds it — which is incorrect — so for first cut we just keep
+    /// the row-parallel bias on rank 0 and zero it on others). The MLX Llama
+    /// weights don't use biases by default (`attentionBias=false`, `mlpBias=false`)
+    /// so this edge case is academic for stock Llama checkpoints.
+    public static func shardWeightIfNeeded(
+        key: String, value: MLXArray, rank: Int, world: Int
+    ) -> MLXArray {
+        // Embedding and layernorms stay full on every rank.
+        if key.contains("embed_tokens") || key.contains("norm") || key.contains("lm_head") {
+            return value
+        }
+
+        // Quantized weights: 4-bit packed weights need special handling.
+        // Pass through for now — TP only validates against unquantized checkpoints
+        // in the first iteration.
+        if key.contains(".scales") || key.contains(".biases") {
+            return value
+        }
+
+        let isColumnParallel =
+            key.contains("q_proj.weight") || key.contains("q_proj.bias")
+            || key.contains("k_proj.weight") || key.contains("k_proj.bias")
+            || key.contains("v_proj.weight") || key.contains("v_proj.bias")
+            || key.contains("gate_proj.weight") || key.contains("gate_proj.bias")
+            || key.contains("up_proj.weight") || key.contains("up_proj.bias")
+
+        let isRowParallelWeight =
+            key.contains("o_proj.weight") || key.contains("down_proj.weight")
+
+        if isColumnParallel {
+            // Slice along axis 0 (output dim).
+            let outDim = value.dim(0)
+            precondition(
+                outDim % world == 0,
+                "column-parallel weight '\(key)' outDim=\(outDim) not divisible by world=\(world)")
+            let shard = outDim / world
+            return value[(rank * shard) ..< ((rank + 1) * shard)]
+        }
+        if isRowParallelWeight {
+            // Slice along axis 1 (input dim).
+            let inDim = value.dim(1)
+            precondition(
+                inDim % world == 0,
+                "row-parallel weight '\(key)' inDim=\(inDim) not divisible by world=\(world)")
+            let shard = inDim / world
+            return value[0..., (rank * shard) ..< ((rank + 1) * shard)]
+        }
+        return value
+    }
+
+    public func messageGenerator(tokenizer: any Tokenizer) -> any MessageGenerator {
+        do {
+            let probe = [
+                [
+                    "role": "system",
+                    "content": "test",
+                ]
+            ]
+            _ = try tokenizer.applyChatTemplate(messages: probe)
+            return DefaultMessageGenerator()
+        } catch {
+            return NoSystemMessageGenerator()
+        }
+    }
+}
+
+// MARK: - LoRA
+
+extension LlamaModelTP: LoRAModel {
+    public var loraLayers: [Module] {
+        model.layers
+    }
+}
diff --git a/Tests/MLXLMTests/LlamaTPTests.swift b/Tests/MLXLMTests/LlamaTPTests.swift
new file mode 100644
index 000000000..12272ee47
--- /dev/null
+++ b/Tests/MLXLMTests/LlamaTPTests.swift
@@ -0,0 +1,175 @@
+// Copyright © 2026 Apple Inc. (TP variant — Layr-Labs)
+
+import Foundation
+import MLX
+import MLXLLM
+import MLXLMCommon
+import MLXNN
+import XCTest
+
+/// Numerical-equivalence and shape tests for `LlamaModelTP`.
+///
+/// On a singleton (size-1) DistributedGroup the sharded layers degenerate to
+/// regular Linear layers: column-parallel weights have shape `[outDim/1, inDim]`
+/// = `[outDim, inDim]`, and row-parallel layers' allSum is a no-op. So with
+/// the same weights, LlamaModelTP must produce output bit-equal to LlamaModel
+/// modulo float accumulation order.
+public class LlamaTPTests: XCTestCase {
+
+    /// Build a small LlamaConfiguration suitable for fast in-process testing.
+    private func smallConfig() -> LlamaConfiguration {
+        LlamaConfiguration(
+            hiddenSize: 64,
+            hiddenLayers: 4,
+            intermediateSize: 256,
+            attentionHeads: 8,
+            rmsNormEps: 1e-5,
+            vocabularySize: 128,
+            kvHeads: 4
+        )
+    }
+
+    /// Singleton group used for in-process equivalence testing.
+    private func singletonGroup() -> DistributedGroup {
+        // Default no-arg init returns a size-1 group when no distributed
+        // backend is initialized — exactly what we want here.
+        DistributedGroup()
+    }
+
+    /// LlamaModelTP with size=1 must have the same parameter shapes as
+    /// LlamaModel — that's the precondition for the equivalence test.
+    func testLlamaTPSingletonHasMatchingParameterShapes() throws {
+        let config = smallConfig()
+        let group = singletonGroup()
+        XCTAssertEqual(group.size, 1)
+
+        let plain = LlamaModel(config)
+        let tp = try LlamaModelTP(config, group: group)
+
+        let plainParams = plain.parameters().flattened()
+        let tpParams = tp.parameters().flattened()
+
+        XCTAssertEqual(plainParams.count, tpParams.count)
+
+        let plainByKey = Dictionary(uniqueKeysWithValues: plainParams.map { ($0.0, $0.1) })
+        for (key, value) in tpParams {
+            guard let plainValue = plainByKey[key] else {
+                XCTFail("TP parameter '\(key)' has no LlamaModel counterpart")
+                continue
+            }
+            XCTAssertEqual(
+                value.shape, plainValue.shape,
+                "shape mismatch for '\(key)': tp=\(value.shape) plain=\(plainValue.shape)")
+        }
+    }
+
+    /// Forward-pass output shape matches LlamaModel on size-1 group.
+    func testLlamaTPSingletonOutputShape() throws {
+        let config = smallConfig()
+        let group = singletonGroup()
+        let model = try LlamaModelTP(config, group: group)
+
+        let input = MLXArray([1, 2, 3, 4, 5])[.newAxis, .ellipsis]
+        let output = model.callAsFunction(input, cache: nil)
+
+        XCTAssertEqual(output.shape, [1, 5, 128])
+    }
+
+    /// With identical weights, LlamaModelTP(size=1) and LlamaModel produce
+    /// the same logits. Tolerance is loose because reshape+matmul order isn't
+    /// guaranteed identical, but on size=1 there's no allreduce so drift
+    /// should be at the float-accumulation noise floor.
+    func testLlamaTPSingletonEquivalenceToLlamaModel() throws {
+        let config = smallConfig()
+        let group = singletonGroup()
+
+        let plain = LlamaModel(config)
+        let tp = try LlamaModelTP(config, group: group)
+
+        // Copy weights from plain → tp by name. The TP modules use the same
+        // parameter keys as the plain modules (q_proj, k_proj, etc.) so a
+        // flat update by key works.
+        let plainParams = plain.parameters()
+        try tp.update(parameters: plainParams, verify: .all)
+        eval(tp.parameters())
+
+        let input = MLXArray([7, 11, 13, 17, 19])[.newAxis, .ellipsis]
+        let plainOutput = plain.callAsFunction(input, cache: nil)
+        let tpOutput = tp.callAsFunction(input, cache: nil)
+
+        XCTAssertEqual(plainOutput.shape, tpOutput.shape)
+        let diff = (plainOutput - tpOutput).abs().max()
+        let diffValue = diff.item(Float.self)
+        XCTAssertLessThan(
+            diffValue, 1e-3,
+            "TP(size=1) output diverged from LlamaModel by max=\(diffValue)")
+    }
+
+    /// Shape validation: TP refuses to initialize if heads don't divide evenly.
+    func testLlamaTPRejectsNonDivisibleHeads() {
+        // Force a 2-rank "fake" by directly testing the validation logic via
+        // the helper — we can't easily construct a size-2 group in-process
+        // without a real distributed backend.
+        var config = smallConfig()
+        config = LlamaConfiguration(
+            hiddenSize: 64, hiddenLayers: 2, intermediateSize: 256,
+            attentionHeads: 7,  // not divisible by 2
+            rmsNormEps: 1e-5, vocabularySize: 128, kvHeads: 4
+        )
+
+        // With size=1 group, divisibility check is trivially satisfied (7 % 1 = 0),
+        // so this test only documents that we CAN construct on size=1. Real
+        // size-2 rejection is exercised by the 2-Mac smoke test in d-inference.
+        XCTAssertNoThrow(try LlamaModelTP(config, group: singletonGroup()))
+    }
+
+    /// Sanity check on the static weight-sharding helper without needing a
+    /// real distributed group.
+    func testShardWeightIfNeededColumnParallel() {
+        let weight = MLXArray(0 ..< 8 * 4, [8, 4]).asType(.float32)
+        // Column-parallel slice along axis 0 (output dim) for world=2.
+        let rank0 = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.layers.0.self_attn.q_proj.weight",
+            value: weight, rank: 0, world: 2)
+        let rank1 = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.layers.0.self_attn.q_proj.weight",
+            value: weight, rank: 1, world: 2)
+        XCTAssertEqual(rank0.shape, [4, 4])
+        XCTAssertEqual(rank1.shape, [4, 4])
+        // rank 0 has rows 0..4, rank 1 has rows 4..8.
+        XCTAssertEqual(rank0[0, 0].item(Float.self), 0.0)
+        XCTAssertEqual(rank1[0, 0].item(Float.self), 16.0)
+    }
+
+    func testShardWeightIfNeededRowParallel() {
+        let weight = MLXArray(0 ..< 4 * 8, [4, 8]).asType(.float32)
+        // Row-parallel slice along axis 1 (input dim) for world=2.
+        let rank0 = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.layers.0.self_attn.o_proj.weight",
+            value: weight, rank: 0, world: 2)
+        let rank1 = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.layers.0.self_attn.o_proj.weight",
+            value: weight, rank: 1, world: 2)
+        XCTAssertEqual(rank0.shape, [4, 4])
+        XCTAssertEqual(rank1.shape, [4, 4])
+        // rank 0 has cols 0..4, rank 1 has cols 4..8.
+        XCTAssertEqual(rank0[0, 0].item(Float.self), 0.0)
+        XCTAssertEqual(rank1[0, 0].item(Float.self), 4.0)
+    }
+
+    func testShardWeightIfNeededEmbeddingNotSharded() {
+        let weight = MLXArray(0 ..< 8 * 4, [8, 4]).asType(.float32)
+        let rank0 = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.embed_tokens.weight",
+            value: weight, rank: 0, world: 2)
+        XCTAssertEqual(rank0.shape, [8, 4])
+    }
+
+    func testShardWeightIfNeededLayerNormNotSharded() {
+        let weight = MLXArray(0 ..< 64, [64]).asType(.float32)
+        let rank1 = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.layers.0.input_layernorm.weight",
+            value: weight, rank: 1, world: 2)
+        XCTAssertEqual(rank1.shape, [64])
+    }
+}

From 56a5ca60b775465ebe25c83a7be825f13318ff3c Mon Sep 17 00:00:00 2001
From: anupsv <6407789+anupsv@users.noreply.github.com>
Date: Wed, 20 May 2026 23:12:11 -0700
Subject: [PATCH 2/3] Add LlamaModelTPQ (quantized TP) + matrix-level sharding
 validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical: stock MLX Llama checkpoints are 4-bit quantized; the original
LlamaModelTP only worked on fp16/bf16 weights because AllToShardedLinear
and QuantizedAllToShardedLinear aren't in an inheritance hierarchy in
mlx-swift — you can't swap one for the other via dynamic dispatch the
way MLXNN.quantize() swaps Linear for QuantizedLinear.

This commit adds a parallel quantized class hierarchy:
- LlamaAttentionTPQ, LlamaMLPTPQ, LlamaTransformerBlockTPQ,
  LlamaModelInnerTPQ, LlamaModelTPQ
- Uses QuantizedAllToShardedLinear / QuantizedShardedToAllLinear for the
  Q/K/V/gate/up and O/down projections respectively
- LM head stays replicated + quantized (QuantizedLinear) — same memory
  trade-off as the fp16 path, just with quantized weights
- sanitize() / shardQuantizedWeightIfNeeded handle .weight (uint32
  packed shape [outDim, inDim/8]) plus .scales / .biases (fp16 shape
  [outDim, inDim/groupSize]) along the right axes. Row-parallel biases
  pass through unsliced (added once after the allSum).
- A `makeLlamaTP(args:quantization:group:)` factory picks the right
  variant from a BaseConfiguration.Quantization value.

Also tightens shardWeightIfNeeded on the fp16 LlamaModelTP:
- "norm" substring check replaced with explicit isLayerNormKey() that
  matches input_layernorm / post_attention_layernorm / model.norm only —
  future TP variants with q_norm / k_norm per-head norms (Qwen3, Gemma)
  won't accidentally skip sharding for those.
- Projection name matches use path-segment boundaries via isProjectionKey
  (e.g. ".q_proj.weight" not "q_proj.weight") so a "q_proj_extra" never
  matches.
- Loading a quantized blob into LlamaModelTP now precondition-fails with
  a clear message pointing at LlamaModelTPQ — silent corruption replaced
  with a loud routing error.
- Row-parallel biases are now also passed through unsliced in the fp16
  path (was an open TODO comment before).

High: the previous numerical-equivalence test on a singleton group
(world=1) didn't exercise any actual sharding — sharded layers degenerate
to regular Linear. To actually validate the sharding math without
needing a multi-process distributed backend, this commit adds three
matrix-level tests that:
- Slice weights with the SAME shardWeightIfNeeded function the production
  sanitize path uses
- Manually run the per-rank matmuls and (for row-parallel) simulate the
  allSum by summing partial outputs
- Compare against the unsharded reference to within 1e-4

Covered patterns: column-parallel (concat), row-parallel (allSum),
column-then-row pipeline (the typical attention / MLP shape).

Real multi-rank DistributedGroup tests still need spawned subprocesses
(jaccl or ring backend) — the 2-Mac Thunderbolt 5 smoke test in
d-inference remains the final validation. But the in-process math
checks catch axis confusion and slicing bugs that would have shipped
silently before.

Plus 5 unit tests for the quantized weight slicing helper and a small
test that q_norm / k_norm aren't tripped by the layernorm gate.

Net: 18 tests pass, up from 8. No regression to existing LlamaModel /
LlamaModelInner consumers; LlamaModelTP behavior unchanged for fp16
checkpoints (only the diagnostic precondition is new). LlamaModelTPQ is
additive.
---
 Libraries/MLXLLM/Models/LlamaTP.swift | 432 +++++++++++++++++++++++++-
 Package.swift                         |   1 +
 Tests/MLXLMTests/LlamaTPTests.swift   | 233 ++++++++++++++
 3 files changed, 652 insertions(+), 14 deletions(-)

diff --git a/Libraries/MLXLLM/Models/LlamaTP.swift b/Libraries/MLXLLM/Models/LlamaTP.swift
index 3b4e06456..d264c16c5 100644
--- a/Libraries/MLXLLM/Models/LlamaTP.swift
+++ b/Libraries/MLXLLM/Models/LlamaTP.swift
@@ -272,27 +272,36 @@ public class LlamaModelTP: Module, LLMModel, KVCacheDimensionProvider {
     public static func shardWeightIfNeeded(
         key: String, value: MLXArray, rank: Int, world: Int
     ) -> MLXArray {
-        // Embedding and layernorms stay full on every rank.
-        if key.contains("embed_tokens") || key.contains("norm") || key.contains("lm_head") {
+        // Embedding, layernorms, and the replicated LM head stay full on
+        // every rank. Use path-segment-precise checks (not a substring
+        // "norm" match) so future TP variants with per-head q_norm/k_norm
+        // don't accidentally skip sharding for those.
+        if key.contains("embed_tokens") || isLayerNormKey(key) || key.contains("lm_head") {
             return value
         }
 
-        // Quantized weights: 4-bit packed weights need special handling.
-        // Pass through for now — TP only validates against unquantized checkpoints
-        // in the first iteration.
+        // This fp16 sanitize doesn't handle quantized weights; LlamaModelTPQ
+        // owns that path. If a caller loads a quantized checkpoint into
+        // LlamaModelTP, the .scales / .biases passes through unsliced and
+        // the .weight (uint32 packed) gets sliced along the wrong axis size,
+        // producing a shape mismatch at module assignment. Fail loudly here
+        // instead of silently producing garbage downstream.
         if key.contains(".scales") || key.contains(".biases") {
-            return value
+            preconditionFailure(
+                "LlamaModelTP cannot load quantized weights (got '\(key)'). Use LlamaModelTPQ for 4-bit / 8-bit MLX checkpoints — see makeLlamaTP(args:quantization:group:) factory."
+            )
         }
 
         let isColumnParallel =
-            key.contains("q_proj.weight") || key.contains("q_proj.bias")
-            || key.contains("k_proj.weight") || key.contains("k_proj.bias")
-            || key.contains("v_proj.weight") || key.contains("v_proj.bias")
-            || key.contains("gate_proj.weight") || key.contains("gate_proj.bias")
-            || key.contains("up_proj.weight") || key.contains("up_proj.bias")
+            isProjectionKey(key, name: "q_proj")
+            || isProjectionKey(key, name: "k_proj")
+            || isProjectionKey(key, name: "v_proj")
+            || isProjectionKey(key, name: "gate_proj")
+            || isProjectionKey(key, name: "up_proj")
 
-        let isRowParallelWeight =
-            key.contains("o_proj.weight") || key.contains("down_proj.weight")
+        let isRowParallel =
+            isProjectionKey(key, name: "o_proj")
+            || isProjectionKey(key, name: "down_proj")
 
         if isColumnParallel {
             // Slice along axis 0 (output dim).
@@ -303,7 +312,12 @@ public class LlamaModelTP: Module, LLMModel, KVCacheDimensionProvider {
             let shard = outDim / world
             return value[(rank * shard) ..< ((rank + 1) * shard)]
         }
-        if isRowParallelWeight {
+        if isRowParallel {
+            // Bias for row-parallel stays full on every rank — addition
+            // after the implicit allSum applies once on the summed result.
+            if key.hasSuffix(".bias") {
+                return value
+            }
             // Slice along axis 1 (input dim).
             let inDim = value.dim(1)
             precondition(
@@ -338,3 +352,393 @@ extension LlamaModelTP: LoRAModel {
         model.layers
     }
 }
+
+// MARK: - Quantized TP variant
+//
+// Stock MLX Llama checkpoints are 4-bit quantized. Because mlx-swift's
+// `AllToShardedLinear` / `ShardedToAllLinear` and their quantized
+// counterparts (`QuantizedAllToShardedLinear` / `QuantizedShardedToAllLinear`)
+// are sibling Module classes — NOT in an inheritance hierarchy — we can't
+// swap one for the other at runtime the way MLXNN's `quantize()` swaps
+// `Linear` for `QuantizedLinear`. So we ship a parallel class hierarchy
+// here. Use `makeLlamaTP(args:quantization:group:)` to pick the right
+// variant based on the model's quantization config.
+
+class LlamaAttentionTPQ: Module {
+
+    let args: LlamaConfiguration
+    let scale: Float
+    let group: DistributedGroup
+    let localHeads: Int
+    let localKVHeads: Int
+
+    @ModuleInfo(key: "q_proj") var wq: QuantizedAllToShardedLinear
+    @ModuleInfo(key: "k_proj") var wk: QuantizedAllToShardedLinear
+    @ModuleInfo(key: "v_proj") var wv: QuantizedAllToShardedLinear
+    @ModuleInfo(key: "o_proj") var wo: QuantizedShardedToAllLinear
+
+    let rope: RoPELayer
+
+    init(
+        _ args: LlamaConfiguration, group: DistributedGroup,
+        groupSize: Int, bits: Int
+    ) throws {
+        self.args = args
+        self.group = group
+
+        let dim = args.hiddenSize
+        let heads = args.attentionHeads
+        let kvHeads = args.kvHeads
+        let headDim = args.resolvedHeadDimensions
+        self.scale = pow(Float(headDim), -0.5)
+
+        guard heads % group.size == 0 else {
+            throw DistributedError.invalidConfiguration(
+                "attentionHeads=\(heads) must be divisible by group size \(group.size)")
+        }
+        guard kvHeads % group.size == 0 else {
+            throw DistributedError.invalidConfiguration(
+                "kvHeads=\(kvHeads) must be divisible by group size \(group.size)")
+        }
+        self.localHeads = heads / group.size
+        self.localKVHeads = kvHeads / group.size
+
+        self._wq.wrappedValue = try QuantizedAllToShardedLinear(
+            inputDimensions: dim, outputDimensions: heads * headDim,
+            bias: args.attentionBias, groupSize: groupSize, bits: bits, group: group)
+        self._wk.wrappedValue = try QuantizedAllToShardedLinear(
+            inputDimensions: dim, outputDimensions: kvHeads * headDim,
+            bias: args.attentionBias, groupSize: groupSize, bits: bits, group: group)
+        self._wv.wrappedValue = try QuantizedAllToShardedLinear(
+            inputDimensions: dim, outputDimensions: kvHeads * headDim,
+            bias: args.attentionBias, groupSize: groupSize, bits: bits, group: group)
+        self._wo.wrappedValue = try QuantizedShardedToAllLinear(
+            inputDimensions: heads * headDim, outputDimensions: dim,
+            bias: args.attentionBias, groupSize: groupSize, bits: bits, group: group)
+
+        self.rope = initializeRope(
+            dims: headDim, base: args.ropeTheta,
+            traditional: args.ropeTraditional,
+            scalingConfig: args.ropeScaling,
+            maxPositionEmbeddings: args.maxPositionEmbeddings)
+    }
+
+    func callAsFunction(
+        _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
+    ) -> MLXArray {
+        let (B, L) = (x.dim(0), x.dim(1))
+
+        var queries = wq(x)
+        var keys = wk(x)
+        var values = wv(x)
+
+        queries = queries.reshaped(B, L, localHeads, -1).transposed(0, 2, 1, 3)
+        keys = keys.reshaped(B, L, localKVHeads, -1).transposed(0, 2, 1, 3)
+        values = values.reshaped(B, L, localKVHeads, -1).transposed(0, 2, 1, 3)
+
+        queries = applyRotaryPosition(rope, to: queries, cache: cache)
+        keys = applyRotaryPosition(rope, to: keys, cache: cache)
+
+        let output = attentionWithCacheUpdate(
+            queries: queries, keys: keys, values: values,
+            cache: cache, scale: scale, mask: mask
+        )
+        .transposed(0, 2, 1, 3)
+        .reshaped(B, L, -1)
+
+        return wo(output)
+    }
+}
+
+class LlamaMLPTPQ: Module, UnaryLayer {
+
+    @ModuleInfo(key: "gate_proj") var gate: QuantizedAllToShardedLinear
+    @ModuleInfo(key: "down_proj") var down: QuantizedShardedToAllLinear
+    @ModuleInfo(key: "up_proj") var up: QuantizedAllToShardedLinear
+
+    init(
+        _ args: LlamaConfiguration, group: DistributedGroup,
+        groupSize: Int, bits: Int
+    ) throws {
+        self._gate.wrappedValue = try QuantizedAllToShardedLinear(
+            inputDimensions: args.hiddenSize, outputDimensions: args.intermediateSize,
+            bias: args.mlpBias, groupSize: groupSize, bits: bits, group: group)
+        self._down.wrappedValue = try QuantizedShardedToAllLinear(
+            inputDimensions: args.intermediateSize, outputDimensions: args.hiddenSize,
+            bias: args.mlpBias, groupSize: groupSize, bits: bits, group: group)
+        self._up.wrappedValue = try QuantizedAllToShardedLinear(
+            inputDimensions: args.hiddenSize, outputDimensions: args.intermediateSize,
+            bias: args.mlpBias, groupSize: groupSize, bits: bits, group: group)
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let activation = silu(gate(x))
+        return down(activation * up(x))
+    }
+}
+
+class LlamaTransformerBlockTPQ: Module {
+    @ModuleInfo(key: "self_attn") var attention: LlamaAttentionTPQ
+    @ModuleInfo(key: "mlp") var mlp: LlamaMLPTPQ
+
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+
+    init(
+        _ args: LlamaConfiguration, group: DistributedGroup,
+        groupSize: Int, bits: Int
+    ) throws {
+        self._attention.wrappedValue = try LlamaAttentionTPQ(
+            args, group: group, groupSize: groupSize, bits: bits)
+        self._mlp.wrappedValue = try LlamaMLPTPQ(
+            args, group: group, groupSize: groupSize, bits: bits)
+        self._inputLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        self._postAttentionLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    func callAsFunction(
+        _ x: MLXArray, mask: MLXFast.ScaledDotProductAttentionMaskMode, cache: KVCache?
+    ) -> MLXArray {
+        var r = attention(inputLayerNorm(x), mask: mask, cache: cache)
+        let h = x + r
+        r = mlp(postAttentionLayerNorm(h))
+        let out = h + r
+        return out
+    }
+}
+
+public class LlamaModelInnerTPQ: Module {
+
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
+
+    let layers: [LlamaTransformerBlockTPQ]
+    let norm: RMSNorm
+    let group: DistributedGroup
+
+    init(
+        _ args: LlamaConfiguration, group: DistributedGroup,
+        groupSize: Int, bits: Int
+    ) throws {
+        precondition(args.vocabularySize > 0)
+
+        self.group = group
+        self._embedTokens.wrappedValue = Embedding(
+            embeddingCount: args.vocabularySize, dimensions: args.hiddenSize)
+
+        var layers: [LlamaTransformerBlockTPQ] = []
+        layers.reserveCapacity(args.hiddenLayers)
+        for _ in 0 ..< args.hiddenLayers {
+            layers.append(
+                try LlamaTransformerBlockTPQ(
+                    args, group: group, groupSize: groupSize, bits: bits))
+        }
+        self.layers = layers
+        self.norm = RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
+        var h = embedTokens(inputs)
+        let mask = createAttentionMask(h: h, cache: cache?.first)
+        for (i, layer) in layers.enumerated() {
+            h = layer(h, mask: mask, cache: cache?[i])
+        }
+        return norm(h)
+    }
+}
+
+/// Quantized tensor-parallel variant of `LlamaModel`. Same structure as
+/// `LlamaModelTP` but each linear layer is `QuantizedAllToShardedLinear`
+/// or `QuantizedShardedToAllLinear`. Use this for 4-bit MLX checkpoints
+/// (the common production case). Embedding stays unquantized + replicated
+/// to match the upstream `LlamaModel` behavior.
+public class LlamaModelTPQ: Module, LLMModel, KVCacheDimensionProvider {
+
+    public let vocabularySize: Int
+    public let kvHeads: [Int]
+    public let group: DistributedGroup
+    public let groupSize: Int
+    public let bits: Int
+
+    public let model: LlamaModelInnerTPQ
+
+    @ModuleInfo(key: "lm_head") var lmHead: QuantizedLinear?
+
+    public init(
+        _ args: LlamaConfiguration, group: DistributedGroup,
+        groupSize: Int = 64, bits: Int = 4
+    ) throws {
+        self.vocabularySize = args.vocabularySize
+        guard args.kvHeads % group.size == 0 else {
+            throw DistributedError.invalidConfiguration(
+                "kvHeads=\(args.kvHeads) must be divisible by group size \(group.size)")
+        }
+        let localKV = args.kvHeads / group.size
+        self.kvHeads = (0 ..< args.hiddenLayers).map { _ in localKV }
+        self.group = group
+        self.groupSize = groupSize
+        self.bits = bits
+        self.model = try LlamaModelInnerTPQ(
+            args, group: group, groupSize: groupSize, bits: bits)
+        if !args.tieWordEmbeddings {
+            // LM head stays replicated + quantized (no row/column split).
+            // Each rank holds a full QuantizedLinear; cheap to broadcast and
+            // avoids the allreduce / allgather complexity for the last step.
+            self._lmHead.wrappedValue = QuantizedLinear(
+                args.hiddenSize, args.vocabularySize, bias: false,
+                groupSize: groupSize, bits: bits)
+        }
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]?) -> MLXArray {
+        let out = model(inputs, cache: cache)
+        if let lmHead {
+            return lmHead(out)
+        } else {
+            return model.embedTokens.asLinear(out)
+        }
+    }
+
+    /// Slices a 4-bit packed quantized weight tree into per-rank shards.
+    ///
+    /// Packed-uint32 weight layout (MLX 4-bit): `weight` has shape
+    /// `[outDim, inDim/8]` (8 weights per uint32). `scales`/`biases` have
+    /// shape `[outDim, inDim/groupSize]` (one fp16 per quantization group).
+    ///
+    /// Column-parallel (Q/K/V/gate/up): slice all three along axis 0 (outDim).
+    /// Row-parallel (O/down): slice along axis 1. For weight, axis-1 length
+    /// is `inDim/8` so `inDim` must be divisible by `8 × world_size = 16`
+    /// for `world=2`. For scales/biases, axis-1 length is `inDim/groupSize`
+    /// so `inDim` must be divisible by `groupSize × world_size = 128` for
+    /// default groupSize=64, world=2. Llama hidden dims (4096, 8192, ...)
+    /// satisfy both constraints.
+    public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var result = weights.filter {
+            !$0.key.contains("self_attn.rotary_emb.inv_freq")
+        }
+        let world = group.size
+        if world == 1 { return result }
+        let rank = group.rank
+
+        var sliced: [String: MLXArray] = [:]
+        sliced.reserveCapacity(result.count)
+        for (key, value) in result {
+            sliced[key] = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+                key: key, value: value, rank: rank, world: world)
+        }
+        return sliced
+    }
+
+    /// Slices a single quantized parameter tensor along the right axis
+    /// based on its key. Returns the value unchanged if the key doesn't
+    /// belong to a Q/K/V/O/gate/up/down projection.
+    ///
+    /// Embedding, layernorms, and the (replicated) LM head are always
+    /// returned unchanged.
+    public static func shardQuantizedWeightIfNeeded(
+        key: String, value: MLXArray, rank: Int, world: Int
+    ) -> MLXArray {
+        // Don't shard embedding, layernorms, or the replicated LM head.
+        if key.contains("embed_tokens") || isLayerNormKey(key) || key.contains("lm_head") {
+            return value
+        }
+
+        let isColumnParallel =
+            isProjectionKey(key, name: "q_proj")
+            || isProjectionKey(key, name: "k_proj")
+            || isProjectionKey(key, name: "v_proj")
+            || isProjectionKey(key, name: "gate_proj")
+            || isProjectionKey(key, name: "up_proj")
+
+        let isRowParallel =
+            isProjectionKey(key, name: "o_proj")
+            || isProjectionKey(key, name: "down_proj")
+
+        if isColumnParallel {
+            // Slice along axis 0 (outDim) for weight, scales, biases, bias.
+            let outDim = value.dim(0)
+            precondition(
+                outDim % world == 0,
+                "quantized column-parallel '\(key)' outDim=\(outDim) not divisible by world=\(world)")
+            let shard = outDim / world
+            return value[(rank * shard) ..< ((rank + 1) * shard)]
+        }
+        if isRowParallel {
+            // Bias for row-parallel stays full on every rank — addition after
+            // allSum applies once on the summed result. Match LlamaModelTP's
+            // convention: row-parallel biases pass through unsliced.
+            if key.hasSuffix(".bias") {
+                return value
+            }
+            // Slice along axis 1 (inDim-derived).
+            let secondDim = value.dim(1)
+            precondition(
+                secondDim % world == 0,
+                "quantized row-parallel '\(key)' axis-1=\(secondDim) not divisible by world=\(world)")
+            let shard = secondDim / world
+            return value[0..., (rank * shard) ..< ((rank + 1) * shard)]
+        }
+        return value
+    }
+
+    public func messageGenerator(tokenizer: any Tokenizer) -> any MessageGenerator {
+        do {
+            let probe = [
+                ["role": "system", "content": "test"]
+            ]
+            _ = try tokenizer.applyChatTemplate(messages: probe)
+            return DefaultMessageGenerator()
+        } catch {
+            return NoSystemMessageGenerator()
+        }
+    }
+}
+
+extension LlamaModelTPQ: LoRAModel {
+    public var loraLayers: [Module] { model.layers }
+}
+
+// MARK: - Shared helpers + factory
+
+/// Path-segment-precise check for `*_layernorm` and the final `model.norm`
+/// (so future TP variants that ALSO have `q_norm`/`k_norm` per-head norms
+/// don't get caught by a naive substring match on "norm").
+private func isLayerNormKey(_ key: String) -> Bool {
+    key.contains("input_layernorm")
+        || key.contains("post_attention_layernorm")
+        || key.hasSuffix("model.norm.weight")
+        || key.hasSuffix("model.norm.bias")
+}
+
+/// Returns true iff `key` names a leaf parameter (weight, bias, scales,
+/// biases) of the linear layer named `name`. Uses path-segment boundaries
+/// to avoid false matches (e.g. "q_proj_extra").
+private func isProjectionKey(_ key: String, name: String) -> Bool {
+    key.contains(".\(name).weight")
+        || key.contains(".\(name).bias")
+        || key.contains(".\(name).scales")
+        || key.contains(".\(name).biases")
+}
+
+/// Build the right TP variant for the model based on quantization config.
+///
+/// - Parameters:
+///   - args: Llama configuration parsed from `config.json`.
+///   - quantization: Quantization config from the checkpoint (`nil` for
+///     fp16/bf16 checkpoints; non-nil for MLX 4-bit / 8-bit quantized).
+///   - group: The distributed group across which to shard.
+///
+/// - Returns: An opaque `LLMModel` that's either `LlamaModelTP` (fp16) or
+///   `LlamaModelTPQuantized` based on `quantization`. Caller doesn't have to
+///   downcast — the LLMModel protocol covers everything needed for inference.
+public func makeLlamaTP(
+    args: LlamaConfiguration,
+    quantization: BaseConfiguration.Quantization?,
+    group: DistributedGroup
+) throws -> any LLMModel {
+    if let q = quantization {
+        return try LlamaModelTPQ(args, group: group, groupSize: q.groupSize, bits: q.bits)
+    }
+    return try LlamaModelTP(args, group: group)
+}
diff --git a/Package.swift b/Package.swift
index d9f7e3a23..4d267bc89 100644
--- a/Package.swift
+++ b/Package.swift
@@ -120,6 +120,7 @@ let package = Package(
                 .product(name: "MLX", package: "mlx-swift"),
                 .product(name: "MLXNN", package: "mlx-swift"),
                 .product(name: "MLXOptimizers", package: "mlx-swift"),
+                .product(name: "MLXRandom", package: "mlx-swift"),
                 "MLXLMCommon",
                 "MLXLLM",
                 "MLXVLM",
diff --git a/Tests/MLXLMTests/LlamaTPTests.swift b/Tests/MLXLMTests/LlamaTPTests.swift
index 12272ee47..d7d8e0ded 100644
--- a/Tests/MLXLMTests/LlamaTPTests.swift
+++ b/Tests/MLXLMTests/LlamaTPTests.swift
@@ -5,6 +5,7 @@ import MLX
 import MLXLLM
 import MLXLMCommon
 import MLXNN
+import MLXRandom
 import XCTest
 
 /// Numerical-equivalence and shape tests for `LlamaModelTP`.
@@ -172,4 +173,236 @@ public class LlamaTPTests: XCTestCase {
             value: weight, rank: 1, world: 2)
         XCTAssertEqual(rank1.shape, [64])
     }
+
+    /// Future-proofing: q_norm / k_norm (per-head norms used in Qwen3 etc.)
+    /// must NOT be matched by the "norm" check in shardWeightIfNeeded —
+    /// they SHOULD be sharded along the head dimension. The current Llama
+    /// implementation doesn't have these keys, but the precise key match
+    /// guards against future TP variants pulling in this code pattern.
+    func testShardWeightIfNeededDoesNotConfusePerHeadNorms() {
+        let weight = MLXArray(0 ..< 128, [128]).asType(.float32)
+        // A q_norm.weight should NOT trip the layernorm gate — it would fall
+        // through to the bottom and pass through unchanged because q_norm
+        // isn't in the column-parallel or row-parallel projection list.
+        let result = LlamaModelTP.shardWeightIfNeeded(
+            key: "model.layers.0.self_attn.q_norm.weight",
+            value: weight, rank: 0, world: 2)
+        // Today: pass-through (no q_norm handling for Llama). The point of
+        // this test is the layernorm gate doesn't INCORRECTLY swallow it.
+        XCTAssertEqual(result.shape, [128])
+    }
+
+    // MARK: - Manual multi-rank sharding validation
+    //
+    // Real multi-rank DistributedGroup tests need spawned subprocesses (jaccl
+    // or ring backend). Within a single process we can still validate the
+    // SHARDING MATH by manually slicing weights and running the matmuls /
+    // allsum on the test side — bypassing AllToShardedLinear's internal
+    // DistributedGroup call. This catches axis-confusion bugs in sanitize
+    // and validates that "sharded column matmul + sharded row matmul + sum"
+    // equals the unsharded reference, which is the load-bearing invariant
+    // of TP correctness.
+
+    /// Builds a fake 2-rank Q×K-style column-parallel matmul: each rank
+    /// holds a row-shard of the weight, multiplies the full input by its
+    /// shard, and the result concatenates along the head dim. Compare to
+    /// the unsharded reference.
+    func testColumnParallelMatmulShardingMath() {
+        let inDim = 32, outDim = 64, B = 1, L = 4
+        let weight = MLXRandom.normal([outDim, inDim])
+        let input = MLXRandom.normal([B, L, inDim])
+
+        // Unsharded reference: input @ weight.T → [B, L, outDim]
+        let reference = matmul(input, weight.T)
+        XCTAssertEqual(reference.shape, [B, L, outDim])
+
+        // Simulated rank-0 / rank-1 shards via shardWeightIfNeeded — the
+        // SAME function the production sanitize path uses.
+        let key = "model.layers.0.self_attn.q_proj.weight"
+        let w0 = LlamaModelTP.shardWeightIfNeeded(key: key, value: weight, rank: 0, world: 2)
+        let w1 = LlamaModelTP.shardWeightIfNeeded(key: key, value: weight, rank: 1, world: 2)
+        XCTAssertEqual(w0.shape, [outDim / 2, inDim])
+        XCTAssertEqual(w1.shape, [outDim / 2, inDim])
+
+        // Each rank multiplies the FULL input by its row-shard.
+        // Output shape per rank: [B, L, outDim/2].
+        let y0 = matmul(input, w0.T)
+        let y1 = matmul(input, w1.T)
+        XCTAssertEqual(y0.shape, [B, L, outDim / 2])
+        XCTAssertEqual(y1.shape, [B, L, outDim / 2])
+
+        // For column-parallel the next stage operates on sharded outputs
+        // independently (no allreduce). To validate against the reference,
+        // concatenate along the last dim.
+        let combined = concatenated([y0, y1], axis: -1)
+        XCTAssertEqual(combined.shape, [B, L, outDim])
+
+        let diff = (combined - reference).abs().max().item(Float.self)
+        XCTAssertLessThan(
+            diff, 1e-4,
+            "column-parallel sharded matmul diverged from unsharded reference by max=\(diff)")
+    }
+
+    /// Builds a row-parallel matmul: input is sharded along the input dim
+    /// (each rank holds half the input width), each rank multiplies its
+    /// input shard by its weight column-shard, then results are SUMMED
+    /// (this is what ShardedToAllLinear.allSum does internally).
+    func testRowParallelMatmulShardingMath() {
+        let inDim = 64, outDim = 32, B = 1, L = 4
+        let weight = MLXRandom.normal([outDim, inDim])
+        let input = MLXRandom.normal([B, L, inDim])
+
+        // Unsharded reference.
+        let reference = matmul(input, weight.T)
+        XCTAssertEqual(reference.shape, [B, L, outDim])
+
+        // Slice the weight along axis 1 (input dim).
+        let key = "model.layers.0.self_attn.o_proj.weight"
+        let w0 = LlamaModelTP.shardWeightIfNeeded(key: key, value: weight, rank: 0, world: 2)
+        let w1 = LlamaModelTP.shardWeightIfNeeded(key: key, value: weight, rank: 1, world: 2)
+        XCTAssertEqual(w0.shape, [outDim, inDim / 2])
+        XCTAssertEqual(w1.shape, [outDim, inDim / 2])
+
+        // Each rank's input is the corresponding axis-2 slice. (In real TP,
+        // the upstream layer produces per-rank sharded outputs that feed
+        // into this layer's input.)
+        let x0 = input[0..., 0..., 0 ..< (inDim / 2)]
+        let x1 = input[0..., 0..., (inDim / 2) ..< inDim]
+
+        // Each rank: partial output [B, L, outDim].
+        let partial0 = matmul(x0, w0.T)
+        let partial1 = matmul(x1, w1.T)
+        XCTAssertEqual(partial0.shape, [B, L, outDim])
+        XCTAssertEqual(partial1.shape, [B, L, outDim])
+
+        // allSum simulation: sum across ranks.
+        let combined = partial0 + partial1
+        let diff = (combined - reference).abs().max().item(Float.self)
+        XCTAssertLessThan(
+            diff, 1e-4,
+            "row-parallel sharded matmul (with simulated allSum) diverged from unsharded reference by max=\(diff)")
+    }
+
+    /// Combined column-parallel + row-parallel pipeline mirrors how an
+    /// attention or MLP block does TP: column-parallel matmul → some op
+    /// (here a no-op identity for testing) → row-parallel matmul with
+    /// allSum. This is the end-to-end TP correctness invariant.
+    func testColumnThenRowParallelEndToEndShardingMath() {
+        let inDim = 32, midDim = 64, outDim = 16, B = 1, L = 4
+        let w1 = MLXRandom.normal([midDim, inDim])
+        let w2 = MLXRandom.normal([outDim, midDim])
+        let input = MLXRandom.normal([B, L, inDim])
+
+        // Unsharded reference: input @ w1.T → mid → mid @ w2.T → out
+        let mid = matmul(input, w1.T)
+        let reference = matmul(mid, w2.T)
+        XCTAssertEqual(reference.shape, [B, L, outDim])
+
+        // Stage 1 (column-parallel): shard w1 along axis 0.
+        let w1Key = "model.layers.0.mlp.gate_proj.weight"
+        let w1a = LlamaModelTP.shardWeightIfNeeded(key: w1Key, value: w1, rank: 0, world: 2)
+        let w1b = LlamaModelTP.shardWeightIfNeeded(key: w1Key, value: w1, rank: 1, world: 2)
+        let mid0 = matmul(input, w1a.T)  // [B, L, midDim/2]
+        let mid1 = matmul(input, w1b.T)  // [B, L, midDim/2]
+
+        // Stage 2 (row-parallel): shard w2 along axis 1, each rank takes
+        // the matching input shard from stage 1's sharded output. No
+        // explicit "gather + slice" — the sharded layout flows through.
+        let w2Key = "model.layers.0.mlp.down_proj.weight"
+        let w2a = LlamaModelTP.shardWeightIfNeeded(key: w2Key, value: w2, rank: 0, world: 2)
+        let w2b = LlamaModelTP.shardWeightIfNeeded(key: w2Key, value: w2, rank: 1, world: 2)
+        let partial0 = matmul(mid0, w2a.T)  // [B, L, outDim]
+        let partial1 = matmul(mid1, w2b.T)  // [B, L, outDim]
+
+        let combined = partial0 + partial1
+        let diff = (combined - reference).abs().max().item(Float.self)
+        XCTAssertLessThan(
+            diff, 1e-4,
+            "column-then-row sharded pipeline diverged from unsharded reference by max=\(diff)")
+    }
+
+    // MARK: - LlamaModelTPQ (quantized) basic sanity
+
+    func testLlamaModelTPQInstantiates() throws {
+        let config = smallConfig()
+        let model = try LlamaModelTPQ(
+            config, group: singletonGroup(), groupSize: 64, bits: 4)
+        // 4 layers in smallConfig; singleton group keeps kvHeads at the global
+        // count (4 per layer).
+        XCTAssertEqual(model.kvHeads.count, 4)
+        XCTAssertEqual(model.kvHeads[0], 4)
+    }
+
+    func testMakeLlamaTPDispatchesByQuantizationConfig() throws {
+        let config = smallConfig()
+        let unquantized = try makeLlamaTP(args: config, quantization: nil, group: singletonGroup())
+        // Quantization nil → fp16 path.
+        XCTAssertTrue(unquantized is LlamaModelTP)
+
+        let q = BaseConfiguration.Quantization(groupSize: 64, bits: 4)
+        let quantized = try makeLlamaTP(args: config, quantization: q, group: singletonGroup())
+        XCTAssertTrue(quantized is LlamaModelTPQ)
+    }
+
+    /// Quantized weight slicing: weight has shape [outDim, inDim/8]; scales
+    /// and biases have shape [outDim, inDim/groupSize]. Column-parallel
+    /// slices all three along axis 0. Row-parallel slices along axis 1.
+    func testShardQuantizedWeightIfNeededColumnParallel() {
+        let outDim = 16, packedInDim = 8  // inDim=64, /8=8
+        let weight = MLXArray(0 ..< outDim * packedInDim, [outDim, packedInDim])
+            .asType(.uint32)
+        let scales = MLXRandom.normal([outDim, 1])  // groupSize=64, one group per row
+        let biases = MLXRandom.normal([outDim, 1])
+
+        let baseKey = "model.layers.0.self_attn.q_proj"
+        let w0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).weight", value: weight, rank: 0, world: 2)
+        let w1 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).weight", value: weight, rank: 1, world: 2)
+        let s0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).scales", value: scales, rank: 0, world: 2)
+        let b0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).biases", value: biases, rank: 0, world: 2)
+
+        XCTAssertEqual(w0.shape, [outDim / 2, packedInDim])
+        XCTAssertEqual(w1.shape, [outDim / 2, packedInDim])
+        XCTAssertEqual(s0.shape, [outDim / 2, 1])
+        XCTAssertEqual(b0.shape, [outDim / 2, 1])
+    }
+
+    func testShardQuantizedWeightIfNeededRowParallel() {
+        let outDim = 16, packedInDim = 8
+        let weight = MLXArray(0 ..< outDim * packedInDim, [outDim, packedInDim])
+            .asType(.uint32)
+        let scales = MLXRandom.normal([outDim, 2])  // 2 groups per row
+
+        let baseKey = "model.layers.0.self_attn.o_proj"
+        let w0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).weight", value: weight, rank: 0, world: 2)
+        let w1 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).weight", value: weight, rank: 1, world: 2)
+        let s0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "\(baseKey).scales", value: scales, rank: 0, world: 2)
+
+        XCTAssertEqual(w0.shape, [outDim, packedInDim / 2])
+        XCTAssertEqual(w1.shape, [outDim, packedInDim / 2])
+        XCTAssertEqual(s0.shape, [outDim, 1])
+    }
+
+    func testShardQuantizedWeightIfNeededEmbeddingNotSharded() {
+        let weight = MLXRandom.normal([128, 64])
+        let r0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "model.embed_tokens.weight", value: weight, rank: 0, world: 2)
+        XCTAssertEqual(r0.shape, [128, 64])
+    }
+
+    /// Row-parallel biases must NOT be sliced (per LlamaModelTPQ comments).
+    /// They're added once after the allreduce.
+    func testShardQuantizedRowParallelBiasNotSliced() {
+        let bias = MLXRandom.normal([64])
+        let r0 = LlamaModelTPQ.shardQuantizedWeightIfNeeded(
+            key: "model.layers.0.self_attn.o_proj.bias",
+            value: bias, rank: 0, world: 2)
+        XCTAssertEqual(r0.shape, [64])
+    }
 }

From 735c28ac0e5e915b3d0e09e470efbd72a8e3472e Mon Sep 17 00:00:00 2001
From: anupsv <6407789+anupsv@users.noreply.github.com>
Date: Thu, 21 May 2026 00:13:46 -0700
Subject: [PATCH 3/3] Fix misleading LoRA conformance + test name on
 LlamaModelTP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LoRA: LLMModel protocol requires LoRAModel conformance, but TP+LoRA
isn't actually supported. MLXLLM's LoRA training code walks loraLayers
looking for Linear sub-modules to wrap with adapters; our sharded
variants (AllToShardedLinear, QuantizedAllToShardedLinear, etc.) are
NOT Linear subclasses so adapter insertion either no-ops silently or
hits surprising shape errors at train time.

Both LlamaModelTP and LlamaModelTPQ now return `[]` from loraLayers
with a docstring pointing at LlamaModel (single-rank) for LoRA
workflows. Sharded LoRA is its own research problem.

Test: testLlamaTPRejectsNonDivisibleHeads was misleading — it actually
asserted XCTAssertNoThrow because 7 % 1 == 0 on a singleton group.
Renamed to testLlamaTPAcceptsAnyHeadCountOnSingletonGroup so the name
matches what's tested. Real world>=2 rejection paths are exercised
end-to-end by the 2-Mac smoke test in d-inference, not here.
---
 Libraries/MLXLLM/Models/LlamaTP.swift | 19 +++++++++++++++----
 Tests/MLXLMTests/LlamaTPTests.swift   | 20 ++++++++------------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/Libraries/MLXLLM/Models/LlamaTP.swift b/Libraries/MLXLLM/Models/LlamaTP.swift
index d264c16c5..ee797a0bc 100644
--- a/Libraries/MLXLLM/Models/LlamaTP.swift
+++ b/Libraries/MLXLLM/Models/LlamaTP.swift
@@ -347,10 +347,19 @@ public class LlamaModelTP: Module, LLMModel, KVCacheDimensionProvider {
 
 // MARK: - LoRA
 
+// LoRAModel conformance is required by LLMModel but TP+LoRA isn't actually
+// supported. The MLXLLM LoRA training code walks `loraLayers` looking for
+// `Linear` sub-modules to wrap with adapters; our sharded variants
+// (`AllToShardedLinear` etc.) are NOT `Linear` subclasses, so adapter
+// insertion either no-ops silently or hits surprising shape errors at
+// train time. Returning `[]` makes the limitation explicit: LoRA over a
+// TP model produces zero adapters, which is more honest than handing back
+// transformer layers that LoRA can't actually adapt.
+//
+// If you need LoRA, use `LlamaModel` (single-rank) and accept the memory
+// cost. Sharded LoRA is its own research problem (Q-LoRA over TP, etc.).
 extension LlamaModelTP: LoRAModel {
-    public var loraLayers: [Module] {
-        model.layers
-    }
+    public var loraLayers: [Module] { [] }
 }
 
 // MARK: - Quantized TP variant
@@ -695,8 +704,10 @@ public class LlamaModelTPQ: Module, LLMModel, KVCacheDimensionProvider {
     }
 }
 
+// LoRA is not supported on the quantized TP path either — see the
+// docstring on `LlamaModelTP`'s extension above for the rationale.
 extension LlamaModelTPQ: LoRAModel {
-    public var loraLayers: [Module] { model.layers }
+    public var loraLayers: [Module] { [] }
 }
 
 // MARK: - Shared helpers + factory
diff --git a/Tests/MLXLMTests/LlamaTPTests.swift b/Tests/MLXLMTests/LlamaTPTests.swift
index d7d8e0ded..039fa68b0 100644
--- a/Tests/MLXLMTests/LlamaTPTests.swift
+++ b/Tests/MLXLMTests/LlamaTPTests.swift
@@ -106,21 +106,17 @@ public class LlamaTPTests: XCTestCase {
             "TP(size=1) output diverged from LlamaModel by max=\(diffValue)")
     }
 
-    /// Shape validation: TP refuses to initialize if heads don't divide evenly.
-    func testLlamaTPRejectsNonDivisibleHeads() {
-        // Force a 2-rank "fake" by directly testing the validation logic via
-        // the helper — we can't easily construct a size-2 group in-process
-        // without a real distributed backend.
-        var config = smallConfig()
-        config = LlamaConfiguration(
+    /// Divisibility-check happy path: an odd-head config trivially passes the
+    /// `heads % group.size == 0` gate on a singleton group (7 % 1 == 0). This
+    /// documents that the check uses % (not strict equality), so size-1 will
+    /// accept any positive head count. Real `world >= 2` rejection paths are
+    /// exercised end-to-end by the 2-Mac smoke test in d-inference, not here.
+    func testLlamaTPAcceptsAnyHeadCountOnSingletonGroup() {
+        let config = LlamaConfiguration(
             hiddenSize: 64, hiddenLayers: 2, intermediateSize: 256,
-            attentionHeads: 7,  // not divisible by 2
+            attentionHeads: 7,
             rmsNormEps: 1e-5, vocabularySize: 128, kvHeads: 4
         )
-
-        // With size=1 group, divisibility check is trivially satisfied (7 % 1 = 0),
-        // so this test only documents that we CAN construct on size=1. Real
-        // size-2 rejection is exercised by the 2-Mac smoke test in d-inference.
         XCTAssertNoThrow(try LlamaModelTP(config, group: singletonGroup()))
     }