diff --git a/docs/gpu_backend_plan.md b/docs/gpu_backend_plan.md
new file mode 100644
index 00000000..a43dbf80
--- /dev/null
+++ b/docs/gpu_backend_plan.md
@@ -0,0 +1,381 @@
+# GPU Backend Plan for PTO Tile Lib
+
+## Selected Decisions (2026-03-31)
+
+- Git identity: use the already configured global Git identity
+- Backend substrate: **CUDA C++ + inline PTX for hot kernels**
+- Architecture scope: **keep all scaffolded SM dirs, implement `sm121` first**
+- First implementation slice: **mixed** (`TLOAD`, `TSTORE`, `TADD`, `TMATMUL`)
+- Event semantics in v1: **defer advanced semantics until compute kernels work**
+
+## Current Repo Analysis
+
+### 1. Existing backend split
+
+The public entry point is `include/pto/pto-inst.hpp`, which pulls in:
+
+- shared tile/type/instruction declarations from `include/pto/common/`
+- CPU simulator support through `__CPU_SIM`
+- NPU implementations through `include/pto/common/arch_macro.hpp` and `include/pto/common/pto_instr_impl.hpp`
+
+Today the repo has three real backend families:
+
+- **CPU simulator**: header-only reference/simulation path
+- **NPU A2/A3**: `include/pto/npu/a2a3/`
+- **NPU A5**: `include/pto/npu/a5/`
+
+There is **no NVIDIA GPU backend yet**:
+
+- no `include/pto/gpu/`
+- no GPU dispatch macro in `arch_macro.hpp`
+- no GPU include block in `pto_instr_impl.hpp`
+- no `tests/gpu/`
+- no `kernels/manual/gpu/`
+
+### 2. What the existing backends tell us
+
+#### CPU backend
+
+The CPU path is the functional/spec reference layer:
+
+- broad instruction surface area
+- flexible tile/layout checks
+- useful for correctness and fallback semantics
+- `TSYNC` is effectively a no-op in CPU simulation
+
+This is the best source of **semantic truth** for a future GPU backend.
+
+#### NPU A2/A3 backend
+
+The A2/A3 path is the first real hardware mapping layer:
+
+- instruction-specific files such as `TAdd.hpp`, `TLoad.hpp`, `TMatmul.hpp`
+- explicit pipeline/event handling
+- memory-move kernels mapped to hardware copy instructions
+- vector and cube paths separated by tile/layout constraints
+
+This is the best source of **how PTO instructions get lowered to hardware micro-ops**.
+
+#### NPU A5 backend
+
+The A5 path extends the same architecture with:
+
+- richer data type support
+- more advanced matmul variants (FP8 / MX / FP4-related handling)
+- evolved sync/event behavior
+- more specialized utility helpers and hardware-specific tuning
+
+This is the best source of **how backend specialization grows with hardware generations**.
+
+### 3. Architectural implications for a GPU backend
+
+A CUDA/NVIDIA backend should not start as one monolithic implementation. It should follow the repo's existing pattern:
+
+- shared PTO API remains in `include/pto/common/`
+- GPU-specific lowering lives under `include/pto/gpu/`
+- arch-specific overrides live in per-SM subdirectories
+- common CUDA helpers, tile iterators, layout adapters, and launch helpers live in `include/pto/gpu/common/`
+
+### 4. Backend bring-up priorities
+
+To make a GPU backend useful early, the instruction bring-up order should be:
+
+1. **Dispatch and toolchain plumbing**
+2. **Memory movement**: `TLOAD`, `TSTORE`, `TPREFETCH`
+3. **Elementwise vector ops**: `TADD`, `TSUB`, `TMUL`, `TMAX`, `TMIN`, `TEXPANDS`, comparisons/selects
+4. **Reductions / reshapes / transforms**
+5. **Matrix kernels**: `TMATMUL`, `TMATMUL_ACC`, `TMATMUL_BIAS`, `TMATMUL_MX`
+6. **Synchronization / event semantics**
+7. **Collective / comm semantics**, if GPU multi-device becomes in scope
+
+---
+
+## Proposed Directory Layout
+
+```text
+include/pto/gpu/
+  README.md
+  common/
+    README.md
+  sm70/
+    README.md
+    microkernels/
+      README.md
+  sm75/
+    README.md
+    microkernels/
+      README.md
+  sm80/
+    README.md
+    microkernels/
+      README.md
+  sm86/
+    README.md
+    microkernels/
+      README.md
+  sm87/
+    README.md
+    microkernels/
+      README.md
+  sm89/
+    README.md
+    microkernels/
+      README.md
+  sm90/
+    README.md
+    microkernels/
+      README.md
+  sm90a/
+    README.md
+    microkernels/
+      README.md
+  sm100/
+    README.md
+    microkernels/
+      README.md
+  sm100a/
+    README.md
+    microkernels/
+      README.md
+  sm103/
+    README.md
+    microkernels/
+      README.md
+  sm110/
+    README.md
+    microkernels/
+      README.md
+  sm120/
+    README.md
+    microkernels/
+      README.md
+  sm121/
+    README.md
+    microkernels/
+      README.md
+
+tests/gpu/
+  README.md
+
+kernels/manual/gpu/
+  README.md
+```
+
+### Why these SM targets
+
+These folders cover the modern NVIDIA targets that matter for PTO-style tile programming and current deployment:
+
+- `sm70`: Volta
+- `sm75`: Turing
+- `sm80`: Ampere datacenter
+- `sm86`: Ampere client/workstation
+- `sm87`: Orin
+- `sm89`: Ada
+- `sm90`: Hopper
+- `sm90a`: Hopper architecture-conditional specialization
+- `sm100`: Blackwell
+- `sm100a`: Blackwell architecture-conditional specialization
+- `sm103`: GB300 / B300 generation
+- `sm110`: Thor / Jetson T-series generation
+- `sm120`: Blackwell workstation / GeForce generation
+- `sm121`: GB10 (DGX Spark)
+
+This machine is **GB10 / compute capability 12.1**, so `sm121` should be the first optimization target.
+
+---
+
+## Phase Plan
+
+### Phase 0 — Scope freeze and backend contract
+
+**Goal:** Decide what “GPU backend” means in this repo before writing kernels.
+
+#### Checklist
+
+- [ ] Decide whether the first GPU backend is:
+  - [ ] correctness-first
+  - [ ] performance-first
+  - [ ] hybrid (correctness-first, optimize hot instructions first)
+- [ ] Decide the implementation substrate:
+  - [ ] CUDA C++
+  - [ ] CUDA C++ + inline PTX
+  - [ ] CUDA C++ + CUTLASS/CuTe
+  - [ ] CUDA C++ + generated microkernels
+- [ ] Decide the minimum supported toolkit/driver baseline
+- [ ] Decide the minimum supported GPU baseline (`sm80+` only vs broad support)
+- [ ] Decide whether GPU backend must preserve exact PTO event semantics or allow a GPU-specific semantic subset first
+- [ ] Decide whether multi-GPU comm is in phase 1 or deferred
+
+### Phase 1 — Dispatch, macros, and build plumbing
+
+**Goal:** Make GPU a first-class backend selection path.
+
+#### Checklist
+
+- [ ] Add GPU arch detection / selection macros
+- [ ] Define a backend macro family such as:
+  - [ ] `PTO_GPU_BACKEND`
+  - [ ] `PTO_GPU_SM80`
+  - [ ] `PTO_GPU_SM90`
+  - [ ] `PTO_GPU_SM121`
+- [ ] Add GPU include blocks to `include/pto/common/pto_instr_impl.hpp`
+- [ ] Decide whether `include/pto/pto-inst.hpp` should expose GPU path under:
+  - [ ] `__CUDACC__`
+  - [ ] a project-defined macro
+  - [ ] both
+- [ ] Introduce initial CMake path for GPU tests/builds
+- [ ] Add one minimal compile-only smoke target for GPU backend headers
+
+### Phase 2 — Memory model and tile mapping
+
+**Goal:** Define how PTO tiles live on NVIDIA GPUs.
+
+#### Checklist
+
+- [ ] Map PTO tile memory spaces onto CUDA memory spaces:
+  - [ ] global memory
+  - [ ] shared memory
+  - [ ] registers
+  - [ ] tensor memory / async staging abstractions where needed
+- [ ] Define row-major / col-major / NZ-like tile layout adapters on GPU
+- [ ] Decide whether PTO fractal layouts are represented as:
+  - [ ] compile-time layout descriptors
+  - [ ] iterator objects
+  - [ ] shared-memory swizzles
+- [ ] Implement common helper layer in `include/pto/gpu/common/`
+- [ ] Document tile-to-warp / tile-to-warpgroup ownership rules
+
+### Phase 3 — Memory instructions first
+
+**Goal:** Make data movement work before math.
+
+#### Checklist
+
+- [ ] Implement `TLOAD`
+- [ ] Implement `TSTORE`
+- [ ] Implement `TPREFETCH`
+- [ ] Validate padding, stride, and layout conversions
+- [ ] Add tests for ND / DN / NZ-like cases
+- [ ] Add baseline performance checks for bandwidth-sensitive paths
+
+### Phase 4 — Core elementwise backend
+
+**Goal:** Bring up the wide surface-area instructions with reusable microkernel templates.
+
+#### Checklist
+
+- [ ] Implement binary elementwise template family
+- [ ] Implement unary elementwise template family
+- [ ] Prioritize:
+  - [ ] `TADD`
+  - [ ] `TSUB`
+  - [ ] `TMUL`
+  - [ ] `TDIV`
+  - [ ] `TMIN`
+  - [ ] `TMAX`
+  - [ ] `TCMP` / `TCMPS`
+  - [ ] `TSEL` / `TSELS`
+  - [ ] `TEXP` / `TLOG` / `TRSQRT` / `TSQRT`
+- [ ] Split generic implementations from per-SM overrides
+- [ ] Add correctness tests against CPU simulator outputs
+
+### Phase 5 — Reduction, reshape, and transform ops
+
+**Goal:** Cover the shape-changing and reduction-heavy instructions.
+
+#### Checklist
+
+- [ ] `TRow*` family
+- [ ] `TCol*` family
+- [ ] `TRESHAPE`
+- [ ] `TTRANS`
+- [ ] `TEXTRACT`
+- [ ] `TFILLPAD`
+- [ ] `TGATHER` / `TSCATTER` variants where practical
+- [ ] Decide warp-level vs block-level reduction strategies per SM
+
+### Phase 6 — Matrix and tensor-core path
+
+**Goal:** Make PTO meaningful for high-performance GPU kernels.
+
+#### Checklist
+
+- [ ] Implement a generic `TMATMUL` path first
+- [ ] Add accumulator forms:
+  - [ ] `TMATMUL_ACC`
+  - [ ] `TMATMUL_BIAS`
+  - [ ] `TGEMV`
+- [ ] Decide whether `TMATMUL_MX` is phase-1 GPU scope or deferred
+- [ ] Create per-SM tensor-core microkernel registry
+- [ ] Define tile shape families by SM
+- [ ] Add tuning metadata per instruction / dtype / tile shape / SM
+- [ ] Prioritize `sm121`, `sm120`, `sm90`, `sm89`, `sm80`
+
+### Phase 7 — Sync/event semantics on GPU
+
+**Goal:** Reconcile PTO pipeline/event model with CUDA execution semantics.
+
+#### Checklist
+
+- [ ] Define what `TSYNC` means on GPU
+- [ ] Decide whether `Event<SrcOp, DstOp>` is:
+  - [ ] fully modeled
+  - [ ] partially modeled
+  - [ ] translated into CUDA barriers / named barriers / cooperative groups
+- [ ] Document unsupported or approximated event cases
+- [ ] Add tests for ordering-sensitive instruction sequences
+
+### Phase 8 — Tests, perf harness, and examples
+
+**Goal:** Make the backend maintainable.
+
+#### Checklist
+
+- [ ] Add `tests/gpu/` structure
+- [ ] Add CPU-vs-GPU oracle tests
+- [ ] Add instruction-level microbenchmarks
+- [ ] Add kernel-level examples under `kernels/manual/gpu/`
+- [ ] Add CI matrix for at least compile coverage, even if hardware CI is unavailable
+- [ ] Add documentation for supported instructions per SM
+
+---
+
+## Recommended first implementation slice
+
+If the goal is fast progress with good signal, the first real slice should be:
+
+1. `sm121` + `sm120` + `sm90` shared bring-up path
+2. `TLOAD`
+3. `TSTORE`
+4. `TADD`
+5. `TMUL`
+6. `TMAX`
+7. `TMATMUL`
+8. basic correctness tests vs CPU simulator
+
+That gets the backend from “directory scaffold” to “real execution path” with a minimal but meaningful kernel set.
+
+---
+
+## Open design questions for the next step
+
+1. **Backend substrate**
+   - Option A: plain CUDA C++ first
+   - Option B: CUDA + inline PTX for hot kernels
+   - Option C: CUDA + CUTLASS/CuTe for matmul, custom kernels for the rest
+
+2. **Initial architecture scope**
+   - Option A: `sm121` only
+   - Option B: `sm121` + `sm120` + `sm90`
+   - Option C: all scaffolded SMs, but only `sm121` gets real kernels first
+
+3. **Bring-up priority**
+   - Option A: memory ops first
+   - Option B: matmul first
+   - Option C: elementwise first
+   - Option D: mixed (`TLOAD/TSTORE/TADD/TMATMUL`)
+
+4. **How strict should PTO event semantics be in v1?**
+   - Option A: exact where possible, fail-fast otherwise
+   - Option B: approximate with documented caveats
+   - Option C: defer advanced event semantics until after compute kernels land
diff --git a/include/pto/common/arch_macro.hpp b/include/pto/common/arch_macro.hpp
index b1eda679..73dd31ac 100644
--- a/include/pto/common/arch_macro.hpp
+++ b/include/pto/common/arch_macro.hpp
@@ -11,6 +11,38 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #ifndef ARCH_MACRO_HPP
 #define ARCH_MACRO_HPP
 
+#if defined(PTO_GPU_BACKEND)
+#define PTO_COMM_NOT_SUPPORTED
+
+#if defined(__CUDA_ARCH__)
+#if __CUDA_ARCH__ >= 1210
+#define PTO_GPU_SM121
+#elif __CUDA_ARCH__ >= 1200
+#define PTO_GPU_SM120
+#elif __CUDA_ARCH__ >= 1100
+#define PTO_GPU_SM110
+#elif __CUDA_ARCH__ >= 1030
+#define PTO_GPU_SM103
+#elif __CUDA_ARCH__ >= 1000
+#define PTO_GPU_SM100
+#elif __CUDA_ARCH__ >= 900
+#define PTO_GPU_SM90
+#elif __CUDA_ARCH__ >= 890
+#define PTO_GPU_SM89
+#elif __CUDA_ARCH__ >= 870
+#define PTO_GPU_SM87
+#elif __CUDA_ARCH__ >= 860
+#define PTO_GPU_SM86
+#elif __CUDA_ARCH__ >= 800
+#define PTO_GPU_SM80
+#elif __CUDA_ARCH__ >= 750
+#define PTO_GPU_SM75
+#elif __CUDA_ARCH__ >= 700
+#define PTO_GPU_SM70
+#endif
+#endif
+#endif
+
 #if __NPU_ARCH__ == 2201
 #define PTO_NPU_ARCH_A2A3
 #elif (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 3510)
diff --git a/include/pto/common/buffer_limits.hpp b/include/pto/common/buffer_limits.hpp
index 8802d8c1..f0af220a 100644
--- a/include/pto/common/buffer_limits.hpp
+++ b/include/pto/common/buffer_limits.hpp
@@ -35,6 +35,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_UBUF_SIZE_BYTES (192u * 1024u)
 #elif defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90)
 #define PTO_UBUF_SIZE_BYTES (128u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_UBUF_SIZE_BYTES (256u * 1024u)
 #else
 #error \
     "PTO_UBUF_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_UBUF_SIZE_BYTES manually."
@@ -51,6 +53,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_CBUF_SIZE_BYTES (1024u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_A5) || defined(PTO_NPU_ARCH_KIRIN9030)
 #define PTO_CBUF_SIZE_BYTES (512u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_CBUF_SIZE_BYTES (256u * 1024u)
 #else
 #error \
     "PTO_CBUF_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_CBUF_SIZE_BYTES manually."
@@ -67,6 +71,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_L0A_SIZE_BYTES (32u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_A5) || defined(PTO_NPU_ARCH_KIRINX90)
 #define PTO_L0A_SIZE_BYTES (64u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_L0A_SIZE_BYTES (128u * 1024u)
 #else
 #error \
     "PTO_L0A_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_L0A_SIZE_BYTES manually."
@@ -83,6 +89,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_L0B_SIZE_BYTES (32u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_A5) || defined(PTO_NPU_ARCH_KIRINX90)
 #define PTO_L0B_SIZE_BYTES (64u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_L0B_SIZE_BYTES (128u * 1024u)
 #else
 #error \
     "PTO_L0B_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_L0B_SIZE_BYTES manually."
@@ -101,6 +109,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_L0C_SIZE_BYTES (128u * 1024u)
 #elif defined(PTO_NPU_ARCH_KIRIN9030)
 #define PTO_L0C_SIZE_BYTES (64u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_L0C_SIZE_BYTES (256u * 1024u)
 #else
 #error \
     "PTO_L0C_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_L0C_SIZE_BYTES manually."
@@ -117,6 +127,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_BIAS_SIZE_BYTES (4u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90)
 #define PTO_BIAS_SIZE_BYTES (1u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_BIAS_SIZE_BYTES (8u * 1024u)
 #else
 #error \
     "PTO_BIAS_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_BIAS_SIZE_BYTES manually."
@@ -137,6 +149,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_FBUF_SIZE_BYTES (4u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3)
 #define PTO_FBUF_SIZE_BYTES (2u * 1024u)
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_FBUF_SIZE_BYTES (8u * 1024u)
 #else
 #error \
     "PTO_FBUF_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_FBUF_SIZE_BYTES manually."
@@ -149,6 +163,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_SCALELEFT_SIZE_BYTES (4u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90)
 #define PTO_SCALELEFT_SIZE_BYTES 0u
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_SCALELEFT_SIZE_BYTES (8u * 1024u)
 #else
 #error \
     "PTO_SCALELEFT_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_SCALELEFT_SIZE_BYTES manually."
@@ -161,6 +177,8 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #define PTO_SCALERIGHT_SIZE_BYTES (4u * 1024u)
 #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90)
 #define PTO_SCALERIGHT_SIZE_BYTES 0u
+#elif defined(PTO_GPU_BACKEND)
+#define PTO_SCALERIGHT_SIZE_BYTES (8u * 1024u)
 #else
 #error \
     "PTO_SCALERIGHT_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_SCALERIGHT_SIZE_BYTES manually."
diff --git a/include/pto/common/constants.hpp b/include/pto/common/constants.hpp
index 1480a5c3..bb489980 100644
--- a/include/pto/common/constants.hpp
+++ b/include/pto/common/constants.hpp
@@ -386,6 +386,9 @@ PTO_INTERNAL constexpr TileLayoutCustom GetTileLayoutCustom()
     } else if constexpr (TileData::isRowMajor && (TileData::SFractal == SLayout::RowMajor) &&
                          TileData::SFractalSize == 512) {
         return TileLayoutCustom::ZZ;
+    } else if constexpr (TileData::isRowMajor && (TileData::SFractal == SLayout::GpuSwizzle128B) &&
+                         TileData::SFractalSize == 128) {
+        return TileLayoutCustom::GPU_SWIZZLE_128B;
     } else {
         return TileLayoutCustom::NONE;
     }
diff --git a/include/pto/common/gpu_stub.hpp b/include/pto/common/gpu_stub.hpp
new file mode 100644
index 00000000..5edc138b
--- /dev/null
+++ b/include/pto/common/gpu_stub.hpp
@@ -0,0 +1,66 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPUSTUB_HPP
+#define PTO_GPUSTUB_HPP
+
+#if defined(__CUDACC__)
+
+#include <cuda_runtime.h>
+
+#define PTO_GPU_BACKEND
+#define PTO_GPU_INLINE_PTX 1
+
+#define __aicore__ __device__
+#define AICORE __device__
+#define PTO_INLINE __forceinline__
+#define PTO_INST AICORE PTO_INLINE
+#define PTO_INTERNAL AICORE PTO_INLINE
+
+#define __gm__
+#define __out__
+#define __in__
+#define __ubuf__
+#define __cbuf__
+#define __ca__
+#define __cb__
+#define __cc__
+#define __fbuf__
+#define __tf__
+
+#ifndef __cce_get_tile_ptr
+#define __cce_get_tile_ptr(x) (x)
+#endif
+
+typedef int pipe_t;
+constexpr pipe_t PIPE_S = 0;
+constexpr pipe_t PIPE_V = 1;
+constexpr pipe_t PIPE_MTE1 = 2;
+constexpr pipe_t PIPE_MTE2 = 3;
+constexpr pipe_t PIPE_MTE3 = 4;
+constexpr pipe_t PIPE_M = 5;
+constexpr pipe_t PIPE_ALL = 6;
+constexpr pipe_t PIPE_FIX = 7;
+
+using event_t = int;
+constexpr event_t EVENT_ID0 = 0;
+
+PTO_INTERNAL void pipe_barrier(pipe_t)
+{}
+
+PTO_INTERNAL void set_flag(pipe_t, pipe_t, event_t)
+{}
+
+PTO_INTERNAL void wait_flag(pipe_t, pipe_t, event_t)
+{}
+
+#endif
+
+#endif
diff --git a/include/pto/common/memory.hpp b/include/pto/common/memory.hpp
index cf5877a9..2e011ed1 100644
--- a/include/pto/common/memory.hpp
+++ b/include/pto/common/memory.hpp
@@ -43,6 +43,7 @@ enum class SLayout
     NoneBox = 0,
     RowMajor = 1,
     ColMajor = 2,
+    GpuSwizzle128B = 3,
 };
 
 // returns the memory qualifier for a given TileType and data type.
@@ -150,6 +151,8 @@ PTO_INTERNAL constexpr const __gm__ char *GetLayoutName(BLayout bType, SLayout s
             return (bType == BLayout::RowMajor) ? "Zz" : "Nz";
         case SLayout::ColMajor:
             return (bType == BLayout::RowMajor) ? "Zn" : "Nn";
+        case SLayout::GpuSwizzle128B:
+            return (bType == BLayout::RowMajor) ? "GS128" : "GS128C";
         default:
             return "Unknown";
     }
diff --git a/include/pto/common/pto_instr.hpp b/include/pto/common/pto_instr.hpp
index ad505d7e..e1237d6e 100644
--- a/include/pto/common/pto_instr.hpp
+++ b/include/pto/common/pto_instr.hpp
@@ -385,7 +385,7 @@ PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileDa
     return {};
 }
 
-#if defined(PTO_NPU_ARCH_A5) || defined(__CPU_SIM)
+#if defined(PTO_NPU_ARCH_A5) || defined(__CPU_SIM) || defined(PTO_GPU_BACKEND)
 template <typename TileRes, typename TileLeft, typename TileLeftScale, typename TileRight, typename TileRightScale,
           typename... WaitEvents>
 PTO_INST RecordEvent TGEMV_MX(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix,
diff --git a/include/pto/common/pto_instr_impl.hpp b/include/pto/common/pto_instr_impl.hpp
index d751072a..f662f78b 100644
--- a/include/pto/common/pto_instr_impl.hpp
+++ b/include/pto/common/pto_instr_impl.hpp
@@ -227,6 +227,19 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #include "pto/npu/kirin9030/header.hpp"
 #endif
 
+#ifdef PTO_GPU_BACKEND
+#include "pto/gpu/common/TSync.hpp"
+#include "pto/gpu/common/TAdd.hpp"
+#include "pto/gpu/common/TMrgSort.hpp"
+#include "pto/gpu/common/TLoad.hpp"
+#include "pto/gpu/common/TStore.hpp"
+#ifdef PTO_GPU_SM121
+#include "pto/gpu/sm121/arch.hpp"
+#include "pto/gpu/sm121/TMatmul.hpp"
+#endif
+#include "pto/gpu/common/TMatmul.hpp"
+#endif
+
 #ifdef __CPU_SIM
 #include "pto/cpu/ElementTileOp.h"
 #include "pto/cpu/ElementTileScalarOp.h"
diff --git a/include/pto/common/pto_tile.hpp b/include/pto/common/pto_tile.hpp
index 4b8cd6d0..5db5055b 100644
--- a/include/pto/common/pto_tile.hpp
+++ b/include/pto/common/pto_tile.hpp
@@ -812,6 +812,8 @@ static constexpr int fixedMxColSize = 2;
 static constexpr int fractalABSize = 512;
 static constexpr int fractalCSize = 1024;
 static constexpr int fractalMxSize = 32;
+static constexpr int gpuSwizzle128BSize = 128;
+static constexpr int gpuSwizzleRows = 8;
 } // namespace TileConfig
 
 namespace ConvTileDetail {
@@ -1253,7 +1255,9 @@ struct Tile {
 
     static constexpr int getInnerRow()
     {
-        if constexpr (SFractalSize_ == TileConfig::fractalCSize) {
+        if constexpr (SFractal_ == SLayout::GpuSwizzle128B) {
+            return TileConfig::gpuSwizzleRows;
+        } else if constexpr (SFractalSize_ == TileConfig::fractalCSize) {
             return TileConfig::fixedRowSize;
         } else if constexpr (SFractalSize_ == TileConfig::fractalMxSize) {
             return TileConfig::fixedMxRowSize;
@@ -1266,7 +1270,9 @@ struct Tile {
 
     static constexpr int getInnerCol()
     {
-        if constexpr (SFractalSize_ == TileConfig::fractalCSize) {
+        if constexpr (SFractal_ == SLayout::GpuSwizzle128B) {
+            return TileConfig::gpuSwizzle128BSize / (TileConfig::gpuSwizzleRows * sizeof(DType));
+        } else if constexpr (SFractalSize_ == TileConfig::fractalCSize) {
             return TileConfig::fixedColSize;
         } else if constexpr (SFractalSize_ == TileConfig::fractalMxSize) {
             return TileConfig::fixedMxColSize;
@@ -1373,6 +1379,7 @@ struct Tile {
     static constexpr bool isBoxedLayout = (SFractal != SLayout::NoneBox);
     static constexpr bool isInnerRowMajor = (SFractal == SLayout::RowMajor);
     static constexpr bool isInnerColMajor = (SFractal == SLayout::ColMajor);
+    static constexpr bool isGpuSwizzled = (SFractal == SLayout::GpuSwizzle128B);
 
     static constexpr int InnerRows = getInnerRow();
     static constexpr int InnerCols = getInnerCol();
@@ -1380,6 +1387,8 @@ struct Tile {
     static constexpr int InnerNumel = InnerRows * InnerCols;
 
     static_assert(InnerRows != 0 && InnerCols != 0, "rows or cols of fractal size is 0.");
+    static_assert((SFractal_ != SLayout::GpuSwizzle128B) || (BFractal_ == BLayout::RowMajor),
+                  "GPU swizzle layout currently supports row-major tiles only.");
     static_assert((Loc == TileType::Vec) || (SFractalSize_ == TileConfig::fractalMxSize) || (Rows_ == 1) ||
                       (Rows % InnerRows == 0),
                   "Layout rows must be divisible by inner box rows");
@@ -1398,7 +1407,7 @@ struct Tile {
          SFractal_ in not NoneBox: Rows/Cols must be integer multiple of InnerRows/InnerCols.");
 
     static_assert(SFractalSize_ == TileConfig::fractalABSize || SFractalSize_ == TileConfig::fractalCSize ||
-                      SFractalSize_ == TileConfig::fractalMxSize,
+                      SFractalSize_ == TileConfig::fractalMxSize || SFractalSize_ == TileConfig::gpuSwizzle128BSize,
                   "SFractalSize_ illegal");
 
 #if defined(__CPU_SIM) || defined(__COSTMODEL)
@@ -1605,6 +1614,23 @@ template <typename Element_, const int Rows_, const int Cols_, const int RowVali
 using TileAccCompact = Tile<TileType::Acc, Element_, Rows_, Cols_, BLayout::ColMajor, RowValid_, ColValid_,
                             SLayout::RowMajor, TileConfig::fractalCSize, PadValue::Null, CompactMode::Normal>;
 
+// GPU-only swizzled layouts. These are intentionally not tied to the NPU fractal conventions.
+template <typename Element_, const int Rows_, const int Cols_, const int RowValid_ = Rows_, const int ColValid_ = Cols_>
+using TileVecGpuSwizzle = Tile<TileType::Vec, Element_, Rows_, Cols_, BLayout::RowMajor, RowValid_, ColValid_,
+                               SLayout::GpuSwizzle128B, TileConfig::gpuSwizzle128BSize>;
+
+template <typename Element_, const int Rows_, const int Cols_, const int RowValid_ = Rows_, const int ColValid_ = Cols_>
+using TileLeftGpuSwizzle = Tile<TileType::Left, Element_, Rows_, Cols_, BLayout::RowMajor, RowValid_, ColValid_,
+                                SLayout::GpuSwizzle128B, TileConfig::gpuSwizzle128BSize>;
+
+template <typename Element_, const int Rows_, const int Cols_, const int RowValid_ = Rows_, const int ColValid_ = Cols_>
+using TileRightGpuSwizzle = Tile<TileType::Right, Element_, Rows_, Cols_, BLayout::RowMajor, RowValid_, ColValid_,
+                                 SLayout::GpuSwizzle128B, TileConfig::gpuSwizzle128BSize>;
+
+template <typename Element_, const int Rows_, const int Cols_, const int RowValid_ = Rows_, const int ColValid_ = Cols_>
+using TileAccGpuSwizzle = Tile<TileType::Acc, Element_, Rows_, Cols_, BLayout::RowMajor, RowValid_, ColValid_,
+                               SLayout::GpuSwizzle128B, TileConfig::gpuSwizzle128BSize>;
+
 template <typename T>
 struct is_global : std::false_type {
 };
diff --git a/include/pto/common/type.hpp b/include/pto/common/type.hpp
index e1980f8f..16802f42 100644
--- a/include/pto/common/type.hpp
+++ b/include/pto/common/type.hpp
@@ -10,17 +10,27 @@ See LICENSE in the root of the software repository for the full text of the Lice
 
 #ifndef _PTO_INCLUDE_NPU_TYPE_H_
 #define _PTO_INCLUDE_NPU_TYPE_H_
+#include <cstdint>
+#ifndef AICORE
 #ifndef __CPU_SIM
 #define AICORE [aicore]
 #else
 #define AICORE
 #endif
+#endif
+
+#ifndef PTO_INLINE
 #define PTO_INLINE inline __attribute__((always_inline))
+#endif
 
 // for pto instruction declaration
+#ifndef PTO_INST
 #define PTO_INST AICORE PTO_INLINE __attribute__((visibility("default")))
+#endif
 // for pto internal implementation
+#ifndef PTO_INTERNAL
 #define PTO_INTERNAL AICORE PTO_INLINE
+#endif
 
 #define OP_NAME(Name) __attribute__((vf_name(#Name)))
 #define OP_TYPE(TypeName) __attribute__((vf_kind(#TypeName)))
@@ -246,6 +256,7 @@ enum class TileLayoutCustom : uint8_t
     NZ,
     ZN,
     ZZ,
+    GPU_SWIZZLE_128B,
     NONE,
 };
 
@@ -264,7 +275,13 @@ using TRandomKey = uint32_t[PTO_RANDOM_KEY_SIZE];
 using TRandomCounter = uint32_t[PTO_RANDOM_COUNTER_SIZE];
 } // namespace pto
 
-#if defined(__CPU_SIM) || defined(__COSTMODEL)
+#if defined(PTO_GPU_BACKEND) && defined(__CUDACC__)
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+typedef __half half;
+typedef __half aclFloat16;
+typedef __nv_bfloat16 bfloat16_t;
+#elif defined(__CPU_SIM) || defined(__COSTMODEL)
 typedef _Float16 half;
 typedef _Float16 aclFloat16;
 // Note: clang version should be >=15 and gcc version should be >=14
diff --git a/include/pto/cpu/tile_offsets.hpp b/include/pto/cpu/tile_offsets.hpp
index c8fa1fff..87753e1e 100644
--- a/include/pto/cpu/tile_offsets.hpp
+++ b/include/pto/cpu/tile_offsets.hpp
@@ -13,6 +13,30 @@ See LICENSE in the root of the software repository for the full text of the Lice
 
 #include <unistd.h>
 namespace pto {
+
+PTO_INTERNAL constexpr bool IsPowerOfTwo(size_t value)
+{
+    return value != 0 && ((value & (value - 1)) == 0);
+}
+
+PTO_INTERNAL constexpr size_t PermuteSwizzleChunk(size_t chunk, size_t lane, size_t chunkCount)
+{
+    return IsPowerOfTwo(chunkCount) ? ((chunk ^ lane) & (chunkCount - 1)) : ((chunk + lane) % chunkCount);
+}
+
+template <typename TileData>
+size_t GetTileElementOffsetGpuSwizzle(size_t r, size_t c)
+{
+    constexpr size_t swizzleRows = TileData::InnerRows;
+    constexpr size_t swizzleCols = TileData::InnerCols;
+    const size_t rowBlock = r / swizzleRows;
+    const size_t rowInBlock = r % swizzleRows;
+    const size_t chunk = c / swizzleCols;
+    const size_t colInChunk = c % swizzleCols;
+    const size_t chunksPerRow = TileData::Cols / swizzleCols;
+    const size_t permutedChunk = PermuteSwizzleChunk(chunk, rowInBlock % chunksPerRow, chunksPerRow);
+    return rowBlock * swizzleRows * TileData::Cols + rowInBlock * TileData::Cols + permutedChunk * swizzleCols + colInChunk;
+}
 template <typename TileData>
 using TypeSum = std::conditional_t<std::is_same_v<typename TileData::DType, half>, float, typename TileData::DType>;
 
@@ -49,11 +73,11 @@ size_t GetTileElementOffsetPlain(size_t r, size_t c)
 template <typename TileData>
 size_t GetTileElementOffset(size_t r, size_t c)
 {
-    if constexpr (TileData::SFractal == SLayout::NoneBox)
+    if constexpr (TileData::SFractal == SLayout::NoneBox) {
         return GetTileElementOffsetPlain<TileData>(r, c);
-    else {
-        size_t subTileR = r / TileData::InnerRows;
-        size_t innerR = r % TileData::InnerRows;
+    } else if constexpr (TileData::SFractal == SLayout::GpuSwizzle128B) {
+        return GetTileElementOffsetGpuSwizzle<TileData>(r, c);
+    } else {
         return GetTileElementOffsetSubfractals<TileData>(r / TileData::InnerRows, r % TileData::InnerRows,
                                                          c / TileData::InnerCols, c % TileData::InnerCols);
     }
diff --git a/include/pto/gpu/README.md b/include/pto/gpu/README.md
new file mode 100644
index 00000000..e93f78ed
--- /dev/null
+++ b/include/pto/gpu/README.md
@@ -0,0 +1,44 @@
+# include/pto/gpu/
+
+Scaffold for a future NVIDIA GPU backend for PTO Tile Lib.
+
+## Purpose
+
+This directory mirrors the existing backend split used by the CPU simulator and Ascend NPU implementations:
+
+- `common/`: shared GPU helpers, tile adapters, launch abstractions, layout helpers
+- `smXX/`: architecture-specific specializations and microkernel registries
+
+## Design intent
+
+The long-term goal is to provide:
+
+- a unified PTO API at `#include <pto/pto-inst.hpp>`
+- a GPU lowering path that keeps PTO semantics stable
+- per-SM specializations for performance-critical PTO instructions
+- dedicated microkernels for hot PTO ISA operations on NVIDIA GPUs
+
+## Initial status
+
+This backend is now past pure scaffold stage.
+
+Implemented groundwork includes:
+
+- backend dispatch macros
+- inclusion from `pto_instr_impl.hpp`
+- initial CUDA build/test plumbing
+- early `sm121` matmul fast paths
+- a GPU-specific swizzle layout family for row-major tiles
+
+Current GPU swizzle support:
+
+- layout enum: `SLayout::GpuSwizzle128B`
+- aliases in `pto_tile.hpp`:
+  - `TileVecGpuSwizzle`
+  - `TileLeftGpuSwizzle`
+  - `TileRightGpuSwizzle`
+  - `TileAccGpuSwizzle`
+- element offset mapping is GPU-specific and intentionally not tied to the NPU boxed layouts
+- current swizzle implementation is row-major only and is intended as groundwork for future shared-memory / tensor-core friendly paths
+
+See `docs/gpu_backend_plan.md` for the phase plan and backend bring-up checklist.
diff --git a/include/pto/gpu/common/README.md b/include/pto/gpu/common/README.md
new file mode 100644
index 00000000..a15584e8
--- /dev/null
+++ b/include/pto/gpu/common/README.md
@@ -0,0 +1,15 @@
+# include/pto/gpu/common/
+
+Shared building blocks for the future PTO NVIDIA GPU backend.
+
+Expected contents over time:
+
+- tile layout adapters
+- register/shared/global memory helpers
+- warp / warpgroup scheduling helpers
+- async copy abstractions
+- tensor-core / MMA wrapper traits
+- launch-time architecture dispatch helpers
+- common validation and debug utilities
+
+This directory is intentionally created before implementation so the backend can evolve with the same structure as existing PTO backends.
diff --git a/include/pto/gpu/common/TAdd.hpp b/include/pto/gpu/common/TAdd.hpp
new file mode 100644
index 00000000..face8101
--- /dev/null
+++ b/include/pto/gpu/common/TAdd.hpp
@@ -0,0 +1,45 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TADD_HPP
+#define PTO_GPU_COMMON_TADD_HPP
+
+#include <pto/common/pto_tile.hpp>
+#include "pto/gpu/common/tile_offsets.hpp"
+
+namespace pto {
+
+template <typename TileDataDst, typename TileDataSrc0, typename TileDataSrc1>
+PTO_INTERNAL void TADD_IMPL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1)
+{
+    using T = typename TileDataDst::DType;
+    static_assert(std::is_same_v<T, typename TileDataSrc0::DType> && std::is_same_v<T, typename TileDataSrc1::DType>,
+                  "Fix: TADD input/output dtypes must match for the GPU backend.");
+
+    const unsigned validRows = dst.GetValidRow();
+    const unsigned validCols = dst.GetValidCol();
+    PTO_ASSERT(src0.GetValidRow() == validRows && src0.GetValidCol() == validCols,
+               "Fix: TADD src0 valid shape mismatch with dst in GPU backend.");
+    PTO_ASSERT(src1.GetValidRow() == validRows && src1.GetValidCol() == validCols,
+               "Fix: TADD src1 valid shape mismatch with dst in GPU backend.");
+
+    for (unsigned r = 0; r < validRows; ++r) {
+        for (unsigned c = 0; c < validCols; ++c) {
+            const std::size_t idx = gpu::GetTileElementOffset<TileDataDst>(r, c);
+            const std::size_t src0Idx = gpu::GetTileElementOffset<TileDataSrc0>(r, c);
+            const std::size_t src1Idx = gpu::GetTileElementOffset<TileDataSrc1>(r, c);
+            dst.data()[idx] = src0.data()[src0Idx] + src1.data()[src1Idx];
+        }
+    }
+}
+
+} // namespace pto
+
+#endif
diff --git a/include/pto/gpu/common/TLoad.hpp b/include/pto/gpu/common/TLoad.hpp
new file mode 100644
index 00000000..d44c42c3
--- /dev/null
+++ b/include/pto/gpu/common/TLoad.hpp
@@ -0,0 +1,100 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TLOAD_HPP
+#define PTO_GPU_COMMON_TLOAD_HPP
+
+#include <pto/common/pto_tile.hpp>
+#include <pto/common/constants.hpp>
+#include "pto/gpu/common/tile_offsets.hpp"
+
+namespace pto {
+
+template <typename TileData>
+PTO_INTERNAL typename TileData::DType GpuPadValue()
+{
+    return GetPadValue<TileData>();
+}
+
+template <typename TileData, typename GlobalData>
+PTO_INTERNAL void TLOAD_IMPL(TileData &dst, GlobalData &src)
+{
+    static_assert(GlobalData::layout == pto::Layout::ND || GlobalData::layout == pto::Layout::DN,
+                  "Only ND and DN GlobalTensor layouts are supported by the v1 GPU backend.");
+
+    const int gShape0 = src.GetShape(pto::GlobalTensorDim::DIM_0);
+    const int gShape1 = src.GetShape(pto::GlobalTensorDim::DIM_1);
+    const int gShape2 = src.GetShape(pto::GlobalTensorDim::DIM_2);
+    const int gShape3 = src.GetShape(pto::GlobalTensorDim::DIM_3);
+    const int gShape4 = src.GetShape(pto::GlobalTensorDim::DIM_4);
+
+    const int gStride0 = src.GetStride(pto::GlobalTensorDim::DIM_0);
+    const int gStride1 = src.GetStride(pto::GlobalTensorDim::DIM_1);
+    const int gStride2 = src.GetStride(pto::GlobalTensorDim::DIM_2);
+    const int gStride3 = src.GetStride(pto::GlobalTensorDim::DIM_3);
+    const int gStride4 = src.GetStride(pto::GlobalTensorDim::DIM_4);
+
+    const int validRow = dst.GetValidRow();
+    const int validCol = dst.GetValidCol();
+
+    for (int r = 0; r < TileData::Rows; ++r) {
+        for (int c = 0; c < TileData::Cols; ++c) {
+            dst.data()[gpu::GetTileElementOffset<TileData>(r, c)] = GpuPadValue<TileData>();
+        }
+    }
+
+    if constexpr (GlobalData::layout == pto::Layout::ND) {
+        PTO_ASSERT(TileData::isRowMajor, "Fix: ND loads currently require row-major tiles in the GPU backend.");
+        PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape3 == validRow && gShape4 == validCol,
+                   "Fix: ND TLOAD valid shape mismatch in GPU backend.");
+
+        int rowBase = 0;
+        for (int i = 0; i < gShape0; ++i) {
+            for (int j = 0; j < gShape1; ++j) {
+                for (int k = 0; k < gShape2; ++k) {
+                    for (int r = 0; r < gShape3; ++r) {
+                        const int tileRow = rowBase + r;
+                        const int srcBase = i * gStride0 + j * gStride1 + k * gStride2 + r * gStride3;
+                        for (int c = 0; c < gShape4; ++c) {
+                            const std::size_t dstIdx = gpu::GetTileElementOffset<TileData>(tileRow, c);
+                            dst.data()[dstIdx] = src.data()[srcBase + c * gStride4];
+                        }
+                    }
+                    rowBase += gShape3;
+                }
+            }
+        }
+    } else {
+        PTO_ASSERT(!TileData::isRowMajor, "Fix: DN loads currently require col-major tiles in the GPU backend.");
+        PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape4 == validCol && gShape3 == validRow,
+                   "Fix: DN TLOAD valid shape mismatch in GPU backend.");
+
+        int colBase = 0;
+        for (int i = 0; i < gShape0; ++i) {
+            for (int j = 0; j < gShape1; ++j) {
+                for (int k = 0; k < gShape2; ++k) {
+                    for (int c = 0; c < gShape4; ++c) {
+                        const int tileCol = colBase + c;
+                        const int srcBase = i * gStride0 + j * gStride1 + k * gStride2 + c * gStride4;
+                        for (int r = 0; r < gShape3; ++r) {
+                            const std::size_t dstIdx = gpu::GetTileElementOffset<TileData>(r, tileCol);
+                            dst.data()[dstIdx] = src.data()[srcBase + r * gStride3];
+                        }
+                    }
+                    colBase += gShape4;
+                }
+            }
+        }
+    }
+}
+
+} // namespace pto
+
+#endif
diff --git a/include/pto/gpu/common/TMatmul.hpp b/include/pto/gpu/common/TMatmul.hpp
new file mode 100644
index 00000000..cc509ee4
--- /dev/null
+++ b/include/pto/gpu/common/TMatmul.hpp
@@ -0,0 +1,217 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TMATMUL_HPP
+#define PTO_GPU_COMMON_TMATMUL_HPP
+
+#include <pto/common/pto_tile.hpp>
+#include "pto/gpu/common/tile_offsets.hpp"
+
+namespace pto {
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL void CheckGpuMatmulValid()
+{
+    using AType = typename TileLeft::DType;
+    using BType = typename TileRight::DType;
+    using CType = typename TileAcc::DType;
+    static_assert(
+        (std::is_same_v<AType, int8_t> && std::is_same_v<BType, int8_t> && std::is_same_v<CType, int32_t>) ||
+            (std::is_same_v<AType, half> && std::is_same_v<BType, half> && std::is_same_v<CType, float>) ||
+            (std::is_same_v<AType, float> && std::is_same_v<BType, float> && std::is_same_v<CType, float>) ||
+            (std::is_same_v<AType, bfloat16_t> && std::is_same_v<BType, bfloat16_t> && std::is_same_v<CType, float>),
+        "Fix: GPU TMATMUL currently supports int8->int32, half->float, bfloat16->float, and float->float.");
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL void TMATMUL_IMPL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    CheckGpuMatmulValid<TileAcc, TileLeft, TileRight>();
+
+#ifdef PTO_GPU_SM121
+    if (gpu::sm121::TrySm121TMATMUL(cMatrix, aMatrix, bMatrix)) {
+        return;
+    }
+#endif
+
+    const uint16_t m = aMatrix.GetValidRow();
+    const uint16_t k = aMatrix.GetValidCol();
+    const uint16_t n = bMatrix.GetValidCol();
+
+    for (uint16_t row = 0; row < m; ++row) {
+        for (uint16_t col = 0; col < n; ++col) {
+            typename TileAcc::DType acc = typename TileAcc::DType{};
+            for (uint16_t kk = 0; kk < k; ++kk) {
+                const std::size_t aIdx = gpu::GetTileElementOffset<TileLeft>(row, kk);
+                const std::size_t bIdx = gpu::GetTileElementOffset<TileRight>(kk, col);
+                acc += static_cast<typename TileAcc::DType>(aMatrix.data()[aIdx]) *
+                       static_cast<typename TileAcc::DType>(bMatrix.data()[bIdx]);
+            }
+            cMatrix.data()[gpu::GetTileElementOffset<TileAcc>(row, col)] = acc;
+        }
+    }
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL void TMATMUL_ACC_IMPL(TileAcc &cOutMatrix, TileAcc &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    CheckGpuMatmulValid<TileAcc, TileLeft, TileRight>();
+
+#ifdef PTO_GPU_SM121
+    if (gpu::sm121::TrySm121TMATMULAcc(cOutMatrix, cInMatrix, aMatrix, bMatrix)) {
+        return;
+    }
+#endif
+
+    const uint16_t m = aMatrix.GetValidRow();
+    const uint16_t k = aMatrix.GetValidCol();
+    const uint16_t n = bMatrix.GetValidCol();
+
+    for (uint16_t row = 0; row < m; ++row) {
+        for (uint16_t col = 0; col < n; ++col) {
+            const std::size_t cIdx = gpu::GetTileElementOffset<TileAcc>(row, col);
+            typename TileAcc::DType acc = cInMatrix.data()[cIdx];
+            for (uint16_t kk = 0; kk < k; ++kk) {
+                const std::size_t aIdx = gpu::GetTileElementOffset<TileLeft>(row, kk);
+                const std::size_t bIdx = gpu::GetTileElementOffset<TileRight>(kk, col);
+                acc += static_cast<typename TileAcc::DType>(aMatrix.data()[aIdx]) *
+                       static_cast<typename TileAcc::DType>(bMatrix.data()[bIdx]);
+            }
+            cOutMatrix.data()[cIdx] = acc;
+        }
+    }
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL void TMATMUL_ACC_IMPL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    CheckGpuMatmulValid<TileAcc, TileLeft, TileRight>();
+
+#ifdef PTO_GPU_SM121
+    if (gpu::sm121::TrySm121TMATMULAcc(cMatrix, aMatrix, bMatrix)) {
+        return;
+    }
+#endif
+
+    TMATMUL_ACC_IMPL(cMatrix, cMatrix, aMatrix, bMatrix);
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight, typename TileBias>
+PTO_INTERNAL void TMATMUL_BIAS_IMPL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasMatrix)
+{
+#ifdef PTO_GPU_SM121
+    if (gpu::sm121::TrySm121TMATMULBias(cMatrix, aMatrix, bMatrix, biasMatrix)) {
+        return;
+    }
+#endif
+
+    TMATMUL_IMPL(cMatrix, aMatrix, bMatrix);
+    const uint16_t m = aMatrix.GetValidRow();
+    const uint16_t n = bMatrix.GetValidCol();
+    for (uint16_t row = 0; row < m; ++row) {
+        for (uint16_t col = 0; col < n; ++col) {
+            const std::size_t cIdx = gpu::GetTileElementOffset<TileAcc>(row, col);
+            const std::size_t bIdx = gpu::GetTileElementOffset<TileBias>(0, col);
+            cMatrix.data()[cIdx] += biasMatrix.data()[bIdx];
+        }
+    }
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileRight>
+PTO_INTERNAL void TGEMV_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    (void)Phase;
+    TMATMUL_IMPL(cMatrix, aMatrix, bMatrix);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileRight>
+PTO_INTERNAL void TGEMV_ACC_IMPL(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    (void)Phase;
+    TMATMUL_ACC_IMPL(cOutMatrix, cInMatrix, aMatrix, bMatrix);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileRight,
+          typename TileBias>
+PTO_INTERNAL void TGEMV_BIAS_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData)
+{
+    (void)Phase;
+    TMATMUL_BIAS_IMPL(cMatrix, aMatrix, bMatrix, biasData);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileLeftScale,
+          typename TileRight, typename TileRightScale>
+PTO_INTERNAL void TMATMUL_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix,
+                                  TileRightScale &bScaleMatrix)
+{
+    (void)Phase;
+    (void)aScaleMatrix;
+    (void)bScaleMatrix;
+    TMATMUL_IMPL(cMatrix, aMatrix, bMatrix);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileLeftScale,
+          typename TileRight, typename TileRightScale>
+PTO_INTERNAL void TMATMUL_MX_IMPL(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix,
+                                  TileLeftScale &aScaleMatrix, TileRight &bMatrix, TileRightScale &bScaleMatrix)
+{
+    (void)Phase;
+    (void)aScaleMatrix;
+    (void)bScaleMatrix;
+    TMATMUL_ACC_IMPL(cOutMatrix, cInMatrix, aMatrix, bMatrix);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileLeftScale,
+          typename TileRight, typename TileRightScale, typename TileBias>
+PTO_INTERNAL void TMATMUL_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix,
+                                  TileRightScale &bScaleMatrix, TileBias &biasData)
+{
+    (void)Phase;
+    (void)aScaleMatrix;
+    (void)bScaleMatrix;
+    TMATMUL_BIAS_IMPL(cMatrix, aMatrix, bMatrix, biasData);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileLeftScale,
+          typename TileRight, typename TileRightScale>
+PTO_INTERNAL void TGEMV_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix,
+                                TileRightScale &bScaleMatrix)
+{
+    (void)Phase;
+    (void)aScaleMatrix;
+    (void)bScaleMatrix;
+    TGEMV_IMPL(cMatrix, aMatrix, bMatrix);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileLeftScale,
+          typename TileRight, typename TileRightScale>
+PTO_INTERNAL void TGEMV_MX_IMPL(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix,
+                                TileRight &bMatrix, TileRightScale &bScaleMatrix)
+{
+    (void)Phase;
+    (void)aScaleMatrix;
+    (void)bScaleMatrix;
+    TGEMV_ACC_IMPL(cOutMatrix, cInMatrix, aMatrix, bMatrix);
+}
+
+template <AccPhase Phase = AccPhase::Unspecified, typename TileRes, typename TileLeft, typename TileLeftScale,
+          typename TileRight, typename TileRightScale, typename TileBias>
+PTO_INTERNAL void TGEMV_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix,
+                                TileRightScale &bScaleMatrix, TileBias &biasData)
+{
+    (void)Phase;
+    (void)aScaleMatrix;
+    (void)bScaleMatrix;
+    TGEMV_BIAS_IMPL(cMatrix, aMatrix, bMatrix, biasData);
+}
+
+} // namespace pto
+
+#endif
diff --git a/include/pto/gpu/common/TMrgSort.hpp b/include/pto/gpu/common/TMrgSort.hpp
new file mode 100644
index 00000000..9ea80f4d
--- /dev/null
+++ b/include/pto/gpu/common/TMrgSort.hpp
@@ -0,0 +1,42 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TMRGSORT_HPP
+#define PTO_GPU_COMMON_TMRGSORT_HPP
+
+namespace pto {
+
+struct MrgSortExecutedNumList {
+};
+
+template <typename DstTileData, typename TmpTileData, typename Src0TileData, typename Src1TileData>
+PTO_INTERNAL void TMRGSORT_IMPL(DstTileData &, MrgSortExecutedNumList &, TmpTileData &, Src0TileData &, Src1TileData &)
+{
+}
+
+template <typename DstTileData, typename TmpTileData, typename Src0TileData, typename Src1TileData,
+          typename Src2TileData, typename Src3TileData>
+PTO_INTERNAL void TMRGSORT_IMPL(DstTileData &, MrgSortExecutedNumList &, TmpTileData &, Src0TileData &, Src1TileData &,
+                                Src2TileData &, Src3TileData &)
+{
+}
+
+template <typename DstTileData, typename TmpTileData, typename Src0TileData, typename Src1TileData,
+          typename Src2TileData, typename Src3TileData, typename Src4TileData, typename Src5TileData,
+          typename Src6TileData, typename Src7TileData>
+PTO_INTERNAL void TMRGSORT_IMPL(DstTileData &, MrgSortExecutedNumList &, TmpTileData &, Src0TileData &, Src1TileData &,
+                                Src2TileData &, Src3TileData &, Src4TileData &, Src5TileData &, Src6TileData &,
+                                Src7TileData &)
+{
+}
+
+} // namespace pto
+
+#endif
diff --git a/include/pto/gpu/common/TStore.hpp b/include/pto/gpu/common/TStore.hpp
new file mode 100644
index 00000000..bdac8df5
--- /dev/null
+++ b/include/pto/gpu/common/TStore.hpp
@@ -0,0 +1,105 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TSTORE_HPP
+#define PTO_GPU_COMMON_TSTORE_HPP
+
+#include <pto/common/pto_tile.hpp>
+#include "pto/gpu/common/tile_offsets.hpp"
+
+namespace pto {
+
+template <typename TileData, typename GlobalData, AtomicType atomicType = AtomicType::AtomicNone>
+PTO_INTERNAL void TSTORE_IMPL(GlobalData &dst, TileData &src)
+{
+    static_assert(atomicType == AtomicType::AtomicNone,
+                  "Atomic store modes are not implemented yet in the v1 GPU backend.");
+    static_assert(GlobalData::layout == pto::Layout::ND || GlobalData::layout == pto::Layout::DN,
+                  "Only ND and DN GlobalTensor layouts are supported by the v1 GPU backend.");
+
+    const int gShape0 = dst.GetShape(pto::GlobalTensorDim::DIM_0);
+    const int gShape1 = dst.GetShape(pto::GlobalTensorDim::DIM_1);
+    const int gShape2 = dst.GetShape(pto::GlobalTensorDim::DIM_2);
+    const int gShape3 = dst.GetShape(pto::GlobalTensorDim::DIM_3);
+    const int gShape4 = dst.GetShape(pto::GlobalTensorDim::DIM_4);
+
+    const int gStride0 = dst.GetStride(pto::GlobalTensorDim::DIM_0);
+    const int gStride1 = dst.GetStride(pto::GlobalTensorDim::DIM_1);
+    const int gStride2 = dst.GetStride(pto::GlobalTensorDim::DIM_2);
+    const int gStride3 = dst.GetStride(pto::GlobalTensorDim::DIM_3);
+    const int gStride4 = dst.GetStride(pto::GlobalTensorDim::DIM_4);
+
+    const int validRow = src.GetValidRow();
+    const int validCol = src.GetValidCol();
+
+    if constexpr (GlobalData::layout == pto::Layout::ND) {
+        PTO_ASSERT(TileData::isRowMajor, "Fix: ND stores currently require row-major tiles in the GPU backend.");
+        PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape3 == validRow && gShape4 == validCol,
+                   "Fix: ND TSTORE valid shape mismatch in GPU backend.");
+
+        int rowBase = 0;
+        for (int i = 0; i < gShape0; ++i) {
+            for (int j = 0; j < gShape1; ++j) {
+                for (int k = 0; k < gShape2; ++k) {
+                    for (int r = 0; r < gShape3; ++r) {
+                        const int tileRow = rowBase + r;
+                        const int dstBase = i * gStride0 + j * gStride1 + k * gStride2 + r * gStride3;
+                        for (int c = 0; c < gShape4; ++c) {
+                            const std::size_t srcIdx = gpu::GetTileElementOffset<TileData>(tileRow, c);
+                            dst.data()[dstBase + c * gStride4] = src.data()[srcIdx];
+                        }
+                    }
+                    rowBase += gShape3;
+                }
+            }
+        }
+    } else {
+        PTO_ASSERT(!TileData::isRowMajor, "Fix: DN stores currently require col-major tiles in the GPU backend.");
+        PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape4 == validCol && gShape3 == validRow,
+                   "Fix: DN TSTORE valid shape mismatch in GPU backend.");
+
+        int colBase = 0;
+        for (int i = 0; i < gShape0; ++i) {
+            for (int j = 0; j < gShape1; ++j) {
+                for (int k = 0; k < gShape2; ++k) {
+                    for (int c = 0; c < gShape4; ++c) {
+                        const int tileCol = colBase + c;
+                        const int dstBase = i * gStride0 + j * gStride1 + k * gStride2 + c * gStride4;
+                        for (int r = 0; r < gShape3; ++r) {
+                            const std::size_t srcIdx = gpu::GetTileElementOffset<TileData>(r, tileCol);
+                            dst.data()[dstBase + r * gStride3] = src.data()[srcIdx];
+                        }
+                    }
+                    colBase += gShape4;
+                }
+            }
+        }
+    }
+}
+
+template <typename TileData, typename GlobalData, AtomicType atomicType = AtomicType::AtomicNone>
+PTO_INTERNAL void TSTORE_IMPL(GlobalData &dst, TileData &src, uint64_t preQuantScalar)
+{
+    (void)preQuantScalar;
+    TSTORE_IMPL<TileData, GlobalData, atomicType>(dst, src);
+}
+
+template <typename TileData, typename GlobalData, typename FpTileData, AtomicType atomicType = AtomicType::AtomicNone,
+          ReluPreMode reluPreMode = ReluPreMode::NoRelu>
+PTO_INTERNAL void TSTORE_IMPL(GlobalData &dst, TileData &src, FpTileData &fp)
+{
+    (void)fp;
+    (void)reluPreMode;
+    TSTORE_IMPL<TileData, GlobalData, atomicType>(dst, src);
+}
+
+} // namespace pto
+
+#endif
diff --git a/include/pto/gpu/common/TSync.hpp b/include/pto/gpu/common/TSync.hpp
new file mode 100644
index 00000000..f8816804
--- /dev/null
+++ b/include/pto/gpu/common/TSync.hpp
@@ -0,0 +1,28 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TSYNC_HPP
+#define PTO_GPU_COMMON_TSYNC_HPP
+
+#include <pto/common/type.hpp>
+#include <pto/common/event.hpp>
+
+namespace pto {
+
+template <Op OpCode>
+PTO_INTERNAL void TSYNC_IMPL()
+{
+    (void)sizeof(OpCode);
+    // v1 GPU backend keeps advanced PTO event / pipeline semantics deferred.
+}
+
+} // namespace pto
+
+#endif
diff --git a/include/pto/gpu/common/tile_offsets.hpp b/include/pto/gpu/common/tile_offsets.hpp
new file mode 100644
index 00000000..90747c69
--- /dev/null
+++ b/include/pto/gpu/common/tile_offsets.hpp
@@ -0,0 +1,81 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_COMMON_TILE_OFFSETS_HPP
+#define PTO_GPU_COMMON_TILE_OFFSETS_HPP
+
+#include <cstddef>
+
+namespace pto::gpu {
+
+PTO_INTERNAL constexpr bool IsPowerOfTwo(std::size_t value)
+{
+    return value != 0 && ((value & (value - 1)) == 0);
+}
+
+PTO_INTERNAL constexpr std::size_t PermuteSwizzleChunk(std::size_t chunk, std::size_t lane, std::size_t chunkCount)
+{
+    return IsPowerOfTwo(chunkCount) ? ((chunk ^ lane) & (chunkCount - 1)) : ((chunk + lane) % chunkCount);
+}
+
+template <typename TileData>
+PTO_INTERNAL std::size_t GetTileElementOffsetGpuSwizzle(std::size_t r, std::size_t c)
+{
+    constexpr std::size_t swizzleRows = TileData::InnerRows;
+    constexpr std::size_t swizzleCols = TileData::InnerCols;
+    const std::size_t rowBlock = r / swizzleRows;
+    const std::size_t rowInBlock = r % swizzleRows;
+    const std::size_t chunk = c / swizzleCols;
+    const std::size_t colInChunk = c % swizzleCols;
+    const std::size_t chunksPerRow = TileData::Cols / swizzleCols;
+    const std::size_t permutedChunk = PermuteSwizzleChunk(chunk, rowInBlock % chunksPerRow, chunksPerRow);
+    return rowBlock * swizzleRows * TileData::Cols + rowInBlock * TileData::Cols + permutedChunk * swizzleCols + colInChunk;
+}
+
+template <typename TileData>
+PTO_INTERNAL std::size_t GetTileElementOffsetSubfractals(std::size_t subTileR, std::size_t innerR,
+                                                         std::size_t subTileC, std::size_t innerC)
+{
+    if constexpr (!TileData::isRowMajor && (TileData::SFractal == SLayout::RowMajor)) {
+        return subTileC * TileData::Rows * TileData::InnerCols + subTileR * TileData::InnerNumel +
+               innerR * TileData::InnerCols + innerC;
+    } else if constexpr (TileData::isRowMajor && (TileData::SFractal == SLayout::ColMajor)) {
+        return subTileR * TileData::Cols * TileData::InnerRows + subTileC * TileData::InnerNumel +
+               innerC * TileData::InnerRows + innerR;
+    } else {
+        return subTileR * TileData::Cols * TileData::InnerRows + subTileC * TileData::InnerNumel +
+               innerR * TileData::InnerCols + innerC;
+    }
+}
+
+template <typename TileData>
+PTO_INTERNAL std::size_t GetTileElementOffsetPlain(std::size_t r, std::size_t c)
+{
+    if constexpr (TileData::isRowMajor) {
+        return r * TileData::Cols + c;
+    }
+    return c * TileData::Rows + r;
+}
+
+template <typename TileData>
+PTO_INTERNAL std::size_t GetTileElementOffset(std::size_t r, std::size_t c)
+{
+    if constexpr (TileData::SFractal == SLayout::NoneBox) {
+        return GetTileElementOffsetPlain<TileData>(r, c);
+    } else if constexpr (TileData::SFractal == SLayout::GpuSwizzle128B) {
+        return GetTileElementOffsetGpuSwizzle<TileData>(r, c);
+    }
+    return GetTileElementOffsetSubfractals<TileData>(r / TileData::InnerRows, r % TileData::InnerRows,
+                                                     c / TileData::InnerCols, c % TileData::InnerCols);
+}
+
+} // namespace pto::gpu
+
+#endif
diff --git a/include/pto/gpu/sm100/README.md b/include/pto/gpu/sm100/README.md
new file mode 100644
index 00000000..aaf26423
--- /dev/null
+++ b/include/pto/gpu/sm100/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm100/
+
+Architecture-specific PTO GPU specializations for **Blackwell**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm100/microkernels/README.md b/include/pto/gpu/sm100/microkernels/README.md
new file mode 100644
index 00000000..3e0e1e1f
--- /dev/null
+++ b/include/pto/gpu/sm100/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm100/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm100**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm100a/README.md b/include/pto/gpu/sm100a/README.md
new file mode 100644
index 00000000..7200336e
--- /dev/null
+++ b/include/pto/gpu/sm100a/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm100a/
+
+Architecture-specific PTO GPU specializations for **Blackwell architecture-conditional specialization**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm100a/microkernels/README.md b/include/pto/gpu/sm100a/microkernels/README.md
new file mode 100644
index 00000000..ab2a6cb2
--- /dev/null
+++ b/include/pto/gpu/sm100a/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm100a/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm100a**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm103/README.md b/include/pto/gpu/sm103/README.md
new file mode 100644
index 00000000..eb2fa400
--- /dev/null
+++ b/include/pto/gpu/sm103/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm103/
+
+Architecture-specific PTO GPU specializations for **GB300 / B300 generation**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm103/microkernels/README.md b/include/pto/gpu/sm103/microkernels/README.md
new file mode 100644
index 00000000..ef686743
--- /dev/null
+++ b/include/pto/gpu/sm103/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm103/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm103**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm110/README.md b/include/pto/gpu/sm110/README.md
new file mode 100644
index 00000000..7408a2bf
--- /dev/null
+++ b/include/pto/gpu/sm110/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm110/
+
+Architecture-specific PTO GPU specializations for **Thor / Jetson T-series generation**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm110/microkernels/README.md b/include/pto/gpu/sm110/microkernels/README.md
new file mode 100644
index 00000000..db61fa0f
--- /dev/null
+++ b/include/pto/gpu/sm110/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm110/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm110**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm120/README.md b/include/pto/gpu/sm120/README.md
new file mode 100644
index 00000000..9d33c453
--- /dev/null
+++ b/include/pto/gpu/sm120/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm120/
+
+Architecture-specific PTO GPU specializations for **Blackwell workstation / GeForce generation**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm120/microkernels/README.md b/include/pto/gpu/sm120/microkernels/README.md
new file mode 100644
index 00000000..9c8e0f35
--- /dev/null
+++ b/include/pto/gpu/sm120/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm120/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm120**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm121/README.md b/include/pto/gpu/sm121/README.md
new file mode 100644
index 00000000..e46c5840
--- /dev/null
+++ b/include/pto/gpu/sm121/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm121/
+
+Architecture-specific PTO GPU specializations for **GB10 / DGX Spark**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm121/TMatmul.hpp b/include/pto/gpu/sm121/TMatmul.hpp
new file mode 100644
index 00000000..7e1ba10c
--- /dev/null
+++ b/include/pto/gpu/sm121/TMatmul.hpp
@@ -0,0 +1,295 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_SM121_TMATMUL_HPP
+#define PTO_GPU_SM121_TMATMUL_HPP
+
+#include <type_traits>
+#include <mma.h>
+#include <pto/common/type.hpp>
+#include "pto/gpu/common/tile_offsets.hpp"
+
+namespace pto::gpu::sm121 {
+
+namespace wmma = nvcuda::wmma;
+
+constexpr int kMmaM = 16;
+constexpr int kMmaN = 16;
+constexpr int kMmaK = 16;
+
+template <typename T>
+PTO_INTERNAL float ToAccumFloat(T value)
+{
+    return static_cast<float>(value);
+}
+
+template <>
+PTO_INTERNAL float ToAccumFloat<half>(half value)
+{
+    return __half2float(value);
+}
+
+template <>
+PTO_INTERNAL float ToAccumFloat<bfloat16_t>(bfloat16_t value)
+{
+    return __bfloat162float(value);
+}
+
+PTO_INTERNAL float InlinePtxFma(float a, float b, float c)
+{
+    float out;
+    asm volatile("fma.rn.f32 %0, %1, %2, %3;" : "=f"(out) : "f"(a), "f"(b), "f"(c));
+    return out;
+}
+
+PTO_INTERNAL unsigned LinearThreadId()
+{
+    return threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+}
+
+PTO_INTERNAL unsigned ThreadsPerBlock()
+{
+    return blockDim.x * blockDim.y * blockDim.z;
+}
+
+template <typename TileAccOut, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool CanUseSm121TensorCoreCore(TileAccOut &cOutMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    using CType = typename TileAccOut::DType;
+    using AType = typename TileLeft::DType;
+    using BType = typename TileRight::DType;
+
+    constexpr bool supportedTypes =
+        std::is_same_v<CType, float> &&
+        ((std::is_same_v<AType, half> && std::is_same_v<BType, half>) ||
+         (std::is_same_v<AType, bfloat16_t> && std::is_same_v<BType, bfloat16_t>));
+
+    if constexpr (!supportedTypes) {
+        (void)cOutMatrix;
+        (void)aMatrix;
+        (void)bMatrix;
+        return false;
+    } else {
+        if constexpr (!TileLeft::isRowMajor || !TileRight::isRowMajor || !TileAccOut::isRowMajor) {
+            return false;
+        }
+
+        const uint16_t m = aMatrix.GetValidRow();
+        const uint16_t k = aMatrix.GetValidCol();
+        const uint16_t n = bMatrix.GetValidCol();
+        if (k != bMatrix.GetValidRow()) {
+            return false;
+        }
+        if ((m % kMmaM) != 0 || (n % kMmaN) != 0 || (k % kMmaK) != 0) {
+            return false;
+        }
+        return ThreadsPerBlock() >= warpSize;
+    }
+}
+
+template <bool UseAcc, bool UseBias, typename TileAccOut, typename TileLeft, typename TileRight,
+          typename TileAccIn = TileAccOut, typename TileBias = TileAccOut>
+PTO_INTERNAL bool TryTensorCoreTMATMULCore(TileAccOut &cOutMatrix, TileLeft &aMatrix, TileRight &bMatrix,
+                                           TileAccIn *cInMatrix = nullptr, TileBias *biasMatrix = nullptr)
+{
+    using CType = typename TileAccOut::DType;
+    using AType = typename TileLeft::DType;
+    using BType = typename TileRight::DType;
+
+    constexpr bool supportedTypes =
+        std::is_same_v<CType, float> &&
+        ((std::is_same_v<AType, half> && std::is_same_v<BType, half>) ||
+         (std::is_same_v<AType, bfloat16_t> && std::is_same_v<BType, bfloat16_t>));
+
+    if constexpr (!supportedTypes) {
+        (void)cOutMatrix;
+        (void)aMatrix;
+        (void)bMatrix;
+        (void)cInMatrix;
+        (void)biasMatrix;
+        return false;
+    } else {
+        if (!CanUseSm121TensorCoreCore(cOutMatrix, aMatrix, bMatrix)) {
+            return false;
+        }
+        if constexpr (UseAcc) {
+            if constexpr (!std::is_same_v<typename TileAccIn::DType, float> || !TileAccIn::isRowMajor) {
+                return false;
+            }
+        }
+        if constexpr (UseBias) {
+            if constexpr (!std::is_same_v<typename TileBias::DType, float> || (TileBias::Rows != 1) ||
+                          !TileBias::isRowMajor) {
+                return false;
+            }
+        }
+
+        const unsigned linearTid = LinearThreadId();
+        const unsigned warpId = linearTid / warpSize;
+        if (warpId != 0) {
+            return true;
+        }
+
+        const uint16_t m = aMatrix.GetValidRow();
+        const uint16_t k = aMatrix.GetValidCol();
+        const uint16_t n = bMatrix.GetValidCol();
+        const int lda = TileLeft::Cols;
+        const int ldb = TileRight::Cols;
+        const int ldc = TileAccOut::Cols;
+
+        using FragA = wmma::fragment<wmma::matrix_a, kMmaM, kMmaN, kMmaK, AType, wmma::row_major>;
+        using FragB = wmma::fragment<wmma::matrix_b, kMmaM, kMmaN, kMmaK, BType, wmma::row_major>;
+        using FragC = wmma::fragment<wmma::accumulator, kMmaM, kMmaN, kMmaK, float>;
+
+        for (uint16_t mBase = 0; mBase < m; mBase += kMmaM) {
+            for (uint16_t nBase = 0; nBase < n; nBase += kMmaN) {
+                FragC cFrag;
+                if constexpr (UseAcc) {
+                    float *cInPtr = cInMatrix->data() + gpu::GetTileElementOffset<TileAccIn>(mBase, nBase);
+                    wmma::load_matrix_sync(cFrag, cInPtr, ldc, wmma::mem_row_major);
+                } else {
+                    wmma::fill_fragment(cFrag, 0.0f);
+                }
+
+                for (uint16_t kBase = 0; kBase < k; kBase += kMmaK) {
+                    FragA aFrag;
+                    FragB bFrag;
+                    AType *aPtr = aMatrix.data() + gpu::GetTileElementOffset<TileLeft>(mBase, kBase);
+                    BType *bPtr = bMatrix.data() + gpu::GetTileElementOffset<TileRight>(kBase, nBase);
+                    wmma::load_matrix_sync(aFrag, aPtr, lda);
+                    wmma::load_matrix_sync(bFrag, bPtr, ldb);
+                    wmma::mma_sync(cFrag, aFrag, bFrag, cFrag);
+                }
+
+                float *cOutPtr = cOutMatrix.data() + gpu::GetTileElementOffset<TileAccOut>(mBase, nBase);
+                wmma::store_matrix_sync(cOutPtr, cFrag, ldc, wmma::mem_row_major);
+
+                if constexpr (UseBias) {
+                    __syncwarp();
+                    if ((linearTid & (warpSize - 1)) == 0) {
+                        for (uint16_t row = 0; row < kMmaM; ++row) {
+                            for (uint16_t col = 0; col < kMmaN; ++col) {
+                                const std::size_t outIdx = row * ldc + col;
+                                const std::size_t biasIdx = gpu::GetTileElementOffset<TileBias>(0, nBase + col);
+                                cOutPtr[outIdx] += biasMatrix->data()[biasIdx];
+                            }
+                        }
+                    }
+                    __syncwarp();
+                }
+            }
+        }
+        return true;
+    }
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TryTensorCoreTMATMUL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    return TryTensorCoreTMATMULCore<false, false>(cMatrix, aMatrix, bMatrix);
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TryTensorCoreTMATMULAcc(TileAcc &cOutMatrix, TileAcc &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    return TryTensorCoreTMATMULCore<true, false, TileAcc, TileLeft, TileRight, TileAcc, TileAcc>(
+        cOutMatrix, aMatrix, bMatrix, &cInMatrix, static_cast<TileAcc *>(nullptr));
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TryTensorCoreTMATMULAcc(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    return TryTensorCoreTMATMULCore<true, false, TileAcc, TileLeft, TileRight, TileAcc, TileAcc>(
+        cMatrix, aMatrix, bMatrix, &cMatrix, static_cast<TileAcc *>(nullptr));
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight, typename TileBias>
+PTO_INTERNAL bool TryTensorCoreTMATMULBias(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasMatrix)
+{
+    return TryTensorCoreTMATMULCore<false, true, TileAcc, TileLeft, TileRight, TileAcc, TileBias>(
+        cMatrix, aMatrix, bMatrix, static_cast<TileAcc *>(nullptr), &biasMatrix);
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TryInlinePtxF32TMATMUL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    using CType = typename TileAcc::DType;
+    using AType = typename TileLeft::DType;
+    using BType = typename TileRight::DType;
+
+    constexpr bool supportedTypes = std::is_same_v<CType, float> &&
+                                    ((std::is_same_v<AType, float> && std::is_same_v<BType, float>) ||
+                                     (std::is_same_v<AType, half> && std::is_same_v<BType, half>) ||
+                                     (std::is_same_v<AType, bfloat16_t> && std::is_same_v<BType, bfloat16_t>));
+    if constexpr (!supportedTypes) {
+        (void)cMatrix;
+        (void)aMatrix;
+        (void)bMatrix;
+        return false;
+    } else {
+        const unsigned linearTid = LinearThreadId();
+        if (linearTid != 0) {
+            return ThreadsPerBlock() >= 1;
+        }
+
+        const uint16_t m = aMatrix.GetValidRow();
+        const uint16_t k = aMatrix.GetValidCol();
+        const uint16_t n = bMatrix.GetValidCol();
+        if (k != bMatrix.GetValidRow()) {
+            return false;
+        }
+
+#pragma unroll 1
+        for (uint16_t row = 0; row < m; ++row) {
+            for (uint16_t col = 0; col < n; ++col) {
+                float acc = 0.0f;
+#pragma unroll 4
+                for (uint16_t kk = 0; kk < k; ++kk) {
+                    const std::size_t aIdx = gpu::GetTileElementOffset<TileLeft>(row, kk);
+                    const std::size_t bIdx = gpu::GetTileElementOffset<TileRight>(kk, col);
+                    acc = InlinePtxFma(ToAccumFloat(aMatrix.data()[aIdx]), ToAccumFloat(bMatrix.data()[bIdx]), acc);
+                }
+                cMatrix.data()[gpu::GetTileElementOffset<TileAcc>(row, col)] = acc;
+            }
+        }
+        return true;
+    }
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TrySm121TMATMUL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    if (TryTensorCoreTMATMUL(cMatrix, aMatrix, bMatrix)) {
+        return true;
+    }
+    return TryInlinePtxF32TMATMUL(cMatrix, aMatrix, bMatrix);
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TrySm121TMATMULAcc(TileAcc &cOutMatrix, TileAcc &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    return TryTensorCoreTMATMULAcc(cOutMatrix, cInMatrix, aMatrix, bMatrix);
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight>
+PTO_INTERNAL bool TrySm121TMATMULAcc(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix)
+{
+    return TryTensorCoreTMATMULAcc(cMatrix, aMatrix, bMatrix);
+}
+
+template <typename TileAcc, typename TileLeft, typename TileRight, typename TileBias>
+PTO_INTERNAL bool TrySm121TMATMULBias(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasMatrix)
+{
+    return TryTensorCoreTMATMULBias(cMatrix, aMatrix, bMatrix, biasMatrix);
+}
+
+} // namespace pto::gpu::sm121
+
+#endif
diff --git a/include/pto/gpu/sm121/arch.hpp b/include/pto/gpu/sm121/arch.hpp
new file mode 100644
index 00000000..0167fb01
--- /dev/null
+++ b/include/pto/gpu/sm121/arch.hpp
@@ -0,0 +1,24 @@
+/**
+Copyright (c) 2026 Huawei Technologies Co., Ltd.
+This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+CANN Open Software License Agreement Version 2.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+#ifndef PTO_GPU_SM121_ARCH_HPP
+#define PTO_GPU_SM121_ARCH_HPP
+
+namespace pto::gpu::sm121 {
+
+constexpr int kComputeCapability = 121;
+constexpr bool kPreferInlinePtx = true;
+constexpr unsigned kDefaultWarpTileM = 16;
+constexpr unsigned kDefaultWarpTileN = 16;
+constexpr unsigned kDefaultWarpTileK = 16;
+
+} // namespace pto::gpu::sm121
+
+#endif
diff --git a/include/pto/gpu/sm121/microkernels/README.md b/include/pto/gpu/sm121/microkernels/README.md
new file mode 100644
index 00000000..3b89e772
--- /dev/null
+++ b/include/pto/gpu/sm121/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm121/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm121**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm70/README.md b/include/pto/gpu/sm70/README.md
new file mode 100644
index 00000000..1f51ded9
--- /dev/null
+++ b/include/pto/gpu/sm70/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm70/
+
+Architecture-specific PTO GPU specializations for **Volta**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm70/microkernels/README.md b/include/pto/gpu/sm70/microkernels/README.md
new file mode 100644
index 00000000..a902c0fd
--- /dev/null
+++ b/include/pto/gpu/sm70/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm70/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm70**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm75/README.md b/include/pto/gpu/sm75/README.md
new file mode 100644
index 00000000..2bc42975
--- /dev/null
+++ b/include/pto/gpu/sm75/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm75/
+
+Architecture-specific PTO GPU specializations for **Turing**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm75/microkernels/README.md b/include/pto/gpu/sm75/microkernels/README.md
new file mode 100644
index 00000000..f37638b2
--- /dev/null
+++ b/include/pto/gpu/sm75/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm75/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm75**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm80/README.md b/include/pto/gpu/sm80/README.md
new file mode 100644
index 00000000..40375e0a
--- /dev/null
+++ b/include/pto/gpu/sm80/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm80/
+
+Architecture-specific PTO GPU specializations for **Ampere datacenter**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm80/microkernels/README.md b/include/pto/gpu/sm80/microkernels/README.md
new file mode 100644
index 00000000..952c513e
--- /dev/null
+++ b/include/pto/gpu/sm80/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm80/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm80**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm86/README.md b/include/pto/gpu/sm86/README.md
new file mode 100644
index 00000000..058dc736
--- /dev/null
+++ b/include/pto/gpu/sm86/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm86/
+
+Architecture-specific PTO GPU specializations for **Ampere client/workstation**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm86/microkernels/README.md b/include/pto/gpu/sm86/microkernels/README.md
new file mode 100644
index 00000000..d2dacd8c
--- /dev/null
+++ b/include/pto/gpu/sm86/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm86/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm86**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm87/README.md b/include/pto/gpu/sm87/README.md
new file mode 100644
index 00000000..769a7744
--- /dev/null
+++ b/include/pto/gpu/sm87/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm87/
+
+Architecture-specific PTO GPU specializations for **Orin**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm87/microkernels/README.md b/include/pto/gpu/sm87/microkernels/README.md
new file mode 100644
index 00000000..0cc59b97
--- /dev/null
+++ b/include/pto/gpu/sm87/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm87/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm87**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm89/README.md b/include/pto/gpu/sm89/README.md
new file mode 100644
index 00000000..d5375e7f
--- /dev/null
+++ b/include/pto/gpu/sm89/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm89/
+
+Architecture-specific PTO GPU specializations for **Ada**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm89/microkernels/README.md b/include/pto/gpu/sm89/microkernels/README.md
new file mode 100644
index 00000000..6cc6e809
--- /dev/null
+++ b/include/pto/gpu/sm89/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm89/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm89**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm90/README.md b/include/pto/gpu/sm90/README.md
new file mode 100644
index 00000000..2cc79f3a
--- /dev/null
+++ b/include/pto/gpu/sm90/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm90/
+
+Architecture-specific PTO GPU specializations for **Hopper**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm90/microkernels/README.md b/include/pto/gpu/sm90/microkernels/README.md
new file mode 100644
index 00000000..56456ea7
--- /dev/null
+++ b/include/pto/gpu/sm90/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm90/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm90**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/gpu/sm90a/README.md b/include/pto/gpu/sm90a/README.md
new file mode 100644
index 00000000..575d25ac
--- /dev/null
+++ b/include/pto/gpu/sm90a/README.md
@@ -0,0 +1,24 @@
+# include/pto/gpu/sm90a/
+
+Architecture-specific PTO GPU specializations for **Hopper architecture-conditional specialization**.
+
+## Role
+
+This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on:
+
+- MMA / tensor-core generation
+- shared-memory banking and swizzle rules
+- async copy behavior
+- warpgroup scheduling details
+- register pressure / occupancy trade-offs
+- architecture-specific instructions or PTX variants
+
+## Expected contents later
+
+- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...)
+- tuning metadata
+- tile-shape policy tables
+- performance notes
+- microkernel registry
+
+This folder is a scaffold for bring-up, not a live backend yet.
diff --git a/include/pto/gpu/sm90a/microkernels/README.md b/include/pto/gpu/sm90a/microkernels/README.md
new file mode 100644
index 00000000..4bd8f70f
--- /dev/null
+++ b/include/pto/gpu/sm90a/microkernels/README.md
@@ -0,0 +1,22 @@
+# include/pto/gpu/sm90a/microkernels/
+
+Planned home for dedicated PTO GPU microkernels targeting **sm90a**.
+
+The intent is to store instruction-family-specific kernels here, for example:
+
+- memory movement microkernels
+- elementwise/vector microkernels
+- reduction microkernels
+- transpose / reshape helpers
+- tensor-core / matmul kernels
+
+Recommended future organization:
+
+- `memory/`
+- `elementwise/`
+- `reduction/`
+- `matrix/`
+- `transform/`
+- `sync/`
+
+Populate this directory only after the common GPU backend contract is defined.
diff --git a/include/pto/pto-inst.hpp b/include/pto/pto-inst.hpp
index 7eac6058..dd58777e 100644
--- a/include/pto/pto-inst.hpp
+++ b/include/pto/pto-inst.hpp
@@ -11,13 +11,17 @@ See LICENSE in the root of the software repository for the full text of the Lice
 #ifndef PTO_INST_HPP
 #define PTO_INST_HPP
 
+#if defined(__CUDACC__)
+#include "pto/common/gpu_stub.hpp"
+#endif
+
 #include <pto/common/type.hpp>
 #if defined(__CPU_SIM) || defined(__COSTMODEL)
 #include "pto/common/cpu_stub.hpp"
 #endif
 #include <pto/common/memory.hpp>
 
-#if defined(__CPU_SIM) || defined(__CCE_AICORE__) || defined(__COSTMODEL)
+#if defined(__CPU_SIM) || defined(__CCE_AICORE__) || defined(__COSTMODEL) || defined(PTO_GPU_BACKEND)
 #include <pto/common/arch_macro.hpp>
 #include <pto/common/pto_tile.hpp>
 #include "pto/common/pto_instr.hpp"
diff --git a/kernels/manual/gpu/README.md b/kernels/manual/gpu/README.md
new file mode 100644
index 00000000..b535f2b6
--- /dev/null
+++ b/kernels/manual/gpu/README.md
@@ -0,0 +1,17 @@
+# kernels/manual/gpu/
+
+Reserved for hand-tuned NVIDIA GPU kernel examples built on top of the future PTO GPU backend.
+
+Intended use:
+
+- end-to-end operator examples
+- architecture-specific tuning case studies
+- performance reference kernels
+- launch configuration experiments
+
+Suggested first examples once the backend exists:
+
+- GEMM
+- Flash Attention
+- elementwise fusion
+- reduction / softmax
diff --git a/tests/README.md b/tests/README.md
index 119dffca..1872b32b 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -11,6 +11,8 @@ Tests and examples for PTO Tile Lib, covering both CPU simulation and NPU (inclu
   - `README.md`: Script usage
 - `cpu/`: CPU-side ST tests (gtest + CMake)
   - `cpu/st/`: CPU ST projects and testcase data generation scripts
+- `gpu/`: NVIDIA GPU backend tests
+  - `gpu/st/`: standalone CUDA smoke / correctness tests for the GPU backend
 - `npu/`: NPU-side ST tests split by SoC
   - `npu/a2a3/src/st/`: A2/A3 compute ST
   - `npu/a2a3/comm/st/`: A2/A3 communication ST
diff --git a/tests/gpu/README.md b/tests/gpu/README.md
new file mode 100644
index 00000000..47465ba0
--- /dev/null
+++ b/tests/gpu/README.md
@@ -0,0 +1,18 @@
+# tests/gpu/
+
+Reserved for future PTO NVIDIA GPU backend tests.
+
+Planned test layers:
+
+- compile-only smoke tests
+- instruction-level correctness tests
+- CPU-vs-GPU oracle tests
+- architecture-specific regression tests
+- microbenchmarks / performance guards
+
+Suggested future split:
+
+- `common/`
+- `sm80/`
+- `sm90/`
+- `sm121/`
diff --git a/tests/gpu/st/CMakeLists.txt b/tests/gpu/st/CMakeLists.txt
new file mode 100644
index 00000000..26bd1157
--- /dev/null
+++ b/tests/gpu/st/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.24)
+project(pto_gpu_st LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CUDA_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_ARCHITECTURES 121)
+
+enable_testing()
+find_package(Threads REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+add_subdirectory(testcase)
diff --git a/tests/gpu/st/README.md b/tests/gpu/st/README.md
new file mode 100644
index 00000000..9d428e8e
--- /dev/null
+++ b/tests/gpu/st/README.md
@@ -0,0 +1,60 @@
+# tests/gpu/st/
+
+Standalone CUDA smoke / correctness tests for the PTO NVIDIA GPU backend.
+
+## What it covers today
+
+Current executables:
+
+- `pto_gpu_core` — correctness / smoke checks
+- `pto_gpu_perf` — lightweight GB10 matmul microbench
+
+Current checks:
+
+- `TLOAD` ND row-major path
+- `TLOAD` DN col-major path
+- `TSTORE` ND row-major path
+- `TSTORE` DN col-major path
+- `TADD` correctness against a host reference
+- GPU swizzle physical-layout smoke test
+- GPU swizzle `TLOAD`/`TSTORE` round-trip smoke test
+- GPU swizzle `TADD` round-trip smoke test
+- `sm121` float `TMATMUL` inline-PTX FMA fallback smoke test
+- `sm121` half `TMATMUL` tensor-core MMA tiled-path smoke test
+- `sm121` bfloat16 `TMATMUL` tensor-core MMA tiled-path smoke test
+- `sm121` half `TMATMUL` larger 64x64x64 tensor-core correctness test
+- `sm121` bfloat16 `TMATMUL` larger 64x64x64 tensor-core correctness test
+- `sm121` half `TMATMUL_ACC` tensor-core fast-path smoke test
+- `sm121` bfloat16 `TMATMUL_BIAS` tensor-core fast-path smoke test
+- `sm121` half `TMATMUL_MX` API-path smoke test
+- `sm121` half `TGEMV_MX` API-path smoke test
+
+## Build
+
+```bash
+cmake -S tests/gpu/st -B build/tests/gpu-st   -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc   -DCMAKE_CUDA_ARCHITECTURES=121
+cmake --build build/tests/gpu-st -j
+```
+
+## Run
+
+Correctness lane:
+
+```bash
+cd build/tests/gpu-st
+ctest --output-on-failure
+```
+
+Perf microbench:
+
+```bash
+./build/tests/gpu-st/testcase/pto_gpu_perf/pto_gpu_perf
+```
+
+## Notes
+
+- This lane is intentionally lightweight and self-contained.
+- It uses CTest directly instead of the repo's CPU/NPU test harnesses.
+- `sm121` now has a real warp-level tensor-core MMA path for half/bfloat16 using 16x16x16 MMA tiles, composed into larger 16-aligned matrix tiles in software.
+- `TMATMUL_ACC` and `TMATMUL_BIAS` now reuse the same tensor-core tiled path on supported `sm121` shapes.
+- float matmul currently uses the lighter inline-PTX FMA fallback path.
diff --git a/tests/gpu/st/testcase/CMakeLists.txt b/tests/gpu/st/testcase/CMakeLists.txt
new file mode 100644
index 00000000..4e343194
--- /dev/null
+++ b/tests/gpu/st/testcase/CMakeLists.txt
@@ -0,0 +1,30 @@
+
+function(pto_gpu_tool NAME)
+    add_executable(${NAME} ${NAME}.cu)
+    target_include_directories(${NAME} PRIVATE
+        ${PROJECT_SOURCE_DIR}/../../../include
+        ${PROJECT_SOURCE_DIR}/../../common
+    )
+    target_compile_options(${NAME} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-O2>
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr -O2>
+    )
+    target_link_libraries(${NAME} PRIVATE Threads::Threads CUDA::cudart)
+endfunction()
+
+function(pto_gpu_st NAME)
+    add_executable(${NAME} ${NAME}.cu)
+    target_include_directories(${NAME} PRIVATE
+        ${PROJECT_SOURCE_DIR}/../../../include
+        ${PROJECT_SOURCE_DIR}/../../common
+    )
+    target_compile_options(${NAME} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-O2>
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr -O2>
+    )
+    target_link_libraries(${NAME} PRIVATE Threads::Threads CUDA::cudart)
+    add_test(NAME ${NAME} COMMAND ${NAME})
+endfunction()
+
+add_subdirectory(pto_gpu_core)
+add_subdirectory(pto_gpu_perf)
diff --git a/tests/gpu/st/testcase/pto_gpu_core/CMakeLists.txt b/tests/gpu/st/testcase/pto_gpu_core/CMakeLists.txt
new file mode 100644
index 00000000..f4931635
--- /dev/null
+++ b/tests/gpu/st/testcase/pto_gpu_core/CMakeLists.txt
@@ -0,0 +1 @@
+pto_gpu_st(pto_gpu_core)
diff --git a/tests/gpu/st/testcase/pto_gpu_core/pto_gpu_core.cu b/tests/gpu/st/testcase/pto_gpu_core/pto_gpu_core.cu
new file mode 100644
index 00000000..7391b2d8
--- /dev/null
+++ b/tests/gpu/st/testcase/pto_gpu_core/pto_gpu_core.cu
@@ -0,0 +1,1108 @@
+#include <cuda_runtime.h>
+#include <iostream>
+#include <vector>
+#include <cstdint>
+#include <cmath>
+
+#include <pto/pto-inst.hpp>
+
+namespace {
+
+template <typename T, pto::Layout LayoutV>
+struct SimpleGlobal {
+    using DType = T;
+    static constexpr pto::Layout layout = LayoutV;
+
+    T *ptr;
+    int64_t shape[pto::GlobalTensorDim::TOTAL_DIM];
+    int64_t stride[pto::GlobalTensorDim::TOTAL_DIM];
+
+    __host__ __device__ SimpleGlobal(T *ptr_, int64_t s0, int64_t s1, int64_t s2, int64_t s3, int64_t s4,
+                                     int64_t st0, int64_t st1, int64_t st2, int64_t st3, int64_t st4)
+        : ptr(ptr_), shape{s0, s1, s2, s3, s4}, stride{st0, st1, st2, st3, st4}
+    {
+    }
+
+    __host__ __device__ T *data()
+    {
+        return ptr;
+    }
+
+    __host__ __device__ const T *data() const
+    {
+        return ptr;
+    }
+
+    __host__ __device__ int64_t GetShape(int dim) const
+    {
+        return shape[dim];
+    }
+
+    __host__ __device__ int64_t GetStride(int dim) const
+    {
+        return stride[dim];
+    }
+};
+
+bool CheckCuda(cudaError_t status, const char *what)
+{
+    if (status == cudaSuccess) {
+        return true;
+    }
+    std::cerr << "[CUDA] " << what << ": " << cudaGetErrorString(status) << std::endl;
+    return false;
+}
+
+template <typename T>
+std::size_t RowMajorOffset(std::size_t rows, std::size_t cols, std::size_t r, std::size_t c)
+{
+    (void)rows;
+    return r * cols + c;
+}
+
+template <typename T>
+std::size_t ColMajorOffset(std::size_t rows, std::size_t cols, std::size_t r, std::size_t c)
+{
+    (void)cols;
+    return c * rows + r;
+}
+
+template <typename T>
+std::vector<T> RefTLoadNdRowMajor(const std::vector<T> &src, int g0, int g1, int g2, int g3, int g4, int tileRows,
+                                  int tileCols, T pad)
+{
+    std::vector<T> out(tileRows * tileCols, pad);
+    int rowBase = 0;
+    for (int i = 0; i < g0; ++i) {
+        for (int j = 0; j < g1; ++j) {
+            for (int k = 0; k < g2; ++k) {
+                for (int r = 0; r < g3; ++r) {
+                    const int tileRow = rowBase + r;
+                    const int srcBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + r * g4;
+                    for (int c = 0; c < g4; ++c) {
+                        out[RowMajorOffset<T>(tileRows, tileCols, tileRow, c)] = src[srcBase + c];
+                    }
+                }
+                rowBase += g3;
+            }
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<T> RefTLoadDnColMajor(const std::vector<T> &src, int g0, int g1, int g2, int g3, int g4, int tileRows,
+                                  int tileCols, T pad)
+{
+    std::vector<T> out(tileRows * tileCols, pad);
+    int colBase = 0;
+    for (int i = 0; i < g0; ++i) {
+        for (int j = 0; j < g1; ++j) {
+            for (int k = 0; k < g2; ++k) {
+                for (int c = 0; c < g4; ++c) {
+                    const int tileCol = colBase + c;
+                    const int srcBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + c * g3;
+                    for (int r = 0; r < g3; ++r) {
+                        out[ColMajorOffset<T>(tileRows, tileCols, r, tileCol)] = src[srcBase + r];
+                    }
+                }
+                colBase += g4;
+            }
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<T> RefTStoreNdRowMajor(const std::vector<T> &tile, int g0, int g1, int g2, int g3, int g4, int tileRows,
+                                   int tileCols)
+{
+    std::vector<T> out(g0 * g1 * g2 * g3 * g4, T{});
+    int rowBase = 0;
+    for (int i = 0; i < g0; ++i) {
+        for (int j = 0; j < g1; ++j) {
+            for (int k = 0; k < g2; ++k) {
+                for (int r = 0; r < g3; ++r) {
+                    const int tileRow = rowBase + r;
+                    const int dstBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + r * g4;
+                    for (int c = 0; c < g4; ++c) {
+                        out[dstBase + c] = tile[RowMajorOffset<T>(tileRows, tileCols, tileRow, c)];
+                    }
+                }
+                rowBase += g3;
+            }
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<T> RefTStoreDnColMajor(const std::vector<T> &tile, int g0, int g1, int g2, int g3, int g4, int tileRows,
+                                   int tileCols)
+{
+    std::vector<T> out(g0 * g1 * g2 * g3 * g4, T{});
+    int colBase = 0;
+    for (int i = 0; i < g0; ++i) {
+        for (int j = 0; j < g1; ++j) {
+            for (int k = 0; k < g2; ++k) {
+                for (int c = 0; c < g4; ++c) {
+                    const int tileCol = colBase + c;
+                    const int dstBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + c * g3;
+                    for (int r = 0; r < g3; ++r) {
+                        out[dstBase + r] = tile[ColMajorOffset<T>(tileRows, tileCols, r, tileCol)];
+                    }
+                }
+                colBase += g4;
+            }
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<T> RefTAddRowMajor(const std::vector<T> &a, const std::vector<T> &b, int tileRows, int tileCols,
+                               int validRows, int validCols, T sentinel)
+{
+    std::vector<T> out(tileRows * tileCols, sentinel);
+    for (int r = 0; r < validRows; ++r) {
+        for (int c = 0; c < validCols; ++c) {
+            out[RowMajorOffset<T>(tileRows, tileCols, r, c)] =
+                a[RowMajorOffset<T>(tileRows, tileCols, r, c)] + b[RowMajorOffset<T>(tileRows, tileCols, r, c)];
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<T> RefGpuSwizzle128BRowMajor(const std::vector<T> &logical, int rows, int cols)
+{
+    constexpr int swizzleRows = 8;
+    const int swizzleCols = 128 / (swizzleRows * sizeof(T));
+    std::vector<T> out(logical.size(), T{});
+    const int chunksPerRow = cols / swizzleCols;
+    auto isPow2 = [](int v) { return v > 0 && ((v & (v - 1)) == 0); };
+    for (int r = 0; r < rows; ++r) {
+        const int rowBlock = r / swizzleRows;
+        const int rowInBlock = r % swizzleRows;
+        for (int c = 0; c < cols; ++c) {
+            const int chunk = c / swizzleCols;
+            const int colInChunk = c % swizzleCols;
+            const int permutedChunk = isPow2(chunksPerRow) ? ((chunk ^ (rowInBlock % chunksPerRow)) & (chunksPerRow - 1))
+                                                           : ((chunk + rowInBlock) % chunksPerRow);
+            const int physical = rowBlock * swizzleRows * cols + rowInBlock * cols + permutedChunk * swizzleCols + colInChunk;
+            out[physical] = logical[r * cols + c];
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<T> RefMatmulF32(const std::vector<T> &a, const std::vector<T> &b, int m, int k, int n)
+{
+    std::vector<T> out(m * n, T{});
+    for (int row = 0; row < m; ++row) {
+        for (int col = 0; col < n; ++col) {
+            T acc = T{};
+            for (int kk = 0; kk < k; ++kk) {
+                acc += a[row * k + kk] * b[kk * n + col];
+            }
+            out[row * n + col] = acc;
+        }
+    }
+    return out;
+}
+
+template <typename T>
+float ToHostFloat(T value)
+{
+    return static_cast<float>(value);
+}
+
+template <>
+float ToHostFloat<half>(half value)
+{
+    return __half2float(value);
+}
+
+template <>
+float ToHostFloat<bfloat16_t>(bfloat16_t value)
+{
+    return __bfloat162float(value);
+}
+
+template <typename T>
+std::vector<float> RefMatmulToFloat(const std::vector<T> &a, const std::vector<T> &b, int m, int k, int n)
+{
+    std::vector<float> out(m * n, 0.0f);
+    for (int row = 0; row < m; ++row) {
+        for (int col = 0; col < n; ++col) {
+            float acc = 0.0f;
+            for (int kk = 0; kk < k; ++kk) {
+                acc += ToHostFloat(a[row * k + kk]) * ToHostFloat(b[kk * n + col]);
+            }
+            out[row * n + col] = acc;
+        }
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<float> RefMatmulAccToFloat(const std::vector<T> &a, const std::vector<T> &b, const std::vector<float> &acc,
+                                       int m, int k, int n)
+{
+    auto out = RefMatmulToFloat(a, b, m, k, n);
+    for (std::size_t i = 0; i < out.size(); ++i) {
+        out[i] += acc[i];
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<float> RefMatmulBiasToFloat(const std::vector<T> &a, const std::vector<T> &b,
+                                        const std::vector<float> &bias, int m, int k, int n)
+{
+    auto out = RefMatmulToFloat(a, b, m, k, n);
+    for (int row = 0; row < m; ++row) {
+        for (int col = 0; col < n; ++col) {
+            out[row * n + col] += bias[col];
+        }
+    }
+    return out;
+}
+
+template <typename T>
+bool ExpectVecEq(const std::vector<T> &expected, const std::vector<T> &actual, const char *label)
+{
+    if (expected.size() != actual.size()) {
+        std::cerr << label << ": size mismatch: expected " << expected.size() << ", got " << actual.size()
+                  << std::endl;
+        return false;
+    }
+    for (std::size_t i = 0; i < expected.size(); ++i) {
+        if (expected[i] != actual[i]) {
+            std::cerr << label << ": mismatch at index " << i << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+template <>
+bool ExpectVecEq<float>(const std::vector<float> &expected, const std::vector<float> &actual, const char *label)
+{
+    if (expected.size() != actual.size()) {
+        std::cerr << label << ": size mismatch: expected " << expected.size() << ", got " << actual.size()
+                  << std::endl;
+        return false;
+    }
+    for (std::size_t i = 0; i < expected.size(); ++i) {
+        if (std::abs(expected[i] - actual[i]) > 1e-5f) {
+            std::cerr << label << ": mismatch at index " << i << ", expected " << expected[i] << ", got "
+                      << actual[i] << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+bool ExpectVecNear(const std::vector<float> &expected, const std::vector<float> &actual,
+                   const char *label, float tol)
+{
+    if (expected.size() != actual.size()) {
+        std::cerr << label << ": size mismatch: expected " << expected.size() << ", got " << actual.size()
+                  << std::endl;
+        return false;
+    }
+    for (std::size_t i = 0; i < expected.size(); ++i) {
+        if (std::abs(expected[i] - actual[i]) > tol) {
+            std::cerr << label << ": mismatch at index " << i << ", expected " << expected[i] << ", got "
+                      << actual[i] << ", tol " << tol << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+template <typename T, int G0, int G1, int G2, int G3, int G4, int TRows, int TCols>
+__global__ void KernelTLoadNd(T *out, T *src)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using GlobalData = SimpleGlobal<T, pto::Layout::ND>;
+    using TileData = pto::Tile<pto::TileType::Vec, T, TRows, TCols, pto::BLayout::RowMajor, -1, -1,
+                               pto::SLayout::NoneBox, 512, pto::PadValue::Zero>;
+    T storage[TRows * TCols];
+    TileData tile(G0 * G1 * G2 * G3, G4);
+    tile.data() = storage;
+    GlobalData global(src, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, G4, 1);
+    pto::TLOAD(tile, global);
+    for (int i = 0; i < TRows * TCols; ++i) {
+        out[i] = storage[i];
+    }
+}
+
+template <typename T, int G0, int G1, int G2, int G3, int G4, int TRows, int TCols>
+__global__ void KernelTLoadDn(T *out, T *src)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using GlobalData = SimpleGlobal<T, pto::Layout::DN>;
+    using TileData = pto::Tile<pto::TileType::Vec, T, TRows, TCols, pto::BLayout::ColMajor, -1, -1,
+                               pto::SLayout::NoneBox, 512, pto::PadValue::Zero>;
+    T storage[TRows * TCols];
+    TileData tile(G3, G0 * G1 * G2 * G4);
+    tile.data() = storage;
+    GlobalData global(src, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, 1, G3);
+    pto::TLOAD(tile, global);
+    for (int i = 0; i < TRows * TCols; ++i) {
+        out[i] = storage[i];
+    }
+}
+
+template <typename T, int G0, int G1, int G2, int G3, int G4, int TRows, int TCols>
+__global__ void KernelTStoreNd(T *out, T *tileRaw)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using GlobalData = SimpleGlobal<T, pto::Layout::ND>;
+    using TileData = pto::Tile<pto::TileType::Vec, T, TRows, TCols, pto::BLayout::RowMajor, -1, -1>;
+    TileData tile(G0 * G1 * G2 * G3, G4);
+    tile.data() = tileRaw;
+    GlobalData global(out, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, G4, 1);
+    pto::TSTORE(global, tile);
+}
+
+template <typename T, int G0, int G1, int G2, int G3, int G4, int TRows, int TCols>
+__global__ void KernelTStoreDn(T *out, T *tileRaw)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using GlobalData = SimpleGlobal<T, pto::Layout::DN>;
+    using TileData = pto::Tile<pto::TileType::Vec, T, TRows, TCols, pto::BLayout::ColMajor, -1, -1>;
+    TileData tile(G3, G0 * G1 * G2 * G4);
+    tile.data() = tileRaw;
+    GlobalData global(out, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, 1, G3);
+    pto::TSTORE(global, tile);
+}
+
+template <typename T, int TRows, int TCols, int ValidRows, int ValidCols>
+__global__ void KernelTAdd(T *out, T *a, T *b, T sentinel)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using TileData = pto::Tile<pto::TileType::Vec, T, TRows, TCols, pto::BLayout::RowMajor, -1, -1>;
+    T aStorage[TRows * TCols];
+    T bStorage[TRows * TCols];
+    T cStorage[TRows * TCols];
+    for (int i = 0; i < TRows * TCols; ++i) {
+        aStorage[i] = a[i];
+        bStorage[i] = b[i];
+        cStorage[i] = sentinel;
+    }
+    TileData aTile(ValidRows, ValidCols);
+    TileData bTile(ValidRows, ValidCols);
+    TileData cTile(ValidRows, ValidCols);
+    aTile.data() = aStorage;
+    bTile.data() = bStorage;
+    cTile.data() = cStorage;
+    pto::TADD(cTile, aTile, bTile);
+    for (int i = 0; i < TRows * TCols; ++i) {
+        out[i] = cStorage[i];
+    }
+}
+
+template <typename T, int Rows, int Cols>
+__global__ void KernelTLoadGpuSwizzleRaw(T *out, T *src)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using Shape5 = pto::Shape<-1, -1, -1, -1, -1>;
+    using GlobalData = SimpleGlobal<T, pto::Layout::ND>;
+    using TileData = pto::TileVecGpuSwizzle<T, Rows, Cols>;
+    T storage[Rows * Cols];
+    TileData tile;
+    tile.data() = storage;
+    GlobalData global(src, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1);
+    pto::TLOAD(tile, global);
+    for (int i = 0; i < Rows * Cols; ++i) {
+        out[i] = storage[i];
+    }
+}
+
+template <typename T, int Rows, int Cols>
+__global__ void KernelTLoadStoreGpuSwizzle(T *out, T *src)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using GlobalData = SimpleGlobal<T, pto::Layout::ND>;
+    using TileData = pto::TileVecGpuSwizzle<T, Rows, Cols>;
+    T storage[Rows * Cols];
+    TileData tile;
+    tile.data() = storage;
+    GlobalData srcGlobal(src, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1);
+    GlobalData dstGlobal(out, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1);
+    pto::TLOAD(tile, srcGlobal);
+    pto::TSTORE(dstGlobal, tile);
+}
+
+template <typename T, int Rows, int Cols>
+__global__ void KernelTAddGpuSwizzleRoundTrip(T *out, T *a, T *b)
+{
+    if (threadIdx.x != 0 || blockIdx.x != 0) {
+        return;
+    }
+    using GlobalData = SimpleGlobal<T, pto::Layout::ND>;
+    using TileData = pto::TileVecGpuSwizzle<T, Rows, Cols>;
+    T aStorage[Rows * Cols];
+    T bStorage[Rows * Cols];
+    T cStorage[Rows * Cols];
+    TileData aTile;
+    TileData bTile;
+    TileData cTile;
+    aTile.data() = aStorage;
+    bTile.data() = bStorage;
+    cTile.data() = cStorage;
+    GlobalData aGlobal(a, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1);
+    GlobalData bGlobal(b, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1);
+    GlobalData outGlobal(out, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1);
+    pto::TLOAD(aTile, aGlobal);
+    pto::TLOAD(bTile, bGlobal);
+    pto::TADD(cTile, aTile, bTile);
+    pto::TSTORE(outGlobal, cTile);
+}
+
+template <typename InputT, int M, int K, int N>
+__global__ void KernelTMATMUL(float *out, InputT *a, InputT *b)
+{
+    using TileA = pto::Tile<pto::TileType::Vec, InputT, M, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileB = pto::Tile<pto::TileType::Vec, InputT, K, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileC = pto::Tile<pto::TileType::Acc, float, M, N, pto::BLayout::RowMajor, -1, -1>;
+    TileA aTile(M, K);
+    TileB bTile(K, N);
+    TileC cTile(M, N);
+    aTile.data() = a;
+    bTile.data() = b;
+    cTile.data() = out;
+    pto::TMATMUL(cTile, aTile, bTile);
+}
+
+template <typename InputT, int M, int K, int N>
+__global__ void KernelTMATMUL_ACC(float *out, float *in, InputT *a, InputT *b)
+{
+    using TileA = pto::Tile<pto::TileType::Vec, InputT, M, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileB = pto::Tile<pto::TileType::Vec, InputT, K, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileC = pto::Tile<pto::TileType::Acc, float, M, N, pto::BLayout::RowMajor, -1, -1>;
+    TileA aTile(M, K);
+    TileB bTile(K, N);
+    TileC cOutTile(M, N);
+    TileC cInTile(M, N);
+    aTile.data() = a;
+    bTile.data() = b;
+    cOutTile.data() = out;
+    cInTile.data() = in;
+    pto::TMATMUL_ACC(cOutTile, cInTile, aTile, bTile);
+}
+
+template <typename InputT, int M, int K, int N>
+__global__ void KernelTMATMUL_BIAS(float *out, InputT *a, InputT *b, float *bias)
+{
+    using TileA = pto::Tile<pto::TileType::Vec, InputT, M, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileB = pto::Tile<pto::TileType::Vec, InputT, K, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileC = pto::Tile<pto::TileType::Acc, float, M, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileBias = pto::Tile<pto::TileType::Vec, float, 1, N, pto::BLayout::RowMajor, -1, -1>;
+    TileA aTile(M, K);
+    TileB bTile(K, N);
+    TileC cTile(M, N);
+    TileBias biasTile(1, N);
+    aTile.data() = a;
+    bTile.data() = b;
+    cTile.data() = out;
+    biasTile.data() = bias;
+    pto::TMATMUL_BIAS(cTile, aTile, bTile, biasTile);
+}
+
+template <typename InputT, int M, int K, int N>
+__global__ void KernelTMATMUL_MX(float *out, InputT *a, float *aScale, InputT *b, float *bScale)
+{
+    using TileA = pto::Tile<pto::TileType::Vec, InputT, M, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileB = pto::Tile<pto::TileType::Vec, InputT, K, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileC = pto::Tile<pto::TileType::Acc, float, M, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileScaleA = pto::Tile<pto::TileType::ScaleLeft, float, 1, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileScaleB = pto::Tile<pto::TileType::ScaleRight, float, 1, N, pto::BLayout::RowMajor, -1, -1>;
+    TileA aTile(M, K);
+    TileB bTile(K, N);
+    TileC cTile(M, N);
+    TileScaleA aScaleTile(1, K);
+    TileScaleB bScaleTile(1, N);
+    aTile.data() = a;
+    bTile.data() = b;
+    cTile.data() = out;
+    aScaleTile.data() = aScale;
+    bScaleTile.data() = bScale;
+    pto::TMATMUL_MX(cTile, aTile, aScaleTile, bTile, bScaleTile);
+}
+
+template <typename InputT, int M, int K, int N>
+__global__ void KernelTGEMV_MX(float *out, InputT *a, float *aScale, InputT *b, float *bScale)
+{
+    using TileA = pto::Tile<pto::TileType::Vec, InputT, M, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileB = pto::Tile<pto::TileType::Vec, InputT, K, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileC = pto::Tile<pto::TileType::Acc, float, M, N, pto::BLayout::RowMajor, -1, -1>;
+    using TileScaleA = pto::Tile<pto::TileType::ScaleLeft, float, 1, K, pto::BLayout::RowMajor, -1, -1>;
+    using TileScaleB = pto::Tile<pto::TileType::ScaleRight, float, 1, N, pto::BLayout::RowMajor, -1, -1>;
+    TileA aTile(M, K);
+    TileB bTile(K, N);
+    TileC cTile(M, N);
+    TileScaleA aScaleTile(1, K);
+    TileScaleB bScaleTile(1, N);
+    aTile.data() = a;
+    bTile.data() = b;
+    cTile.data() = out;
+    aScaleTile.data() = aScale;
+    bScaleTile.data() = bScale;
+    pto::TGEMV_MX(cTile, aTile, aScaleTile, bTile, bScaleTile);
+}
+
+bool TestTLoadNdRowMajorMatchesReference()
+{
+    constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 4, G4 = 7, TRows = 4, TCols = 8;
+    std::vector<float> src(G0 * G1 * G2 * G3 * G4);
+    for (std::size_t i = 0; i < src.size(); ++i) {
+        src[i] = static_cast<float>(i + 1);
+    }
+    auto expected = RefTLoadNdRowMajor(src, G0, G1, G2, G3, G4, TRows, TCols, 0.0f);
+
+    float *dSrc = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(float)), "cudaMalloc dSrc")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false;
+    if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(float), cudaMemcpyHostToDevice), "copy src")) return false;
+    KernelTLoadNd<float, G0, G1, G2, G3, G4, TRows, TCols><<<1, 1>>>(dOut, dSrc);
+    if (!CheckCuda(cudaGetLastError(), "launch tload nd")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tload nd")) return false;
+
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false;
+    cudaFree(dSrc);
+    cudaFree(dOut);
+    return ExpectVecEq(expected, actual, "tload_nd");
+}
+
+bool TestTLoadDnColMajorMatchesReference()
+{
+    constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 3, G4 = 5, TRows = 8, TCols = 8;
+    std::vector<float> src(G0 * G1 * G2 * G3 * G4);
+    for (std::size_t i = 0; i < src.size(); ++i) {
+        src[i] = static_cast<float>(100 + i);
+    }
+    auto expected = RefTLoadDnColMajor(src, G0, G1, G2, G3, G4, TRows, TCols, 0.0f);
+
+    float *dSrc = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(float)), "cudaMalloc dSrc")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false;
+    if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(float), cudaMemcpyHostToDevice), "copy src")) return false;
+    KernelTLoadDn<float, G0, G1, G2, G3, G4, TRows, TCols><<<1, 1>>>(dOut, dSrc);
+    if (!CheckCuda(cudaGetLastError(), "launch tload dn")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tload dn")) return false;
+
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false;
+    cudaFree(dSrc);
+    cudaFree(dOut);
+    return ExpectVecEq(expected, actual, "tload_dn");
+}
+
+bool TestTStoreNdAndDnMatchReference()
+{
+    {
+        constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 4, G4 = 7, TRows = 4, TCols = 8;
+        std::vector<float> tile(TRows * TCols, -1.0f);
+        for (int r = 0; r < G3; ++r) {
+            for (int c = 0; c < G4; ++c) {
+                tile[r * TCols + c] = static_cast<float>(r * 10 + c);
+            }
+        }
+        auto expected = RefTStoreNdRowMajor(tile, G0, G1, G2, G3, G4, TRows, TCols);
+        float *dTile = nullptr;
+        float *dOut = nullptr;
+        if (!CheckCuda(cudaMalloc(&dTile, tile.size() * sizeof(float)), "cudaMalloc dTile")) return false;
+        if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false;
+        if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset dOut")) return false;
+        if (!CheckCuda(cudaMemcpy(dTile, tile.data(), tile.size() * sizeof(float), cudaMemcpyHostToDevice), "copy tile")) return false;
+        KernelTStoreNd<float, G0, G1, G2, G3, G4, TRows, TCols><<<1, 1>>>(dOut, dTile);
+        if (!CheckCuda(cudaGetLastError(), "launch tstore nd")) return false;
+        if (!CheckCuda(cudaDeviceSynchronize(), "sync tstore nd")) return false;
+        std::vector<float> actual(expected.size());
+        if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false;
+        cudaFree(dTile);
+        cudaFree(dOut);
+        if (!ExpectVecEq(expected, actual, "tstore_nd")) return false;
+    }
+    {
+        constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 3, G4 = 6, TRows = 16, TCols = 8;
+        std::vector<int16_t> tile(TRows * TCols, static_cast<int16_t>(-7));
+        for (int c = 0; c < G4; ++c) {
+            for (int r = 0; r < G3; ++r) {
+                tile[c * TRows + r] = static_cast<int16_t>(c * 10 + r);
+            }
+        }
+        auto expected = RefTStoreDnColMajor(tile, G0, G1, G2, G3, G4, TRows, TCols);
+        int16_t *dTile = nullptr;
+        int16_t *dOut = nullptr;
+        if (!CheckCuda(cudaMalloc(&dTile, tile.size() * sizeof(int16_t)), "cudaMalloc dTile")) return false;
+        if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(int16_t)), "cudaMalloc dOut")) return false;
+        if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(int16_t)), "memset dOut")) return false;
+        if (!CheckCuda(cudaMemcpy(dTile, tile.data(), tile.size() * sizeof(int16_t), cudaMemcpyHostToDevice), "copy tile")) return false;
+        KernelTStoreDn<int16_t, G0, G1, G2, G3, G4, TRows, TCols><<<1, 1>>>(dOut, dTile);
+        if (!CheckCuda(cudaGetLastError(), "launch tstore dn")) return false;
+        if (!CheckCuda(cudaDeviceSynchronize(), "sync tstore dn")) return false;
+        std::vector<int16_t> actual(expected.size());
+        if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(int16_t), cudaMemcpyDeviceToHost), "copy out")) return false;
+        cudaFree(dTile);
+        cudaFree(dOut);
+        if (!ExpectVecEq(expected, actual, "tstore_dn")) return false;
+    }
+    return true;
+}
+
+bool TestTAddMatchesReference()
+{
+    constexpr int TRows = 8, TCols = 8, ValidRows = 5, ValidCols = 7;
+    std::vector<float> a(TRows * TCols);
+    std::vector<float> b(TRows * TCols);
+    for (int i = 0; i < TRows * TCols; ++i) {
+        a[i] = static_cast<float>(i);
+        b[i] = static_cast<float>(100 - i);
+    }
+    constexpr float sentinel = -777.0f;
+    auto expected = RefTAddRowMajor(a, b, TRows, TCols, ValidRows, ValidCols, sentinel);
+    float *dA = nullptr;
+    float *dB = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(float)), "cudaMalloc dA")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(float)), "cudaMalloc dB")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(float), cudaMemcpyHostToDevice), "copy a")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(float), cudaMemcpyHostToDevice), "copy b")) return false;
+    KernelTAdd<float, TRows, TCols, ValidRows, ValidCols><<<1, 1>>>(dOut, dA, dB, sentinel);
+    if (!CheckCuda(cudaGetLastError(), "launch tadd")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tadd")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecEq(expected, actual, "tadd");
+}
+
+bool TestGpuSwizzlePhysicalLayoutMatchesReference()
+{
+    constexpr int Rows = 16, Cols = 16;
+    std::vector<half> src(Rows * Cols);
+    for (int i = 0; i < Rows * Cols; ++i) {
+        src[i] = __float2half(static_cast<float>(i + 1));
+    }
+    auto expected = RefGpuSwizzle128BRowMajor(src, Rows, Cols);
+    half *dSrc = nullptr;
+    half *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(half)), "cudaMalloc dSrc swizzle")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(half)), "cudaMalloc dOut swizzle")) return false;
+    if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(half), cudaMemcpyHostToDevice), "copy src swizzle")) return false;
+    KernelTLoadGpuSwizzleRaw<half, Rows, Cols><<<1, 1>>>(dOut, dSrc);
+    if (!CheckCuda(cudaGetLastError(), "launch tload swizzle raw")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tload swizzle raw")) return false;
+    std::vector<half> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(half), cudaMemcpyDeviceToHost), "copy out swizzle")) return false;
+    cudaFree(dSrc);
+    cudaFree(dOut);
+    return ExpectVecEq(expected, actual, "gpu_swizzle_physical");
+}
+
+bool TestGpuSwizzleRoundTripMatchesReference()
+{
+    constexpr int Rows = 16, Cols = 16;
+    std::vector<half> src(Rows * Cols);
+    for (int i = 0; i < Rows * Cols; ++i) {
+        src[i] = __float2half(static_cast<float>((i % 23) - 11) * 0.25f);
+    }
+    half *dSrc = nullptr;
+    half *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(half)), "cudaMalloc dSrc swizzle rt")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, src.size() * sizeof(half)), "cudaMalloc dOut swizzle rt")) return false;
+    if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(half), cudaMemcpyHostToDevice), "copy src swizzle rt")) return false;
+    KernelTLoadStoreGpuSwizzle<half, Rows, Cols><<<1, 1>>>(dOut, dSrc);
+    if (!CheckCuda(cudaGetLastError(), "launch tloadstore swizzle")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tloadstore swizzle")) return false;
+    std::vector<half> actual(src.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(half), cudaMemcpyDeviceToHost), "copy out swizzle rt")) return false;
+    cudaFree(dSrc);
+    cudaFree(dOut);
+    return ExpectVecEq(src, actual, "gpu_swizzle_roundtrip");
+}
+
+bool TestGpuSwizzleTAddRoundTripMatchesReference()
+{
+    constexpr int Rows = 16, Cols = 16;
+    std::vector<half> a(Rows * Cols), b(Rows * Cols);
+    std::vector<half> expected(Rows * Cols);
+    for (int i = 0; i < Rows * Cols; ++i) {
+        a[i] = __float2half(static_cast<float>((i % 13) - 6) * 0.5f);
+        b[i] = __float2half(static_cast<float>((i % 7) - 3) * 0.25f);
+        expected[i] = __hadd(a[i], b[i]);
+    }
+    half *dA = nullptr;
+    half *dB = nullptr;
+    half *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA swizzle add")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB swizzle add")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(half)), "cudaMalloc dOut swizzle add")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a swizzle add")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b swizzle add")) return false;
+    KernelTAddGpuSwizzleRoundTrip<half, Rows, Cols><<<1, 1>>>(dOut, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tadd swizzle")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tadd swizzle")) return false;
+    std::vector<half> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(half), cudaMemcpyDeviceToHost), "copy out swizzle add")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecEq(expected, actual, "gpu_swizzle_tadd");
+}
+
+bool TestSm121FloatInlinePtxMatmulMatchesReference()
+{
+    constexpr int M = 8, K = 8, N = 8;
+    std::vector<float> a(M * K);
+    std::vector<float> b(K * N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = static_cast<float>((i % 9) - 4);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = static_cast<float>((i % 7) - 3);
+    }
+    auto expected = RefMatmulF32(a, b, M, K, N);
+    float *dA = nullptr;
+    float *dB = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(float)), "cudaMalloc dA")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(float)), "cudaMalloc dB")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(float), cudaMemcpyHostToDevice), "copy a")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(float), cudaMemcpyHostToDevice), "copy b")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out")) return false;
+    KernelTMATMUL<float, M, K, N><<<1, 1>>>(dOut, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul float")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul float")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecEq(expected, actual, "tmatmul_sm121_float");
+}
+
+bool TestSm121HalfTensorCoreMatmulExtendedMatchesReference()
+{
+    constexpr int M = 32, K = 16, N = 32;
+    std::vector<half> a(M * K);
+    std::vector<half> b(K * N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = __float2half(static_cast<float>((i % 13) - 6) * 0.25f);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = __float2half(static_cast<float>((i % 11) - 5) * 0.5f);
+    }
+    auto expected = RefMatmulToFloat(a, b, M, K, N);
+    half *dA = nullptr;
+    half *dB = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA half")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB half")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut half")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a half")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b half")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out half")) return false;
+    KernelTMATMUL<half, M, K, N><<<1, 32>>>(dOut, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul half")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul half")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out half")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_sm121_half_extended", 1e-3f);
+}
+
+bool TestSm121Bfloat16TensorCoreMatmulExtendedMatchesReference()
+{
+    constexpr int M = 16, K = 32, N = 16;
+    std::vector<bfloat16_t> a(M * K);
+    std::vector<bfloat16_t> b(K * N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = __float2bfloat16(static_cast<float>((i % 9) - 4) * 0.75f);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = __float2bfloat16(static_cast<float>((i % 7) - 3) * 0.5f);
+    }
+    auto expected = RefMatmulToFloat(a, b, M, K, N);
+    bfloat16_t *dA = nullptr;
+    bfloat16_t *dB = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(bfloat16_t)), "cudaMalloc dA bf16")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(bfloat16_t)), "cudaMalloc dB bf16")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut bf16")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy a bf16")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy b bf16")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out bf16")) return false;
+    KernelTMATMUL<bfloat16_t, M, K, N><<<1, 32>>>(dOut, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul bf16")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul bf16")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out bf16")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_sm121_bf16_extended", 2e-2f);
+}
+
+bool TestSm121HalfTensorCoreMatmulLarge64MatchesReference()
+{
+    constexpr int M = 64, K = 64, N = 64;
+    std::vector<half> a(M * K);
+    std::vector<half> b(K * N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = __float2half(static_cast<float>((i % 19) - 9) * 0.125f);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = __float2half(static_cast<float>((i % 17) - 8) * 0.1875f);
+    }
+    auto expected = RefMatmulToFloat(a, b, M, K, N);
+    half *dA = nullptr;
+    half *dB = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA half large")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB half large")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut half large")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a half large")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b half large")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out half large")) return false;
+    KernelTMATMUL<half, M, K, N><<<1, 32>>>(dOut, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul half large")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul half large")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out half large")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_sm121_half_large64", 2e-3f);
+}
+
+bool TestSm121Bfloat16TensorCoreMatmulLarge64MatchesReference()
+{
+    constexpr int M = 64, K = 64, N = 64;
+    std::vector<bfloat16_t> a(M * K);
+    std::vector<bfloat16_t> b(K * N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = __float2bfloat16(static_cast<float>((i % 15) - 7) * 0.25f);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = __float2bfloat16(static_cast<float>((i % 13) - 6) * 0.3125f);
+    }
+    auto expected = RefMatmulToFloat(a, b, M, K, N);
+    bfloat16_t *dA = nullptr;
+    bfloat16_t *dB = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(bfloat16_t)), "cudaMalloc dA bf16 large")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(bfloat16_t)), "cudaMalloc dB bf16 large")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut bf16 large")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy a bf16 large")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy b bf16 large")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out bf16 large")) return false;
+    KernelTMATMUL<bfloat16_t, M, K, N><<<1, 32>>>(dOut, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul bf16 large")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul bf16 large")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out bf16 large")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_sm121_bf16_large64", 3e-2f);
+}
+
+bool TestSm121HalfTensorCoreMatmulAccMatchesReference()
+{
+    constexpr int M = 16, K = 16, N = 16;
+    std::vector<half> a(M * K);
+    std::vector<half> b(K * N);
+    std::vector<float> acc(M * N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = __float2half(static_cast<float>((i % 15) - 7) * 0.125f);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = __float2half(static_cast<float>((i % 10) - 4) * 0.375f);
+    }
+    for (int i = 0; i < M * N; ++i) {
+        acc[i] = static_cast<float>((i % 8) - 4) * 0.5f;
+    }
+    auto expected = RefMatmulAccToFloat(a, b, acc, M, K, N);
+    half *dA = nullptr;
+    half *dB = nullptr;
+    float *dAcc = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA acc")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB acc")) return false;
+    if (!CheckCuda(cudaMalloc(&dAcc, acc.size() * sizeof(float)), "cudaMalloc dAcc acc")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut acc")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a acc")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b acc")) return false;
+    if (!CheckCuda(cudaMemcpy(dAcc, acc.data(), acc.size() * sizeof(float), cudaMemcpyHostToDevice), "copy acc in")) return false;
+    KernelTMATMUL_ACC<half, M, K, N><<<1, 32>>>(dOut, dAcc, dA, dB);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul acc")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul acc")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out acc")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dAcc);
+    cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_sm121_acc", 1e-3f);
+}
+
+bool TestSm121Bfloat16TensorCoreMatmulBiasMatchesReference()
+{
+    constexpr int M = 16, K = 16, N = 16;
+    std::vector<bfloat16_t> a(M * K);
+    std::vector<bfloat16_t> b(K * N);
+    std::vector<float> bias(N);
+    for (int i = 0; i < M * K; ++i) {
+        a[i] = __float2bfloat16(static_cast<float>((i % 9) - 4) * 0.25f);
+    }
+    for (int i = 0; i < K * N; ++i) {
+        b[i] = __float2bfloat16(static_cast<float>((i % 7) - 3) * 0.75f);
+    }
+    for (int i = 0; i < N; ++i) {
+        bias[i] = static_cast<float>(i - 8) * 0.125f;
+    }
+    auto expected = RefMatmulBiasToFloat(a, b, bias, M, K, N);
+    bfloat16_t *dA = nullptr;
+    bfloat16_t *dB = nullptr;
+    float *dBias = nullptr;
+    float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(bfloat16_t)), "cudaMalloc dA bias")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(bfloat16_t)), "cudaMalloc dB bias")) return false;
+    if (!CheckCuda(cudaMalloc(&dBias, bias.size() * sizeof(float)), "cudaMalloc dBias")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut bias")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy a bias")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy b bias")) return false;
+    if (!CheckCuda(cudaMemcpy(dBias, bias.data(), bias.size() * sizeof(float), cudaMemcpyHostToDevice), "copy bias")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out bias")) return false;
+    KernelTMATMUL_BIAS<bfloat16_t, M, K, N><<<1, 32>>>(dOut, dA, dB, dBias);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul bias")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul bias")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out bias")) return false;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dBias);
+    cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_sm121_bias", 2e-2f);
+}
+
+bool TestSm121HalfTMATMUL_MXMatchesReference()
+{
+    constexpr int M = 16, K = 16, N = 16;
+    std::vector<half> a(M * K);
+    std::vector<half> b(K * N);
+    std::vector<float> aScale(K, 1.0f), bScale(N, 1.0f);
+    for (int i = 0; i < M * K; ++i) a[i] = __float2half(static_cast<float>((i % 13) - 6) * 0.25f);
+    for (int i = 0; i < K * N; ++i) b[i] = __float2half(static_cast<float>((i % 11) - 5) * 0.5f);
+    auto expected = RefMatmulToFloat(a, b, M, K, N);
+    half *dA = nullptr; half *dB = nullptr; float *dAS = nullptr; float *dBS = nullptr; float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dAS, aScale.size() * sizeof(float)), "cudaMalloc dAS mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dBS, bScale.size() * sizeof(float)), "cudaMalloc dBS mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dAS, aScale.data(), aScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy as mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dBS, bScale.data(), bScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy bs mx")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out mx")) return false;
+    KernelTMATMUL_MX<half, M, K, N><<<1, 32>>>(dOut, dA, dAS, dB, dBS);
+    if (!CheckCuda(cudaGetLastError(), "launch tmatmul_mx")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul_mx")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out mx")) return false;
+    cudaFree(dA); cudaFree(dB); cudaFree(dAS); cudaFree(dBS); cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tmatmul_mx", 1e-3f);
+}
+
+bool TestSm121HalfTGEMV_MXMatchesReference()
+{
+    constexpr int M = 16, K = 16, N = 16;
+    std::vector<half> a(M * K);
+    std::vector<half> b(K * N);
+    std::vector<float> aScale(K, 1.0f), bScale(N, 1.0f);
+    for (int i = 0; i < M * K; ++i) a[i] = __float2half(static_cast<float>((i % 9) - 4) * 0.375f);
+    for (int i = 0; i < K * N; ++i) b[i] = __float2half(static_cast<float>((i % 7) - 3) * 0.625f);
+    auto expected = RefMatmulToFloat(a, b, M, K, N);
+    half *dA = nullptr; half *dB = nullptr; float *dAS = nullptr; float *dBS = nullptr; float *dOut = nullptr;
+    if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA gemv_mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB gemv_mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dAS, aScale.size() * sizeof(float)), "cudaMalloc dAS gemv_mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dBS, bScale.size() * sizeof(float)), "cudaMalloc dBS gemv_mx")) return false;
+    if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut gemv_mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a gemv_mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b gemv_mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dAS, aScale.data(), aScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy as gemv_mx")) return false;
+    if (!CheckCuda(cudaMemcpy(dBS, bScale.data(), bScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy bs gemv_mx")) return false;
+    if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out gemv_mx")) return false;
+    KernelTGEMV_MX<half, M, K, N><<<1, 32>>>(dOut, dA, dAS, dB, dBS);
+    if (!CheckCuda(cudaGetLastError(), "launch tgemv_mx")) return false;
+    if (!CheckCuda(cudaDeviceSynchronize(), "sync tgemv_mx")) return false;
+    std::vector<float> actual(expected.size());
+    if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out gemv_mx")) return false;
+    cudaFree(dA); cudaFree(dB); cudaFree(dAS); cudaFree(dBS); cudaFree(dOut);
+    return ExpectVecNear(expected, actual, "tgemv_mx", 1e-3f);
+}
+
+} // namespace
+
+int main()
+{
+    int failed = 0;
+    auto run = [&](const char *name, bool (*fn)()) {
+        const bool ok = fn();
+        std::cout << (ok ? "[PASS] " : "[FAIL] ") << name << std::endl;
+        if (!ok) {
+            ++failed;
+        }
+    };
+
+    run("TLoadNdRowMajorMatchesReference", &TestTLoadNdRowMajorMatchesReference);
+    run("TLoadDnColMajorMatchesReference", &TestTLoadDnColMajorMatchesReference);
+    run("TStoreNdAndDnMatchReference", &TestTStoreNdAndDnMatchReference);
+    run("TAddMatchesReference", &TestTAddMatchesReference);
+    run("GpuSwizzlePhysicalLayoutMatchesReference", &TestGpuSwizzlePhysicalLayoutMatchesReference);
+    run("GpuSwizzleRoundTripMatchesReference", &TestGpuSwizzleRoundTripMatchesReference);
+    run("GpuSwizzleTAddRoundTripMatchesReference", &TestGpuSwizzleTAddRoundTripMatchesReference);
+    run("Sm121FloatInlinePtxMatmulMatchesReference", &TestSm121FloatInlinePtxMatmulMatchesReference);
+    run("Sm121HalfTensorCoreMatmulExtendedMatchesReference", &TestSm121HalfTensorCoreMatmulExtendedMatchesReference);
+    run("Sm121Bfloat16TensorCoreMatmulExtendedMatchesReference", &TestSm121Bfloat16TensorCoreMatmulExtendedMatchesReference);
+    run("Sm121HalfTensorCoreMatmulLarge64MatchesReference", &TestSm121HalfTensorCoreMatmulLarge64MatchesReference);
+    run("Sm121Bfloat16TensorCoreMatmulLarge64MatchesReference", &TestSm121Bfloat16TensorCoreMatmulLarge64MatchesReference);
+    run("Sm121HalfTensorCoreMatmulAccMatchesReference", &TestSm121HalfTensorCoreMatmulAccMatchesReference);
+    run("Sm121Bfloat16TensorCoreMatmulBiasMatchesReference", &TestSm121Bfloat16TensorCoreMatmulBiasMatchesReference);
+    run("Sm121HalfTMATMUL_MXMatchesReference", &TestSm121HalfTMATMUL_MXMatchesReference);
+    run("Sm121HalfTGEMV_MXMatchesReference", &TestSm121HalfTGEMV_MXMatchesReference);
+
+    return failed == 0 ? 0 : 1;
+}
diff --git a/tests/gpu/st/testcase/pto_gpu_perf/CMakeLists.txt b/tests/gpu/st/testcase/pto_gpu_perf/CMakeLists.txt
new file mode 100644
index 00000000..067b8645
--- /dev/null
+++ b/tests/gpu/st/testcase/pto_gpu_perf/CMakeLists.txt
@@ -0,0 +1 @@
+pto_gpu_tool(pto_gpu_perf)
diff --git a/tests/gpu/st/testcase/pto_gpu_perf/pto_gpu_perf.cu b/tests/gpu/st/testcase/pto_gpu_perf/pto_gpu_perf.cu
new file mode 100644
index 00000000..f0d798d2
--- /dev/null
+++ b/tests/gpu/st/testcase/pto_gpu_perf/pto_gpu_perf.cu
@@ -0,0 +1,119 @@
+#include <cuda_runtime.h>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <string>
+
+#include <pto/pto-inst.hpp>
+
+namespace {
+
+template <typename T>
+__global__ void KernelTMATMULBench(float *out, T *a, T *b)
+{
+    using TileA = pto::Tile<pto::TileType::Vec, T, 64, 64, pto::BLayout::RowMajor, -1, -1>;
+    using TileB = pto::Tile<pto::TileType::Vec, T, 64, 64, pto::BLayout::RowMajor, -1, -1>;
+    using TileC = pto::Tile<pto::TileType::Acc, float, 64, 64, pto::BLayout::RowMajor, -1, -1>;
+    TileA aTile(64, 64);
+    TileB bTile(64, 64);
+    TileC cTile(64, 64);
+    aTile.data() = a;
+    bTile.data() = b;
+    cTile.data() = out;
+    pto::TMATMUL(cTile, aTile, bTile);
+}
+
+bool Check(cudaError_t st, const char *what)
+{
+    if (st == cudaSuccess) return true;
+    std::cerr << what << ": " << cudaGetErrorString(st) << std::endl;
+    return false;
+}
+
+template <typename T>
+void InitInputs(std::vector<T> &a, std::vector<T> &b)
+{
+    for (std::size_t i = 0; i < a.size(); ++i) a[i] = static_cast<T>((int(i % 17) - 8) * 0.1f);
+    for (std::size_t i = 0; i < b.size(); ++i) b[i] = static_cast<T>((int(i % 13) - 6) * 0.2f);
+}
+
+template <>
+void InitInputs<half>(std::vector<half> &a, std::vector<half> &b)
+{
+    for (std::size_t i = 0; i < a.size(); ++i) a[i] = __float2half((int(i % 17) - 8) * 0.1f);
+    for (std::size_t i = 0; i < b.size(); ++i) b[i] = __float2half((int(i % 13) - 6) * 0.2f);
+}
+
+template <>
+void InitInputs<bfloat16_t>(std::vector<bfloat16_t> &a, std::vector<bfloat16_t> &b)
+{
+    for (std::size_t i = 0; i < a.size(); ++i) a[i] = __float2bfloat16((int(i % 17) - 8) * 0.1f);
+    for (std::size_t i = 0; i < b.size(); ++i) b[i] = __float2bfloat16((int(i % 13) - 6) * 0.2f);
+}
+
+template <typename T>
+bool RunBench(const std::string &name, dim3 block, int iters)
+{
+    constexpr int M = 64, K = 64, N = 64;
+    constexpr std::size_t aCount = M * K;
+    constexpr std::size_t bCount = K * N;
+    constexpr std::size_t cCount = M * N;
+
+    std::vector<T> hA(aCount), hB(bCount);
+    InitInputs(hA, hB);
+
+    T *dA = nullptr;
+    T *dB = nullptr;
+    float *dC = nullptr;
+    if (!Check(cudaMalloc(&dA, aCount * sizeof(T)), "cudaMalloc dA")) return false;
+    if (!Check(cudaMalloc(&dB, bCount * sizeof(T)), "cudaMalloc dB")) return false;
+    if (!Check(cudaMalloc(&dC, cCount * sizeof(float)), "cudaMalloc dC")) return false;
+    if (!Check(cudaMemcpy(dA, hA.data(), aCount * sizeof(T), cudaMemcpyHostToDevice), "copy A")) return false;
+    if (!Check(cudaMemcpy(dB, hB.data(), bCount * sizeof(T), cudaMemcpyHostToDevice), "copy B")) return false;
+    if (!Check(cudaMemset(dC, 0, cCount * sizeof(float)), "memset C")) return false;
+
+    for (int i = 0; i < 10; ++i) {
+        KernelTMATMULBench<T><<<1, block>>>(dC, dA, dB);
+    }
+    if (!Check(cudaDeviceSynchronize(), "warmup sync")) return false;
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);
+    for (int i = 0; i < iters; ++i) {
+        KernelTMATMULBench<T><<<1, block>>>(dC, dA, dB);
+    }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    float ms = 0.0f;
+    cudaEventElapsedTime(&ms, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    const double avgMs = ms / iters;
+    const double flops = 2.0 * M * N * K;
+    const double gflops = flops / (avgMs * 1.0e6);
+
+    std::cout << std::left << std::setw(8) << name
+              << " avg_ms=" << std::fixed << std::setprecision(4) << avgMs
+              << " gflops=" << std::fixed << std::setprecision(2) << gflops
+              << " block=" << block.x << std::endl;
+
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+    return true;
+}
+
+} // namespace
+
+int main()
+{
+    bool ok = true;
+    std::cout << "PTO GPU matmul microbench on GB10 (64x64x64, 1 block)" << std::endl;
+    ok &= RunBench<float>("float", dim3(1), 200);
+    ok &= RunBench<half>("half", dim3(32), 500);
+    ok &= RunBench<bfloat16_t>("bf16", dim3(32), 500);
+    return ok ? 0 : 1;
+}