diff --git a/docs/gpu_backend_plan.md b/docs/gpu_backend_plan.md new file mode 100644 index 00000000..a43dbf80 --- /dev/null +++ b/docs/gpu_backend_plan.md @@ -0,0 +1,381 @@ +# GPU Backend Plan for PTO Tile Lib + +## Selected Decisions (2026-03-31) + +- Git identity: use the already configured global Git identity +- Backend substrate: **CUDA C++ + inline PTX for hot kernels** +- Architecture scope: **keep all scaffolded SM dirs, implement `sm121` first** +- First implementation slice: **mixed** (`TLOAD`, `TSTORE`, `TADD`, `TMATMUL`) +- Event semantics in v1: **defer advanced semantics until compute kernels work** + +## Current Repo Analysis + +### 1. Existing backend split + +The public entry point is `include/pto/pto-inst.hpp`, which pulls in: + +- shared tile/type/instruction declarations from `include/pto/common/` +- CPU simulator support through `__CPU_SIM` +- NPU implementations through `include/pto/common/arch_macro.hpp` and `include/pto/common/pto_instr_impl.hpp` + +Today the repo has three real backend families: + +- **CPU simulator**: header-only reference/simulation path +- **NPU A2/A3**: `include/pto/npu/a2a3/` +- **NPU A5**: `include/pto/npu/a5/` + +There is **no NVIDIA GPU backend yet**: + +- no `include/pto/gpu/` +- no GPU dispatch macro in `arch_macro.hpp` +- no GPU include block in `pto_instr_impl.hpp` +- no `tests/gpu/` +- no `kernels/manual/gpu/` + +### 2. What the existing backends tell us + +#### CPU backend + +The CPU path is the functional/spec reference layer: + +- broad instruction surface area +- flexible tile/layout checks +- useful for correctness and fallback semantics +- `TSYNC` is effectively a no-op in CPU simulation + +This is the best source of **semantic truth** for a future GPU backend. + +#### NPU A2/A3 backend + +The A2/A3 path is the first real hardware mapping layer: + +- instruction-specific files such as `TAdd.hpp`, `TLoad.hpp`, `TMatmul.hpp` +- explicit pipeline/event handling +- memory-move kernels mapped to hardware copy instructions +- vector and cube paths separated by tile/layout constraints + +This is the best source of **how PTO instructions get lowered to hardware micro-ops**. + +#### NPU A5 backend + +The A5 path extends the same architecture with: + +- richer data type support +- more advanced matmul variants (FP8 / MX / FP4-related handling) +- evolved sync/event behavior +- more specialized utility helpers and hardware-specific tuning + +This is the best source of **how backend specialization grows with hardware generations**. + +### 3. Architectural implications for a GPU backend + +A CUDA/NVIDIA backend should not start as one monolithic implementation. It should follow the repo's existing pattern: + +- shared PTO API remains in `include/pto/common/` +- GPU-specific lowering lives under `include/pto/gpu/` +- arch-specific overrides live in per-SM subdirectories +- common CUDA helpers, tile iterators, layout adapters, and launch helpers live in `include/pto/gpu/common/` + +### 4. Backend bring-up priorities + +To make a GPU backend useful early, the instruction bring-up order should be: + +1. **Dispatch and toolchain plumbing** +2. **Memory movement**: `TLOAD`, `TSTORE`, `TPREFETCH` +3. **Elementwise vector ops**: `TADD`, `TSUB`, `TMUL`, `TMAX`, `TMIN`, `TEXPANDS`, comparisons/selects +4. **Reductions / reshapes / transforms** +5. **Matrix kernels**: `TMATMUL`, `TMATMUL_ACC`, `TMATMUL_BIAS`, `TMATMUL_MX` +6. **Synchronization / event semantics** +7. **Collective / comm semantics**, if GPU multi-device becomes in scope + +--- + +## Proposed Directory Layout + +```text +include/pto/gpu/ + README.md + common/ + README.md + sm70/ + README.md + microkernels/ + README.md + sm75/ + README.md + microkernels/ + README.md + sm80/ + README.md + microkernels/ + README.md + sm86/ + README.md + microkernels/ + README.md + sm87/ + README.md + microkernels/ + README.md + sm89/ + README.md + microkernels/ + README.md + sm90/ + README.md + microkernels/ + README.md + sm90a/ + README.md + microkernels/ + README.md + sm100/ + README.md + microkernels/ + README.md + sm100a/ + README.md + microkernels/ + README.md + sm103/ + README.md + microkernels/ + README.md + sm110/ + README.md + microkernels/ + README.md + sm120/ + README.md + microkernels/ + README.md + sm121/ + README.md + microkernels/ + README.md + +tests/gpu/ + README.md + +kernels/manual/gpu/ + README.md +``` + +### Why these SM targets + +These folders cover the modern NVIDIA targets that matter for PTO-style tile programming and current deployment: + +- `sm70`: Volta +- `sm75`: Turing +- `sm80`: Ampere datacenter +- `sm86`: Ampere client/workstation +- `sm87`: Orin +- `sm89`: Ada +- `sm90`: Hopper +- `sm90a`: Hopper architecture-conditional specialization +- `sm100`: Blackwell +- `sm100a`: Blackwell architecture-conditional specialization +- `sm103`: GB300 / B300 generation +- `sm110`: Thor / Jetson T-series generation +- `sm120`: Blackwell workstation / GeForce generation +- `sm121`: GB10 (DGX Spark) + +This machine is **GB10 / compute capability 12.1**, so `sm121` should be the first optimization target. + +--- + +## Phase Plan + +### Phase 0 — Scope freeze and backend contract + +**Goal:** Decide what “GPU backend” means in this repo before writing kernels. + +#### Checklist + +- [ ] Decide whether the first GPU backend is: + - [ ] correctness-first + - [ ] performance-first + - [ ] hybrid (correctness-first, optimize hot instructions first) +- [ ] Decide the implementation substrate: + - [ ] CUDA C++ + - [ ] CUDA C++ + inline PTX + - [ ] CUDA C++ + CUTLASS/CuTe + - [ ] CUDA C++ + generated microkernels +- [ ] Decide the minimum supported toolkit/driver baseline +- [ ] Decide the minimum supported GPU baseline (`sm80+` only vs broad support) +- [ ] Decide whether GPU backend must preserve exact PTO event semantics or allow a GPU-specific semantic subset first +- [ ] Decide whether multi-GPU comm is in phase 1 or deferred + +### Phase 1 — Dispatch, macros, and build plumbing + +**Goal:** Make GPU a first-class backend selection path. + +#### Checklist + +- [ ] Add GPU arch detection / selection macros +- [ ] Define a backend macro family such as: + - [ ] `PTO_GPU_BACKEND` + - [ ] `PTO_GPU_SM80` + - [ ] `PTO_GPU_SM90` + - [ ] `PTO_GPU_SM121` +- [ ] Add GPU include blocks to `include/pto/common/pto_instr_impl.hpp` +- [ ] Decide whether `include/pto/pto-inst.hpp` should expose GPU path under: + - [ ] `__CUDACC__` + - [ ] a project-defined macro + - [ ] both +- [ ] Introduce initial CMake path for GPU tests/builds +- [ ] Add one minimal compile-only smoke target for GPU backend headers + +### Phase 2 — Memory model and tile mapping + +**Goal:** Define how PTO tiles live on NVIDIA GPUs. + +#### Checklist + +- [ ] Map PTO tile memory spaces onto CUDA memory spaces: + - [ ] global memory + - [ ] shared memory + - [ ] registers + - [ ] tensor memory / async staging abstractions where needed +- [ ] Define row-major / col-major / NZ-like tile layout adapters on GPU +- [ ] Decide whether PTO fractal layouts are represented as: + - [ ] compile-time layout descriptors + - [ ] iterator objects + - [ ] shared-memory swizzles +- [ ] Implement common helper layer in `include/pto/gpu/common/` +- [ ] Document tile-to-warp / tile-to-warpgroup ownership rules + +### Phase 3 — Memory instructions first + +**Goal:** Make data movement work before math. + +#### Checklist + +- [ ] Implement `TLOAD` +- [ ] Implement `TSTORE` +- [ ] Implement `TPREFETCH` +- [ ] Validate padding, stride, and layout conversions +- [ ] Add tests for ND / DN / NZ-like cases +- [ ] Add baseline performance checks for bandwidth-sensitive paths + +### Phase 4 — Core elementwise backend + +**Goal:** Bring up the wide surface-area instructions with reusable microkernel templates. + +#### Checklist + +- [ ] Implement binary elementwise template family +- [ ] Implement unary elementwise template family +- [ ] Prioritize: + - [ ] `TADD` + - [ ] `TSUB` + - [ ] `TMUL` + - [ ] `TDIV` + - [ ] `TMIN` + - [ ] `TMAX` + - [ ] `TCMP` / `TCMPS` + - [ ] `TSEL` / `TSELS` + - [ ] `TEXP` / `TLOG` / `TRSQRT` / `TSQRT` +- [ ] Split generic implementations from per-SM overrides +- [ ] Add correctness tests against CPU simulator outputs + +### Phase 5 — Reduction, reshape, and transform ops + +**Goal:** Cover the shape-changing and reduction-heavy instructions. + +#### Checklist + +- [ ] `TRow*` family +- [ ] `TCol*` family +- [ ] `TRESHAPE` +- [ ] `TTRANS` +- [ ] `TEXTRACT` +- [ ] `TFILLPAD` +- [ ] `TGATHER` / `TSCATTER` variants where practical +- [ ] Decide warp-level vs block-level reduction strategies per SM + +### Phase 6 — Matrix and tensor-core path + +**Goal:** Make PTO meaningful for high-performance GPU kernels. + +#### Checklist + +- [ ] Implement a generic `TMATMUL` path first +- [ ] Add accumulator forms: + - [ ] `TMATMUL_ACC` + - [ ] `TMATMUL_BIAS` + - [ ] `TGEMV` +- [ ] Decide whether `TMATMUL_MX` is phase-1 GPU scope or deferred +- [ ] Create per-SM tensor-core microkernel registry +- [ ] Define tile shape families by SM +- [ ] Add tuning metadata per instruction / dtype / tile shape / SM +- [ ] Prioritize `sm121`, `sm120`, `sm90`, `sm89`, `sm80` + +### Phase 7 — Sync/event semantics on GPU + +**Goal:** Reconcile PTO pipeline/event model with CUDA execution semantics. + +#### Checklist + +- [ ] Define what `TSYNC` means on GPU +- [ ] Decide whether `Event` is: + - [ ] fully modeled + - [ ] partially modeled + - [ ] translated into CUDA barriers / named barriers / cooperative groups +- [ ] Document unsupported or approximated event cases +- [ ] Add tests for ordering-sensitive instruction sequences + +### Phase 8 — Tests, perf harness, and examples + +**Goal:** Make the backend maintainable. + +#### Checklist + +- [ ] Add `tests/gpu/` structure +- [ ] Add CPU-vs-GPU oracle tests +- [ ] Add instruction-level microbenchmarks +- [ ] Add kernel-level examples under `kernels/manual/gpu/` +- [ ] Add CI matrix for at least compile coverage, even if hardware CI is unavailable +- [ ] Add documentation for supported instructions per SM + +--- + +## Recommended first implementation slice + +If the goal is fast progress with good signal, the first real slice should be: + +1. `sm121` + `sm120` + `sm90` shared bring-up path +2. `TLOAD` +3. `TSTORE` +4. `TADD` +5. `TMUL` +6. `TMAX` +7. `TMATMUL` +8. basic correctness tests vs CPU simulator + +That gets the backend from “directory scaffold” to “real execution path” with a minimal but meaningful kernel set. + +--- + +## Open design questions for the next step + +1. **Backend substrate** + - Option A: plain CUDA C++ first + - Option B: CUDA + inline PTX for hot kernels + - Option C: CUDA + CUTLASS/CuTe for matmul, custom kernels for the rest + +2. **Initial architecture scope** + - Option A: `sm121` only + - Option B: `sm121` + `sm120` + `sm90` + - Option C: all scaffolded SMs, but only `sm121` gets real kernels first + +3. **Bring-up priority** + - Option A: memory ops first + - Option B: matmul first + - Option C: elementwise first + - Option D: mixed (`TLOAD/TSTORE/TADD/TMATMUL`) + +4. **How strict should PTO event semantics be in v1?** + - Option A: exact where possible, fail-fast otherwise + - Option B: approximate with documented caveats + - Option C: defer advanced event semantics until after compute kernels land diff --git a/include/pto/common/arch_macro.hpp b/include/pto/common/arch_macro.hpp index b1eda679..73dd31ac 100644 --- a/include/pto/common/arch_macro.hpp +++ b/include/pto/common/arch_macro.hpp @@ -11,6 +11,38 @@ See LICENSE in the root of the software repository for the full text of the Lice #ifndef ARCH_MACRO_HPP #define ARCH_MACRO_HPP +#if defined(PTO_GPU_BACKEND) +#define PTO_COMM_NOT_SUPPORTED + +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 1210 +#define PTO_GPU_SM121 +#elif __CUDA_ARCH__ >= 1200 +#define PTO_GPU_SM120 +#elif __CUDA_ARCH__ >= 1100 +#define PTO_GPU_SM110 +#elif __CUDA_ARCH__ >= 1030 +#define PTO_GPU_SM103 +#elif __CUDA_ARCH__ >= 1000 +#define PTO_GPU_SM100 +#elif __CUDA_ARCH__ >= 900 +#define PTO_GPU_SM90 +#elif __CUDA_ARCH__ >= 890 +#define PTO_GPU_SM89 +#elif __CUDA_ARCH__ >= 870 +#define PTO_GPU_SM87 +#elif __CUDA_ARCH__ >= 860 +#define PTO_GPU_SM86 +#elif __CUDA_ARCH__ >= 800 +#define PTO_GPU_SM80 +#elif __CUDA_ARCH__ >= 750 +#define PTO_GPU_SM75 +#elif __CUDA_ARCH__ >= 700 +#define PTO_GPU_SM70 +#endif +#endif +#endif + #if __NPU_ARCH__ == 2201 #define PTO_NPU_ARCH_A2A3 #elif (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 3510) diff --git a/include/pto/common/buffer_limits.hpp b/include/pto/common/buffer_limits.hpp index 8802d8c1..f0af220a 100644 --- a/include/pto/common/buffer_limits.hpp +++ b/include/pto/common/buffer_limits.hpp @@ -35,6 +35,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_UBUF_SIZE_BYTES (192u * 1024u) #elif defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90) #define PTO_UBUF_SIZE_BYTES (128u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_UBUF_SIZE_BYTES (256u * 1024u) #else #error \ "PTO_UBUF_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_UBUF_SIZE_BYTES manually." @@ -51,6 +53,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_CBUF_SIZE_BYTES (1024u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_A5) || defined(PTO_NPU_ARCH_KIRIN9030) #define PTO_CBUF_SIZE_BYTES (512u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_CBUF_SIZE_BYTES (256u * 1024u) #else #error \ "PTO_CBUF_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_CBUF_SIZE_BYTES manually." @@ -67,6 +71,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_L0A_SIZE_BYTES (32u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_A5) || defined(PTO_NPU_ARCH_KIRINX90) #define PTO_L0A_SIZE_BYTES (64u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_L0A_SIZE_BYTES (128u * 1024u) #else #error \ "PTO_L0A_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_L0A_SIZE_BYTES manually." @@ -83,6 +89,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_L0B_SIZE_BYTES (32u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_A5) || defined(PTO_NPU_ARCH_KIRINX90) #define PTO_L0B_SIZE_BYTES (64u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_L0B_SIZE_BYTES (128u * 1024u) #else #error \ "PTO_L0B_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_L0B_SIZE_BYTES manually." @@ -101,6 +109,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_L0C_SIZE_BYTES (128u * 1024u) #elif defined(PTO_NPU_ARCH_KIRIN9030) #define PTO_L0C_SIZE_BYTES (64u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_L0C_SIZE_BYTES (256u * 1024u) #else #error \ "PTO_L0C_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_L0C_SIZE_BYTES manually." @@ -117,6 +127,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_BIAS_SIZE_BYTES (4u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90) #define PTO_BIAS_SIZE_BYTES (1u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_BIAS_SIZE_BYTES (8u * 1024u) #else #error \ "PTO_BIAS_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_BIAS_SIZE_BYTES manually." @@ -137,6 +149,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_FBUF_SIZE_BYTES (4u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) #define PTO_FBUF_SIZE_BYTES (2u * 1024u) +#elif defined(PTO_GPU_BACKEND) +#define PTO_FBUF_SIZE_BYTES (8u * 1024u) #else #error \ "PTO_FBUF_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_FBUF_SIZE_BYTES manually." @@ -149,6 +163,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_SCALELEFT_SIZE_BYTES (4u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90) #define PTO_SCALELEFT_SIZE_BYTES 0u +#elif defined(PTO_GPU_BACKEND) +#define PTO_SCALELEFT_SIZE_BYTES (8u * 1024u) #else #error \ "PTO_SCALELEFT_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_SCALELEFT_SIZE_BYTES manually." @@ -161,6 +177,8 @@ See LICENSE in the root of the software repository for the full text of the Lice #define PTO_SCALERIGHT_SIZE_BYTES (4u * 1024u) #elif defined(PTO_NPU_ARCH_A2A3) || defined(PTO_NPU_ARCH_KIRIN9030) || defined(PTO_NPU_ARCH_KIRINX90) #define PTO_SCALERIGHT_SIZE_BYTES 0u +#elif defined(PTO_GPU_BACKEND) +#define PTO_SCALERIGHT_SIZE_BYTES (8u * 1024u) #else #error \ "PTO_SCALERIGHT_SIZE_BYTES: unknown NPU architecture. Define one of PTO_NPU_ARCH_{A2A3,A5,KIRIN9030,KIRINX90} or set PTO_SCALERIGHT_SIZE_BYTES manually." diff --git a/include/pto/common/constants.hpp b/include/pto/common/constants.hpp index 1480a5c3..bb489980 100644 --- a/include/pto/common/constants.hpp +++ b/include/pto/common/constants.hpp @@ -386,6 +386,9 @@ PTO_INTERNAL constexpr TileLayoutCustom GetTileLayoutCustom() } else if constexpr (TileData::isRowMajor && (TileData::SFractal == SLayout::RowMajor) && TileData::SFractalSize == 512) { return TileLayoutCustom::ZZ; + } else if constexpr (TileData::isRowMajor && (TileData::SFractal == SLayout::GpuSwizzle128B) && + TileData::SFractalSize == 128) { + return TileLayoutCustom::GPU_SWIZZLE_128B; } else { return TileLayoutCustom::NONE; } diff --git a/include/pto/common/gpu_stub.hpp b/include/pto/common/gpu_stub.hpp new file mode 100644 index 00000000..5edc138b --- /dev/null +++ b/include/pto/common/gpu_stub.hpp @@ -0,0 +1,66 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPUSTUB_HPP +#define PTO_GPUSTUB_HPP + +#if defined(__CUDACC__) + +#include + +#define PTO_GPU_BACKEND +#define PTO_GPU_INLINE_PTX 1 + +#define __aicore__ __device__ +#define AICORE __device__ +#define PTO_INLINE __forceinline__ +#define PTO_INST AICORE PTO_INLINE +#define PTO_INTERNAL AICORE PTO_INLINE + +#define __gm__ +#define __out__ +#define __in__ +#define __ubuf__ +#define __cbuf__ +#define __ca__ +#define __cb__ +#define __cc__ +#define __fbuf__ +#define __tf__ + +#ifndef __cce_get_tile_ptr +#define __cce_get_tile_ptr(x) (x) +#endif + +typedef int pipe_t; +constexpr pipe_t PIPE_S = 0; +constexpr pipe_t PIPE_V = 1; +constexpr pipe_t PIPE_MTE1 = 2; +constexpr pipe_t PIPE_MTE2 = 3; +constexpr pipe_t PIPE_MTE3 = 4; +constexpr pipe_t PIPE_M = 5; +constexpr pipe_t PIPE_ALL = 6; +constexpr pipe_t PIPE_FIX = 7; + +using event_t = int; +constexpr event_t EVENT_ID0 = 0; + +PTO_INTERNAL void pipe_barrier(pipe_t) +{} + +PTO_INTERNAL void set_flag(pipe_t, pipe_t, event_t) +{} + +PTO_INTERNAL void wait_flag(pipe_t, pipe_t, event_t) +{} + +#endif + +#endif diff --git a/include/pto/common/memory.hpp b/include/pto/common/memory.hpp index cf5877a9..2e011ed1 100644 --- a/include/pto/common/memory.hpp +++ b/include/pto/common/memory.hpp @@ -43,6 +43,7 @@ enum class SLayout NoneBox = 0, RowMajor = 1, ColMajor = 2, + GpuSwizzle128B = 3, }; // returns the memory qualifier for a given TileType and data type. @@ -150,6 +151,8 @@ PTO_INTERNAL constexpr const __gm__ char *GetLayoutName(BLayout bType, SLayout s return (bType == BLayout::RowMajor) ? "Zz" : "Nz"; case SLayout::ColMajor: return (bType == BLayout::RowMajor) ? "Zn" : "Nn"; + case SLayout::GpuSwizzle128B: + return (bType == BLayout::RowMajor) ? "GS128" : "GS128C"; default: return "Unknown"; } diff --git a/include/pto/common/pto_instr.hpp b/include/pto/common/pto_instr.hpp index ad505d7e..e1237d6e 100644 --- a/include/pto/common/pto_instr.hpp +++ b/include/pto/common/pto_instr.hpp @@ -385,7 +385,7 @@ PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileDa return {}; } -#if defined(PTO_NPU_ARCH_A5) || defined(__CPU_SIM) +#if defined(PTO_NPU_ARCH_A5) || defined(__CPU_SIM) || defined(PTO_GPU_BACKEND) template PTO_INST RecordEvent TGEMV_MX(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, diff --git a/include/pto/common/pto_instr_impl.hpp b/include/pto/common/pto_instr_impl.hpp index d751072a..f662f78b 100644 --- a/include/pto/common/pto_instr_impl.hpp +++ b/include/pto/common/pto_instr_impl.hpp @@ -227,6 +227,19 @@ See LICENSE in the root of the software repository for the full text of the Lice #include "pto/npu/kirin9030/header.hpp" #endif +#ifdef PTO_GPU_BACKEND +#include "pto/gpu/common/TSync.hpp" +#include "pto/gpu/common/TAdd.hpp" +#include "pto/gpu/common/TMrgSort.hpp" +#include "pto/gpu/common/TLoad.hpp" +#include "pto/gpu/common/TStore.hpp" +#ifdef PTO_GPU_SM121 +#include "pto/gpu/sm121/arch.hpp" +#include "pto/gpu/sm121/TMatmul.hpp" +#endif +#include "pto/gpu/common/TMatmul.hpp" +#endif + #ifdef __CPU_SIM #include "pto/cpu/ElementTileOp.h" #include "pto/cpu/ElementTileScalarOp.h" diff --git a/include/pto/common/pto_tile.hpp b/include/pto/common/pto_tile.hpp index 4b8cd6d0..5db5055b 100644 --- a/include/pto/common/pto_tile.hpp +++ b/include/pto/common/pto_tile.hpp @@ -812,6 +812,8 @@ static constexpr int fixedMxColSize = 2; static constexpr int fractalABSize = 512; static constexpr int fractalCSize = 1024; static constexpr int fractalMxSize = 32; +static constexpr int gpuSwizzle128BSize = 128; +static constexpr int gpuSwizzleRows = 8; } // namespace TileConfig namespace ConvTileDetail { @@ -1253,7 +1255,9 @@ struct Tile { static constexpr int getInnerRow() { - if constexpr (SFractalSize_ == TileConfig::fractalCSize) { + if constexpr (SFractal_ == SLayout::GpuSwizzle128B) { + return TileConfig::gpuSwizzleRows; + } else if constexpr (SFractalSize_ == TileConfig::fractalCSize) { return TileConfig::fixedRowSize; } else if constexpr (SFractalSize_ == TileConfig::fractalMxSize) { return TileConfig::fixedMxRowSize; @@ -1266,7 +1270,9 @@ struct Tile { static constexpr int getInnerCol() { - if constexpr (SFractalSize_ == TileConfig::fractalCSize) { + if constexpr (SFractal_ == SLayout::GpuSwizzle128B) { + return TileConfig::gpuSwizzle128BSize / (TileConfig::gpuSwizzleRows * sizeof(DType)); + } else if constexpr (SFractalSize_ == TileConfig::fractalCSize) { return TileConfig::fixedColSize; } else if constexpr (SFractalSize_ == TileConfig::fractalMxSize) { return TileConfig::fixedMxColSize; @@ -1373,6 +1379,7 @@ struct Tile { static constexpr bool isBoxedLayout = (SFractal != SLayout::NoneBox); static constexpr bool isInnerRowMajor = (SFractal == SLayout::RowMajor); static constexpr bool isInnerColMajor = (SFractal == SLayout::ColMajor); + static constexpr bool isGpuSwizzled = (SFractal == SLayout::GpuSwizzle128B); static constexpr int InnerRows = getInnerRow(); static constexpr int InnerCols = getInnerCol(); @@ -1380,6 +1387,8 @@ struct Tile { static constexpr int InnerNumel = InnerRows * InnerCols; static_assert(InnerRows != 0 && InnerCols != 0, "rows or cols of fractal size is 0."); + static_assert((SFractal_ != SLayout::GpuSwizzle128B) || (BFractal_ == BLayout::RowMajor), + "GPU swizzle layout currently supports row-major tiles only."); static_assert((Loc == TileType::Vec) || (SFractalSize_ == TileConfig::fractalMxSize) || (Rows_ == 1) || (Rows % InnerRows == 0), "Layout rows must be divisible by inner box rows"); @@ -1398,7 +1407,7 @@ struct Tile { SFractal_ in not NoneBox: Rows/Cols must be integer multiple of InnerRows/InnerCols."); static_assert(SFractalSize_ == TileConfig::fractalABSize || SFractalSize_ == TileConfig::fractalCSize || - SFractalSize_ == TileConfig::fractalMxSize, + SFractalSize_ == TileConfig::fractalMxSize || SFractalSize_ == TileConfig::gpuSwizzle128BSize, "SFractalSize_ illegal"); #if defined(__CPU_SIM) || defined(__COSTMODEL) @@ -1605,6 +1614,23 @@ template ; +// GPU-only swizzled layouts. These are intentionally not tied to the NPU fractal conventions. +template +using TileVecGpuSwizzle = Tile; + +template +using TileLeftGpuSwizzle = Tile; + +template +using TileRightGpuSwizzle = Tile; + +template +using TileAccGpuSwizzle = Tile; + template struct is_global : std::false_type { }; diff --git a/include/pto/common/type.hpp b/include/pto/common/type.hpp index e1980f8f..16802f42 100644 --- a/include/pto/common/type.hpp +++ b/include/pto/common/type.hpp @@ -10,17 +10,27 @@ See LICENSE in the root of the software repository for the full text of the Lice #ifndef _PTO_INCLUDE_NPU_TYPE_H_ #define _PTO_INCLUDE_NPU_TYPE_H_ +#include +#ifndef AICORE #ifndef __CPU_SIM #define AICORE [aicore] #else #define AICORE #endif +#endif + +#ifndef PTO_INLINE #define PTO_INLINE inline __attribute__((always_inline)) +#endif // for pto instruction declaration +#ifndef PTO_INST #define PTO_INST AICORE PTO_INLINE __attribute__((visibility("default"))) +#endif // for pto internal implementation +#ifndef PTO_INTERNAL #define PTO_INTERNAL AICORE PTO_INLINE +#endif #define OP_NAME(Name) __attribute__((vf_name(#Name))) #define OP_TYPE(TypeName) __attribute__((vf_kind(#TypeName))) @@ -246,6 +256,7 @@ enum class TileLayoutCustom : uint8_t NZ, ZN, ZZ, + GPU_SWIZZLE_128B, NONE, }; @@ -264,7 +275,13 @@ using TRandomKey = uint32_t[PTO_RANDOM_KEY_SIZE]; using TRandomCounter = uint32_t[PTO_RANDOM_COUNTER_SIZE]; } // namespace pto -#if defined(__CPU_SIM) || defined(__COSTMODEL) +#if defined(PTO_GPU_BACKEND) && defined(__CUDACC__) +#include +#include +typedef __half half; +typedef __half aclFloat16; +typedef __nv_bfloat16 bfloat16_t; +#elif defined(__CPU_SIM) || defined(__COSTMODEL) typedef _Float16 half; typedef _Float16 aclFloat16; // Note: clang version should be >=15 and gcc version should be >=14 diff --git a/include/pto/cpu/tile_offsets.hpp b/include/pto/cpu/tile_offsets.hpp index c8fa1fff..87753e1e 100644 --- a/include/pto/cpu/tile_offsets.hpp +++ b/include/pto/cpu/tile_offsets.hpp @@ -13,6 +13,30 @@ See LICENSE in the root of the software repository for the full text of the Lice #include namespace pto { + +PTO_INTERNAL constexpr bool IsPowerOfTwo(size_t value) +{ + return value != 0 && ((value & (value - 1)) == 0); +} + +PTO_INTERNAL constexpr size_t PermuteSwizzleChunk(size_t chunk, size_t lane, size_t chunkCount) +{ + return IsPowerOfTwo(chunkCount) ? ((chunk ^ lane) & (chunkCount - 1)) : ((chunk + lane) % chunkCount); +} + +template +size_t GetTileElementOffsetGpuSwizzle(size_t r, size_t c) +{ + constexpr size_t swizzleRows = TileData::InnerRows; + constexpr size_t swizzleCols = TileData::InnerCols; + const size_t rowBlock = r / swizzleRows; + const size_t rowInBlock = r % swizzleRows; + const size_t chunk = c / swizzleCols; + const size_t colInChunk = c % swizzleCols; + const size_t chunksPerRow = TileData::Cols / swizzleCols; + const size_t permutedChunk = PermuteSwizzleChunk(chunk, rowInBlock % chunksPerRow, chunksPerRow); + return rowBlock * swizzleRows * TileData::Cols + rowInBlock * TileData::Cols + permutedChunk * swizzleCols + colInChunk; +} template using TypeSum = std::conditional_t, float, typename TileData::DType>; @@ -49,11 +73,11 @@ size_t GetTileElementOffsetPlain(size_t r, size_t c) template size_t GetTileElementOffset(size_t r, size_t c) { - if constexpr (TileData::SFractal == SLayout::NoneBox) + if constexpr (TileData::SFractal == SLayout::NoneBox) { return GetTileElementOffsetPlain(r, c); - else { - size_t subTileR = r / TileData::InnerRows; - size_t innerR = r % TileData::InnerRows; + } else if constexpr (TileData::SFractal == SLayout::GpuSwizzle128B) { + return GetTileElementOffsetGpuSwizzle(r, c); + } else { return GetTileElementOffsetSubfractals(r / TileData::InnerRows, r % TileData::InnerRows, c / TileData::InnerCols, c % TileData::InnerCols); } diff --git a/include/pto/gpu/README.md b/include/pto/gpu/README.md new file mode 100644 index 00000000..e93f78ed --- /dev/null +++ b/include/pto/gpu/README.md @@ -0,0 +1,44 @@ +# include/pto/gpu/ + +Scaffold for a future NVIDIA GPU backend for PTO Tile Lib. + +## Purpose + +This directory mirrors the existing backend split used by the CPU simulator and Ascend NPU implementations: + +- `common/`: shared GPU helpers, tile adapters, launch abstractions, layout helpers +- `smXX/`: architecture-specific specializations and microkernel registries + +## Design intent + +The long-term goal is to provide: + +- a unified PTO API at `#include ` +- a GPU lowering path that keeps PTO semantics stable +- per-SM specializations for performance-critical PTO instructions +- dedicated microkernels for hot PTO ISA operations on NVIDIA GPUs + +## Initial status + +This backend is now past pure scaffold stage. + +Implemented groundwork includes: + +- backend dispatch macros +- inclusion from `pto_instr_impl.hpp` +- initial CUDA build/test plumbing +- early `sm121` matmul fast paths +- a GPU-specific swizzle layout family for row-major tiles + +Current GPU swizzle support: + +- layout enum: `SLayout::GpuSwizzle128B` +- aliases in `pto_tile.hpp`: + - `TileVecGpuSwizzle` + - `TileLeftGpuSwizzle` + - `TileRightGpuSwizzle` + - `TileAccGpuSwizzle` +- element offset mapping is GPU-specific and intentionally not tied to the NPU boxed layouts +- current swizzle implementation is row-major only and is intended as groundwork for future shared-memory / tensor-core friendly paths + +See `docs/gpu_backend_plan.md` for the phase plan and backend bring-up checklist. diff --git a/include/pto/gpu/common/README.md b/include/pto/gpu/common/README.md new file mode 100644 index 00000000..a15584e8 --- /dev/null +++ b/include/pto/gpu/common/README.md @@ -0,0 +1,15 @@ +# include/pto/gpu/common/ + +Shared building blocks for the future PTO NVIDIA GPU backend. + +Expected contents over time: + +- tile layout adapters +- register/shared/global memory helpers +- warp / warpgroup scheduling helpers +- async copy abstractions +- tensor-core / MMA wrapper traits +- launch-time architecture dispatch helpers +- common validation and debug utilities + +This directory is intentionally created before implementation so the backend can evolve with the same structure as existing PTO backends. diff --git a/include/pto/gpu/common/TAdd.hpp b/include/pto/gpu/common/TAdd.hpp new file mode 100644 index 00000000..face8101 --- /dev/null +++ b/include/pto/gpu/common/TAdd.hpp @@ -0,0 +1,45 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TADD_HPP +#define PTO_GPU_COMMON_TADD_HPP + +#include +#include "pto/gpu/common/tile_offsets.hpp" + +namespace pto { + +template +PTO_INTERNAL void TADD_IMPL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1) +{ + using T = typename TileDataDst::DType; + static_assert(std::is_same_v && std::is_same_v, + "Fix: TADD input/output dtypes must match for the GPU backend."); + + const unsigned validRows = dst.GetValidRow(); + const unsigned validCols = dst.GetValidCol(); + PTO_ASSERT(src0.GetValidRow() == validRows && src0.GetValidCol() == validCols, + "Fix: TADD src0 valid shape mismatch with dst in GPU backend."); + PTO_ASSERT(src1.GetValidRow() == validRows && src1.GetValidCol() == validCols, + "Fix: TADD src1 valid shape mismatch with dst in GPU backend."); + + for (unsigned r = 0; r < validRows; ++r) { + for (unsigned c = 0; c < validCols; ++c) { + const std::size_t idx = gpu::GetTileElementOffset(r, c); + const std::size_t src0Idx = gpu::GetTileElementOffset(r, c); + const std::size_t src1Idx = gpu::GetTileElementOffset(r, c); + dst.data()[idx] = src0.data()[src0Idx] + src1.data()[src1Idx]; + } + } +} + +} // namespace pto + +#endif diff --git a/include/pto/gpu/common/TLoad.hpp b/include/pto/gpu/common/TLoad.hpp new file mode 100644 index 00000000..d44c42c3 --- /dev/null +++ b/include/pto/gpu/common/TLoad.hpp @@ -0,0 +1,100 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TLOAD_HPP +#define PTO_GPU_COMMON_TLOAD_HPP + +#include +#include +#include "pto/gpu/common/tile_offsets.hpp" + +namespace pto { + +template +PTO_INTERNAL typename TileData::DType GpuPadValue() +{ + return GetPadValue(); +} + +template +PTO_INTERNAL void TLOAD_IMPL(TileData &dst, GlobalData &src) +{ + static_assert(GlobalData::layout == pto::Layout::ND || GlobalData::layout == pto::Layout::DN, + "Only ND and DN GlobalTensor layouts are supported by the v1 GPU backend."); + + const int gShape0 = src.GetShape(pto::GlobalTensorDim::DIM_0); + const int gShape1 = src.GetShape(pto::GlobalTensorDim::DIM_1); + const int gShape2 = src.GetShape(pto::GlobalTensorDim::DIM_2); + const int gShape3 = src.GetShape(pto::GlobalTensorDim::DIM_3); + const int gShape4 = src.GetShape(pto::GlobalTensorDim::DIM_4); + + const int gStride0 = src.GetStride(pto::GlobalTensorDim::DIM_0); + const int gStride1 = src.GetStride(pto::GlobalTensorDim::DIM_1); + const int gStride2 = src.GetStride(pto::GlobalTensorDim::DIM_2); + const int gStride3 = src.GetStride(pto::GlobalTensorDim::DIM_3); + const int gStride4 = src.GetStride(pto::GlobalTensorDim::DIM_4); + + const int validRow = dst.GetValidRow(); + const int validCol = dst.GetValidCol(); + + for (int r = 0; r < TileData::Rows; ++r) { + for (int c = 0; c < TileData::Cols; ++c) { + dst.data()[gpu::GetTileElementOffset(r, c)] = GpuPadValue(); + } + } + + if constexpr (GlobalData::layout == pto::Layout::ND) { + PTO_ASSERT(TileData::isRowMajor, "Fix: ND loads currently require row-major tiles in the GPU backend."); + PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape3 == validRow && gShape4 == validCol, + "Fix: ND TLOAD valid shape mismatch in GPU backend."); + + int rowBase = 0; + for (int i = 0; i < gShape0; ++i) { + for (int j = 0; j < gShape1; ++j) { + for (int k = 0; k < gShape2; ++k) { + for (int r = 0; r < gShape3; ++r) { + const int tileRow = rowBase + r; + const int srcBase = i * gStride0 + j * gStride1 + k * gStride2 + r * gStride3; + for (int c = 0; c < gShape4; ++c) { + const std::size_t dstIdx = gpu::GetTileElementOffset(tileRow, c); + dst.data()[dstIdx] = src.data()[srcBase + c * gStride4]; + } + } + rowBase += gShape3; + } + } + } + } else { + PTO_ASSERT(!TileData::isRowMajor, "Fix: DN loads currently require col-major tiles in the GPU backend."); + PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape4 == validCol && gShape3 == validRow, + "Fix: DN TLOAD valid shape mismatch in GPU backend."); + + int colBase = 0; + for (int i = 0; i < gShape0; ++i) { + for (int j = 0; j < gShape1; ++j) { + for (int k = 0; k < gShape2; ++k) { + for (int c = 0; c < gShape4; ++c) { + const int tileCol = colBase + c; + const int srcBase = i * gStride0 + j * gStride1 + k * gStride2 + c * gStride4; + for (int r = 0; r < gShape3; ++r) { + const std::size_t dstIdx = gpu::GetTileElementOffset(r, tileCol); + dst.data()[dstIdx] = src.data()[srcBase + r * gStride3]; + } + } + colBase += gShape4; + } + } + } + } +} + +} // namespace pto + +#endif diff --git a/include/pto/gpu/common/TMatmul.hpp b/include/pto/gpu/common/TMatmul.hpp new file mode 100644 index 00000000..cc509ee4 --- /dev/null +++ b/include/pto/gpu/common/TMatmul.hpp @@ -0,0 +1,217 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TMATMUL_HPP +#define PTO_GPU_COMMON_TMATMUL_HPP + +#include +#include "pto/gpu/common/tile_offsets.hpp" + +namespace pto { + +template +PTO_INTERNAL void CheckGpuMatmulValid() +{ + using AType = typename TileLeft::DType; + using BType = typename TileRight::DType; + using CType = typename TileAcc::DType; + static_assert( + (std::is_same_v && std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v && std::is_same_v), + "Fix: GPU TMATMUL currently supports int8->int32, half->float, bfloat16->float, and float->float."); +} + +template +PTO_INTERNAL void TMATMUL_IMPL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + CheckGpuMatmulValid(); + +#ifdef PTO_GPU_SM121 + if (gpu::sm121::TrySm121TMATMUL(cMatrix, aMatrix, bMatrix)) { + return; + } +#endif + + const uint16_t m = aMatrix.GetValidRow(); + const uint16_t k = aMatrix.GetValidCol(); + const uint16_t n = bMatrix.GetValidCol(); + + for (uint16_t row = 0; row < m; ++row) { + for (uint16_t col = 0; col < n; ++col) { + typename TileAcc::DType acc = typename TileAcc::DType{}; + for (uint16_t kk = 0; kk < k; ++kk) { + const std::size_t aIdx = gpu::GetTileElementOffset(row, kk); + const std::size_t bIdx = gpu::GetTileElementOffset(kk, col); + acc += static_cast(aMatrix.data()[aIdx]) * + static_cast(bMatrix.data()[bIdx]); + } + cMatrix.data()[gpu::GetTileElementOffset(row, col)] = acc; + } + } +} + +template +PTO_INTERNAL void TMATMUL_ACC_IMPL(TileAcc &cOutMatrix, TileAcc &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + CheckGpuMatmulValid(); + +#ifdef PTO_GPU_SM121 + if (gpu::sm121::TrySm121TMATMULAcc(cOutMatrix, cInMatrix, aMatrix, bMatrix)) { + return; + } +#endif + + const uint16_t m = aMatrix.GetValidRow(); + const uint16_t k = aMatrix.GetValidCol(); + const uint16_t n = bMatrix.GetValidCol(); + + for (uint16_t row = 0; row < m; ++row) { + for (uint16_t col = 0; col < n; ++col) { + const std::size_t cIdx = gpu::GetTileElementOffset(row, col); + typename TileAcc::DType acc = cInMatrix.data()[cIdx]; + for (uint16_t kk = 0; kk < k; ++kk) { + const std::size_t aIdx = gpu::GetTileElementOffset(row, kk); + const std::size_t bIdx = gpu::GetTileElementOffset(kk, col); + acc += static_cast(aMatrix.data()[aIdx]) * + static_cast(bMatrix.data()[bIdx]); + } + cOutMatrix.data()[cIdx] = acc; + } + } +} + +template +PTO_INTERNAL void TMATMUL_ACC_IMPL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + CheckGpuMatmulValid(); + +#ifdef PTO_GPU_SM121 + if (gpu::sm121::TrySm121TMATMULAcc(cMatrix, aMatrix, bMatrix)) { + return; + } +#endif + + TMATMUL_ACC_IMPL(cMatrix, cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TMATMUL_BIAS_IMPL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasMatrix) +{ +#ifdef PTO_GPU_SM121 + if (gpu::sm121::TrySm121TMATMULBias(cMatrix, aMatrix, bMatrix, biasMatrix)) { + return; + } +#endif + + TMATMUL_IMPL(cMatrix, aMatrix, bMatrix); + const uint16_t m = aMatrix.GetValidRow(); + const uint16_t n = bMatrix.GetValidCol(); + for (uint16_t row = 0; row < m; ++row) { + for (uint16_t col = 0; col < n; ++col) { + const std::size_t cIdx = gpu::GetTileElementOffset(row, col); + const std::size_t bIdx = gpu::GetTileElementOffset(0, col); + cMatrix.data()[cIdx] += biasMatrix.data()[bIdx]; + } + } +} + +template +PTO_INTERNAL void TGEMV_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + (void)Phase; + TMATMUL_IMPL(cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TGEMV_ACC_IMPL(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + (void)Phase; + TMATMUL_ACC_IMPL(cOutMatrix, cInMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TGEMV_BIAS_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData) +{ + (void)Phase; + TMATMUL_BIAS_IMPL(cMatrix, aMatrix, bMatrix, biasData); +} + +template +PTO_INTERNAL void TMATMUL_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, + TileRightScale &bScaleMatrix) +{ + (void)Phase; + (void)aScaleMatrix; + (void)bScaleMatrix; + TMATMUL_IMPL(cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TMATMUL_MX_IMPL(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, + TileLeftScale &aScaleMatrix, TileRight &bMatrix, TileRightScale &bScaleMatrix) +{ + (void)Phase; + (void)aScaleMatrix; + (void)bScaleMatrix; + TMATMUL_ACC_IMPL(cOutMatrix, cInMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TMATMUL_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, + TileRightScale &bScaleMatrix, TileBias &biasData) +{ + (void)Phase; + (void)aScaleMatrix; + (void)bScaleMatrix; + TMATMUL_BIAS_IMPL(cMatrix, aMatrix, bMatrix, biasData); +} + +template +PTO_INTERNAL void TGEMV_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, + TileRightScale &bScaleMatrix) +{ + (void)Phase; + (void)aScaleMatrix; + (void)bScaleMatrix; + TGEMV_IMPL(cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TGEMV_MX_IMPL(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, + TileRight &bMatrix, TileRightScale &bScaleMatrix) +{ + (void)Phase; + (void)aScaleMatrix; + (void)bScaleMatrix; + TGEMV_ACC_IMPL(cOutMatrix, cInMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL void TGEMV_MX_IMPL(TileRes &cMatrix, TileLeft &aMatrix, TileLeftScale &aScaleMatrix, TileRight &bMatrix, + TileRightScale &bScaleMatrix, TileBias &biasData) +{ + (void)Phase; + (void)aScaleMatrix; + (void)bScaleMatrix; + TGEMV_BIAS_IMPL(cMatrix, aMatrix, bMatrix, biasData); +} + +} // namespace pto + +#endif diff --git a/include/pto/gpu/common/TMrgSort.hpp b/include/pto/gpu/common/TMrgSort.hpp new file mode 100644 index 00000000..9ea80f4d --- /dev/null +++ b/include/pto/gpu/common/TMrgSort.hpp @@ -0,0 +1,42 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TMRGSORT_HPP +#define PTO_GPU_COMMON_TMRGSORT_HPP + +namespace pto { + +struct MrgSortExecutedNumList { +}; + +template +PTO_INTERNAL void TMRGSORT_IMPL(DstTileData &, MrgSortExecutedNumList &, TmpTileData &, Src0TileData &, Src1TileData &) +{ +} + +template +PTO_INTERNAL void TMRGSORT_IMPL(DstTileData &, MrgSortExecutedNumList &, TmpTileData &, Src0TileData &, Src1TileData &, + Src2TileData &, Src3TileData &) +{ +} + +template +PTO_INTERNAL void TMRGSORT_IMPL(DstTileData &, MrgSortExecutedNumList &, TmpTileData &, Src0TileData &, Src1TileData &, + Src2TileData &, Src3TileData &, Src4TileData &, Src5TileData &, Src6TileData &, + Src7TileData &) +{ +} + +} // namespace pto + +#endif diff --git a/include/pto/gpu/common/TStore.hpp b/include/pto/gpu/common/TStore.hpp new file mode 100644 index 00000000..bdac8df5 --- /dev/null +++ b/include/pto/gpu/common/TStore.hpp @@ -0,0 +1,105 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TSTORE_HPP +#define PTO_GPU_COMMON_TSTORE_HPP + +#include +#include "pto/gpu/common/tile_offsets.hpp" + +namespace pto { + +template +PTO_INTERNAL void TSTORE_IMPL(GlobalData &dst, TileData &src) +{ + static_assert(atomicType == AtomicType::AtomicNone, + "Atomic store modes are not implemented yet in the v1 GPU backend."); + static_assert(GlobalData::layout == pto::Layout::ND || GlobalData::layout == pto::Layout::DN, + "Only ND and DN GlobalTensor layouts are supported by the v1 GPU backend."); + + const int gShape0 = dst.GetShape(pto::GlobalTensorDim::DIM_0); + const int gShape1 = dst.GetShape(pto::GlobalTensorDim::DIM_1); + const int gShape2 = dst.GetShape(pto::GlobalTensorDim::DIM_2); + const int gShape3 = dst.GetShape(pto::GlobalTensorDim::DIM_3); + const int gShape4 = dst.GetShape(pto::GlobalTensorDim::DIM_4); + + const int gStride0 = dst.GetStride(pto::GlobalTensorDim::DIM_0); + const int gStride1 = dst.GetStride(pto::GlobalTensorDim::DIM_1); + const int gStride2 = dst.GetStride(pto::GlobalTensorDim::DIM_2); + const int gStride3 = dst.GetStride(pto::GlobalTensorDim::DIM_3); + const int gStride4 = dst.GetStride(pto::GlobalTensorDim::DIM_4); + + const int validRow = src.GetValidRow(); + const int validCol = src.GetValidCol(); + + if constexpr (GlobalData::layout == pto::Layout::ND) { + PTO_ASSERT(TileData::isRowMajor, "Fix: ND stores currently require row-major tiles in the GPU backend."); + PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape3 == validRow && gShape4 == validCol, + "Fix: ND TSTORE valid shape mismatch in GPU backend."); + + int rowBase = 0; + for (int i = 0; i < gShape0; ++i) { + for (int j = 0; j < gShape1; ++j) { + for (int k = 0; k < gShape2; ++k) { + for (int r = 0; r < gShape3; ++r) { + const int tileRow = rowBase + r; + const int dstBase = i * gStride0 + j * gStride1 + k * gStride2 + r * gStride3; + for (int c = 0; c < gShape4; ++c) { + const std::size_t srcIdx = gpu::GetTileElementOffset(tileRow, c); + dst.data()[dstBase + c * gStride4] = src.data()[srcIdx]; + } + } + rowBase += gShape3; + } + } + } + } else { + PTO_ASSERT(!TileData::isRowMajor, "Fix: DN stores currently require col-major tiles in the GPU backend."); + PTO_ASSERT(gShape0 * gShape1 * gShape2 * gShape4 == validCol && gShape3 == validRow, + "Fix: DN TSTORE valid shape mismatch in GPU backend."); + + int colBase = 0; + for (int i = 0; i < gShape0; ++i) { + for (int j = 0; j < gShape1; ++j) { + for (int k = 0; k < gShape2; ++k) { + for (int c = 0; c < gShape4; ++c) { + const int tileCol = colBase + c; + const int dstBase = i * gStride0 + j * gStride1 + k * gStride2 + c * gStride4; + for (int r = 0; r < gShape3; ++r) { + const std::size_t srcIdx = gpu::GetTileElementOffset(r, tileCol); + dst.data()[dstBase + r * gStride3] = src.data()[srcIdx]; + } + } + colBase += gShape4; + } + } + } + } +} + +template +PTO_INTERNAL void TSTORE_IMPL(GlobalData &dst, TileData &src, uint64_t preQuantScalar) +{ + (void)preQuantScalar; + TSTORE_IMPL(dst, src); +} + +template +PTO_INTERNAL void TSTORE_IMPL(GlobalData &dst, TileData &src, FpTileData &fp) +{ + (void)fp; + (void)reluPreMode; + TSTORE_IMPL(dst, src); +} + +} // namespace pto + +#endif diff --git a/include/pto/gpu/common/TSync.hpp b/include/pto/gpu/common/TSync.hpp new file mode 100644 index 00000000..f8816804 --- /dev/null +++ b/include/pto/gpu/common/TSync.hpp @@ -0,0 +1,28 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TSYNC_HPP +#define PTO_GPU_COMMON_TSYNC_HPP + +#include +#include + +namespace pto { + +template +PTO_INTERNAL void TSYNC_IMPL() +{ + (void)sizeof(OpCode); + // v1 GPU backend keeps advanced PTO event / pipeline semantics deferred. +} + +} // namespace pto + +#endif diff --git a/include/pto/gpu/common/tile_offsets.hpp b/include/pto/gpu/common/tile_offsets.hpp new file mode 100644 index 00000000..90747c69 --- /dev/null +++ b/include/pto/gpu/common/tile_offsets.hpp @@ -0,0 +1,81 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_COMMON_TILE_OFFSETS_HPP +#define PTO_GPU_COMMON_TILE_OFFSETS_HPP + +#include + +namespace pto::gpu { + +PTO_INTERNAL constexpr bool IsPowerOfTwo(std::size_t value) +{ + return value != 0 && ((value & (value - 1)) == 0); +} + +PTO_INTERNAL constexpr std::size_t PermuteSwizzleChunk(std::size_t chunk, std::size_t lane, std::size_t chunkCount) +{ + return IsPowerOfTwo(chunkCount) ? ((chunk ^ lane) & (chunkCount - 1)) : ((chunk + lane) % chunkCount); +} + +template +PTO_INTERNAL std::size_t GetTileElementOffsetGpuSwizzle(std::size_t r, std::size_t c) +{ + constexpr std::size_t swizzleRows = TileData::InnerRows; + constexpr std::size_t swizzleCols = TileData::InnerCols; + const std::size_t rowBlock = r / swizzleRows; + const std::size_t rowInBlock = r % swizzleRows; + const std::size_t chunk = c / swizzleCols; + const std::size_t colInChunk = c % swizzleCols; + const std::size_t chunksPerRow = TileData::Cols / swizzleCols; + const std::size_t permutedChunk = PermuteSwizzleChunk(chunk, rowInBlock % chunksPerRow, chunksPerRow); + return rowBlock * swizzleRows * TileData::Cols + rowInBlock * TileData::Cols + permutedChunk * swizzleCols + colInChunk; +} + +template +PTO_INTERNAL std::size_t GetTileElementOffsetSubfractals(std::size_t subTileR, std::size_t innerR, + std::size_t subTileC, std::size_t innerC) +{ + if constexpr (!TileData::isRowMajor && (TileData::SFractal == SLayout::RowMajor)) { + return subTileC * TileData::Rows * TileData::InnerCols + subTileR * TileData::InnerNumel + + innerR * TileData::InnerCols + innerC; + } else if constexpr (TileData::isRowMajor && (TileData::SFractal == SLayout::ColMajor)) { + return subTileR * TileData::Cols * TileData::InnerRows + subTileC * TileData::InnerNumel + + innerC * TileData::InnerRows + innerR; + } else { + return subTileR * TileData::Cols * TileData::InnerRows + subTileC * TileData::InnerNumel + + innerR * TileData::InnerCols + innerC; + } +} + +template +PTO_INTERNAL std::size_t GetTileElementOffsetPlain(std::size_t r, std::size_t c) +{ + if constexpr (TileData::isRowMajor) { + return r * TileData::Cols + c; + } + return c * TileData::Rows + r; +} + +template +PTO_INTERNAL std::size_t GetTileElementOffset(std::size_t r, std::size_t c) +{ + if constexpr (TileData::SFractal == SLayout::NoneBox) { + return GetTileElementOffsetPlain(r, c); + } else if constexpr (TileData::SFractal == SLayout::GpuSwizzle128B) { + return GetTileElementOffsetGpuSwizzle(r, c); + } + return GetTileElementOffsetSubfractals(r / TileData::InnerRows, r % TileData::InnerRows, + c / TileData::InnerCols, c % TileData::InnerCols); +} + +} // namespace pto::gpu + +#endif diff --git a/include/pto/gpu/sm100/README.md b/include/pto/gpu/sm100/README.md new file mode 100644 index 00000000..aaf26423 --- /dev/null +++ b/include/pto/gpu/sm100/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm100/ + +Architecture-specific PTO GPU specializations for **Blackwell**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm100/microkernels/README.md b/include/pto/gpu/sm100/microkernels/README.md new file mode 100644 index 00000000..3e0e1e1f --- /dev/null +++ b/include/pto/gpu/sm100/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm100/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm100**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm100a/README.md b/include/pto/gpu/sm100a/README.md new file mode 100644 index 00000000..7200336e --- /dev/null +++ b/include/pto/gpu/sm100a/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm100a/ + +Architecture-specific PTO GPU specializations for **Blackwell architecture-conditional specialization**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm100a/microkernels/README.md b/include/pto/gpu/sm100a/microkernels/README.md new file mode 100644 index 00000000..ab2a6cb2 --- /dev/null +++ b/include/pto/gpu/sm100a/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm100a/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm100a**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm103/README.md b/include/pto/gpu/sm103/README.md new file mode 100644 index 00000000..eb2fa400 --- /dev/null +++ b/include/pto/gpu/sm103/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm103/ + +Architecture-specific PTO GPU specializations for **GB300 / B300 generation**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm103/microkernels/README.md b/include/pto/gpu/sm103/microkernels/README.md new file mode 100644 index 00000000..ef686743 --- /dev/null +++ b/include/pto/gpu/sm103/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm103/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm103**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm110/README.md b/include/pto/gpu/sm110/README.md new file mode 100644 index 00000000..7408a2bf --- /dev/null +++ b/include/pto/gpu/sm110/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm110/ + +Architecture-specific PTO GPU specializations for **Thor / Jetson T-series generation**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm110/microkernels/README.md b/include/pto/gpu/sm110/microkernels/README.md new file mode 100644 index 00000000..db61fa0f --- /dev/null +++ b/include/pto/gpu/sm110/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm110/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm110**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm120/README.md b/include/pto/gpu/sm120/README.md new file mode 100644 index 00000000..9d33c453 --- /dev/null +++ b/include/pto/gpu/sm120/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm120/ + +Architecture-specific PTO GPU specializations for **Blackwell workstation / GeForce generation**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm120/microkernels/README.md b/include/pto/gpu/sm120/microkernels/README.md new file mode 100644 index 00000000..9c8e0f35 --- /dev/null +++ b/include/pto/gpu/sm120/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm120/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm120**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm121/README.md b/include/pto/gpu/sm121/README.md new file mode 100644 index 00000000..e46c5840 --- /dev/null +++ b/include/pto/gpu/sm121/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm121/ + +Architecture-specific PTO GPU specializations for **GB10 / DGX Spark**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm121/TMatmul.hpp b/include/pto/gpu/sm121/TMatmul.hpp new file mode 100644 index 00000000..7e1ba10c --- /dev/null +++ b/include/pto/gpu/sm121/TMatmul.hpp @@ -0,0 +1,295 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_SM121_TMATMUL_HPP +#define PTO_GPU_SM121_TMATMUL_HPP + +#include +#include +#include +#include "pto/gpu/common/tile_offsets.hpp" + +namespace pto::gpu::sm121 { + +namespace wmma = nvcuda::wmma; + +constexpr int kMmaM = 16; +constexpr int kMmaN = 16; +constexpr int kMmaK = 16; + +template +PTO_INTERNAL float ToAccumFloat(T value) +{ + return static_cast(value); +} + +template <> +PTO_INTERNAL float ToAccumFloat(half value) +{ + return __half2float(value); +} + +template <> +PTO_INTERNAL float ToAccumFloat(bfloat16_t value) +{ + return __bfloat162float(value); +} + +PTO_INTERNAL float InlinePtxFma(float a, float b, float c) +{ + float out; + asm volatile("fma.rn.f32 %0, %1, %2, %3;" : "=f"(out) : "f"(a), "f"(b), "f"(c)); + return out; +} + +PTO_INTERNAL unsigned LinearThreadId() +{ + return threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); +} + +PTO_INTERNAL unsigned ThreadsPerBlock() +{ + return blockDim.x * blockDim.y * blockDim.z; +} + +template +PTO_INTERNAL bool CanUseSm121TensorCoreCore(TileAccOut &cOutMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + using CType = typename TileAccOut::DType; + using AType = typename TileLeft::DType; + using BType = typename TileRight::DType; + + constexpr bool supportedTypes = + std::is_same_v && + ((std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v)); + + if constexpr (!supportedTypes) { + (void)cOutMatrix; + (void)aMatrix; + (void)bMatrix; + return false; + } else { + if constexpr (!TileLeft::isRowMajor || !TileRight::isRowMajor || !TileAccOut::isRowMajor) { + return false; + } + + const uint16_t m = aMatrix.GetValidRow(); + const uint16_t k = aMatrix.GetValidCol(); + const uint16_t n = bMatrix.GetValidCol(); + if (k != bMatrix.GetValidRow()) { + return false; + } + if ((m % kMmaM) != 0 || (n % kMmaN) != 0 || (k % kMmaK) != 0) { + return false; + } + return ThreadsPerBlock() >= warpSize; + } +} + +template +PTO_INTERNAL bool TryTensorCoreTMATMULCore(TileAccOut &cOutMatrix, TileLeft &aMatrix, TileRight &bMatrix, + TileAccIn *cInMatrix = nullptr, TileBias *biasMatrix = nullptr) +{ + using CType = typename TileAccOut::DType; + using AType = typename TileLeft::DType; + using BType = typename TileRight::DType; + + constexpr bool supportedTypes = + std::is_same_v && + ((std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v)); + + if constexpr (!supportedTypes) { + (void)cOutMatrix; + (void)aMatrix; + (void)bMatrix; + (void)cInMatrix; + (void)biasMatrix; + return false; + } else { + if (!CanUseSm121TensorCoreCore(cOutMatrix, aMatrix, bMatrix)) { + return false; + } + if constexpr (UseAcc) { + if constexpr (!std::is_same_v || !TileAccIn::isRowMajor) { + return false; + } + } + if constexpr (UseBias) { + if constexpr (!std::is_same_v || (TileBias::Rows != 1) || + !TileBias::isRowMajor) { + return false; + } + } + + const unsigned linearTid = LinearThreadId(); + const unsigned warpId = linearTid / warpSize; + if (warpId != 0) { + return true; + } + + const uint16_t m = aMatrix.GetValidRow(); + const uint16_t k = aMatrix.GetValidCol(); + const uint16_t n = bMatrix.GetValidCol(); + const int lda = TileLeft::Cols; + const int ldb = TileRight::Cols; + const int ldc = TileAccOut::Cols; + + using FragA = wmma::fragment; + using FragB = wmma::fragment; + using FragC = wmma::fragment; + + for (uint16_t mBase = 0; mBase < m; mBase += kMmaM) { + for (uint16_t nBase = 0; nBase < n; nBase += kMmaN) { + FragC cFrag; + if constexpr (UseAcc) { + float *cInPtr = cInMatrix->data() + gpu::GetTileElementOffset(mBase, nBase); + wmma::load_matrix_sync(cFrag, cInPtr, ldc, wmma::mem_row_major); + } else { + wmma::fill_fragment(cFrag, 0.0f); + } + + for (uint16_t kBase = 0; kBase < k; kBase += kMmaK) { + FragA aFrag; + FragB bFrag; + AType *aPtr = aMatrix.data() + gpu::GetTileElementOffset(mBase, kBase); + BType *bPtr = bMatrix.data() + gpu::GetTileElementOffset(kBase, nBase); + wmma::load_matrix_sync(aFrag, aPtr, lda); + wmma::load_matrix_sync(bFrag, bPtr, ldb); + wmma::mma_sync(cFrag, aFrag, bFrag, cFrag); + } + + float *cOutPtr = cOutMatrix.data() + gpu::GetTileElementOffset(mBase, nBase); + wmma::store_matrix_sync(cOutPtr, cFrag, ldc, wmma::mem_row_major); + + if constexpr (UseBias) { + __syncwarp(); + if ((linearTid & (warpSize - 1)) == 0) { + for (uint16_t row = 0; row < kMmaM; ++row) { + for (uint16_t col = 0; col < kMmaN; ++col) { + const std::size_t outIdx = row * ldc + col; + const std::size_t biasIdx = gpu::GetTileElementOffset(0, nBase + col); + cOutPtr[outIdx] += biasMatrix->data()[biasIdx]; + } + } + } + __syncwarp(); + } + } + } + return true; + } +} + +template +PTO_INTERNAL bool TryTensorCoreTMATMUL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + return TryTensorCoreTMATMULCore(cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL bool TryTensorCoreTMATMULAcc(TileAcc &cOutMatrix, TileAcc &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + return TryTensorCoreTMATMULCore( + cOutMatrix, aMatrix, bMatrix, &cInMatrix, static_cast(nullptr)); +} + +template +PTO_INTERNAL bool TryTensorCoreTMATMULAcc(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + return TryTensorCoreTMATMULCore( + cMatrix, aMatrix, bMatrix, &cMatrix, static_cast(nullptr)); +} + +template +PTO_INTERNAL bool TryTensorCoreTMATMULBias(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasMatrix) +{ + return TryTensorCoreTMATMULCore( + cMatrix, aMatrix, bMatrix, static_cast(nullptr), &biasMatrix); +} + +template +PTO_INTERNAL bool TryInlinePtxF32TMATMUL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + using CType = typename TileAcc::DType; + using AType = typename TileLeft::DType; + using BType = typename TileRight::DType; + + constexpr bool supportedTypes = std::is_same_v && + ((std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v)); + if constexpr (!supportedTypes) { + (void)cMatrix; + (void)aMatrix; + (void)bMatrix; + return false; + } else { + const unsigned linearTid = LinearThreadId(); + if (linearTid != 0) { + return ThreadsPerBlock() >= 1; + } + + const uint16_t m = aMatrix.GetValidRow(); + const uint16_t k = aMatrix.GetValidCol(); + const uint16_t n = bMatrix.GetValidCol(); + if (k != bMatrix.GetValidRow()) { + return false; + } + +#pragma unroll 1 + for (uint16_t row = 0; row < m; ++row) { + for (uint16_t col = 0; col < n; ++col) { + float acc = 0.0f; +#pragma unroll 4 + for (uint16_t kk = 0; kk < k; ++kk) { + const std::size_t aIdx = gpu::GetTileElementOffset(row, kk); + const std::size_t bIdx = gpu::GetTileElementOffset(kk, col); + acc = InlinePtxFma(ToAccumFloat(aMatrix.data()[aIdx]), ToAccumFloat(bMatrix.data()[bIdx]), acc); + } + cMatrix.data()[gpu::GetTileElementOffset(row, col)] = acc; + } + } + return true; + } +} + +template +PTO_INTERNAL bool TrySm121TMATMUL(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + if (TryTensorCoreTMATMUL(cMatrix, aMatrix, bMatrix)) { + return true; + } + return TryInlinePtxF32TMATMUL(cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL bool TrySm121TMATMULAcc(TileAcc &cOutMatrix, TileAcc &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + return TryTensorCoreTMATMULAcc(cOutMatrix, cInMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL bool TrySm121TMATMULAcc(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix) +{ + return TryTensorCoreTMATMULAcc(cMatrix, aMatrix, bMatrix); +} + +template +PTO_INTERNAL bool TrySm121TMATMULBias(TileAcc &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasMatrix) +{ + return TryTensorCoreTMATMULBias(cMatrix, aMatrix, bMatrix, biasMatrix); +} + +} // namespace pto::gpu::sm121 + +#endif diff --git a/include/pto/gpu/sm121/arch.hpp b/include/pto/gpu/sm121/arch.hpp new file mode 100644 index 00000000..0167fb01 --- /dev/null +++ b/include/pto/gpu/sm121/arch.hpp @@ -0,0 +1,24 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#ifndef PTO_GPU_SM121_ARCH_HPP +#define PTO_GPU_SM121_ARCH_HPP + +namespace pto::gpu::sm121 { + +constexpr int kComputeCapability = 121; +constexpr bool kPreferInlinePtx = true; +constexpr unsigned kDefaultWarpTileM = 16; +constexpr unsigned kDefaultWarpTileN = 16; +constexpr unsigned kDefaultWarpTileK = 16; + +} // namespace pto::gpu::sm121 + +#endif diff --git a/include/pto/gpu/sm121/microkernels/README.md b/include/pto/gpu/sm121/microkernels/README.md new file mode 100644 index 00000000..3b89e772 --- /dev/null +++ b/include/pto/gpu/sm121/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm121/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm121**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm70/README.md b/include/pto/gpu/sm70/README.md new file mode 100644 index 00000000..1f51ded9 --- /dev/null +++ b/include/pto/gpu/sm70/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm70/ + +Architecture-specific PTO GPU specializations for **Volta**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm70/microkernels/README.md b/include/pto/gpu/sm70/microkernels/README.md new file mode 100644 index 00000000..a902c0fd --- /dev/null +++ b/include/pto/gpu/sm70/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm70/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm70**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm75/README.md b/include/pto/gpu/sm75/README.md new file mode 100644 index 00000000..2bc42975 --- /dev/null +++ b/include/pto/gpu/sm75/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm75/ + +Architecture-specific PTO GPU specializations for **Turing**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm75/microkernels/README.md b/include/pto/gpu/sm75/microkernels/README.md new file mode 100644 index 00000000..f37638b2 --- /dev/null +++ b/include/pto/gpu/sm75/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm75/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm75**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm80/README.md b/include/pto/gpu/sm80/README.md new file mode 100644 index 00000000..40375e0a --- /dev/null +++ b/include/pto/gpu/sm80/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm80/ + +Architecture-specific PTO GPU specializations for **Ampere datacenter**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm80/microkernels/README.md b/include/pto/gpu/sm80/microkernels/README.md new file mode 100644 index 00000000..952c513e --- /dev/null +++ b/include/pto/gpu/sm80/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm80/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm80**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm86/README.md b/include/pto/gpu/sm86/README.md new file mode 100644 index 00000000..058dc736 --- /dev/null +++ b/include/pto/gpu/sm86/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm86/ + +Architecture-specific PTO GPU specializations for **Ampere client/workstation**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm86/microkernels/README.md b/include/pto/gpu/sm86/microkernels/README.md new file mode 100644 index 00000000..d2dacd8c --- /dev/null +++ b/include/pto/gpu/sm86/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm86/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm86**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm87/README.md b/include/pto/gpu/sm87/README.md new file mode 100644 index 00000000..769a7744 --- /dev/null +++ b/include/pto/gpu/sm87/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm87/ + +Architecture-specific PTO GPU specializations for **Orin**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm87/microkernels/README.md b/include/pto/gpu/sm87/microkernels/README.md new file mode 100644 index 00000000..0cc59b97 --- /dev/null +++ b/include/pto/gpu/sm87/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm87/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm87**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm89/README.md b/include/pto/gpu/sm89/README.md new file mode 100644 index 00000000..d5375e7f --- /dev/null +++ b/include/pto/gpu/sm89/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm89/ + +Architecture-specific PTO GPU specializations for **Ada**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm89/microkernels/README.md b/include/pto/gpu/sm89/microkernels/README.md new file mode 100644 index 00000000..6cc6e809 --- /dev/null +++ b/include/pto/gpu/sm89/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm89/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm89**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm90/README.md b/include/pto/gpu/sm90/README.md new file mode 100644 index 00000000..2cc79f3a --- /dev/null +++ b/include/pto/gpu/sm90/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm90/ + +Architecture-specific PTO GPU specializations for **Hopper**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm90/microkernels/README.md b/include/pto/gpu/sm90/microkernels/README.md new file mode 100644 index 00000000..56456ea7 --- /dev/null +++ b/include/pto/gpu/sm90/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm90/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm90**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/gpu/sm90a/README.md b/include/pto/gpu/sm90a/README.md new file mode 100644 index 00000000..575d25ac --- /dev/null +++ b/include/pto/gpu/sm90a/README.md @@ -0,0 +1,24 @@ +# include/pto/gpu/sm90a/ + +Architecture-specific PTO GPU specializations for **Hopper architecture-conditional specialization**. + +## Role + +This directory is intended to hold per-SM overrides for instructions whose optimal implementation depends on: + +- MMA / tensor-core generation +- shared-memory banking and swizzle rules +- async copy behavior +- warpgroup scheduling details +- register pressure / occupancy trade-offs +- architecture-specific instructions or PTX variants + +## Expected contents later + +- instruction-level specializations (`TLoad`, `TStore`, `TAdd`, `TMatmul`, ...) +- tuning metadata +- tile-shape policy tables +- performance notes +- microkernel registry + +This folder is a scaffold for bring-up, not a live backend yet. diff --git a/include/pto/gpu/sm90a/microkernels/README.md b/include/pto/gpu/sm90a/microkernels/README.md new file mode 100644 index 00000000..4bd8f70f --- /dev/null +++ b/include/pto/gpu/sm90a/microkernels/README.md @@ -0,0 +1,22 @@ +# include/pto/gpu/sm90a/microkernels/ + +Planned home for dedicated PTO GPU microkernels targeting **sm90a**. + +The intent is to store instruction-family-specific kernels here, for example: + +- memory movement microkernels +- elementwise/vector microkernels +- reduction microkernels +- transpose / reshape helpers +- tensor-core / matmul kernels + +Recommended future organization: + +- `memory/` +- `elementwise/` +- `reduction/` +- `matrix/` +- `transform/` +- `sync/` + +Populate this directory only after the common GPU backend contract is defined. diff --git a/include/pto/pto-inst.hpp b/include/pto/pto-inst.hpp index 7eac6058..dd58777e 100644 --- a/include/pto/pto-inst.hpp +++ b/include/pto/pto-inst.hpp @@ -11,13 +11,17 @@ See LICENSE in the root of the software repository for the full text of the Lice #ifndef PTO_INST_HPP #define PTO_INST_HPP +#if defined(__CUDACC__) +#include "pto/common/gpu_stub.hpp" +#endif + #include #if defined(__CPU_SIM) || defined(__COSTMODEL) #include "pto/common/cpu_stub.hpp" #endif #include -#if defined(__CPU_SIM) || defined(__CCE_AICORE__) || defined(__COSTMODEL) +#if defined(__CPU_SIM) || defined(__CCE_AICORE__) || defined(__COSTMODEL) || defined(PTO_GPU_BACKEND) #include #include #include "pto/common/pto_instr.hpp" diff --git a/kernels/manual/gpu/README.md b/kernels/manual/gpu/README.md new file mode 100644 index 00000000..b535f2b6 --- /dev/null +++ b/kernels/manual/gpu/README.md @@ -0,0 +1,17 @@ +# kernels/manual/gpu/ + +Reserved for hand-tuned NVIDIA GPU kernel examples built on top of the future PTO GPU backend. + +Intended use: + +- end-to-end operator examples +- architecture-specific tuning case studies +- performance reference kernels +- launch configuration experiments + +Suggested first examples once the backend exists: + +- GEMM +- Flash Attention +- elementwise fusion +- reduction / softmax diff --git a/tests/README.md b/tests/README.md index 119dffca..1872b32b 100644 --- a/tests/README.md +++ b/tests/README.md @@ -11,6 +11,8 @@ Tests and examples for PTO Tile Lib, covering both CPU simulation and NPU (inclu - `README.md`: Script usage - `cpu/`: CPU-side ST tests (gtest + CMake) - `cpu/st/`: CPU ST projects and testcase data generation scripts +- `gpu/`: NVIDIA GPU backend tests + - `gpu/st/`: standalone CUDA smoke / correctness tests for the GPU backend - `npu/`: NPU-side ST tests split by SoC - `npu/a2a3/src/st/`: A2/A3 compute ST - `npu/a2a3/comm/st/`: A2/A3 communication ST diff --git a/tests/gpu/README.md b/tests/gpu/README.md new file mode 100644 index 00000000..47465ba0 --- /dev/null +++ b/tests/gpu/README.md @@ -0,0 +1,18 @@ +# tests/gpu/ + +Reserved for future PTO NVIDIA GPU backend tests. + +Planned test layers: + +- compile-only smoke tests +- instruction-level correctness tests +- CPU-vs-GPU oracle tests +- architecture-specific regression tests +- microbenchmarks / performance guards + +Suggested future split: + +- `common/` +- `sm80/` +- `sm90/` +- `sm121/` diff --git a/tests/gpu/st/CMakeLists.txt b/tests/gpu/st/CMakeLists.txt new file mode 100644 index 00000000..26bd1157 --- /dev/null +++ b/tests/gpu/st/CMakeLists.txt @@ -0,0 +1,14 @@ +cmake_minimum_required(VERSION 3.24) +project(pto_gpu_st LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_ARCHITECTURES 121) + +enable_testing() +find_package(Threads REQUIRED) +find_package(CUDAToolkit REQUIRED) + +add_subdirectory(testcase) diff --git a/tests/gpu/st/README.md b/tests/gpu/st/README.md new file mode 100644 index 00000000..9d428e8e --- /dev/null +++ b/tests/gpu/st/README.md @@ -0,0 +1,60 @@ +# tests/gpu/st/ + +Standalone CUDA smoke / correctness tests for the PTO NVIDIA GPU backend. + +## What it covers today + +Current executables: + +- `pto_gpu_core` — correctness / smoke checks +- `pto_gpu_perf` — lightweight GB10 matmul microbench + +Current checks: + +- `TLOAD` ND row-major path +- `TLOAD` DN col-major path +- `TSTORE` ND row-major path +- `TSTORE` DN col-major path +- `TADD` correctness against a host reference +- GPU swizzle physical-layout smoke test +- GPU swizzle `TLOAD`/`TSTORE` round-trip smoke test +- GPU swizzle `TADD` round-trip smoke test +- `sm121` float `TMATMUL` inline-PTX FMA fallback smoke test +- `sm121` half `TMATMUL` tensor-core MMA tiled-path smoke test +- `sm121` bfloat16 `TMATMUL` tensor-core MMA tiled-path smoke test +- `sm121` half `TMATMUL` larger 64x64x64 tensor-core correctness test +- `sm121` bfloat16 `TMATMUL` larger 64x64x64 tensor-core correctness test +- `sm121` half `TMATMUL_ACC` tensor-core fast-path smoke test +- `sm121` bfloat16 `TMATMUL_BIAS` tensor-core fast-path smoke test +- `sm121` half `TMATMUL_MX` API-path smoke test +- `sm121` half `TGEMV_MX` API-path smoke test + +## Build + +```bash +cmake -S tests/gpu/st -B build/tests/gpu-st -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=121 +cmake --build build/tests/gpu-st -j +``` + +## Run + +Correctness lane: + +```bash +cd build/tests/gpu-st +ctest --output-on-failure +``` + +Perf microbench: + +```bash +./build/tests/gpu-st/testcase/pto_gpu_perf/pto_gpu_perf +``` + +## Notes + +- This lane is intentionally lightweight and self-contained. +- It uses CTest directly instead of the repo's CPU/NPU test harnesses. +- `sm121` now has a real warp-level tensor-core MMA path for half/bfloat16 using 16x16x16 MMA tiles, composed into larger 16-aligned matrix tiles in software. +- `TMATMUL_ACC` and `TMATMUL_BIAS` now reuse the same tensor-core tiled path on supported `sm121` shapes. +- float matmul currently uses the lighter inline-PTX FMA fallback path. diff --git a/tests/gpu/st/testcase/CMakeLists.txt b/tests/gpu/st/testcase/CMakeLists.txt new file mode 100644 index 00000000..4e343194 --- /dev/null +++ b/tests/gpu/st/testcase/CMakeLists.txt @@ -0,0 +1,30 @@ + +function(pto_gpu_tool NAME) + add_executable(${NAME} ${NAME}.cu) + target_include_directories(${NAME} PRIVATE + ${PROJECT_SOURCE_DIR}/../../../include + ${PROJECT_SOURCE_DIR}/../../common + ) + target_compile_options(${NAME} PRIVATE + $<$:-O2> + $<$:--expt-relaxed-constexpr -O2> + ) + target_link_libraries(${NAME} PRIVATE Threads::Threads CUDA::cudart) +endfunction() + +function(pto_gpu_st NAME) + add_executable(${NAME} ${NAME}.cu) + target_include_directories(${NAME} PRIVATE + ${PROJECT_SOURCE_DIR}/../../../include + ${PROJECT_SOURCE_DIR}/../../common + ) + target_compile_options(${NAME} PRIVATE + $<$:-O2> + $<$:--expt-relaxed-constexpr -O2> + ) + target_link_libraries(${NAME} PRIVATE Threads::Threads CUDA::cudart) + add_test(NAME ${NAME} COMMAND ${NAME}) +endfunction() + +add_subdirectory(pto_gpu_core) +add_subdirectory(pto_gpu_perf) diff --git a/tests/gpu/st/testcase/pto_gpu_core/CMakeLists.txt b/tests/gpu/st/testcase/pto_gpu_core/CMakeLists.txt new file mode 100644 index 00000000..f4931635 --- /dev/null +++ b/tests/gpu/st/testcase/pto_gpu_core/CMakeLists.txt @@ -0,0 +1 @@ +pto_gpu_st(pto_gpu_core) diff --git a/tests/gpu/st/testcase/pto_gpu_core/pto_gpu_core.cu b/tests/gpu/st/testcase/pto_gpu_core/pto_gpu_core.cu new file mode 100644 index 00000000..7391b2d8 --- /dev/null +++ b/tests/gpu/st/testcase/pto_gpu_core/pto_gpu_core.cu @@ -0,0 +1,1108 @@ +#include +#include +#include +#include +#include + +#include + +namespace { + +template +struct SimpleGlobal { + using DType = T; + static constexpr pto::Layout layout = LayoutV; + + T *ptr; + int64_t shape[pto::GlobalTensorDim::TOTAL_DIM]; + int64_t stride[pto::GlobalTensorDim::TOTAL_DIM]; + + __host__ __device__ SimpleGlobal(T *ptr_, int64_t s0, int64_t s1, int64_t s2, int64_t s3, int64_t s4, + int64_t st0, int64_t st1, int64_t st2, int64_t st3, int64_t st4) + : ptr(ptr_), shape{s0, s1, s2, s3, s4}, stride{st0, st1, st2, st3, st4} + { + } + + __host__ __device__ T *data() + { + return ptr; + } + + __host__ __device__ const T *data() const + { + return ptr; + } + + __host__ __device__ int64_t GetShape(int dim) const + { + return shape[dim]; + } + + __host__ __device__ int64_t GetStride(int dim) const + { + return stride[dim]; + } +}; + +bool CheckCuda(cudaError_t status, const char *what) +{ + if (status == cudaSuccess) { + return true; + } + std::cerr << "[CUDA] " << what << ": " << cudaGetErrorString(status) << std::endl; + return false; +} + +template +std::size_t RowMajorOffset(std::size_t rows, std::size_t cols, std::size_t r, std::size_t c) +{ + (void)rows; + return r * cols + c; +} + +template +std::size_t ColMajorOffset(std::size_t rows, std::size_t cols, std::size_t r, std::size_t c) +{ + (void)cols; + return c * rows + r; +} + +template +std::vector RefTLoadNdRowMajor(const std::vector &src, int g0, int g1, int g2, int g3, int g4, int tileRows, + int tileCols, T pad) +{ + std::vector out(tileRows * tileCols, pad); + int rowBase = 0; + for (int i = 0; i < g0; ++i) { + for (int j = 0; j < g1; ++j) { + for (int k = 0; k < g2; ++k) { + for (int r = 0; r < g3; ++r) { + const int tileRow = rowBase + r; + const int srcBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + r * g4; + for (int c = 0; c < g4; ++c) { + out[RowMajorOffset(tileRows, tileCols, tileRow, c)] = src[srcBase + c]; + } + } + rowBase += g3; + } + } + } + return out; +} + +template +std::vector RefTLoadDnColMajor(const std::vector &src, int g0, int g1, int g2, int g3, int g4, int tileRows, + int tileCols, T pad) +{ + std::vector out(tileRows * tileCols, pad); + int colBase = 0; + for (int i = 0; i < g0; ++i) { + for (int j = 0; j < g1; ++j) { + for (int k = 0; k < g2; ++k) { + for (int c = 0; c < g4; ++c) { + const int tileCol = colBase + c; + const int srcBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + c * g3; + for (int r = 0; r < g3; ++r) { + out[ColMajorOffset(tileRows, tileCols, r, tileCol)] = src[srcBase + r]; + } + } + colBase += g4; + } + } + } + return out; +} + +template +std::vector RefTStoreNdRowMajor(const std::vector &tile, int g0, int g1, int g2, int g3, int g4, int tileRows, + int tileCols) +{ + std::vector out(g0 * g1 * g2 * g3 * g4, T{}); + int rowBase = 0; + for (int i = 0; i < g0; ++i) { + for (int j = 0; j < g1; ++j) { + for (int k = 0; k < g2; ++k) { + for (int r = 0; r < g3; ++r) { + const int tileRow = rowBase + r; + const int dstBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + r * g4; + for (int c = 0; c < g4; ++c) { + out[dstBase + c] = tile[RowMajorOffset(tileRows, tileCols, tileRow, c)]; + } + } + rowBase += g3; + } + } + } + return out; +} + +template +std::vector RefTStoreDnColMajor(const std::vector &tile, int g0, int g1, int g2, int g3, int g4, int tileRows, + int tileCols) +{ + std::vector out(g0 * g1 * g2 * g3 * g4, T{}); + int colBase = 0; + for (int i = 0; i < g0; ++i) { + for (int j = 0; j < g1; ++j) { + for (int k = 0; k < g2; ++k) { + for (int c = 0; c < g4; ++c) { + const int tileCol = colBase + c; + const int dstBase = i * (g1 * g2 * g3 * g4) + j * (g2 * g3 * g4) + k * (g3 * g4) + c * g3; + for (int r = 0; r < g3; ++r) { + out[dstBase + r] = tile[ColMajorOffset(tileRows, tileCols, r, tileCol)]; + } + } + colBase += g4; + } + } + } + return out; +} + +template +std::vector RefTAddRowMajor(const std::vector &a, const std::vector &b, int tileRows, int tileCols, + int validRows, int validCols, T sentinel) +{ + std::vector out(tileRows * tileCols, sentinel); + for (int r = 0; r < validRows; ++r) { + for (int c = 0; c < validCols; ++c) { + out[RowMajorOffset(tileRows, tileCols, r, c)] = + a[RowMajorOffset(tileRows, tileCols, r, c)] + b[RowMajorOffset(tileRows, tileCols, r, c)]; + } + } + return out; +} + +template +std::vector RefGpuSwizzle128BRowMajor(const std::vector &logical, int rows, int cols) +{ + constexpr int swizzleRows = 8; + const int swizzleCols = 128 / (swizzleRows * sizeof(T)); + std::vector out(logical.size(), T{}); + const int chunksPerRow = cols / swizzleCols; + auto isPow2 = [](int v) { return v > 0 && ((v & (v - 1)) == 0); }; + for (int r = 0; r < rows; ++r) { + const int rowBlock = r / swizzleRows; + const int rowInBlock = r % swizzleRows; + for (int c = 0; c < cols; ++c) { + const int chunk = c / swizzleCols; + const int colInChunk = c % swizzleCols; + const int permutedChunk = isPow2(chunksPerRow) ? ((chunk ^ (rowInBlock % chunksPerRow)) & (chunksPerRow - 1)) + : ((chunk + rowInBlock) % chunksPerRow); + const int physical = rowBlock * swizzleRows * cols + rowInBlock * cols + permutedChunk * swizzleCols + colInChunk; + out[physical] = logical[r * cols + c]; + } + } + return out; +} + +template +std::vector RefMatmulF32(const std::vector &a, const std::vector &b, int m, int k, int n) +{ + std::vector out(m * n, T{}); + for (int row = 0; row < m; ++row) { + for (int col = 0; col < n; ++col) { + T acc = T{}; + for (int kk = 0; kk < k; ++kk) { + acc += a[row * k + kk] * b[kk * n + col]; + } + out[row * n + col] = acc; + } + } + return out; +} + +template +float ToHostFloat(T value) +{ + return static_cast(value); +} + +template <> +float ToHostFloat(half value) +{ + return __half2float(value); +} + +template <> +float ToHostFloat(bfloat16_t value) +{ + return __bfloat162float(value); +} + +template +std::vector RefMatmulToFloat(const std::vector &a, const std::vector &b, int m, int k, int n) +{ + std::vector out(m * n, 0.0f); + for (int row = 0; row < m; ++row) { + for (int col = 0; col < n; ++col) { + float acc = 0.0f; + for (int kk = 0; kk < k; ++kk) { + acc += ToHostFloat(a[row * k + kk]) * ToHostFloat(b[kk * n + col]); + } + out[row * n + col] = acc; + } + } + return out; +} + +template +std::vector RefMatmulAccToFloat(const std::vector &a, const std::vector &b, const std::vector &acc, + int m, int k, int n) +{ + auto out = RefMatmulToFloat(a, b, m, k, n); + for (std::size_t i = 0; i < out.size(); ++i) { + out[i] += acc[i]; + } + return out; +} + +template +std::vector RefMatmulBiasToFloat(const std::vector &a, const std::vector &b, + const std::vector &bias, int m, int k, int n) +{ + auto out = RefMatmulToFloat(a, b, m, k, n); + for (int row = 0; row < m; ++row) { + for (int col = 0; col < n; ++col) { + out[row * n + col] += bias[col]; + } + } + return out; +} + +template +bool ExpectVecEq(const std::vector &expected, const std::vector &actual, const char *label) +{ + if (expected.size() != actual.size()) { + std::cerr << label << ": size mismatch: expected " << expected.size() << ", got " << actual.size() + << std::endl; + return false; + } + for (std::size_t i = 0; i < expected.size(); ++i) { + if (expected[i] != actual[i]) { + std::cerr << label << ": mismatch at index " << i << std::endl; + return false; + } + } + return true; +} + +template <> +bool ExpectVecEq(const std::vector &expected, const std::vector &actual, const char *label) +{ + if (expected.size() != actual.size()) { + std::cerr << label << ": size mismatch: expected " << expected.size() << ", got " << actual.size() + << std::endl; + return false; + } + for (std::size_t i = 0; i < expected.size(); ++i) { + if (std::abs(expected[i] - actual[i]) > 1e-5f) { + std::cerr << label << ": mismatch at index " << i << ", expected " << expected[i] << ", got " + << actual[i] << std::endl; + return false; + } + } + return true; +} + +bool ExpectVecNear(const std::vector &expected, const std::vector &actual, + const char *label, float tol) +{ + if (expected.size() != actual.size()) { + std::cerr << label << ": size mismatch: expected " << expected.size() << ", got " << actual.size() + << std::endl; + return false; + } + for (std::size_t i = 0; i < expected.size(); ++i) { + if (std::abs(expected[i] - actual[i]) > tol) { + std::cerr << label << ": mismatch at index " << i << ", expected " << expected[i] << ", got " + << actual[i] << ", tol " << tol << std::endl; + return false; + } + } + return true; +} + +template +__global__ void KernelTLoadNd(T *out, T *src) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using GlobalData = SimpleGlobal; + using TileData = pto::Tile; + T storage[TRows * TCols]; + TileData tile(G0 * G1 * G2 * G3, G4); + tile.data() = storage; + GlobalData global(src, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, G4, 1); + pto::TLOAD(tile, global); + for (int i = 0; i < TRows * TCols; ++i) { + out[i] = storage[i]; + } +} + +template +__global__ void KernelTLoadDn(T *out, T *src) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using GlobalData = SimpleGlobal; + using TileData = pto::Tile; + T storage[TRows * TCols]; + TileData tile(G3, G0 * G1 * G2 * G4); + tile.data() = storage; + GlobalData global(src, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, 1, G3); + pto::TLOAD(tile, global); + for (int i = 0; i < TRows * TCols; ++i) { + out[i] = storage[i]; + } +} + +template +__global__ void KernelTStoreNd(T *out, T *tileRaw) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using GlobalData = SimpleGlobal; + using TileData = pto::Tile; + TileData tile(G0 * G1 * G2 * G3, G4); + tile.data() = tileRaw; + GlobalData global(out, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, G4, 1); + pto::TSTORE(global, tile); +} + +template +__global__ void KernelTStoreDn(T *out, T *tileRaw) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using GlobalData = SimpleGlobal; + using TileData = pto::Tile; + TileData tile(G3, G0 * G1 * G2 * G4); + tile.data() = tileRaw; + GlobalData global(out, G0, G1, G2, G3, G4, G1 * G2 * G3 * G4, G2 * G3 * G4, G3 * G4, 1, G3); + pto::TSTORE(global, tile); +} + +template +__global__ void KernelTAdd(T *out, T *a, T *b, T sentinel) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using TileData = pto::Tile; + T aStorage[TRows * TCols]; + T bStorage[TRows * TCols]; + T cStorage[TRows * TCols]; + for (int i = 0; i < TRows * TCols; ++i) { + aStorage[i] = a[i]; + bStorage[i] = b[i]; + cStorage[i] = sentinel; + } + TileData aTile(ValidRows, ValidCols); + TileData bTile(ValidRows, ValidCols); + TileData cTile(ValidRows, ValidCols); + aTile.data() = aStorage; + bTile.data() = bStorage; + cTile.data() = cStorage; + pto::TADD(cTile, aTile, bTile); + for (int i = 0; i < TRows * TCols; ++i) { + out[i] = cStorage[i]; + } +} + +template +__global__ void KernelTLoadGpuSwizzleRaw(T *out, T *src) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using Shape5 = pto::Shape<-1, -1, -1, -1, -1>; + using GlobalData = SimpleGlobal; + using TileData = pto::TileVecGpuSwizzle; + T storage[Rows * Cols]; + TileData tile; + tile.data() = storage; + GlobalData global(src, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1); + pto::TLOAD(tile, global); + for (int i = 0; i < Rows * Cols; ++i) { + out[i] = storage[i]; + } +} + +template +__global__ void KernelTLoadStoreGpuSwizzle(T *out, T *src) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using GlobalData = SimpleGlobal; + using TileData = pto::TileVecGpuSwizzle; + T storage[Rows * Cols]; + TileData tile; + tile.data() = storage; + GlobalData srcGlobal(src, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1); + GlobalData dstGlobal(out, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1); + pto::TLOAD(tile, srcGlobal); + pto::TSTORE(dstGlobal, tile); +} + +template +__global__ void KernelTAddGpuSwizzleRoundTrip(T *out, T *a, T *b) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) { + return; + } + using GlobalData = SimpleGlobal; + using TileData = pto::TileVecGpuSwizzle; + T aStorage[Rows * Cols]; + T bStorage[Rows * Cols]; + T cStorage[Rows * Cols]; + TileData aTile; + TileData bTile; + TileData cTile; + aTile.data() = aStorage; + bTile.data() = bStorage; + cTile.data() = cStorage; + GlobalData aGlobal(a, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1); + GlobalData bGlobal(b, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1); + GlobalData outGlobal(out, 1, 1, 1, Rows, Cols, Rows * Cols, Rows * Cols, Rows * Cols, Cols, 1); + pto::TLOAD(aTile, aGlobal); + pto::TLOAD(bTile, bGlobal); + pto::TADD(cTile, aTile, bTile); + pto::TSTORE(outGlobal, cTile); +} + +template +__global__ void KernelTMATMUL(float *out, InputT *a, InputT *b) +{ + using TileA = pto::Tile; + using TileB = pto::Tile; + using TileC = pto::Tile; + TileA aTile(M, K); + TileB bTile(K, N); + TileC cTile(M, N); + aTile.data() = a; + bTile.data() = b; + cTile.data() = out; + pto::TMATMUL(cTile, aTile, bTile); +} + +template +__global__ void KernelTMATMUL_ACC(float *out, float *in, InputT *a, InputT *b) +{ + using TileA = pto::Tile; + using TileB = pto::Tile; + using TileC = pto::Tile; + TileA aTile(M, K); + TileB bTile(K, N); + TileC cOutTile(M, N); + TileC cInTile(M, N); + aTile.data() = a; + bTile.data() = b; + cOutTile.data() = out; + cInTile.data() = in; + pto::TMATMUL_ACC(cOutTile, cInTile, aTile, bTile); +} + +template +__global__ void KernelTMATMUL_BIAS(float *out, InputT *a, InputT *b, float *bias) +{ + using TileA = pto::Tile; + using TileB = pto::Tile; + using TileC = pto::Tile; + using TileBias = pto::Tile; + TileA aTile(M, K); + TileB bTile(K, N); + TileC cTile(M, N); + TileBias biasTile(1, N); + aTile.data() = a; + bTile.data() = b; + cTile.data() = out; + biasTile.data() = bias; + pto::TMATMUL_BIAS(cTile, aTile, bTile, biasTile); +} + +template +__global__ void KernelTMATMUL_MX(float *out, InputT *a, float *aScale, InputT *b, float *bScale) +{ + using TileA = pto::Tile; + using TileB = pto::Tile; + using TileC = pto::Tile; + using TileScaleA = pto::Tile; + using TileScaleB = pto::Tile; + TileA aTile(M, K); + TileB bTile(K, N); + TileC cTile(M, N); + TileScaleA aScaleTile(1, K); + TileScaleB bScaleTile(1, N); + aTile.data() = a; + bTile.data() = b; + cTile.data() = out; + aScaleTile.data() = aScale; + bScaleTile.data() = bScale; + pto::TMATMUL_MX(cTile, aTile, aScaleTile, bTile, bScaleTile); +} + +template +__global__ void KernelTGEMV_MX(float *out, InputT *a, float *aScale, InputT *b, float *bScale) +{ + using TileA = pto::Tile; + using TileB = pto::Tile; + using TileC = pto::Tile; + using TileScaleA = pto::Tile; + using TileScaleB = pto::Tile; + TileA aTile(M, K); + TileB bTile(K, N); + TileC cTile(M, N); + TileScaleA aScaleTile(1, K); + TileScaleB bScaleTile(1, N); + aTile.data() = a; + bTile.data() = b; + cTile.data() = out; + aScaleTile.data() = aScale; + bScaleTile.data() = bScale; + pto::TGEMV_MX(cTile, aTile, aScaleTile, bTile, bScaleTile); +} + +bool TestTLoadNdRowMajorMatchesReference() +{ + constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 4, G4 = 7, TRows = 4, TCols = 8; + std::vector src(G0 * G1 * G2 * G3 * G4); + for (std::size_t i = 0; i < src.size(); ++i) { + src[i] = static_cast(i + 1); + } + auto expected = RefTLoadNdRowMajor(src, G0, G1, G2, G3, G4, TRows, TCols, 0.0f); + + float *dSrc = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(float)), "cudaMalloc dSrc")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false; + if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(float), cudaMemcpyHostToDevice), "copy src")) return false; + KernelTLoadNd<<<1, 1>>>(dOut, dSrc); + if (!CheckCuda(cudaGetLastError(), "launch tload nd")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tload nd")) return false; + + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false; + cudaFree(dSrc); + cudaFree(dOut); + return ExpectVecEq(expected, actual, "tload_nd"); +} + +bool TestTLoadDnColMajorMatchesReference() +{ + constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 3, G4 = 5, TRows = 8, TCols = 8; + std::vector src(G0 * G1 * G2 * G3 * G4); + for (std::size_t i = 0; i < src.size(); ++i) { + src[i] = static_cast(100 + i); + } + auto expected = RefTLoadDnColMajor(src, G0, G1, G2, G3, G4, TRows, TCols, 0.0f); + + float *dSrc = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(float)), "cudaMalloc dSrc")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false; + if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(float), cudaMemcpyHostToDevice), "copy src")) return false; + KernelTLoadDn<<<1, 1>>>(dOut, dSrc); + if (!CheckCuda(cudaGetLastError(), "launch tload dn")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tload dn")) return false; + + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false; + cudaFree(dSrc); + cudaFree(dOut); + return ExpectVecEq(expected, actual, "tload_dn"); +} + +bool TestTStoreNdAndDnMatchReference() +{ + { + constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 4, G4 = 7, TRows = 4, TCols = 8; + std::vector tile(TRows * TCols, -1.0f); + for (int r = 0; r < G3; ++r) { + for (int c = 0; c < G4; ++c) { + tile[r * TCols + c] = static_cast(r * 10 + c); + } + } + auto expected = RefTStoreNdRowMajor(tile, G0, G1, G2, G3, G4, TRows, TCols); + float *dTile = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dTile, tile.size() * sizeof(float)), "cudaMalloc dTile")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset dOut")) return false; + if (!CheckCuda(cudaMemcpy(dTile, tile.data(), tile.size() * sizeof(float), cudaMemcpyHostToDevice), "copy tile")) return false; + KernelTStoreNd<<<1, 1>>>(dOut, dTile); + if (!CheckCuda(cudaGetLastError(), "launch tstore nd")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tstore nd")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false; + cudaFree(dTile); + cudaFree(dOut); + if (!ExpectVecEq(expected, actual, "tstore_nd")) return false; + } + { + constexpr int G0 = 1, G1 = 1, G2 = 1, G3 = 3, G4 = 6, TRows = 16, TCols = 8; + std::vector tile(TRows * TCols, static_cast(-7)); + for (int c = 0; c < G4; ++c) { + for (int r = 0; r < G3; ++r) { + tile[c * TRows + r] = static_cast(c * 10 + r); + } + } + auto expected = RefTStoreDnColMajor(tile, G0, G1, G2, G3, G4, TRows, TCols); + int16_t *dTile = nullptr; + int16_t *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dTile, tile.size() * sizeof(int16_t)), "cudaMalloc dTile")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(int16_t)), "cudaMalloc dOut")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(int16_t)), "memset dOut")) return false; + if (!CheckCuda(cudaMemcpy(dTile, tile.data(), tile.size() * sizeof(int16_t), cudaMemcpyHostToDevice), "copy tile")) return false; + KernelTStoreDn<<<1, 1>>>(dOut, dTile); + if (!CheckCuda(cudaGetLastError(), "launch tstore dn")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tstore dn")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(int16_t), cudaMemcpyDeviceToHost), "copy out")) return false; + cudaFree(dTile); + cudaFree(dOut); + if (!ExpectVecEq(expected, actual, "tstore_dn")) return false; + } + return true; +} + +bool TestTAddMatchesReference() +{ + constexpr int TRows = 8, TCols = 8, ValidRows = 5, ValidCols = 7; + std::vector a(TRows * TCols); + std::vector b(TRows * TCols); + for (int i = 0; i < TRows * TCols; ++i) { + a[i] = static_cast(i); + b[i] = static_cast(100 - i); + } + constexpr float sentinel = -777.0f; + auto expected = RefTAddRowMajor(a, b, TRows, TCols, ValidRows, ValidCols, sentinel); + float *dA = nullptr; + float *dB = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(float)), "cudaMalloc dA")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(float)), "cudaMalloc dB")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(float), cudaMemcpyHostToDevice), "copy a")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(float), cudaMemcpyHostToDevice), "copy b")) return false; + KernelTAdd<<<1, 1>>>(dOut, dA, dB, sentinel); + if (!CheckCuda(cudaGetLastError(), "launch tadd")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tadd")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecEq(expected, actual, "tadd"); +} + +bool TestGpuSwizzlePhysicalLayoutMatchesReference() +{ + constexpr int Rows = 16, Cols = 16; + std::vector src(Rows * Cols); + for (int i = 0; i < Rows * Cols; ++i) { + src[i] = __float2half(static_cast(i + 1)); + } + auto expected = RefGpuSwizzle128BRowMajor(src, Rows, Cols); + half *dSrc = nullptr; + half *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(half)), "cudaMalloc dSrc swizzle")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(half)), "cudaMalloc dOut swizzle")) return false; + if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(half), cudaMemcpyHostToDevice), "copy src swizzle")) return false; + KernelTLoadGpuSwizzleRaw<<<1, 1>>>(dOut, dSrc); + if (!CheckCuda(cudaGetLastError(), "launch tload swizzle raw")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tload swizzle raw")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(half), cudaMemcpyDeviceToHost), "copy out swizzle")) return false; + cudaFree(dSrc); + cudaFree(dOut); + return ExpectVecEq(expected, actual, "gpu_swizzle_physical"); +} + +bool TestGpuSwizzleRoundTripMatchesReference() +{ + constexpr int Rows = 16, Cols = 16; + std::vector src(Rows * Cols); + for (int i = 0; i < Rows * Cols; ++i) { + src[i] = __float2half(static_cast((i % 23) - 11) * 0.25f); + } + half *dSrc = nullptr; + half *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dSrc, src.size() * sizeof(half)), "cudaMalloc dSrc swizzle rt")) return false; + if (!CheckCuda(cudaMalloc(&dOut, src.size() * sizeof(half)), "cudaMalloc dOut swizzle rt")) return false; + if (!CheckCuda(cudaMemcpy(dSrc, src.data(), src.size() * sizeof(half), cudaMemcpyHostToDevice), "copy src swizzle rt")) return false; + KernelTLoadStoreGpuSwizzle<<<1, 1>>>(dOut, dSrc); + if (!CheckCuda(cudaGetLastError(), "launch tloadstore swizzle")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tloadstore swizzle")) return false; + std::vector actual(src.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(half), cudaMemcpyDeviceToHost), "copy out swizzle rt")) return false; + cudaFree(dSrc); + cudaFree(dOut); + return ExpectVecEq(src, actual, "gpu_swizzle_roundtrip"); +} + +bool TestGpuSwizzleTAddRoundTripMatchesReference() +{ + constexpr int Rows = 16, Cols = 16; + std::vector a(Rows * Cols), b(Rows * Cols); + std::vector expected(Rows * Cols); + for (int i = 0; i < Rows * Cols; ++i) { + a[i] = __float2half(static_cast((i % 13) - 6) * 0.5f); + b[i] = __float2half(static_cast((i % 7) - 3) * 0.25f); + expected[i] = __hadd(a[i], b[i]); + } + half *dA = nullptr; + half *dB = nullptr; + half *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA swizzle add")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB swizzle add")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(half)), "cudaMalloc dOut swizzle add")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a swizzle add")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b swizzle add")) return false; + KernelTAddGpuSwizzleRoundTrip<<<1, 1>>>(dOut, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tadd swizzle")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tadd swizzle")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(half), cudaMemcpyDeviceToHost), "copy out swizzle add")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecEq(expected, actual, "gpu_swizzle_tadd"); +} + +bool TestSm121FloatInlinePtxMatmulMatchesReference() +{ + constexpr int M = 8, K = 8, N = 8; + std::vector a(M * K); + std::vector b(K * N); + for (int i = 0; i < M * K; ++i) { + a[i] = static_cast((i % 9) - 4); + } + for (int i = 0; i < K * N; ++i) { + b[i] = static_cast((i % 7) - 3); + } + auto expected = RefMatmulF32(a, b, M, K, N); + float *dA = nullptr; + float *dB = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(float)), "cudaMalloc dA")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(float)), "cudaMalloc dB")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(float), cudaMemcpyHostToDevice), "copy a")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(float), cudaMemcpyHostToDevice), "copy b")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out")) return false; + KernelTMATMUL<<<1, 1>>>(dOut, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul float")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul float")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecEq(expected, actual, "tmatmul_sm121_float"); +} + +bool TestSm121HalfTensorCoreMatmulExtendedMatchesReference() +{ + constexpr int M = 32, K = 16, N = 32; + std::vector a(M * K); + std::vector b(K * N); + for (int i = 0; i < M * K; ++i) { + a[i] = __float2half(static_cast((i % 13) - 6) * 0.25f); + } + for (int i = 0; i < K * N; ++i) { + b[i] = __float2half(static_cast((i % 11) - 5) * 0.5f); + } + auto expected = RefMatmulToFloat(a, b, M, K, N); + half *dA = nullptr; + half *dB = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA half")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB half")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut half")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a half")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b half")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out half")) return false; + KernelTMATMUL<<<1, 32>>>(dOut, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul half")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul half")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out half")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_sm121_half_extended", 1e-3f); +} + +bool TestSm121Bfloat16TensorCoreMatmulExtendedMatchesReference() +{ + constexpr int M = 16, K = 32, N = 16; + std::vector a(M * K); + std::vector b(K * N); + for (int i = 0; i < M * K; ++i) { + a[i] = __float2bfloat16(static_cast((i % 9) - 4) * 0.75f); + } + for (int i = 0; i < K * N; ++i) { + b[i] = __float2bfloat16(static_cast((i % 7) - 3) * 0.5f); + } + auto expected = RefMatmulToFloat(a, b, M, K, N); + bfloat16_t *dA = nullptr; + bfloat16_t *dB = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(bfloat16_t)), "cudaMalloc dA bf16")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(bfloat16_t)), "cudaMalloc dB bf16")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut bf16")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy a bf16")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy b bf16")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out bf16")) return false; + KernelTMATMUL<<<1, 32>>>(dOut, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul bf16")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul bf16")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out bf16")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_sm121_bf16_extended", 2e-2f); +} + +bool TestSm121HalfTensorCoreMatmulLarge64MatchesReference() +{ + constexpr int M = 64, K = 64, N = 64; + std::vector a(M * K); + std::vector b(K * N); + for (int i = 0; i < M * K; ++i) { + a[i] = __float2half(static_cast((i % 19) - 9) * 0.125f); + } + for (int i = 0; i < K * N; ++i) { + b[i] = __float2half(static_cast((i % 17) - 8) * 0.1875f); + } + auto expected = RefMatmulToFloat(a, b, M, K, N); + half *dA = nullptr; + half *dB = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA half large")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB half large")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut half large")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a half large")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b half large")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out half large")) return false; + KernelTMATMUL<<<1, 32>>>(dOut, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul half large")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul half large")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out half large")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_sm121_half_large64", 2e-3f); +} + +bool TestSm121Bfloat16TensorCoreMatmulLarge64MatchesReference() +{ + constexpr int M = 64, K = 64, N = 64; + std::vector a(M * K); + std::vector b(K * N); + for (int i = 0; i < M * K; ++i) { + a[i] = __float2bfloat16(static_cast((i % 15) - 7) * 0.25f); + } + for (int i = 0; i < K * N; ++i) { + b[i] = __float2bfloat16(static_cast((i % 13) - 6) * 0.3125f); + } + auto expected = RefMatmulToFloat(a, b, M, K, N); + bfloat16_t *dA = nullptr; + bfloat16_t *dB = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(bfloat16_t)), "cudaMalloc dA bf16 large")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(bfloat16_t)), "cudaMalloc dB bf16 large")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut bf16 large")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy a bf16 large")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy b bf16 large")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out bf16 large")) return false; + KernelTMATMUL<<<1, 32>>>(dOut, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul bf16 large")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul bf16 large")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out bf16 large")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_sm121_bf16_large64", 3e-2f); +} + +bool TestSm121HalfTensorCoreMatmulAccMatchesReference() +{ + constexpr int M = 16, K = 16, N = 16; + std::vector a(M * K); + std::vector b(K * N); + std::vector acc(M * N); + for (int i = 0; i < M * K; ++i) { + a[i] = __float2half(static_cast((i % 15) - 7) * 0.125f); + } + for (int i = 0; i < K * N; ++i) { + b[i] = __float2half(static_cast((i % 10) - 4) * 0.375f); + } + for (int i = 0; i < M * N; ++i) { + acc[i] = static_cast((i % 8) - 4) * 0.5f; + } + auto expected = RefMatmulAccToFloat(a, b, acc, M, K, N); + half *dA = nullptr; + half *dB = nullptr; + float *dAcc = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA acc")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB acc")) return false; + if (!CheckCuda(cudaMalloc(&dAcc, acc.size() * sizeof(float)), "cudaMalloc dAcc acc")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut acc")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a acc")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b acc")) return false; + if (!CheckCuda(cudaMemcpy(dAcc, acc.data(), acc.size() * sizeof(float), cudaMemcpyHostToDevice), "copy acc in")) return false; + KernelTMATMUL_ACC<<<1, 32>>>(dOut, dAcc, dA, dB); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul acc")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul acc")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out acc")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dAcc); + cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_sm121_acc", 1e-3f); +} + +bool TestSm121Bfloat16TensorCoreMatmulBiasMatchesReference() +{ + constexpr int M = 16, K = 16, N = 16; + std::vector a(M * K); + std::vector b(K * N); + std::vector bias(N); + for (int i = 0; i < M * K; ++i) { + a[i] = __float2bfloat16(static_cast((i % 9) - 4) * 0.25f); + } + for (int i = 0; i < K * N; ++i) { + b[i] = __float2bfloat16(static_cast((i % 7) - 3) * 0.75f); + } + for (int i = 0; i < N; ++i) { + bias[i] = static_cast(i - 8) * 0.125f; + } + auto expected = RefMatmulBiasToFloat(a, b, bias, M, K, N); + bfloat16_t *dA = nullptr; + bfloat16_t *dB = nullptr; + float *dBias = nullptr; + float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(bfloat16_t)), "cudaMalloc dA bias")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(bfloat16_t)), "cudaMalloc dB bias")) return false; + if (!CheckCuda(cudaMalloc(&dBias, bias.size() * sizeof(float)), "cudaMalloc dBias")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut bias")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy a bias")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(bfloat16_t), cudaMemcpyHostToDevice), "copy b bias")) return false; + if (!CheckCuda(cudaMemcpy(dBias, bias.data(), bias.size() * sizeof(float), cudaMemcpyHostToDevice), "copy bias")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out bias")) return false; + KernelTMATMUL_BIAS<<<1, 32>>>(dOut, dA, dB, dBias); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul bias")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul bias")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out bias")) return false; + cudaFree(dA); + cudaFree(dB); + cudaFree(dBias); + cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_sm121_bias", 2e-2f); +} + +bool TestSm121HalfTMATMUL_MXMatchesReference() +{ + constexpr int M = 16, K = 16, N = 16; + std::vector a(M * K); + std::vector b(K * N); + std::vector aScale(K, 1.0f), bScale(N, 1.0f); + for (int i = 0; i < M * K; ++i) a[i] = __float2half(static_cast((i % 13) - 6) * 0.25f); + for (int i = 0; i < K * N; ++i) b[i] = __float2half(static_cast((i % 11) - 5) * 0.5f); + auto expected = RefMatmulToFloat(a, b, M, K, N); + half *dA = nullptr; half *dB = nullptr; float *dAS = nullptr; float *dBS = nullptr; float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA mx")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB mx")) return false; + if (!CheckCuda(cudaMalloc(&dAS, aScale.size() * sizeof(float)), "cudaMalloc dAS mx")) return false; + if (!CheckCuda(cudaMalloc(&dBS, bScale.size() * sizeof(float)), "cudaMalloc dBS mx")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut mx")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a mx")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b mx")) return false; + if (!CheckCuda(cudaMemcpy(dAS, aScale.data(), aScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy as mx")) return false; + if (!CheckCuda(cudaMemcpy(dBS, bScale.data(), bScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy bs mx")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out mx")) return false; + KernelTMATMUL_MX<<<1, 32>>>(dOut, dA, dAS, dB, dBS); + if (!CheckCuda(cudaGetLastError(), "launch tmatmul_mx")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tmatmul_mx")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out mx")) return false; + cudaFree(dA); cudaFree(dB); cudaFree(dAS); cudaFree(dBS); cudaFree(dOut); + return ExpectVecNear(expected, actual, "tmatmul_mx", 1e-3f); +} + +bool TestSm121HalfTGEMV_MXMatchesReference() +{ + constexpr int M = 16, K = 16, N = 16; + std::vector a(M * K); + std::vector b(K * N); + std::vector aScale(K, 1.0f), bScale(N, 1.0f); + for (int i = 0; i < M * K; ++i) a[i] = __float2half(static_cast((i % 9) - 4) * 0.375f); + for (int i = 0; i < K * N; ++i) b[i] = __float2half(static_cast((i % 7) - 3) * 0.625f); + auto expected = RefMatmulToFloat(a, b, M, K, N); + half *dA = nullptr; half *dB = nullptr; float *dAS = nullptr; float *dBS = nullptr; float *dOut = nullptr; + if (!CheckCuda(cudaMalloc(&dA, a.size() * sizeof(half)), "cudaMalloc dA gemv_mx")) return false; + if (!CheckCuda(cudaMalloc(&dB, b.size() * sizeof(half)), "cudaMalloc dB gemv_mx")) return false; + if (!CheckCuda(cudaMalloc(&dAS, aScale.size() * sizeof(float)), "cudaMalloc dAS gemv_mx")) return false; + if (!CheckCuda(cudaMalloc(&dBS, bScale.size() * sizeof(float)), "cudaMalloc dBS gemv_mx")) return false; + if (!CheckCuda(cudaMalloc(&dOut, expected.size() * sizeof(float)), "cudaMalloc dOut gemv_mx")) return false; + if (!CheckCuda(cudaMemcpy(dA, a.data(), a.size() * sizeof(half), cudaMemcpyHostToDevice), "copy a gemv_mx")) return false; + if (!CheckCuda(cudaMemcpy(dB, b.data(), b.size() * sizeof(half), cudaMemcpyHostToDevice), "copy b gemv_mx")) return false; + if (!CheckCuda(cudaMemcpy(dAS, aScale.data(), aScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy as gemv_mx")) return false; + if (!CheckCuda(cudaMemcpy(dBS, bScale.data(), bScale.size() * sizeof(float), cudaMemcpyHostToDevice), "copy bs gemv_mx")) return false; + if (!CheckCuda(cudaMemset(dOut, 0, expected.size() * sizeof(float)), "memset out gemv_mx")) return false; + KernelTGEMV_MX<<<1, 32>>>(dOut, dA, dAS, dB, dBS); + if (!CheckCuda(cudaGetLastError(), "launch tgemv_mx")) return false; + if (!CheckCuda(cudaDeviceSynchronize(), "sync tgemv_mx")) return false; + std::vector actual(expected.size()); + if (!CheckCuda(cudaMemcpy(actual.data(), dOut, actual.size() * sizeof(float), cudaMemcpyDeviceToHost), "copy out gemv_mx")) return false; + cudaFree(dA); cudaFree(dB); cudaFree(dAS); cudaFree(dBS); cudaFree(dOut); + return ExpectVecNear(expected, actual, "tgemv_mx", 1e-3f); +} + +} // namespace + +int main() +{ + int failed = 0; + auto run = [&](const char *name, bool (*fn)()) { + const bool ok = fn(); + std::cout << (ok ? "[PASS] " : "[FAIL] ") << name << std::endl; + if (!ok) { + ++failed; + } + }; + + run("TLoadNdRowMajorMatchesReference", &TestTLoadNdRowMajorMatchesReference); + run("TLoadDnColMajorMatchesReference", &TestTLoadDnColMajorMatchesReference); + run("TStoreNdAndDnMatchReference", &TestTStoreNdAndDnMatchReference); + run("TAddMatchesReference", &TestTAddMatchesReference); + run("GpuSwizzlePhysicalLayoutMatchesReference", &TestGpuSwizzlePhysicalLayoutMatchesReference); + run("GpuSwizzleRoundTripMatchesReference", &TestGpuSwizzleRoundTripMatchesReference); + run("GpuSwizzleTAddRoundTripMatchesReference", &TestGpuSwizzleTAddRoundTripMatchesReference); + run("Sm121FloatInlinePtxMatmulMatchesReference", &TestSm121FloatInlinePtxMatmulMatchesReference); + run("Sm121HalfTensorCoreMatmulExtendedMatchesReference", &TestSm121HalfTensorCoreMatmulExtendedMatchesReference); + run("Sm121Bfloat16TensorCoreMatmulExtendedMatchesReference", &TestSm121Bfloat16TensorCoreMatmulExtendedMatchesReference); + run("Sm121HalfTensorCoreMatmulLarge64MatchesReference", &TestSm121HalfTensorCoreMatmulLarge64MatchesReference); + run("Sm121Bfloat16TensorCoreMatmulLarge64MatchesReference", &TestSm121Bfloat16TensorCoreMatmulLarge64MatchesReference); + run("Sm121HalfTensorCoreMatmulAccMatchesReference", &TestSm121HalfTensorCoreMatmulAccMatchesReference); + run("Sm121Bfloat16TensorCoreMatmulBiasMatchesReference", &TestSm121Bfloat16TensorCoreMatmulBiasMatchesReference); + run("Sm121HalfTMATMUL_MXMatchesReference", &TestSm121HalfTMATMUL_MXMatchesReference); + run("Sm121HalfTGEMV_MXMatchesReference", &TestSm121HalfTGEMV_MXMatchesReference); + + return failed == 0 ? 0 : 1; +} diff --git a/tests/gpu/st/testcase/pto_gpu_perf/CMakeLists.txt b/tests/gpu/st/testcase/pto_gpu_perf/CMakeLists.txt new file mode 100644 index 00000000..067b8645 --- /dev/null +++ b/tests/gpu/st/testcase/pto_gpu_perf/CMakeLists.txt @@ -0,0 +1 @@ +pto_gpu_tool(pto_gpu_perf) diff --git a/tests/gpu/st/testcase/pto_gpu_perf/pto_gpu_perf.cu b/tests/gpu/st/testcase/pto_gpu_perf/pto_gpu_perf.cu new file mode 100644 index 00000000..f0d798d2 --- /dev/null +++ b/tests/gpu/st/testcase/pto_gpu_perf/pto_gpu_perf.cu @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include + +#include + +namespace { + +template +__global__ void KernelTMATMULBench(float *out, T *a, T *b) +{ + using TileA = pto::Tile; + using TileB = pto::Tile; + using TileC = pto::Tile; + TileA aTile(64, 64); + TileB bTile(64, 64); + TileC cTile(64, 64); + aTile.data() = a; + bTile.data() = b; + cTile.data() = out; + pto::TMATMUL(cTile, aTile, bTile); +} + +bool Check(cudaError_t st, const char *what) +{ + if (st == cudaSuccess) return true; + std::cerr << what << ": " << cudaGetErrorString(st) << std::endl; + return false; +} + +template +void InitInputs(std::vector &a, std::vector &b) +{ + for (std::size_t i = 0; i < a.size(); ++i) a[i] = static_cast((int(i % 17) - 8) * 0.1f); + for (std::size_t i = 0; i < b.size(); ++i) b[i] = static_cast((int(i % 13) - 6) * 0.2f); +} + +template <> +void InitInputs(std::vector &a, std::vector &b) +{ + for (std::size_t i = 0; i < a.size(); ++i) a[i] = __float2half((int(i % 17) - 8) * 0.1f); + for (std::size_t i = 0; i < b.size(); ++i) b[i] = __float2half((int(i % 13) - 6) * 0.2f); +} + +template <> +void InitInputs(std::vector &a, std::vector &b) +{ + for (std::size_t i = 0; i < a.size(); ++i) a[i] = __float2bfloat16((int(i % 17) - 8) * 0.1f); + for (std::size_t i = 0; i < b.size(); ++i) b[i] = __float2bfloat16((int(i % 13) - 6) * 0.2f); +} + +template +bool RunBench(const std::string &name, dim3 block, int iters) +{ + constexpr int M = 64, K = 64, N = 64; + constexpr std::size_t aCount = M * K; + constexpr std::size_t bCount = K * N; + constexpr std::size_t cCount = M * N; + + std::vector hA(aCount), hB(bCount); + InitInputs(hA, hB); + + T *dA = nullptr; + T *dB = nullptr; + float *dC = nullptr; + if (!Check(cudaMalloc(&dA, aCount * sizeof(T)), "cudaMalloc dA")) return false; + if (!Check(cudaMalloc(&dB, bCount * sizeof(T)), "cudaMalloc dB")) return false; + if (!Check(cudaMalloc(&dC, cCount * sizeof(float)), "cudaMalloc dC")) return false; + if (!Check(cudaMemcpy(dA, hA.data(), aCount * sizeof(T), cudaMemcpyHostToDevice), "copy A")) return false; + if (!Check(cudaMemcpy(dB, hB.data(), bCount * sizeof(T), cudaMemcpyHostToDevice), "copy B")) return false; + if (!Check(cudaMemset(dC, 0, cCount * sizeof(float)), "memset C")) return false; + + for (int i = 0; i < 10; ++i) { + KernelTMATMULBench<<<1, block>>>(dC, dA, dB); + } + if (!Check(cudaDeviceSynchronize(), "warmup sync")) return false; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + for (int i = 0; i < iters; ++i) { + KernelTMATMULBench<<<1, block>>>(dC, dA, dB); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms = 0.0f; + cudaEventElapsedTime(&ms, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + const double avgMs = ms / iters; + const double flops = 2.0 * M * N * K; + const double gflops = flops / (avgMs * 1.0e6); + + std::cout << std::left << std::setw(8) << name + << " avg_ms=" << std::fixed << std::setprecision(4) << avgMs + << " gflops=" << std::fixed << std::setprecision(2) << gflops + << " block=" << block.x << std::endl; + + cudaFree(dA); + cudaFree(dB); + cudaFree(dC); + return true; +} + +} // namespace + +int main() +{ + bool ok = true; + std::cout << "PTO GPU matmul microbench on GB10 (64x64x64, 1 block)" << std::endl; + ok &= RunBench("float", dim3(1), 200); + ok &= RunBench("half", dim3(32), 500); + ok &= RunBench("bf16", dim3(32), 500); + return ok ? 0 : 1; +}