diff --git a/Docs/Notes/StencilForPipelinedTMA.md b/Docs/Notes/StencilForPipelinedTMA.md new file mode 100644 index 00000000000..ee9b4feef66 --- /dev/null +++ b/Docs/Notes/StencilForPipelinedTMA.md @@ -0,0 +1,257 @@ +# Pipelined 2D-Slice TMA Backend for `StencilFor` + +## Overview + +This note describes a possible phase-2 CUDA backend for +`amrex::StencilFor` that uses true Tensor Memory Accelerator (TMA) +transfers on Hopper-class and newer NVIDIA GPUs. + +The recommendation is deliberately narrow: + +- do not replace the current row-wise `cub::BlockLoadToShared` path +- do not load the full 3-D staged cube in one bulk transfer +- do add a CUDA-specialized backend that pipelines 2-D slice transfers in + the slow `z` direction + +This follows the guidance from NVIDIA's GTC 2026 memory-bandwidth talk: +their 3-D stencil example reported a regression when loading the whole +3-D cube at once, and the improvement came from pipelining 2-D slices +through shared memory instead. + +## Why A Separate Design Is Needed + +The current CUDA implementation in `Src/Base/AMReX_StencilForG.H` stages +the full grown tile into shared memory and only starts compute after the +copy is complete. + +That structure fits: + +- the generic manual cooperative load path +- the current row-wise `BlockLoadToShared` path + +It does not match the execution model that made TMA effective in the +stencil case shown in the NVIDIA slides: + +- keep multiple slices in flight +- overlap later loads with current compute +- march through the tile in `z` +- recycle shared-memory slices as soon as the corresponding compute is done + +For that reason, a true TMA backend should be treated as a new internal +execution strategy, not as a drop-in replacement for the existing +row-copy staging loop. + +## Recommended Scope + +The first pipelined TMA backend should stay narrower than the generic +`StencilFor` abstraction and fall back aggressively when the requested +case falls outside the tuned path. + +Recommended initial scope: + +- CUDA only +- `sm_90` and newer +- one staged component per launch +- cell-centered or otherwise regular packed `Array4` source layout +- uniform tiles in all dimensions +- full grown tile available in the source box +- `halo.z == 1` for the first implementation +- `TensorCopyPolicy::Auto` or `Always` + +Recommended initial exclusions: + +- partial tiles +- `halo.z > 1` +- multi-component staging +- user-controlled shared-padding modes that change the slice layout +- HIP, SYCL, or CPU specialization + +The generic fallback and current row-wise async-copy path should remain +the default for all excluded cases. + +## Why Not Full-Cube 3-D TMA + +The full-cube approach has several drawbacks for this API: + +- it preserves the current load-then-compute schedule, so it gives up the + overlap that makes TMA attractive +- it increases the shared-memory footprint to the whole grown tile +- it makes occupancy and launch-shape tuning harder +- it does not match the best-performing pattern shown in the stencil + slides + +Inference from the NVIDIA material: if AMReX adds true TMA here, the +first target should be a pipelined slice backend, not a monolithic 3-D +cube transfer. + +## Proposed Execution Model + +The block still owns one logical 3-D compute tile, but it no longer +stages the full tile at once. + +Instead: + +1. Build one interior compute tile as today. +2. Grow it only in `x` and `y` for each staged slice. +3. Keep a ring buffer of shared-memory slices. +4. Prime the ring buffer with enough `z` planes to cover the current + stencil window plus a small prefetch distance. +5. March through the tile in `z`, computing one or a few planes at a + time. +6. As each oldest slice becomes dead, recycle that slot and issue the + next TMA slice copy. + +For a radius-1 stencil, a reasonable first shape is: + +- live stencil window: `3` slices +- extra prefetch depth: `1` or `2` slices +- total ring size: `4` or `5` slices + +This is much smaller than staging the full grown 3-D tile and creates a +natural place to overlap transfer and compute. + +## Shared-Memory Layout + +Each stage in the ring buffer is one 2-D `x-y` slice including `x` and +`y` halos. + +Recommended layout rules: + +- align each stage base to at least `128` bytes for ND TMA +- keep one shared-memory barrier per live stage slot +- reserve any bank-conflict padding as an internal backend detail +- avoid exposing stage padding or swizzle controls in the public API + +The current `TileView` can still be used to expose the staged data to the +user lambda, but the backend will need an internal helper that remaps the +logical `z` coordinate onto the ring-buffer slot. + +## Descriptor and Launch Model + +This backend would need true CUDA TMA descriptors rather than the current +row-copy helper abstraction. + +Recommended host-side responsibilities: + +- cache `CUtensorMap` descriptors for the source layout +- key the cache on source pointer identity, source extents, source + strides, datatype, tile shape, and halo shape +- pass descriptors to the kernel as immutable launch data + +Recommended kernel-side responsibilities: + +- elect one issuing thread or one issuing warp +- initialize the shared-memory barriers for the live stage slots +- issue TMA loads for the initial slices +- wait only on the stage needed for the next compute step +- recycle stage slots after all threads are done consuming them + +The first implementation should use a single elected issuing thread. +Warp-specialized issuing can be revisited only if profiling shows that +instruction issue for TMA becomes a measurable bottleneck. + +## Gating and Fallback Rules + +The pipelined TMA backend should only activate when all of the following +are true: + +- device capability is `sm_90+` +- tile shape is uniform +- `MT` is valid for the chosen compute mapping +- source base pointer is TMA-aligned +- source stride in bytes satisfies the TMA alignment rules +- staged slice byte count is a multiple of `16` +- shared-memory stage bases satisfy the stronger TMA alignment +- the ring-buffer footprint fits within the shared-memory budget +- the source box fully contains the grown tile region + +If any one of these checks fails, the implementation should fall back in +this order: + +1. row-wise `BlockLoadToShared` path when supported +2. manual cooperative shared-memory load +3. direct alias fallback + +`TensorCopyPolicy::Always` should only mean "must use the selected native +backend". If the pipelined TMA path is not eligible, the call should +assert rather than silently dropping to another strategy. + +## API Impact + +The recommended design does not require a public API change. + +The existing `StencilInfo` and `StencilFor` entry points can stay +unchanged if the TMA backend is treated as an internal specialization +selected from `TensorCopyPolicy::Auto`. + +Possible internal additions: + +- a CUDA-only pipeline state helper +- a CUDA-only slice descriptor cache +- a CUDA-only staged-tile accessor that maps logical `z` into ring slots + +No public descriptor types should leak into the user-facing API. + +## Testing Plan + +Correctness coverage should compare the pipelined TMA backend against the +existing baseline stencil path and the current row-wise async-copy path. + +Minimum correctness cases: + +- 3-D 7-point Laplacian +- 3-D 27-point stencil with `halo = IntVect(1)` +- at least one tile shape with small `tile.z` +- at least one tile shape with larger `tile.z` +- forced fallback on unsupported shapes +- `TensorCopyPolicy::Always` failure on unsupported shapes + +Performance validation should measure: + +- runtime +- achieved bandwidth +- shared-memory footprint +- active blocks per SM +- sensitivity to pipeline depth +- sensitivity to `tile.z` + +The comparison target should be the current row-wise async-copy backend, +not only the manual fallback. + +## Open Questions + +1. Should the first TMA slice backend support only `halo.z == 1`, or is + there a small-extension path for `halo.z == 2` that is still worth the + added shared-memory pressure? +2. Is one issuing thread sufficient, or does one issuing warp matter on + Blackwell? +3. Do we need a swizzled shared-memory layout to avoid bank conflicts for + wider `x-y` slices? +4. Where should descriptor caches live so their lifetime and invalidation + rules remain obvious? +5. Should the backend require `require_full_tile = true` in the first + implementation, even if other `StencilFor` backends allow partial-tile + fallback? + +## Recommendation + +AMReX should not switch the current `StencilFor` CUDA path directly to +true 3-D TMA copies. + +If AMReX pursues a second CUDA tensor-copy backend, it should be a +pipelined 2-D-slice design that: + +- keeps the current CUB row-copy backend as the broadly applicable path +- targets only the cases where slice-pipelined TMA is likely to win +- uses strict eligibility checks +- proves value with dedicated Hopper and Blackwell benchmarks before the + scope is widened + +## References + +- NVIDIA CUDA Programming Guide, asynchronous data copies: + +- NVIDIA GTC 2026 slide deck: + +- Existing AMReX stencil prototype note: + `Docs/Notes/TensorCopyStencilPrototype.md` diff --git a/Docs/Notes/TensorCopyStencilPrototype.md b/Docs/Notes/TensorCopyStencilPrototype.md new file mode 100644 index 00000000000..337d02e3e0d --- /dev/null +++ b/Docs/Notes/TensorCopyStencilPrototype.md @@ -0,0 +1,760 @@ +# Tensor Copy Stencil Prototype Plan + +## Overview + +This document describes a prototype implementation plan for adding a +C++ lambda abstraction to AMReX for block-tiled stencil kernels that can +stage multidimensional tiles from global memory into shared memory or LDS +using next-generation tensor copy instructions where available. + +The motivating discussion is AMReX issue +[`#5279`](https://github.com/AMReX-Codes/amrex/issues/5279), which calls +out: + +- NVIDIA Tensor Memory Accelerator (TMA) on Hopper (`sm_90`) and newer +- AMD Tensor Data Mover (TDM) on `gfx1250` and newer +- the need for an AMReX abstraction suitable for multidimensional stencil + kernels + +The proposed prototype is intentionally narrow: + +- add one new opt-in block-stencil launch path +- keep existing `ParallelFor` and `launch` unchanged +- provide a generic fallback path on all backends +- provide a first native tensor-copy implementation on CUDA/Hopper-class + hardware +- defer the first native AMD implementation until the API shape is proven +- explicitly support wide-halo stencils with radius greater than one, + including practical cases with halo widths of 4 or 5 cells + +## Motivation + +AMReX already has efficient pointwise kernel abstractions built around +`ParallelFor`, and more general block launches via `amrex::launch`. +Those are a good fit for many kernels, but they do not directly capture +the execution model of tensor copy instructions: + +- one thread block owns one compute tile +- the block cooperatively stages a grown tile into shared memory or LDS +- the block waits for the staged copy to complete +- the block computes stencil values over the interior tile + +That execution model is fundamentally different from AMReX's default +flattened pointwise `ParallelFor` path. + +The prototype should therefore introduce a dedicated block-stencil +abstraction instead of overloading pointwise `ParallelFor` further. + +## Current AMReX Constraints + +This plan is based on the current AMReX implementation. + +### 1. `ParallelFor` is pointwise + +`ParallelFor` dispatches pointwise lambdas over a flattened cell space. +The lambda may optionally accept `Gpu::Handler`, but the launch model is +still one logical cell per iteration. + +Relevant files: + +- `Src/Base/AMReX_GpuLaunchFunctsG.H` +- `Src/Base/AMReX_GpuLaunchFunctsC.H` +- `Docs/sphinx_documentation/source/GPU.rst` + +### 2. `KernelInfo` is too small for this feature + +`Gpu::KernelInfo` currently only carries a reduction flag. It is not the +right place to encode tile shape, halo size, copy policy, padding, or +backend capability selection. + +Relevant file: + +- `Src/Base/AMReX_GpuKernelInfo.H` + +### 3. `amrex::launch` is the better starting point + +AMReX already uses `amrex::launch(..., shared_mem_bytes, ..., lambda)` for +block-cooperative shared-memory kernels. This is the natural substrate +for a new stencil abstraction. + +Relevant files: + +- `Src/Base/AMReX_BaseFabUtility.H` +- `Src/Base/AMReX_GpuContainers.H` +- `Docs/sphinx_documentation/source/GPU.rst` + +### 4. `ExecutionConfig(Box)` assumes 1-D decomposition + +`Gpu::ExecutionConfig(Box)` currently uses a 1-D decomposition of the cell +space, and the source comment notes that this assumption matters for +existing code. A tensor-copy stencil path should not reuse that mapping +for the compute phase. It should launch one block per tile instead. + +Relevant file: + +- `Src/Base/AMReX_GpuLaunch.H` + +### 5. `Array4` is packed, not explicitly strided + +`Array4` carries the source pointer, bounds, and packed stride +information. That is useful for source metadata, but the current +constructor path computes packed strides from the extents. It does not +directly represent a padded shared-memory tile layout. + +That means the prototype should not assume that a padded tile can be +represented as a plain `Array4`. + +Relevant file: + +- `Src/Base/AMReX_Array4.H` + +### 6. Backend specialization already exists elsewhere + +AMReX already wraps backend-specific block collectives through +backend-specialized implementations using CUB, rocPRIM, and SYCL +facilities. The tensor-copy abstraction should follow the same pattern. + +Relevant files: + +- `Src/Base/AMReX_GpuReduce.H` +- `Src/Base/AMReX_Scan.H` + +### 7. Alignment must be checked explicitly + +AMReX arena allocation currently documents a default arena alignment of +16 bytes. Tensor-copy paths will have stricter backend- and datatype- +dependent alignment requirements, so the prototype must validate those +requirements and fall back automatically when they are not met. In +particular, CUDA async-copy helpers such as `cub::BlockLoadToShared` +require not only an aligned base shared-memory pointer but also aligned +per-copy destinations, so row strides derived from `shared_padding` +must preserve the required shared-memory buffer alignment. + +Relevant file: + +- `Src/Base/AMReX_Arena.H` + +## Prototype Goals + +The prototype should: + +- preserve existing AMReX launch APIs +- introduce one new opt-in abstraction for stencil kernels +- hide all backend-specific descriptor and instruction details from user code +- support a generic fallback path on CPU, HIP, SYCL, and older CUDA GPUs +- support a native CUDA tensor-copy path on Hopper-class GPUs +- keep the user kernel in ordinary C++ lambda form +- work with existing `Array4` global-memory inputs +- support halo widths from 1 up to at least 5 cells in each dimension +- expose enough tile metadata to support performance tuning +- make fallback behavior explicit and predictable + +## Non-Goals + +This prototype does not try to: + +- redesign `ParallelFor` +- generalize to every block-structured kernel shape immediately +- stage multiple source arrays in the first implementation +- support tensor-store instructions in the first implementation +- expose vendor descriptor types in the public API +- land a full AMD native tensor-copy path in the first patch series +- change the storage layout of `FArrayBox` or `MultiFab` + +## Proposed Public API + +The prototype should add a new launch family rather than adding more flags +to `ParallelFor`. + +### High-level launch object + +```cpp +namespace amrex::Gpu { + +enum class TensorCopyPolicy { + Auto, + Always, + Never +}; + +struct StencilInfo { + IntVect tile = IntVect(AMREX_D_DECL(32, 8, 4)); + IntVect halo = IntVect(1); + IntVect shared_padding = IntVect(0); + TensorCopyPolicy tensor_copy = TensorCopyPolicy::Auto; + bool require_full_tile = false; + bool stage_one_component = true; + + StencilInfo& setTile (IntVect const& v) noexcept; + StencilInfo& setHalo (IntVect const& v) noexcept; + StencilInfo& setSharedPadding (IntVect const& v) noexcept; + StencilInfo& setTensorCopyPolicy (TensorCopyPolicy v) noexcept; + StencilInfo& setRequireFullTile (bool v) noexcept; +}; + +} +``` + +### High-level launch functions + +Prototype entry points: + +```cpp +template +void StencilFor (Box const& bx, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept; + +template +void StencilFor (Box const& bx, + int ncomp, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept; +``` + +The staged source field is explicit. Destination arrays and any other +captured state stay in the user lambda. + +Example target usage: + +```cpp +auto info = amrex::Gpu::StencilInfo{} + .setTile(IntVect(AMREX_D_DECL(32, 8, 4))) + .setHalo(IntVect(1)) + .setTensorCopyPolicy(amrex::Gpu::TensorCopyPolicy::Auto); + +amrex::StencilFor<128>(bx, info, phi_old, +[=] AMREX_GPU_DEVICE (auto const& tile, int i, int j, int k) noexcept +{ + phi_new(i,j,k) = c0 * tile(i,j,k) + + c1 * (tile(i-1,j,k) + tile(i+1,j,k) + + tile(i,j-1,k) + tile(i,j+1,k) + + tile(i,j,k-1) + tile(i,j,k+1)); +}); +``` + +This keeps the user-visible stencil expression close to existing +`Array4`-based kernels while giving AMReX ownership of the tile staging +step. + +## Accessor Design + +The staged tile accessor should be a new lightweight view type rather than +plain `Array4`. + +Recommended prototype type: + +```cpp +template +class TileView { +public: + AMREX_GPU_DEVICE T& operator() (int i, int j, int k) const noexcept; + AMREX_GPU_DEVICE T& operator() (int i, int j, int k, int n) const noexcept; + + AMREX_GPU_DEVICE bool contains (int i, int j, int k) const noexcept; + AMREX_GPU_DEVICE Box tileBox () const noexcept; + AMREX_GPU_DEVICE Box validBox () const noexcept; +}; +``` + +`halo` must not be treated as a special-case radius-1 parameter. The +prototype should support arbitrary positive halo widths, with validation +and testing covering at least the range `[1,5]`. + +Design choices: + +- index with absolute cell coordinates, not relative offsets +- carry explicit strides so padded shared-memory layouts are representable +- keep the staged region's grown box available for debug checks +- optionally expose component indexing for a later multi-component path + +This avoids changing `Array4` for the first prototype while still allowing +padding and backend-specific shared-memory layout rules. + +## Internal Execution Model + +### 1. Tile decomposition on the host + +For each `Box bx` passed by the user: + +- decompose `bx` into compute tiles using `StencilInfo::tile` +- assign one thread block per compute tile +- define the staged tile as `compute_tile.grow(info.halo)` +- intersect that staged tile with the available source region +- if the staged tile cannot be satisfied and `require_full_tile == true`, + abort in debug or fall back in optimized builds + +This launch should not reuse `ParallelFor(Box)` because the block-to-tile +ownership is the key abstraction. + +Wide halos directly affect tile feasibility. A tile size that is sensible +for `halo = 1` may become impossible for `halo = 4` or `5`, so the launch +path must validate the staged tile footprint before selecting the native +copy path. + +### 2. Shared-memory layout construction + +For each compute tile: + +- compute interior extents = `tile` +- compute staged extents = `tile + 2 * halo` +- apply optional shared-memory padding +- allocate one dynamic shared-memory buffer per block +- construct `TileView` over that buffer with explicit strides + +The first prototype should support only one staged component per launch. +That keeps descriptor construction, shared-memory sizing, and fallback +logic simple. + +For wide halos, shared-memory use grows quickly. In 3-D, a radius-5 halo +adds 10 cells in every dimension, so staged extents can become much larger +than the interior tile. The prototype should therefore: + +- compute shared-memory requirements on the host before launch +- reject native tensor-copy staging if the staged tile does not fit +- fall back automatically to a smaller-tile or generic path when needed + +### 3. Copy phase + +The block enters one of three paths: + +1. native tensor-copy path +2. optimized cooperative block-load path +3. generic manual fallback path + +#### Native path + +Used only when: + +- the backend reports tensor-copy support +- the device architecture is new enough +- the pointer, bounds, and strides satisfy alignment and size constraints +- each staged row start in shared memory remains aligned, not just the + base pointer +- the staged tile shape fits within shared-memory limits +- the halo width is supported by the selected descriptor construction path + +#### Cooperative load path + +Used when the backend has a block-load helper but native tensor-copy is +not available or not selected. + +#### Generic fallback path + +Each thread block cooperatively loads the staged tile from global memory +using ordinary global loads and shared-memory stores. + +This fallback path is especially important for wide-halo stencils because +some halo widths or tile shapes may be correct but not profitable, or may +exceed native tensor-copy constraints on a given backend. + +### 4. Synchronization + +After the load phase: + +- wait for the copy operation to complete +- synchronize the block +- enter the compute phase + +### 5. Compute phase + +Threads iterate over the interior tile only. Each thread: + +- maps its local thread id to one or more cells in the compute tile +- calls the user lambda with `TileView` and absolute cell indices +- writes results directly to globally captured destination arrays + +The first prototype should not attempt a tensor-store path. A direct +global store is simpler and sufficient to validate the staging model. + +## Backend Strategy + +### CPU + +The CPU path should preserve ordinary correctness and developer +ergonomics: + +- allocate a small stack or heap tile buffer if convenient, or bypass tile + staging entirely +- present the same lambda shape to the user +- prioritize clarity over aggressive optimization + +The goal is API portability, not a CPU performance feature. + +### CUDA + +The first native implementation should be CUDA-first. + +Why: + +- TMA support is already documented in CCCL/libcudacxx +- the issue already references NVIDIA's `cub::BlockLoadToShared` +- AMReX already uses CUB in other backend-specialized code +- CUDA capability checks are already available through + `Gpu::Device::devicePropMajor()` and `devicePropMinor()` + +Recommended CUDA rollout: + +#### Phase 1 CUDA native path + +Use the highest-level supported CUDA abstraction that still maps to TMA. +Prefer: + +- `cub::BlockLoadToShared` if it covers the needed layout cases + +Fallback within CUDA: + +- direct TMA descriptor construction using CCCL/libcudacxx if the higher- + level helper does not fit AMReX's layout or padding needs + +The prototype should be guarded on: + +- `AMREX_USE_CUDA` +- toolkit and CCCL availability +- `Gpu::Device::devicePropMajor() >= 9` + +#### Descriptor translation + +If direct TMA descriptors are needed, AMReX should build them internally +from `Array4` metadata. The dimension ordering and stride conventions must +be translated carefully because AMReX uses x-fastest storage while the +CCCL TMA APIs are documented in DLTensor terms. + +This translation must remain an internal implementation detail. + +#### Descriptor caching + +The prototype should start simple: + +- create one descriptor per source array and launch configuration on the host +- pass it into the kernel launch + +If the prototype is promising, a later optimization can cache descriptors +by a key such as: + +- source pointer +- source extents and strides +- datatype +- component +- tile shape +- halo + +Wide halos make descriptor caching more valuable because the descriptor +setup cost becomes a smaller fraction of total work as the staged region +gets larger. + +### HIP / AMD + +The first patch series should not block on a native AMD implementation. + +Recommended HIP plan: + +- Phase 1: generic fallback path only +- Phase 2: add `supportsTensorCopy()` detection for `gfx1250+` +- Phase 3: add a native TDM-backed implementation once the relevant LLVM or + HIP APIs are stable enough for AMReX to wrap cleanly + +The public API should be designed now so that the later AMD path can slot +in without changing user code. + +### SYCL + +The prototype should support SYCL only through the generic fallback path. + +There is no reason to block the API on a SYCL-native tensor-copy story. + +## Capability and Fallback Interface + +AMReX should expose one internal capability layer: + +```cpp +namespace amrex::Gpu::detail { + +struct TensorCopyCaps { + bool available = false; + bool requires_alignment_check = true; + bool supports_multidim = false; + bool supports_shared_padding = false; +}; + +TensorCopyCaps queryTensorCopyCaps () noexcept; + +bool canUseTensorCopy (void const* p, + Box const& src_box, + IntVect const& tile, + IntVect const& halo, + std::size_t element_size) noexcept; +} +``` + +And optionally one public coarse-grained query: + +```cpp +namespace amrex::Gpu { +bool supportsTensorCopy () noexcept; +} +``` + +This keeps the public surface small while still giving the implementation +room to make backend-specific decisions. + +## File Layout + +The prototype should be introduced in a way that matches AMReX's existing +header organization. + +Recommended files: + +- `Src/Base/AMReX_StencilFor.H` + User-facing declarations and small wrappers. +- `Src/Base/AMReX_StencilForC.H` + CPU implementation. +- `Src/Base/AMReX_StencilForG.H` + GPU implementation and backend dispatch. +- `Src/Base/AMReX_GpuTensorCopy.H` + Internal capability checks, descriptor wrappers, and backend hooks. +- `Src/Base/AMReX_Gpu.H` + Add the new include once the API is ready. +- `Src/Base/CMakeLists.txt` + Add headers to the build. +- `Src/Base/Make.package` + Add headers to the GNUmake package list. + +If `TileView` proves broadly useful, it can later move into its own header. +For the prototype, it should stay scoped to the stencil-launch feature. + +## Recommended Implementation Phases + +### Phase 0: scaffolding + +- add the new headers +- add `StencilInfo` +- add a CPU fallback path +- add a GPU fallback path with manual cooperative loads +- add one focused GPU test + +Deliverable: + +- new API compiles and produces correct answers everywhere + +### Phase 1: CUDA native path + +- add CUDA capability detection +- add alignment checks +- integrate `cub::BlockLoadToShared` or direct TMA descriptors +- keep the manual fallback path active for unsupported cases + +Follow-on note: + +- `Docs/Notes/StencilForPipelinedTMA.md` describes a possible later CUDA + backend based on pipelined 2-D slice TMA rather than row-wise block + loads or full-cube 3-D TMA. + +Deliverable: + +- CUDA/Hopper can use the native path automatically + +### Phase 2: performance validation + +- benchmark 7-point and 27-point stencil kernels +- tune default tile sizes +- measure shared-memory footprint and occupancy tradeoffs +- validate fallback behavior on misaligned and unsupported inputs + +Deliverable: + +- evidence that the abstraction is worthwhile + +### Phase 3: API hardening + +- decide whether `TileView` should remain local or become a more general + explicit-stride view type +- decide whether the 4-D staged-component overload should be enabled +- decide whether descriptor caching is needed + +Deliverable: + +- stable prototype API ready for broader use + +### Phase 4: AMD native path + +- add architecture detection for `gfx1250+` +- wrap the best available TDM interface +- validate against the same tests and benchmarks + +Deliverable: + +- native AMD tensor-copy implementation behind the same API + +## Testing Plan + +The prototype should include both correctness and performance checks. + +### Correctness tests + +Add a focused test such as: + +- `Tests/GPU/TensorCopyStencil/main.cpp` + +Test matrix: + +- 2-D and 3-D +- halo widths `1`, `2`, `4`, and `5` +- both isotropic halos like `IntVect(4)` and at least one anisotropic case + such as `IntVect(AMREX_D_DECL(5,2,1))` +- single component staged source +- fallback path forced on +- tensor-copy path forced on when supported +- edge tiles and partial tiles +- ghost-cell-dependent stencil values + +The test kernels should include both: + +- radius-1 stencils, to compare against the current common case +- wider-radius stencils that consume halo values at distances 2 through 5 + +This avoids accidentally validating only the descriptor plumbing while +failing to exercise the wider-halo indexing paths. + +The result should be compared against a baseline `ParallelFor` stencil +implementation similar to the heat equation example in +`Tests/HeatEquation/main.cpp`. + +### Performance tests + +Measure: + +- kernel time +- achieved bandwidth +- occupancy or active blocks per SM where available +- sensitivity to tile shape and block size + +Initial benchmark kernels: + +- 7-point Laplacian +- 27-point stencil +- one wide-halo stencil in 2-D +- one wide-halo stencil in 3-D + +Suggested wide-halo benchmark shapes: + +- 2-D radius-4 or radius-5 box stencil +- 3-D radius-2 or radius-4 structured stencil, depending on shared-memory + footprint + +### Negative tests + +Explicitly test fallback behavior for: + +- unsupported architectures +- misaligned pointers +- staged tile too large for shared memory +- tile-plus-halo region not fully available +- halo widths that force fallback because the staged region is too large for + the selected native copy path + +## Risks and Open Questions + +### 1. Shared-tile accessor shape + +The biggest immediate design question is whether to: + +- introduce a narrow `TileView` for this feature, or +- generalize `Array4` to support explicit strides + +The prototype should choose `TileView` first to avoid broad churn. + +### 2. CUDA helper coverage + +It is not yet guaranteed that `cub::BlockLoadToShared` will cover all +AMReX layout and padding cases needed for a production path. If it does +not, the CUDA implementation may need to drop to a lower-level TMA +descriptor interface sooner. + +### 3. Multi-component staging + +Staging multiple components in one shot is desirable, but it increases: + +- descriptor complexity +- shared-memory footprint +- padding complexity +- fallback-path complexity + +The prototype should stage one component per launch first. + +This is even more important when the halo radius is 4 or 5, because wide +halos already put strong pressure on shared-memory capacity. + +### 4. Tile-size defaults for wide halos + +Reasonable default interior tile sizes will likely depend on halo width. +For example, a tile shape that is appropriate for radius-1 stencils may be +too large once the staged region includes a radius-5 halo. + +The prototype should therefore avoid assuming one universal default tile. +At minimum, the implementation should: + +- validate the user-requested tile against the shared-memory budget +- document that wide-halo stencils may need smaller interior tiles +- consider choosing backend defaults as a function of halo width in a later + tuning pass + +### 4. Descriptor cache lifetime + +If AMReX caches native descriptors, the invalidation rules must be clear. +Source pointer identity alone may not be enough if the logical extents or +component slice change. + +### 5. AMD implementation surface + +The public AMD documentation currently points more clearly to compiler or +IR-level constructs than to a stable user-level C++ runtime API. That is a +strong reason to keep the AMD-native path out of the first patch series. + +## Recommended Initial Patch Series + +1. Add `StencilInfo`, `TileView`, `StencilFor`, and the generic fallback. +2. Add one correctness test and one benchmark-style test. +3. Add CUDA native staging behind `TensorCopyPolicy::Auto`. +4. Tune tile defaults and document fallback behavior. +5. Revisit AMD-native support after the CUDA path is validated. + +This keeps the first series small enough to review while still proving the +core idea. + +## References + +### AMReX issue + +- AMReX issue `#5279`: + +### AMReX source references + +- `Src/Base/AMReX_GpuKernelInfo.H` +- `Src/Base/AMReX_GpuLaunch.H` +- `Src/Base/AMReX_GpuLaunchFunctsG.H` +- `Src/Base/AMReX_GpuLaunchFunctsC.H` +- `Src/Base/AMReX_BaseFabUtility.H` +- `Src/Base/AMReX_GpuContainers.H` +- `Src/Base/AMReX_Array4.H` +- `Src/Base/AMReX_GpuReduce.H` +- `Src/Base/AMReX_Scan.H` +- `Src/Base/AMReX_Arena.H` +- `Docs/sphinx_documentation/source/GPU.rst` +- `Tests/HeatEquation/main.cpp` + +### NVIDIA references + +- CCCL Tensor Memory Accelerator overview: + +- CCCL `make_tma_descriptor`: + +- CUB `BlockLoadToShared` reference from the issue discussion: + + +### AMD references + +- MLIR AMDGPU dialect: + diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H index 12bd6aac85c..ecd29764abd 100644 --- a/Src/Base/AMReX_Gpu.H +++ b/Src/Base/AMReX_Gpu.H @@ -31,6 +31,7 @@ namespace amrex::Cuda {} #include #include #include +#include namespace amrex::Gpu { #ifdef AMREX_USE_HIP diff --git a/Src/Base/AMReX_GpuTensorCopy.H b/Src/Base/AMReX_GpuTensorCopy.H new file mode 100644 index 00000000000..4ee495432c2 --- /dev/null +++ b/Src/Base/AMReX_GpuTensorCopy.H @@ -0,0 +1,96 @@ +#ifndef AMREX_GPU_TENSOR_COPY_H_ +#define AMREX_GPU_TENSOR_COPY_H_ +#include + +#include +#include + +#include +#include + +#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && defined(__has_include) +# if __has_include() && __has_include() +# define AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED 1 +# endif +#endif + +#ifndef AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED +# define AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED 0 +#endif + +namespace amrex::Gpu::detail { + +struct TensorCopyCaps { + bool available = false; + bool requires_alignment_check = true; + bool supports_multidim = false; + bool supports_shared_padding = false; +}; + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +constexpr std::size_t tensorCopyAlignment () noexcept +{ + return 16u; +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +bool isTensorCopyAligned (void const* p, std::size_t alignment = tensorCopyAlignment()) noexcept +{ + return (reinterpret_cast(p) % alignment) == 0u; +} + +[[nodiscard]] inline TensorCopyCaps queryTensorCopyCaps () noexcept +{ + TensorCopyCaps caps; +#if AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED + caps.available = (Gpu::Device::devicePropMajor() >= 9); + caps.supports_multidim = true; + caps.supports_shared_padding = true; +#endif + return caps; +} + +[[nodiscard]] inline bool canUseTensorCopy (void const* p, + Box const& src_box, + IntVect const& tile, + IntVect const& halo, + std::size_t element_size) noexcept +{ + amrex::ignore_unused(src_box); + + TensorCopyCaps const caps = queryTensorCopyCaps(); + if (!caps.available || (element_size == 0u)) { + return false; + } + + IntVect const staged = tile + halo + halo; + if (staged.min() <= 0) { + return false; + } + + Dim3 const staged_d = staged.dim3(1); + auto const nelems = static_cast(staged_d.x) + * static_cast(staged_d.y) + * static_cast(staged_d.z); + +#ifdef AMREX_USE_GPU + return isTensorCopyAligned(p) + && nelems * element_size <= Gpu::Device::sharedMemPerBlock(); +#else + amrex::ignore_unused(nelems); + return false; +#endif +} + +} // namespace amrex::Gpu::detail + +namespace amrex::Gpu { + +[[nodiscard]] inline bool supportsTensorCopy () noexcept +{ + return detail::queryTensorCopyCaps().available; +} + +} // namespace amrex::Gpu + +#endif diff --git a/Src/Base/AMReX_StencilFor.H b/Src/Base/AMReX_StencilFor.H new file mode 100644 index 00000000000..1c41d81d4d5 --- /dev/null +++ b/Src/Base/AMReX_StencilFor.H @@ -0,0 +1,358 @@ +#ifndef AMREX_STENCIL_FOR_H_ +#define AMREX_STENCIL_FOR_H_ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace amrex::Gpu { + +enum class TensorCopyPolicy { + Auto, + Always, + Never +}; + +struct StencilInfo { + IntVect tile = IntVect(AMREX_D_DECL(32, 8, 4)); + IntVect halo = IntVect(1); + IntVect shared_padding = IntVect(0); + TensorCopyPolicy tensor_copy = TensorCopyPolicy::Auto; + bool require_full_tile = false; + bool stage_one_component = true; + + StencilInfo& setTile (IntVect const& v) noexcept { + tile = v; + return *this; + } + + StencilInfo& setHalo (IntVect const& v) noexcept { + halo = v; + return *this; + } + + StencilInfo& setSharedPadding (IntVect const& v) noexcept { + shared_padding = v; + return *this; + } + + StencilInfo& setTensorCopyPolicy (TensorCopyPolicy v) noexcept { + tensor_copy = v; + return *this; + } + + StencilInfo& setRequireFullTile (bool v) noexcept { + require_full_tile = v; + return *this; + } +}; + +template +class TileView +{ +public: + AMREX_GPU_HOST_DEVICE + constexpr TileView () noexcept = default; + + template , int> = 0> + AMREX_GPU_HOST_DEVICE + constexpr TileView (TileView> const& rhs) noexcept + : m_p(rhs.m_p), + m_stride_y(rhs.m_stride_y), + m_stride_z(rhs.m_stride_z), + m_stride_n(rhs.m_stride_n), + m_ncomp(rhs.m_ncomp), + m_tile_box(rhs.m_tile_box), + m_valid_box(rhs.m_valid_box) + {} + + AMREX_GPU_HOST_DEVICE + constexpr TileView (T* p, + Long stride_y, + Long stride_z, + Long stride_n, + int ncomp, + Box const& tile_box, + Box const& valid_box) noexcept + : m_p(p), + m_stride_y(stride_y), + m_stride_z(stride_z), + m_stride_n(stride_n), + m_ncomp(ncomp), + m_tile_box(tile_box), + m_valid_box(valid_box) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T& operator() (int i, int j, int k) const noexcept { + return this->operator()(i, j, k, 0); + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + T& operator() (int i, int j, int k, int n) const noexcept { + AMREX_ASSERT(contains(i, j, k)); + AMREX_ASSERT(n >= 0 && n < m_ncomp); + auto const lo = m_valid_box.smallEnd().dim3(); + return m_p[(i - lo.x) + + (j - lo.y) * m_stride_y + + (k - lo.z) * m_stride_z + + static_cast(n) * m_stride_n]; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + bool contains (int i, int j, int k) const noexcept { + return m_valid_box.contains(i, j, k); + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Box tileBox () const noexcept { + return m_tile_box; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Box validBox () const noexcept { + return m_valid_box; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + int nComp () const noexcept { + return m_ncomp; + } + +private: + template friend class TileView; + + T* m_p = nullptr; + Long m_stride_y = 1; + Long m_stride_z = 1; + Long m_stride_n = 1; + int m_ncomp = 1; + Box m_tile_box; + Box m_valid_box; +}; + +} // namespace amrex::Gpu + +namespace amrex::detail { + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Box array4_box_like (Array4 const& a, Box const& like) noexcept +{ + IntVect const lo(AMREX_D_DECL(a.begin.vect[0], a.begin.vect[1], a.begin.vect[2])); + IntVect const hi(AMREX_D_DECL(a.end.vect[0]-1, a.end.vect[1]-1, a.end.vect[2]-1)); + return Box(lo, hi, like.ixType()); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +IntVect sanitize_stencil_tile (Box const& bx, Gpu::StencilInfo const& info) noexcept +{ + IntVect r = info.tile; + for (int d = 0; d < AMREX_SPACEDIM; ++d) { + r[d] = amrex::max(1, amrex::min(r[d], bx.length(d))); + } + return r; +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +GpuArray tile_counts (Box const& bx, IntVect const& tile) noexcept +{ + auto const len = bx.length3d(); + Dim3 const t = tile.dim3(1); + return {{(len[0] + t.x - 1) / t.x, + (len[1] + t.y - 1) / t.y, + (len[2] + t.z - 1) / t.z}}; +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +bool tiles_are_uniform (Box const& bx, IntVect const& tile) noexcept +{ + auto const len = bx.length3d(); + Dim3 const t = tile.dim3(1); + return (len[0] % t.x == 0) && (len[1] % t.y == 0) && (len[2] % t.z == 0); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Box make_stencil_tile_box (Box const& bx, + IntVect const& tile, + int tx, + int ty, + int tz) noexcept +{ + IntVect small = bx.smallEnd() + IntVect(AMREX_D_DECL(tx*tile[0], ty*tile[1], tz*tile[2])); + IntVect big = amrex::min(small + tile - IntVect(1), bx.bigEnd()); + return Box(small, big, bx.ixType()); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Box make_stencil_tile_box (Box const& bx, + IntVect const& tile, + int tile_id, + GpuArray const& counts) noexcept +{ + int const tx = tile_id % counts[0]; + int const ty = (tile_id / counts[0]) % counts[1]; + int const tz = tile_id / (counts[0] * counts[1]); + return make_stencil_tile_box(bx, tile, tx, ty, tz); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Long num_tile_elements (IntVect const& extents) noexcept +{ + Dim3 const e = extents.dim3(1); + return static_cast(e.x) * static_cast(e.y) * static_cast(e.z); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Long num_tile_elements (Box const& bx) noexcept +{ + return num_tile_elements(bx.length()); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +GpuArray tile_view_strides (Box const& valid_box, + IntVect const& padding, + int ncomp = 1) noexcept +{ + amrex::ignore_unused(ncomp); + Dim3 const ext = valid_box.length().dim3(1); + Dim3 const pad = padding.dim3(0); + Long const stride_y = static_cast(ext.x + pad.x); + Long const stride_z = stride_y * static_cast(ext.y + pad.y); + Long const stride_n = stride_z * static_cast(ext.z + pad.z); + return {{stride_y, stride_z, stride_n}}; +} + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Gpu::TileView make_tile_view (T* p, + Box const& tile_box, + Box const& valid_box, + IntVect const& padding, + int ncomp = 1) noexcept +{ + auto const strides = tile_view_strides(valid_box, padding, ncomp); + return Gpu::TileView(p, strides[0], strides[1], strides[2], ncomp, tile_box, valid_box); +} + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Gpu::TileView make_tile_view (T* p, + Box const& tile_box, + Box const& valid_box, + Long stride_y, + Long stride_z, + Long stride_n, + int ncomp = 1) noexcept +{ + return Gpu::TileView(p, stride_y, stride_z, stride_n, ncomp, tile_box, valid_box); +} + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Gpu::TileView make_global_alias_tile_view (Array4 const& src, + Box const& tile_box, + Box const& valid_box) noexcept +{ + auto const lo = valid_box.smallEnd().dim3(); + return make_tile_view(src.ptr(lo.x, lo.y, lo.z), tile_box, valid_box, + src.stride.a[0], src.stride.a[1], src.stride.a[2], 1); +} + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +std::size_t shared_tile_bytes (IntVect const& valid_extents, + IntVect const& padding, + int ncomp = 1) noexcept +{ + Dim3 const ext = valid_extents.dim3(1); + Dim3 const pad = padding.dim3(0); + auto const nelems = static_cast(ext.x + pad.x) + * static_cast(ext.y + pad.y) + * static_cast(ext.z + pad.z) + * static_cast(ncomp); + return nelems * sizeof(T); +} + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +std::size_t shared_tile_bytes (Box const& valid_box, + IntVect const& padding, + int ncomp = 1) noexcept +{ + return shared_tile_bytes(valid_box.length(), padding, ncomp); +} + +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +bool boxes_equal_bounds (Box const& lhs, Box const& rhs) noexcept +{ + return lhs.smallEnd() == rhs.smallEnd() && lhs.bigEnd() == rhs.bigEnd(); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +void call_stencil_body (F const& f, Tile const& tile, int i, int j, int k) noexcept +{ + f(tile, i, j, k); +} + +template +void StencilFor_doit (Box const& bx, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept; + +} // namespace amrex::detail + +#ifdef AMREX_USE_GPU +#include +#else +#include +#endif + +namespace amrex { + +template +void StencilFor (Box const& bx, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept +{ + detail::StencilFor_doit(bx, info, src, std::forward(f)); +} + +template +void StencilFor (Box const& bx, + int ncomp, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept +{ + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(ncomp >= 0, "StencilFor: ncomp must be non-negative"); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(src.nComp() >= ncomp, + "StencilFor: staged source does not have enough components"); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(info.stage_one_component, + "StencilFor: multi-component staging is not implemented in this prototype"); + + for (int n = 0; n < ncomp; ++n) { + auto const src_n = Array4(src, n, 1); + StencilFor(bx, info, src_n, + [=] AMREX_GPU_DEVICE (auto const& tile, int i, int j, int k) noexcept + { + f(tile, i, j, k, n); + }); + } +} + +} // namespace amrex + +#endif diff --git a/Src/Base/AMReX_StencilForC.H b/Src/Base/AMReX_StencilForC.H new file mode 100644 index 00000000000..332cff5a510 --- /dev/null +++ b/Src/Base/AMReX_StencilForC.H @@ -0,0 +1,63 @@ +#ifndef AMREX_STENCIL_FOR_C_H_ +#define AMREX_STENCIL_FOR_C_H_ +#include + +#ifndef AMREX_USE_GPU + +#include +#include + +namespace amrex::detail { + +template +void StencilFor_doit (Box const& bx, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept +{ + amrex::ignore_unused(MT); + + if (bx.isEmpty()) { + return; + } + + IntVect const tile = sanitize_stencil_tile(bx, info); + Box const src_box = array4_box_like(src, bx); + + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(src_box.contains(bx), + "StencilFor: source Array4 must contain the compute box"); + + auto const counts = tile_counts(bx, tile); + + for (int tz = 0; tz < counts[2]; ++tz) { + for (int ty = 0; ty < counts[1]; ++ty) { + for (int tx = 0; tx < counts[0]; ++tx) { + Box const tile_box = make_stencil_tile_box(bx, tile, tx, ty, tz); + Box const staged_box = amrex::grow(tile_box, info.halo); + Box const valid_box = staged_box & src_box; + + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(!info.require_full_tile || boxes_equal_bounds(valid_box, staged_box), + "StencilFor: full staged tile is not available"); + + Vector tile_storage(static_cast(shared_tile_bytes(valid_box, info.shared_padding)) + / sizeof(T)); + auto stage_view = make_tile_view(tile_storage.data(), tile_box, valid_box, info.shared_padding); + + amrex::LoopOnCpu(valid_box, [&] (int i, int j, int k) noexcept + { + stage_view(i, j, k) = src(i, j, k, 0); + }); + + amrex::LoopOnCpu(tile_box, [&] (int i, int j, int k) noexcept + { + call_stencil_body(f, stage_view, i, j, k); + }); + } + } + } +} + +} // namespace amrex::detail + +#endif +#endif diff --git a/Src/Base/AMReX_StencilForG.H b/Src/Base/AMReX_StencilForG.H new file mode 100644 index 00000000000..1e46636a852 --- /dev/null +++ b/Src/Base/AMReX_StencilForG.H @@ -0,0 +1,304 @@ +#ifndef AMREX_STENCIL_FOR_G_H_ +#define AMREX_STENCIL_FOR_G_H_ +#include + +#ifdef AMREX_USE_GPU + +#include +#include +#include +#include +#include + +#if AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED +#include +#include +#endif + +#include +#include + +namespace amrex::detail { + +template +void StencilFor_direct_alias_fallback (Box const& bx, + IntVect const& tile, + Gpu::StencilInfo const& info, + Array4 const& src, + F const& f) noexcept +{ + Box const src_box = array4_box_like(src, bx); + auto const counts = tile_counts(bx, tile); + + for (int tz = 0; tz < counts[2]; ++tz) { + for (int ty = 0; ty < counts[1]; ++ty) { + for (int tx = 0; tx < counts[0]; ++tx) { + Box const tile_box = make_stencil_tile_box(bx, tile, tx, ty, tz); + Box const staged_box = amrex::grow(tile_box, info.halo); + Box const valid_box = staged_box & src_box; + + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(!info.require_full_tile || boxes_equal_bounds(valid_box, staged_box), + "StencilFor: full staged tile is not available"); + + auto const tile_view = make_global_alias_tile_view(src, tile_box, valid_box); + amrex::ParallelFor(tile_box, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + call_stencil_body(f, tile_view, i, j, k); + }); + } + } + } +} + +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +constexpr std::size_t shared_copy_alignment () noexcept +{ + return (alignof(T) > Gpu::detail::tensorCopyAlignment()) + ? alignof(T) + : Gpu::detail::tensorCopyAlignment(); +} + +template +[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +char* align_shared_ptr (char* p) noexcept +{ + auto const a = static_cast(shared_copy_alignment()); + auto const v = reinterpret_cast(p); + return reinterpret_cast((v + (a - 1u)) & ~(a - 1u)); +} + +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void stencil_copy_manual_to_shared (Array4 const& src, + Box const& valid_box, + T* shared_ptr, + Long stride_y, + Long stride_z) noexcept +{ + auto const ext = valid_box.length3d(); + Long const npts = static_cast(ext[0]) * static_cast(ext[1]) * static_cast(ext[2]); + Dim3 const vlo = valid_box.smallEnd().dim3(); + + for (Long idx = static_cast(threadIdx.x); idx < npts; idx += static_cast(blockDim.x)) { + Long tmp = idx; + int const ii = static_cast(tmp % ext[0]); + tmp /= ext[0]; + int const jj = static_cast(tmp % ext[1]); + int const kk = static_cast(tmp / ext[1]); + + int const i = vlo.x + ii; + int const j = vlo.y + jj; + int const k = vlo.z + kk; + shared_ptr[static_cast(ii) + static_cast(jj) * stride_y + static_cast(kk) * stride_z] + = src(i, j, k, 0); + } +} + +#if AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +void stencil_copy_tensor_to_shared (Array4 const& src, + Box const& valid_box, + T* shared_ptr, + Long stride_y, + Long stride_z) noexcept +{ + using Loader = cub::detail::BlockLoadToShared; + __shared__ typename Loader::TempStorage load_temp_storage; + + Loader loader(load_temp_storage); + auto const ext = valid_box.length3d(); + Dim3 const vlo = valid_box.smallEnd().dim3(); + + for (int kk = 0; kk < ext[2]; ++kk) { + for (int jj = 0; jj < ext[1]; ++jj) { + T* const dst = shared_ptr + static_cast(jj) * stride_y + static_cast(kk) * stride_z; + T const* const src_row = src.ptr(vlo.x, vlo.y + jj, vlo.z + kk); + auto const dst_span = cuda::std::span( + reinterpret_cast(dst), + static_cast( + cub::detail::LoadToSharedBufferSizeBytes(ext[0]))); + auto const src_span = cuda::std::span(src_row, static_cast(ext[0])); + auto const staged_span = + loader.template CopyAsync(dst_span, src_span); + amrex::ignore_unused(staged_span); + } + } + + auto token = loader.Commit(); + loader.Wait(static_cast(token)); +} +#endif + +template +void StencilFor_shared_launch (Box const& bx, + IntVect const& tile, + Gpu::StencilInfo const& info, + Array4 const& src, + F const& f, + std::size_t shared_mem_bytes) noexcept +{ + auto const counts = tile_counts(bx, tile); + int const ntile_blocks = counts[0] * counts[1] * counts[2]; + + amrex::launch_global<<>>( + [=] AMREX_GPU_DEVICE () noexcept + { + int const tile_id = static_cast(blockIdx.x); + if (tile_id >= ntile_blocks) { + return; + } + + Box const src_box = array4_box_like(src, bx); + Box const tile_box = make_stencil_tile_box(bx, tile, tile_id, counts); + Box const valid_box = amrex::grow(tile_box, info.halo) & src_box; + auto const strides = tile_view_strides(valid_box, info.shared_padding); + + Gpu::SharedMemory smem; + char* const raw_shared = smem.dataPtr(); + T* const shared_ptr = reinterpret_cast(align_shared_ptr(raw_shared)); + + if constexpr (UseTensorCopy) { +#if AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED + stencil_copy_tensor_to_shared(src, valid_box, shared_ptr, strides[0], strides[1]); +#else + stencil_copy_manual_to_shared(src, valid_box, shared_ptr, strides[0], strides[1]); + __syncthreads(); +#endif + } else { + stencil_copy_manual_to_shared(src, valid_box, shared_ptr, strides[0], strides[1]); + __syncthreads(); + } + + auto const tile_view = make_tile_view(shared_ptr, tile_box, valid_box, strides[0], strides[1], strides[2], 1); + auto const ext = tile_box.length3d(); + Long const npts = static_cast(ext[0]) * static_cast(ext[1]) * static_cast(ext[2]); + Dim3 const tlo = tile_box.smallEnd().dim3(); + + for (Long idx = static_cast(threadIdx.x); idx < npts; idx += static_cast(blockDim.x)) { + Long tmp = idx; + int const ii = static_cast(tmp % ext[0]); + tmp /= ext[0]; + int const jj = static_cast(tmp % ext[1]); + int const kk = static_cast(tmp / ext[1]); + call_stencil_body(f, tile_view, tlo.x + ii, tlo.y + jj, tlo.z + kk); + } + }); + + AMREX_GPU_ERROR_CHECK(); +} + +template +[[nodiscard]] inline bool can_use_native_tensor_copy (Box const& bx, + IntVect const& tile, + Gpu::StencilInfo const& info, + Array4 const& src, + std::size_t shared_mem_bytes) noexcept +{ + if (info.tensor_copy == Gpu::TensorCopyPolicy::Never) { + return false; + } + +#if AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED + if constexpr (MT % Gpu::Device::warp_size != 0) { + return false; + } + + if (!Gpu::detail::queryTensorCopyCaps().available) { + return false; + } + + if (!tiles_are_uniform(bx, tile)) { + return false; + } + + Box const src_box = array4_box_like(src, bx); + if (!src_box.contains(amrex::grow(bx, info.halo))) { + return false; + } + + if (shared_mem_bytes > Gpu::Device::sharedMemPerBlock()) { + return false; + } + + Box const full_stage = amrex::grow(make_stencil_tile_box(bx, tile, 0, 0, 0), info.halo); + auto const stage_lo = full_stage.smallEnd().dim3(); + T const* const stage_ptr = src.ptr(stage_lo.x, stage_lo.y, stage_lo.z); + Long const row_bytes = static_cast(full_stage.length(0)) * sizeof(T); + auto const shared_strides = tile_view_strides(full_stage, info.shared_padding); + Long const shared_stride_y_bytes = shared_strides[0] * static_cast(sizeof(T)); + + return Gpu::detail::canUseTensorCopy(stage_ptr, src_box, tile, info.halo, sizeof(T)) + && (row_bytes % static_cast(Gpu::detail::tensorCopyAlignment()) == 0) + && Gpu::detail::isTensorCopyAligned(stage_ptr) + && (shared_stride_y_bytes % static_cast(cub::detail::LoadToSharedBufferAlignBytes()) == 0) + && ((src.stride.a[0] * sizeof(T)) % static_cast(Gpu::detail::tensorCopyAlignment()) == 0) + && ((src.stride.a[1] * sizeof(T)) % static_cast(Gpu::detail::tensorCopyAlignment()) == 0); +#else + amrex::ignore_unused(bx, tile, info, src, shared_mem_bytes); + return false; +#endif +} + +#endif + +template +void StencilFor_doit (Box const& bx, + Gpu::StencilInfo const& info, + Array4 const& src, + F&& f) noexcept +{ + if (bx.isEmpty()) { + return; + } + + IntVect const tile = sanitize_stencil_tile(bx, info); + Box const src_box = array4_box_like(src, bx); + + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(src_box.contains(bx), + "StencilFor: source Array4 must contain the compute box"); + +#if defined(AMREX_USE_SYCL) + StencilFor_direct_alias_fallback(bx, tile, info, src, f); +#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(MT <= Gpu::Device::maxThreadsPerBlock(), + "StencilFor: MT exceeds the device thread-block limit"); + + IntVect const shared_extents = tile + info.halo + info.halo; + std::size_t const shared_mem_bytes = shared_tile_bytes(shared_extents, info.shared_padding) + + shared_copy_alignment(); + + bool const use_native_tensor_copy = can_use_native_tensor_copy(bx, tile, info, src, shared_mem_bytes); + + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(info.tensor_copy != Gpu::TensorCopyPolicy::Always || use_native_tensor_copy, + "StencilFor: TensorCopyPolicy::Always requires the native tensor-copy path"); + + if (shared_mem_bytes > Gpu::Device::sharedMemPerBlock()) { + StencilFor_direct_alias_fallback(bx, tile, info, src, f); + return; + } + +#if AMREX_USE_CUDA_BLOCK_LOAD_TO_SHARED + if constexpr (MT % Gpu::Device::warp_size == 0) { + if (use_native_tensor_copy) { + StencilFor_shared_launch(bx, tile, info, src, f, shared_mem_bytes); + return; + } + } +#endif + { + StencilFor_shared_launch(bx, tile, info, src, f, shared_mem_bytes); + } +#else + StencilFor_direct_alias_fallback(bx, tile, info, src, f); +#endif +} + +} // namespace amrex::detail + +#endif +#endif diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 04915b21611..76552a7b9b5 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -239,8 +239,12 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_GpuMemory.H AMReX_GpuRange.H AMReX_GpuReduce.H + AMReX_GpuTensorCopy.H AMReX_GpuAllocators.H AMReX_GpuContainers.H + AMReX_StencilFor.H + AMReX_StencilForC.H + AMReX_StencilForG.H AMReX_MFParallelFor.H AMReX_MFParallelForC.H AMReX_MFParallelForG.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index d22a6cc824d..71f40ed9a94 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -106,10 +106,12 @@ C$(AMREX_BASE)_headers += AMReX_GpuElixir.H C$(AMREX_BASE)_sources += AMReX_GpuElixir.cpp C$(AMREX_BASE)_headers += AMReX_GpuReduce.H +C$(AMREX_BASE)_headers += AMReX_GpuTensorCopy.H C$(AMREX_BASE)_headers += AMReX_CudaGraph.H AMReX_GpuContainers.H C$(AMREX_BASE)_headers += AMReX_GpuAllocators.H +C$(AMREX_BASE)_headers += AMReX_StencilFor.H AMReX_StencilForC.H AMReX_StencilForG.H C$(AMREX_BASE)_headers += AMReX_MFParallelFor.H C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 975b575c951..684536c1514 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -160,6 +160,7 @@ else() if (AMReX_GPU_BACKEND STREQUAL NONE) list(APPEND AMREX_TESTS_SUBDIRS OpenMP) + list(APPEND AMREX_TESTS_SUBDIRS GPU/TensorCopyStencil) endif () list(TRANSFORM AMREX_TESTS_SUBDIRS PREPEND "${CMAKE_CURRENT_LIST_DIR}/") diff --git a/Tests/GPU/TensorCopyStencil/CMakeLists.txt b/Tests/GPU/TensorCopyStencil/CMakeLists.txt new file mode 100644 index 00000000000..f47afd7c7ad --- /dev/null +++ b/Tests/GPU/TensorCopyStencil/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/GPU/TensorCopyStencil/GNUmakefile b/Tests/GPU/TensorCopyStencil/GNUmakefile new file mode 100644 index 00000000000..7fa41462ea2 --- /dev/null +++ b/Tests/GPU/TensorCopyStencil/GNUmakefile @@ -0,0 +1,23 @@ +AMREX_HOME = ../../../ + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_CUDA = FALSE +USE_ACC = FALSE +USE_OMP_OFFLOAD = FALSE + +USE_MPI = FALSE +USE_OMP = FALSE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/GPU/TensorCopyStencil/Make.package b/Tests/GPU/TensorCopyStencil/Make.package new file mode 100644 index 00000000000..6b4b865e8fc --- /dev/null +++ b/Tests/GPU/TensorCopyStencil/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/GPU/TensorCopyStencil/main.cpp b/Tests/GPU/TensorCopyStencil/main.cpp new file mode 100644 index 00000000000..5cb91a117a8 --- /dev/null +++ b/Tests/GPU/TensorCopyStencil/main.cpp @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include + +#include + +using namespace amrex; + +namespace { + +[[nodiscard]] Box make_domain (int nx, int ny = 1, int nz = 1) noexcept +{ + return Box(IntVect(0), + IntVect(AMREX_D_DECL(nx-1, ny-1, nz-1))); +} + +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Real source_value (int i, int j, int k) noexcept +{ + return Real(0.125)*(i*i + 3*j + 5*k*k) + + Real(0.5)*(2*i - j + 4*k) + + Real(1.0); +} + +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +Real stencil_sum (Arr const& arr, IntVect const& halo, int i, int j, int k) noexcept +{ + Dim3 const h = halo.dim3(0); + Real sum = Real(0.0); + + for (int kk = -h.z; kk <= h.z; ++kk) { + for (int jj = -h.y; jj <= h.y; ++jj) { + for (int ii = -h.x; ii <= h.x; ++ii) { + int const ai = (ii < 0) ? -ii : ii; + int const aj = (jj < 0) ? -jj : jj; + int const ak = (kk < 0) ? -kk : kk; + Real const weight = Real(1.0) + / (Real(1.0) + Real(ai) + Real(2*aj) + Real(3*ak)); + sum += weight * arr(i+ii, j+jj, k+kk); + } + } + } + + return sum; +} + +template +void check_case (Box const& bx, + IntVect const& tile, + IntVect const& halo, + IntVect const& shared_padding, + Gpu::TensorCopyPolicy policy, + char const* label) +{ + Box const src_box = amrex::grow(bx, halo); + FArrayBox src(src_box, 1); + FArrayBox baseline(bx, 1); + FArrayBox result(bx, 1); + + auto const src_arr = src.array(); + ParallelFor(src_box, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + src_arr(i, j, k) = source_value(i, j, k); + }); + + auto const src_const = src.const_array(); + auto const baseline_arr = baseline.array(); + ParallelFor(bx, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + baseline_arr(i, j, k) = stencil_sum(src_const, halo, i, j, k); + }); + + auto info = Gpu::StencilInfo{} + .setTile(tile) + .setHalo(halo) + .setSharedPadding(shared_padding) + .setTensorCopyPolicy(policy); + + auto const result_arr = result.array(); + StencilFor(bx, info, src.const_array(), + [=] AMREX_GPU_DEVICE (auto const& tile_view, int i, int j, int k) noexcept + { + result_arr(i, j, k) = stencil_sum(tile_view, halo, i, j, k); + }); + + Gpu::streamSynchronize(); + + Long const npts = bx.numPts(); + Vector h_baseline(npts); + Vector h_result(npts); + Gpu::copyAsync(Gpu::deviceToHost, baseline.dataPtr(), baseline.dataPtr()+npts, h_baseline.begin()); + Gpu::copyAsync(Gpu::deviceToHost, result.dataPtr(), result.dataPtr()+npts, h_result.begin()); + Gpu::streamSynchronize(); + + for (Long i = 0; i < npts; ++i) { + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(std::abs(h_baseline[i] - h_result[i]) < Real(1.e-12), + label); + } + + amrex::Print() << "TensorCopyStencil: " << label << " passed\n"; +} + +void run_tests () +{ + check_case(make_domain(AMREX_D_DECL(24, 10, 8)), + IntVect(AMREX_D_DECL(8, 5, 4)), + IntVect(1), + IntVect(0), + Gpu::TensorCopyPolicy::Auto, + "uniform-radius1-auto"); + + if (Gpu::supportsTensorCopy()) { + check_case(make_domain(AMREX_D_DECL(24, 10, 8)), + IntVect(AMREX_D_DECL(8, 5, 4)), + IntVect(1), + IntVect(0), + Gpu::TensorCopyPolicy::Always, + "uniform-radius1-always"); + } + + check_case(make_domain(AMREX_D_DECL(24, 10, 8)), + IntVect(AMREX_D_DECL(8, 5, 4)), + IntVect(2), + IntVect(0), + Gpu::TensorCopyPolicy::Never, + "uniform-radius2-never"); + + check_case(make_domain(AMREX_D_DECL(29, 19, 13)), + IntVect(AMREX_D_DECL(7, 6, 4)), + IntVect(4), + IntVect(0), + Gpu::TensorCopyPolicy::Auto, + "partial-radius4-auto"); + + check_case<100>(make_domain(AMREX_D_DECL(24, 10, 8)), + IntVect(AMREX_D_DECL(8, 5, 4)), + IntVect(1), + IntVect(0), + Gpu::TensorCopyPolicy::Auto, + "uniform-radius1-auto-mt100"); + + check_case(make_domain(AMREX_D_DECL(24, 10, 8)), + IntVect(AMREX_D_DECL(8, 5, 4)), + IntVect(1), + IntVect(AMREX_D_DECL(1, 0, 0)), + Gpu::TensorCopyPolicy::Auto, + "uniform-radius1-auto-paddingx1"); + + check_case(make_domain(AMREX_D_DECL(29, 19, 13)), + IntVect(AMREX_D_DECL(7, 6, 4)), + IntVect(AMREX_D_DECL(5, 2, 1)), + IntVect(0), + Gpu::TensorCopyPolicy::Never, + "partial-anisotropic-never"); +} + +} // namespace + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + run_tests(); + } + amrex::Finalize(); + return 0; +}