Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
406c6c4
Add amrex::Launch
AlexanderSinn Jan 13, 2026
2ac0e24
add CUDA and CPU versions
AlexanderSinn Jan 15, 2026
12be033
add includes
AlexanderSinn Jan 15, 2026
9912b9b
Fix circular include
AlexanderSinn Jan 19, 2026
69b214a
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Jan 19, 2026
288428f
fix index calculation
AlexanderSinn Jan 19, 2026
494a26f
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Jan 22, 2026
418ba21
add function to get the runtime value of blockDim.x
AlexanderSinn Jan 28, 2026
1897d9c
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Feb 4, 2026
87d2dcf
fix
AlexanderSinn Feb 4, 2026
2636e19
add template for SYCL
AlexanderSinn Feb 4, 2026
536707f
remove blockDim1Drt
AlexanderSinn Feb 9, 2026
f827c80
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Feb 9, 2026
e1facb8
convert some functions to use LaunchRaw
AlexanderSinn Feb 9, 2026
1a0897b
Use LaunchRaw in more functions
AlexanderSinn Feb 10, 2026
4956fde
fix
AlexanderSinn Feb 10, 2026
19c909a
fix sycl
AlexanderSinn Feb 10, 2026
35f3ea2
Merge branch 'development' into Add_amrex__Launch
AlexanderSinn Mar 10, 2026
c0a70a9
Add AMREX_IF_SYCL and nodiscard
AlexanderSinn Mar 10, 2026
232988f
fix nodiscard
AlexanderSinn Mar 10, 2026
c392c70
fix nodiscard 2
AlexanderSinn Mar 10, 2026
62565bb
try fix void return issue
AlexanderSinn Mar 10, 2026
5c59a30
remove nodiscard
AlexanderSinn Mar 10, 2026
6161d90
try fix cuda first capture issue
AlexanderSinn Mar 10, 2026
ec26b0b
add documentation
AlexanderSinn Mar 10, 2026
cf44395
start to add test
AlexanderSinn Mar 10, 2026
09d75ee
update test and see if it is compiled
AlexanderSinn Mar 11, 2026
76a8401
move test dir
AlexanderSinn Mar 11, 2026
9df5358
fix test
AlexanderSinn Mar 11, 2026
8a78b4f
add static cast
AlexanderSinn Mar 11, 2026
f29aef1
add 1d and 2d tests
AlexanderSinn Mar 11, 2026
28b3949
fix constructor
AlexanderSinn Mar 11, 2026
aa9bbea
fix constructor 2
AlexanderSinn Mar 11, 2026
4bb24ae
test
AlexanderSinn Mar 11, 2026
8bcefc0
test2
AlexanderSinn Mar 11, 2026
cb9a3fd
fix
AlexanderSinn Mar 11, 2026
d4566ec
remove static
AlexanderSinn Mar 11, 2026
13b8a42
fix27
AlexanderSinn Mar 11, 2026
528d8dc
try fix MSVC
AlexanderSinn Mar 12, 2026
e13206d
try fix MSVC 2
AlexanderSinn Mar 12, 2026
72e5d88
try fix clang
AlexanderSinn Mar 12, 2026
0c60372
relax dim constraint for threadIdxND
AlexanderSinn Mar 12, 2026
56bfc6c
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Mar 14, 2026
a9f0cfc
Fix 3D threadIdxND, nd_item* and try guess fix for reqd_work_group_size
AlexanderSinn Mar 14, 2026
b690384
Add some suggestions from review
AlexanderSinn Mar 14, 2026
1f33724
typo
AlexanderSinn Mar 14, 2026
155dd6f
Add include and use BoxIndexer for SYCL
AlexanderSinn Mar 14, 2026
a3dfac3
fix
AlexanderSinn Mar 14, 2026
d5e6840
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Mar 16, 2026
3f2a3f8
Add missing include to Arena.H
AlexanderSinn Mar 16, 2026
cbad36a
move definition of gpuStream_t
AlexanderSinn Mar 16, 2026
93dd7b2
fix
AlexanderSinn Mar 16, 2026
b25c2a1
Update AMREX_HOME
AlexanderSinn Mar 17, 2026
67688d3
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Mar 20, 2026
b1d3184
Merge branch 'AMReX-Codes:development' into Add_amrex__Launch
AlexanderSinn Apr 2, 2026
5184333
Merge branch 'development' into Add_amrex__Launch
AlexanderSinn Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Docs/Doxygen/groups.dox
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@
* - \ref amrex::ParallelFor
* - \ref amrex::ParallelForOMP
* - \ref amrex::ParallelForRNG
* - \ref amrex::LaunchRaw
*/

/**
Expand Down
84 changes: 20 additions & 64 deletions Src/AmrCore/AMReX_TagBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,46 +446,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
int* ntags = dv_ntags.data() + blockoffset[li];
const int ncells = fai.fabbox().numPts();
const char* tags = (*this)[fai].dataPtr();
#ifdef AMREX_USE_SYCL
amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
{
int bid = h.item->get_group_linear_id();
int tid = h.item->get_local_id(0);
int icell = h.item->get_global_id(0);

int t = 0;
if (icell < ncells && tags[icell] != TagBox::CLEAR) {
t = 1;
}

t = Gpu::blockReduce<Gpu::Device::warp_size>
(t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, h);
if (tid == 0) {
ntags[bid] = t;
}
});
#else
amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
amrex::LaunchRaw<block_size, int>(amrex::IntVectND<1>{nblocks[li]},
AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
[=] AMREX_GPU_DEVICE (auto lh) noexcept
{
int bid = blockIdx.x;
int tid = threadIdx.x;
int icell = block_size*blockIdx.x+threadIdx.x;
int bid = lh.blockIdx1D();
int tid = lh.threadIdx1D();
int icell = lh.globalIdx1D();

int t = 0;
if (icell < ncells && tags[icell] != TagBox::CLEAR) {
t = 1;
}

t = Gpu::blockReduce<Gpu::Device::warp_size>
(t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
(t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
AMREX_IF_SYCL(, lh.handler()));
if (tid == 0) {
ntags[bid] = t;
}
});
#endif
}

Gpu::PinnedVector<int> hv_ntags(ntotblocks);
Expand Down Expand Up @@ -524,51 +505,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
const auto lenx = len.x;
const int ncells = bx.numPts();
const char* tags = (*this)[fai].dataPtr();
#ifdef AMREX_USE_SYCL
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
amrex::LaunchRaw<block_size, unsigned int>(amrex::IntVectND<1>{nblocks[li]}, 1,
[=] AMREX_GPU_DEVICE (auto lh) noexcept
{
int bid = h.item->get_group(0);
int tid = h.item->get_local_id(0);
int icell = h.item->get_global_id(0);
int bid = lh.blockIdx1D();
int tid = lh.threadIdx1D();
int icell = lh.globalIdx1D();

unsigned int* shared_counter = (unsigned int*)h.local;
unsigned int * shared_counter = lh.shared_memory();
if (tid == 0) {
*shared_counter = 0;
}
h.item->barrier(sycl::access::fence_space::local_space);
lh.syncthreads();

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Add<unsigned int,
sycl::access::address_space::local_space>
(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
p[itag] = IntVect(AMREX_D_DECL(i,j,k));
}
});
#else
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
int bid = blockIdx.x;
int tid = threadIdx.x;
int icell = block_size*blockIdx.x+threadIdx.x;

Gpu::SharedMemory<unsigned int> gsm;
unsigned int * shared_counter = gsm.dataPtr();
if (tid == 0) {
*shared_counter = 0;
}
__syncthreads();
unsigned int itag = Gpu::Atomic::Add
#ifdef AMREX_USE_SYCL
<unsigned int, sycl::access::address_space::local_space>
#endif
(shared_counter, 1u);

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand All @@ -579,7 +536,6 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
p[itag] = IntVect(AMREX_D_DECL(i,j,k));
}
});
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_Arena.H
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <AMReX_BLassert.H>
#include <AMReX_INT.H>
#ifdef AMREX_USE_GPU
#include <AMReX_GpuControl.H>
#include <AMReX_GpuTypes.H>
#endif

#ifdef AMREX_TINY_PROFILING
Expand Down
48 changes: 13 additions & 35 deletions Src/Base/AMReX_BaseFabUtility.H
Original file line number Diff line number Diff line change
Expand Up @@ -42,49 +42,27 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
auto nblocks = int(nblocks_long);
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
std::size_t shared_mem_elem = nthreads_per_block * STRUCTSIZE;
T* p = (T*)aos_fab.dataPtr();
#ifdef AMREX_USE_SYCL
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
amrex::LaunchRaw<nthreads_per_block, T>(amrex::IntVectND<1>{nblocks}, shared_mem_elem,
[=] AMREX_GPU_DEVICE (auto lh) noexcept
{
auto const icell = std::uint64_t(handler.globalIdx());
std::uint64_t const blockDimx = handler.blockDim();
std::uint64_t const threadIdxx = handler.threadIdx();
std::uint64_t const blockIdxx = handler.blockIdx();
auto const shared = (T*)handler.sharedMemory();
std::uint64_t const icell =
std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()+lh.threadIdx1D();
T* const shared = lh.shared_memory();
if (icell < indexer.numPts()) {
auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
auto ga = new(shared+std::uint64_t(lh.threadIdx1D())*STRUCTSIZE) STRUCT;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
handler.sharedBarrier();
for (std::uint64_t m = threadIdxx,
mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
m < mend; m += blockDimx) {
p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
lh.syncthreads();
for (std::uint64_t m = lh.threadIdx1D(),
mend = amrex::min<std::uint64_t>(lh.blockDim1D(),
indexer.numPts()-std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()) * STRUCTSIZE;
m < mend; m += lh.blockDim1D()) {
p[std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()*STRUCTSIZE+m] = shared[m];
}
});
#else
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
Gpu::SharedMemory<T> gsm;
T* const shared = gsm.dataPtr();
if (icell < indexer.numPts()) {
auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
__syncthreads();
for (std::uint64_t m = threadIdx.x,
mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
m < mend; m += blockDim.x) {
p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
}
});
#endif
} else
#endif
{
Expand Down
27 changes: 7 additions & 20 deletions Src/Base/AMReX_FBI.H
Original file line number Diff line number Diff line change
Expand Up @@ -295,25 +295,16 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
auto const* pntags = d_ntags.data();
auto const nblocks = int(h_ntags.size()-1);
constexpr auto nthreads = 256;
amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
#ifdef AMREX_USE_SYCL
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
[[sycl::reqd_work_group_size(nthreads)]]
#else
[=] AMREX_GPU_DEVICE () noexcept
#endif
{
#ifdef AMREX_USE_SYCL
Dim1 blockIdx{item.get_group_linear_id()};
Dim1 threadIdx{item.get_local_linear_id()};
#endif

for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
amrex::LaunchRaw<nthreads>(amrex::IntVectND<1>{nblocks},
[=] AMREX_GPU_DEVICE (auto lh) noexcept
{
for (unsigned int itag = pntags[lh.blockIdx1D()]; itag < pntags[lh.blockIdx1D()+1]; ++itag) {
auto const tag = ptag[itag];
auto ncells = int(tag.dbox.numPts());
const auto len = amrex::length(tag.dbox);
const auto lo = amrex::lbound(tag.dbox);
for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
for (int icell = int(lh.threadIdx1D()); icell < ncells; icell += nthreads) {
int k = icell / (len.x*len.y);
int j = (icell - k*(len.x*len.y)) / len.x;
int i = (icell - k*(len.x*len.y)) - j*len.x;
Expand All @@ -328,12 +319,8 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
}
}

if (itag+1 < pntags[blockIdx.x+1]) {
#ifdef AMREX_USE_SYCL
sycl::group_barrier(item.get_group());
#else
__syncthreads();
#endif
if (itag+1 < pntags[lh.blockIdx1D()+1]) {
lh.syncthreads();
}
}
});
Expand Down
40 changes: 8 additions & 32 deletions Src/Base/AMReX_GpuContainers.H
Original file line number Diff line number Diff line change
Expand Up @@ -448,51 +448,27 @@ namespace amrex::Gpu {
auto pu = reinterpret_cast<U*>(p);
constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
#ifdef AMREX_USE_SYCL
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
std::size_t shared_mem_Uelem = nthreads_per_block * nU;
amrex::LaunchRaw<nthreads_per_block, U>(amrex::IntVectND<1>{nblocks}, shared_mem_Uelem,
[=] AMREX_GPU_DEVICE (auto lh) noexcept
{
Long i = handler.globalIdx();
Long blockDimx = handler.blockDim();
Long threadIdxx = handler.threadIdx();
Long blockIdxx = handler.blockIdx();
auto const shared_U = (U*)handler.sharedMemory();
auto const shared_T = (T*)shared_U;
if (i < N) {
auto ga = new(shared_T+threadIdxx) T;
f(*ga, i);
}
handler.sharedBarrier();
for (Long m = threadIdxx,
mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
m < mend; m += blockDimx) {
pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
}
});
#else
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
Long blockDimx = blockDim.x;
Long threadIdxx = threadIdx.x;
Long blockIdxx = blockIdx.x;
Long blockDimx = lh.blockDim1D();
Long threadIdxx = lh.threadIdx1D();
Long blockIdxx = lh.blockIdx1D();
Long i = blockDimx*blockIdxx + threadIdxx;
Gpu::SharedMemory<U> gsm;
auto const shared_U = gsm.dataPtr();
auto const shared_U = lh.shared_memory();
auto const shared_T = (T*)shared_U;
if (i < N) {
auto ga = new(shared_T+threadIdxx) T;
f(*ga, i);
}
__syncthreads();
lh.syncthreads();
for (Long m = threadIdxx,
mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
m < mend; m += blockDimx) {
pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
}
});
#endif
}
#endif
}
Expand Down
28 changes: 17 additions & 11 deletions Src/Base/AMReX_GpuControl.H
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include <AMReX_Config.H>

#include <AMReX_GpuQualifiers.H>
#include <AMReX_GpuTypes.H>

#include <utility>

Expand Down Expand Up @@ -33,12 +32,28 @@
#define AMREX_HIP_OR_CUDA_OR_SYCL(a,b,c) ((void)0);
#endif

#if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) a
#elif defined(AMREX_USE_SYCL)
#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) b
#else
#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) c
#endif

#ifdef AMREX_USE_GPU
#define AMREX_GPU_OR_CPU(a,b) a
#else
#define AMREX_GPU_OR_CPU(a,b) b
#endif

#ifdef AMREX_USE_SYCL
#define AMREX_IF_SYCL(...) __VA_ARGS__
#define AMREX_IF_NOT_SYCL(...)
#else
#define AMREX_IF_SYCL(...)
#define AMREX_IF_NOT_SYCL(...) __VA_ARGS__
#endif

#ifdef AMREX_USE_SYCL
#define AMREX_SYCL_ONLY(a) a
#else
Expand Down Expand Up @@ -75,15 +90,7 @@ namespace amrex {
#define AMREX_DEFAULT_RUNON =amrex::RunOn::Host // by default run on Host when compiling for Cpu
#endif

namespace amrex { // NOLINT(modernize-concat-nested-namespaces)

#ifdef AMREX_USE_HIP
using gpuStream_t = hipStream_t;
#elif defined(AMREX_USE_CUDA)
using gpuStream_t = cudaStream_t;
#endif

namespace Gpu {
namespace amrex::Gpu {

#if defined(AMREX_USE_GPU)

Expand Down Expand Up @@ -225,7 +232,6 @@ namespace Gpu {

#endif

}
}

#endif
Loading
Loading