diff --git a/Docs/Doxygen/groups.dox b/Docs/Doxygen/groups.dox index c2d946159f..1265c9a505 100644 --- a/Docs/Doxygen/groups.dox +++ b/Docs/Doxygen/groups.dox @@ -187,6 +187,7 @@ * - \ref amrex::ParallelFor * - \ref amrex::ParallelForOMP * - \ref amrex::ParallelForRNG + * - \ref amrex::LaunchRaw */ /** diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index 3e8f4eb560..62e6d05149 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -446,33 +446,14 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const int* ntags = dv_ntags.data() + blockoffset[li]; const int ncells = fai.fabbox().numPts(); const char* tags = (*this)[fai].dataPtr(); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks[li], sizeof(int)*Gpu::Device::warp_size, - Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept - { - int bid = h.item->get_group_linear_id(); - int tid = h.item->get_local_id(0); - int icell = h.item->get_global_id(0); - int t = 0; - if (icell < ncells && tags[icell] != TagBox::CLEAR) { - t = 1; - } - - t = Gpu::blockReduce - (t, Gpu::warpReduce >(), 0, h); - if (tid == 0) { - ntags[bid] = t; - } - }); -#else - amrex::launch(nblocks[li], Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks[li]}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - int bid = blockIdx.x; - int tid = threadIdx.x; - int icell = block_size*blockIdx.x+threadIdx.x; + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int icell = lh.globalIdx1D(); int t = 0; if (icell < ncells && tags[icell] != TagBox::CLEAR) { @@ -480,12 +461,12 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const } t = Gpu::blockReduce - (t, Gpu::warpReduce >(), 0); + (t, Gpu::warpReduce >(), 0 + AMREX_IF_SYCL(, lh.handler())); if (tid == 0) { ntags[bid] = t; } }); -#endif } Gpu::PinnedVector hv_ntags(ntotblocks); @@ -524,51 +505,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const const auto lenx = len.x; const int ncells = bx.numPts(); const char* tags = (*this)[fai].dataPtr(); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks[li]}, 1, + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - int bid = h.item->get_group(0); - int tid = h.item->get_local_id(0); - int icell = h.item->get_global_id(0); + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int icell = lh.globalIdx1D(); - unsigned int* shared_counter = (unsigned int*)h.local; + unsigned int * shared_counter = lh.shared_memory(); if (tid == 0) { *shared_counter = 0; } - h.item->barrier(sycl::access::fence_space::local_space); + lh.syncthreads(); if (icell < ncells && tags[icell] != TagBox::CLEAR) { - unsigned int itag = Gpu::Atomic::Add - (shared_counter, 1u); - IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - p[itag] = IntVect(AMREX_D_DECL(i,j,k)); - } - }); -#else - amrex::launch(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept - { - int bid = blockIdx.x; - int tid = threadIdx.x; - int icell = block_size*blockIdx.x+threadIdx.x; - Gpu::SharedMemory gsm; - unsigned int * shared_counter = gsm.dataPtr(); - if (tid == 0) { - *shared_counter = 0; - } - __syncthreads(); + unsigned int itag = Gpu::Atomic::Add +#ifdef AMREX_USE_SYCL + +#endif + (shared_counter, 1u); - if (icell < ncells && tags[icell] != TagBox::CLEAR) { - unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u); IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -579,7 +536,6 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const p[itag] = IntVect(AMREX_D_DECL(i,j,k)); } }); -#endif } } diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index c8674aa004..41328f4982 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -5,7 +5,7 @@ #include #include #ifdef AMREX_USE_GPU -#include +#include #endif #ifdef AMREX_TINY_PROFILING diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H index d0c1e78847..c6c125db5b 100644 --- a/Src/Base/AMReX_BaseFabUtility.H +++ b/Src/Base/AMReX_BaseFabUtility.H @@ -42,49 +42,27 @@ void fill (BaseFab& aos_fab, F const& f) std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block; AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits::max())); auto nblocks = int(nblocks_long); - std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT); + std::size_t shared_mem_elem = nthreads_per_block * STRUCTSIZE; T* p = (T*)aos_fab.dataPtr(); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, shared_mem_elem, + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - auto const icell = std::uint64_t(handler.globalIdx()); - std::uint64_t const blockDimx = handler.blockDim(); - std::uint64_t const threadIdxx = handler.threadIdx(); - std::uint64_t const blockIdxx = handler.blockIdx(); - auto const shared = (T*)handler.sharedMemory(); + std::uint64_t const icell = + std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()+lh.threadIdx1D(); + T* const shared = lh.shared_memory(); if (icell < indexer.numPts()) { - auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT; + auto ga = new(shared+std::uint64_t(lh.threadIdx1D())*STRUCTSIZE) STRUCT; auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } - handler.sharedBarrier(); - for (std::uint64_t m = threadIdxx, - mend = amrex::min(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE; - m < mend; m += blockDimx) { - p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m]; + lh.syncthreads(); + for (std::uint64_t m = lh.threadIdx1D(), + mend = amrex::min(lh.blockDim1D(), + indexer.numPts()-std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()) * STRUCTSIZE; + m < mend; m += lh.blockDim1D()) { + p[std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()*STRUCTSIZE+m] = shared[m]; } }); -#else - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept - { - std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; - Gpu::SharedMemory gsm; - T* const shared = gsm.dataPtr(); - if (icell < indexer.numPts()) { - auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT; - auto [i, j, k] = indexer(icell); - f(*ga, i, j, k); - } - __syncthreads(); - for (std::uint64_t m = threadIdx.x, - mend = amrex::min(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE; - m < mend; m += blockDim.x) { - p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m]; - } - }); -#endif } else #endif { diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index 1b3f8cc13f..ee9584a4c0 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -295,25 +295,16 @@ void deterministic_fab_to_fab (Vector> const& a_tags, int s auto const* pntags = d_ntags.data(); auto const nblocks = int(h_ntags.size()-1); constexpr auto nthreads = 256; - amrex::launch(nblocks, Gpu::gpuStream(), -#ifdef AMREX_USE_SYCL - [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept - [[sycl::reqd_work_group_size(nthreads)]] -#else - [=] AMREX_GPU_DEVICE () noexcept -#endif - { -#ifdef AMREX_USE_SYCL - Dim1 blockIdx{item.get_group_linear_id()}; - Dim1 threadIdx{item.get_local_linear_id()}; -#endif - for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) { + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, + [=] AMREX_GPU_DEVICE (auto lh) noexcept + { + for (unsigned int itag = pntags[lh.blockIdx1D()]; itag < pntags[lh.blockIdx1D()+1]; ++itag) { auto const tag = ptag[itag]; auto ncells = int(tag.dbox.numPts()); const auto len = amrex::length(tag.dbox); const auto lo = amrex::lbound(tag.dbox); - for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) { + for (int icell = int(lh.threadIdx1D()); icell < ncells; icell += nthreads) { int k = icell / (len.x*len.y); int j = (icell - k*(len.x*len.y)) / len.x; int i = (icell - k*(len.x*len.y)) - j*len.x; @@ -328,12 +319,8 @@ void deterministic_fab_to_fab (Vector> const& a_tags, int s } } - if (itag+1 < pntags[blockIdx.x+1]) { -#ifdef AMREX_USE_SYCL - sycl::group_barrier(item.get_group()); -#else - __syncthreads(); -#endif + if (itag+1 < pntags[lh.blockIdx1D()+1]) { + lh.syncthreads(); } } }); diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H index d404e8cfd1..981898f56c 100644 --- a/Src/Base/AMReX_GpuContainers.H +++ b/Src/Base/AMReX_GpuContainers.H @@ -448,51 +448,27 @@ namespace amrex::Gpu { auto pu = reinterpret_cast(p); constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128; int nblocks = static_cast((N+nthreads_per_block-1)/nthreads_per_block); - std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept + std::size_t shared_mem_Uelem = nthreads_per_block * nU; + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, shared_mem_Uelem, + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - Long i = handler.globalIdx(); - Long blockDimx = handler.blockDim(); - Long threadIdxx = handler.threadIdx(); - Long blockIdxx = handler.blockIdx(); - auto const shared_U = (U*)handler.sharedMemory(); - auto const shared_T = (T*)shared_U; - if (i < N) { - auto ga = new(shared_T+threadIdxx) T; - f(*ga, i); - } - handler.sharedBarrier(); - for (Long m = threadIdxx, - mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx); - m < mend; m += blockDimx) { - pu[blockDimx*blockIdxx*nU+m] = shared_U[m]; - } - }); -#else - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept - { - Long blockDimx = blockDim.x; - Long threadIdxx = threadIdx.x; - Long blockIdxx = blockIdx.x; + Long blockDimx = lh.blockDim1D(); + Long threadIdxx = lh.threadIdx1D(); + Long blockIdxx = lh.blockIdx1D(); Long i = blockDimx*blockIdxx + threadIdxx; - Gpu::SharedMemory gsm; - auto const shared_U = gsm.dataPtr(); + auto const shared_U = lh.shared_memory(); auto const shared_T = (T*)shared_U; if (i < N) { auto ga = new(shared_T+threadIdxx) T; f(*ga, i); } - __syncthreads(); + lh.syncthreads(); for (Long m = threadIdxx, mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx); m < mend; m += blockDimx) { pu[blockDimx*blockIdxx*nU+m] = shared_U[m]; } }); -#endif } #endif } diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index e03951a724..9310f9ffae 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -3,7 +3,6 @@ #include #include -#include #include @@ -33,12 +32,28 @@ #define AMREX_HIP_OR_CUDA_OR_SYCL(a,b,c) ((void)0); #endif +#if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA) +#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) a +#elif defined(AMREX_USE_SYCL) +#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) b +#else +#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) c +#endif + #ifdef AMREX_USE_GPU #define AMREX_GPU_OR_CPU(a,b) a #else #define AMREX_GPU_OR_CPU(a,b) b #endif +#ifdef AMREX_USE_SYCL +#define AMREX_IF_SYCL(...) __VA_ARGS__ +#define AMREX_IF_NOT_SYCL(...) +#else +#define AMREX_IF_SYCL(...) +#define AMREX_IF_NOT_SYCL(...) __VA_ARGS__ +#endif + #ifdef AMREX_USE_SYCL #define AMREX_SYCL_ONLY(a) a #else @@ -75,15 +90,7 @@ namespace amrex { #define AMREX_DEFAULT_RUNON =amrex::RunOn::Host // by default run on Host when compiling for Cpu #endif -namespace amrex { // NOLINT(modernize-concat-nested-namespaces) - -#ifdef AMREX_USE_HIP -using gpuStream_t = hipStream_t; -#elif defined(AMREX_USE_CUDA) -using gpuStream_t = cudaStream_t; -#endif - -namespace Gpu { +namespace amrex::Gpu { #if defined(AMREX_USE_GPU) @@ -225,7 +232,6 @@ namespace Gpu { #endif -} } #endif diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 1ffa48dbbb..a5a4ec4cdb 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -2,6 +2,8 @@ #define AMREX_GPU_LAUNCH_FUNCTS_C_H_ #include +#include + namespace amrex { /// \cond DOXYGEN_IGNORE @@ -131,6 +133,65 @@ void launch (T const& n, L&& f) noexcept std::forward(f)(n); } +template +void LaunchRaw (IntVectND nblocks, L const& f) +{ + static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! " + "Otherwise the syncthreads function would not work"); + if constexpr(dim == 1) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{ + IntVectND<1>{bx}, nblocks, nullptr}); + } + } else if constexpr(dim == 2) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{ + IntVectND<2>{bx, by}, nblocks, nullptr}); + } + } + } else { + for (int bz=0; bz < nblocks[2]; ++bz) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{ + IntVectND<3>{bx, by, bz}, nblocks, nullptr}); + } + } + } + } +} + +template +void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) +{ + static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! " + "Otherwise the syncthreads function would not work"); + std::vector smem(shared_mem_elements); + if constexpr(dim == 1) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{ + IntVectND<1>{bx}, nblocks, smem.data()}); + } + } else if constexpr(dim == 2) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{ + IntVectND<2>{bx, by}, nblocks, smem.data()}); + } + } + } else { + for (int bz=0; bz < nblocks[2]; ++bz) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{ + IntVectND<3>{bx, by, bz}, nblocks, smem.data()}); + } + } + } + } +} + template > > AMREX_ATTRIBUTE_FLATTEN_FOR void For (T n, L const& f) noexcept diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index e4d377fda8..8140b95015 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -896,6 +896,110 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, } } +template +void LaunchRaw (IntVectND nblocks, L const& f) +{ + detail::SyclKernelDevPtr skdp(f, Gpu::gpuStream()); + L const* pf = skdp.template get<0>(); + amrex::ignore_unused(pf); + + auto& q = Gpu::Device::streamQueue(); + + sycl::range<1> threads_per_block{MT}; + sycl::range<1> threads_total{MT}; + + for (int i=0; i(nblocks[i]); + } + + // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim, + // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that + // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here. + BoxIndexerND bxi(BoxND{IntVectND(0), nblocks - 1}); + + try { + q.submit([&] (sycl::handler& h) { + h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block), + [=] (sycl::nd_item<1> item) + [[sycl::reqd_work_group_size(MT)]] + [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] + { + if constexpr (detail::is_big_kernel()) { + (*pf)(Gpu::LaunchHandler{ + &item, + bxi.intVect(item.get_group(0)), + nullptr + }); + } else { + f(Gpu::LaunchHandler{ + &item, + bxi.intVect(item.get_group(0)), + nullptr + }); + } + }); + }); + } catch (sycl::exception const& ex) { + amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!"); + } +} + +template +void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) +{ + detail::SyclKernelDevPtr skdp(f, Gpu::gpuStream()); + L const* pf = skdp.template get<0>(); + amrex::ignore_unused(pf); + + auto& q = Gpu::Device::streamQueue(); + // use double4 for shared memory as it has the largest alignment of types that might be used + using ST = sycl::double4; + static_assert(alignof(ST) >= alignof(T)); + const std::size_t shared_mem_num = (shared_mem_elements*sizeof(T)+sizeof(ST)-1) / sizeof(ST); + + sycl::range<1> threads_per_block{MT}; + sycl::range<1> threads_total{MT}; + + for (int i=0; i(nblocks[i]); + } + + // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim, + // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that + // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here. + BoxIndexerND bxi(BoxND{IntVectND(0), nblocks - 1}); + + try { + q.submit([&] (sycl::handler& h) { + sycl::local_accessor shared_data(sycl::range<1>(shared_mem_num), h); + h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block), + [=] (sycl::nd_item<1> item) + [[sycl::reqd_work_group_size(MT)]] + [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] + { + T* shared_mem = reinterpret_cast( + shared_data.template get_multi_ptr().get()); + if constexpr (detail::is_big_kernel()) { + (*pf)(Gpu::LaunchHandler{ + &item, + bxi.intVect(item.get_group(0)), + shared_mem + }); + } else { + f(Gpu::LaunchHandler{ + &item, + bxi.intVect(item.get_group(0)), + shared_mem + }); + } + }); + }); + } catch (sycl::exception const& ex) { + amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!"); + } +} + + #else // CUDA or HIP @@ -1284,6 +1388,87 @@ ParallelFor (Gpu::KernelInfo const&, AMREX_GPU_ERROR_CHECK(); } +/** + * \ingroup amrex_execution + * \brief Performance-portable kernel launch function + * that provides low-level access to GPU thread blocks through \ref amrex::Gpu::LaunchHandler. + * + * The number of threads per block is a compile-time-known one-dimensional value that usually + * should be one of 128, 256, 512 or 1024. + * + * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native + * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z. + * Note that this uses types int and unsigned int which might overflow if many blocks or + * total threads are needed. In case this is an issue, it is necessary to explicitly add + * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks. + * + * \tparam MT number of threads per GPU block. + * \param nblocks number of GPU blocks to launch. + * \param f a callable object that takes amrex::Gpu::LaunchHandler as input. + */ +template +void LaunchRaw (IntVectND nblocks, L const& f) +{ + dim3 num_blocks; + num_blocks.x = nblocks[0]; + if constexpr (dim >= 2) { + num_blocks.y = nblocks[1]; + } + if constexpr (dim == 3) { + num_blocks.z = nblocks[2]; + } + + AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + f(Gpu::LaunchHandler{}); + }); + AMREX_GPU_ERROR_CHECK(); +} + +/** + * \ingroup amrex_execution + * \brief Performance-portable kernel launch function + * that provides low-level access to GPU thread blocks and shared memory + * through \ref amrex::Gpu::LaunchHandler. + * + * The number of threads per block is a compile-time-known one-dimensional value that usually + * should be one of 128, 256, 512 or 1024. + * + * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native + * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z. + * Note that this uses types int and unsigned int which might overflow if many blocks or + * total threads are needed. In case this is an issue, it is necessary to explicitly add + * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks. + * + * This version of LaunchRaw supports the use of dynamic shared memory inside the thread block. + * Shared memory is a fast cache local to thread blocks. It has different names in + * different GPU backends (CUDA: shared memory, HIP: local data share, SYCL: local memory). + * + * \tparam MT number of threads per GPU block. + * \tparam T data type of shared memory elements. + * \param nblocks number of GPU blocks to launch. + * \param shared_mem_elements number of shared memory elements per block to allocate. + * \param f a callable object that takes amrex::Gpu::LaunchHandler as input. + */ +template +void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) +{ + dim3 num_blocks; + num_blocks.x = nblocks[0]; + if constexpr (dim >= 2) { + num_blocks.y = nblocks[1]; + } + if constexpr (dim == 3) { + num_blocks.z = nblocks[2]; + } + + AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, shared_mem_elements * sizeof(T), Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + f(Gpu::LaunchHandler{}); + }); + AMREX_GPU_ERROR_CHECK(); +} + #endif template diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index d4ed18c48e..3c96b04685 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -3,6 +3,10 @@ #include #include +#include +#include +#include +#include #ifdef AMREX_USE_GPU @@ -32,6 +36,14 @@ struct gpuStream_t { bool operator!= (gpuStream_t const& rhs) const noexcept { return queue != rhs.queue; } }; +#elif defined(AMREX_USE_HIP) + +using gpuStream_t = hipStream_t; + +#elif defined(AMREX_USE_CUDA) + +using gpuStream_t = cudaStream_t; + #endif } @@ -102,6 +114,223 @@ struct Handler {}; #endif +template +struct LaunchHandler +{ + static_assert(dim >= 1 && dim <= 3); + +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + LaunchHandler() = default; +#elif defined(AMREX_USE_SYCL) + LaunchHandler(sycl::nd_item<1> const* a_item, IntVectND a_blockid, T * a_shared_mem) + : m_item{a_item}, m_blockid{a_blockid}, m_shared_mem{a_shared_mem} {} +#else + LaunchHandler(IntVectND a_blockid, IntVectND a_griddim, T * a_shared_mem) + : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {} +#endif + + /** + * \brief Returns the thread ID in the local block. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int threadIdx1D () const { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + threadIdx.x, + m_item->get_local_linear_id(), + 0 + )); + } + + /** + * \brief Splits the local thread ID into N dimensions. + * The sizes of the dimensions are supplied as template arguments. + */ + template + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + IntVectND threadIdxND () const { + static_assert(sizeof...(nd_block_size) <= 3 && + (1 * ... * nd_block_size) == threads_per_block); + constexpr IntVectND iv_block_size = + blockDimND(); + IntVectND ret(0); + unsigned int idx = threadIdx1D(); + if constexpr (sizeof...(nd_block_size) == 3) { + ret[2] = idx / (iv_block_size[0] * iv_block_size[1]); + idx = idx - ret[2] * (iv_block_size[0] * iv_block_size[1]); + } + if constexpr (sizeof...(nd_block_size) >= 2) { + ret[1] = idx / iv_block_size[0]; + idx = idx - ret[1] * iv_block_size[0]; + } + ret[0] = idx; + return ret; + } + + /** + * \brief Returns the block ID flattened to 1D. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int blockIdx1D () const { + if constexpr (dim == 1) { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockIdx.x, + m_item->get_group(0), + m_blockid[0] + )); + } else if constexpr (dim == 2) { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockIdx.x + gridDim.x * blockIdx.y, + m_item->get_group(0), + m_blockid[0] + m_griddim[0] * m_blockid[1] + )); + } else { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z, + m_item->get_group(0), + m_blockid[0] + m_griddim[0] * m_blockid[1] + + m_griddim[0] * m_griddim[1] * m_blockid[2] + )); + } + } + + /** + * \brief Returns the 1D/2D/3D block ID. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + IntVectND blockIdxND () const { + if constexpr (dim == 1) { + return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + IntVectND(static_cast(blockIdx.x)), + m_blockid, + m_blockid + ); + } else if constexpr (dim == 2) { + return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + IntVectND(static_cast(blockIdx.x), + static_cast(blockIdx.y)), + m_blockid, + m_blockid + ); + } else { + return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + IntVectND(static_cast(blockIdx.x), + static_cast(blockIdx.y), + static_cast(blockIdx.z)), + m_blockid, + m_blockid + ); + } + } + + /** + * \brief Returns the number of threads inside a block. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + static constexpr unsigned int blockDim1D () { + return threads_per_block; + } + + /** + * \brief Analogous to threadIdxND but returns the ND block size. + * This effectively just returns the template arguments as an IntVectND. + */ + template + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + static constexpr IntVectND blockDimND () { + static_assert((1 * ... * nd_block_size) == threads_per_block); + return IntVectND{nd_block_size...}; + } + + /** + * \brief Returns the global thread index flattened to 1D. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int globalIdx1D () const { + return blockIdx1D() * threads_per_block + threadIdx1D(); + } + + /** + * \brief Returns the global 1D/2D/3D thread index. + */ + template + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + IntVectND globalIdxND () const { + static_assert(sizeof...(nd_block_size) == dim && + (1 * ... * nd_block_size) == threads_per_block); + return blockIdxND() * blockDimND() + threadIdxND(); + } + + /** + * \brief Synchronizes all threads within a block. This is needed before + * accessing (shared) memory that was previously written by another thread in the block. + */ + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void syncthreads () const { + AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + __syncthreads(), + m_item->barrier(sycl::access::fence_space::global_and_local), + (void)0 + ); + } + + /** + * \brief Returns a pointer to block-local shared memory. If multiple shared memory + * allocations are needed in a block, then the allocation must be manually split by adding + * offsets to it. The memory is aligned to 32 bytes. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + T* shared_memory () const { +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + // 32 bytes is sufficient for double4_32a + static_assert(32 >= alignof(T)); + alignas(32) extern __shared__ char smem[]; + return reinterpret_cast(smem); +#else + return m_shared_mem; +#endif + } + + /** + * \brief Returns an amrex::Gpu::Handler object, which is sometimes needed for reductions. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + Gpu::Handler handler () const { +#if defined(AMREX_USE_SYCL) + return Gpu::Handler(m_item, reinterpret_cast(m_shared_mem), threads_per_block); +#elif defined(AMREX_USE_GPU) + return Gpu::Handler(threads_per_block); +#else + return Gpu::Handler{}; +#endif + } + + /** + * \brief Returns the internal sycl::nd_item<1>. + */ + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE +#if defined(AMREX_USE_SYCL) + const sycl::nd_item<1>* sycl_item () const { + return m_item; + } +#else + int sycl_item () const { + return 0; + } +#endif + +private: + +#if defined(AMREX_USE_SYCL) + sycl::nd_item<1> const* m_item; + IntVectND m_blockid; + T * m_shared_mem; +#elif !defined(AMREX_USE_GPU) + IntVectND m_blockid; + IntVectND m_griddim; + T * m_shared_mem; +#endif +}; + } #endif diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H index 4f6cdf68c9..16240fadc6 100644 --- a/Src/Base/AMReX_MultiFabUtil.H +++ b/Src/Base/AMReX_MultiFabUtil.H @@ -1257,20 +1257,14 @@ void reduce_to_plane (Array4 const& ar, int direction, Box const& bx, int box const auto len = amrex::length(bx); constexpr int nthreads = 128; auto nblocks = static_cast(b2d.numPts()); -#ifdef AMREX_USE_SYCL - constexpr std::size_t shared_mem_bytes = sizeof(T)*Gpu::Device::warp_size; - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) - { - int bid = h.blockIdx(); - int tid = h.threadIdx(); -#else - amrex::launch(nblocks, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () + + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE (auto lh) { - int bid = blockIdx.x; - int tid = threadIdx.x; -#endif + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + T tmp; Op().init(tmp); T* p; @@ -1303,7 +1297,7 @@ void reduce_to_plane (Array4 const& ar, int direction, Box const& bx, int box p = ar.ptr(i,j,0); } #ifdef AMREX_USE_SYCL - Op().template parallel_update(*p, tmp, h); + Op().template parallel_update(*p, tmp, lh.handler()); #else Op().template parallel_update(*p, tmp); #endif diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index 2f99faade7..3870394bd8 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -870,22 +870,14 @@ namespace amrex } int n2dblocks = (n2d+AMREX_GPU_MAX_THREADS-1)/AMREX_GPU_MAX_THREADS; int nblocks = n2dblocks * b.length(direction); -#ifdef AMREX_USE_SYCL - std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size; - amrex::launch(nblocks, shared_mem_byte, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept -#else - amrex::launch(nblocks, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept -#endif + + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { -#ifdef AMREX_USE_SYCL - int i1d = h.blockIdx() / n2dblocks; - int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks); -#else - int i1d = blockIdx.x / n2dblocks; - int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks); -#endif + int i1d = lh.blockIdx1D() / n2dblocks; + int i2d = lh.threadIdx1D() + + AMREX_GPU_MAX_THREADS*(lh.blockIdx1D()-i1d*n2dblocks); int i2dy = i2d / n2dx; int i2dx = i2d - i2dy*n2dx; int i, j, k, idir; @@ -907,11 +899,8 @@ namespace amrex } for (int n = 0; n < ncomp; ++n) { Real r = (i2d < n2d) ? fab(i,j,k,n+icomp) : Real(0.0); -#ifdef AMREX_USE_SYCL - Gpu::deviceReduceSum_full(p+n+ncomp*idir, r, h); -#else - Gpu::deviceReduceSum_full(p+n+ncomp*idir, r); -#endif + Gpu::deviceReduceSum_full(p+n+ncomp*idir, r + AMREX_IF_SYCL(, lh.handler())); } }); } diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index 2883e05674..a58263e2a2 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -634,29 +634,22 @@ public: reduce_data.nBlocks(stream) = nblocks_ec; reduce_data.updateMaxStreamIndex(stream); -#ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept - { - Dim1 blockIdx {gh.blockIdx()}; - Dim1 threadIdx{gh.threadIdx()}; -#else - amrex::launch_global - <<>> - ([=] AMREX_GPU_DEVICE () noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { -#endif ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple& dst = pdst[blockIdx.x]; - if (threadIdx.x == 0) { + ReduceTuple& dst = pdst[lh.blockIdx1D()]; + if (lh.threadIdx1D() == 0) { dst = r; } - for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) { + for (int iblock = lh.blockIdx1D(); iblock < nblocks; iblock += nblocks_ec) { int ibox = iblock / nblocks_per_box; - auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x; + auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + + lh.threadIdx1D(); BoxIndexer const& indexer = dp_boxes[ibox]; if (icell < indexer.numPts()) { @@ -665,11 +658,8 @@ public: (f, ibox, i, j, k, ncomp, r); } } -#ifdef AMREX_USE_SYCL - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); -#else - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); -#endif + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_IF_SYCL(, lh.handler())); }); } } @@ -689,48 +679,42 @@ public: / (nitems_per_thread*AMREX_GPU_MAX_THREADS); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); -#ifdef AMREX_USE_SYCL - // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{static_cast(nblocks_ec)}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE ( + Gpu::LaunchHandler lh + ) noexcept { - Dim1 blockIdx {gh.blockIdx()}; - Dim1 threadIdx{gh.threadIdx()}; - Dim1 gridDim {gh.gridDim()}; -#else - amrex::launch(nblocks_ec, 0, stream, - [=] AMREX_GPU_DEVICE () noexcept - { -#endif ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple& dst = *(dp+blockIdx.x); - if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { + ReduceTuple& dst = *(dp+lh.blockIdx1D()); + if (lh.threadIdx1D() == 0 && static_cast(lh.blockIdx1D()) >= nblocks) { dst = r; } - for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x, - stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; + for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D() + + lh.threadIdx1D(), + stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec; icell < indexer.numPts(); icell += stride) { auto iv = indexer.intVect(icell); - amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture + // work around first-capture + auto f2 = f; + auto ncomp2 = ncomp; + auto ixtype2 = ixtype; if constexpr (std::is_same_v) { - auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype); + auto pr = Reduce::detail::call_f_intvect_box(f2, iv, ixtype2); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr); } else { - for (int n = 0; n < ncomp; ++n) { - auto pr = Reduce::detail::call_f_intvect_n(f, iv, n); + for (int n = 0; n < ncomp2; ++n) { + auto pr = Reduce::detail::call_f_intvect_n(f2, iv, n); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr); } } } -#ifdef AMREX_USE_SYCL - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); -#else - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); -#endif + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_IF_SYCL(, lh.handler())); }); nblocks = std::max(nblocks, static_cast(nblocks_ec)); } @@ -822,39 +806,28 @@ public: / (nitems_per_thread*AMREX_GPU_MAX_THREADS); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); -#ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept - { - Dim1 blockIdx {gh.blockIdx()}; - Dim1 threadIdx{gh.threadIdx()}; - Dim1 gridDim {gh.gridDim()}; -#else - amrex::launch(nblocks_ec, 0, stream, - [=] AMREX_GPU_DEVICE () noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { -#endif ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple& dst = *(dp+blockIdx.x); - if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { + ReduceTuple& dst = *(dp+lh.blockIdx1D()); + if (lh.threadIdx1D() == 0 && static_cast(lh.blockIdx1D()) >= nblocks) { dst = r; } - for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x, - stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x; + for (N i = N(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D()+lh.threadIdx1D(), + stride = N(AMREX_GPU_MAX_THREADS)*nblocks_ec; i < n; i += stride) { auto pr = f(i); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr); } -#ifdef AMREX_USE_SYCL - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); -#else - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); -#endif + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_IF_SYCL(, lh.handler())); }); nblocks = amrex::max(nblocks, nblocks_ec); } @@ -870,7 +843,6 @@ public: } using ReduceTuple = typename D::Type; - auto const& stream = Gpu::gpuStream(); auto dp = reduce_data.devicePtr(); auto const& nblocks = reduce_data.nBlocks(); #if defined(AMREX_USE_SYCL) @@ -891,52 +863,34 @@ public: #endif { int maxblocks = reduce_data.maxBlocks(); -#ifdef AMREX_USE_SYCL - // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; -#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND +#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL) // xxxxx SYCL todo: reduce bug workaround Gpu::DeviceVector dtmp(1); auto presult = dtmp.data(); #else auto presult = hp; #endif - amrex::launch(1, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{1}, + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); ReduceTuple dst = r; for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) { auto dp_stream = dp+istream*maxblocks; - for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0); + for (int i = lh.globalIdx1D(), stride = AMREX_GPU_MAX_THREADS; i < nblocks[istream]; i += stride) { Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]); } } - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); - if (gh.threadIdx() == 0) { *presult = dst; } + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_IF_SYCL(, lh.handler())); + if (lh.threadIdx1D() == 0) { *presult = dst; } }); -#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND +#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL) Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple)); -#endif -#else - amrex::launch(1, 0, stream, - [=] AMREX_GPU_DEVICE () noexcept - { - ReduceTuple r; - Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple dst = r; - for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) { - auto dp_stream = dp+istream*maxblocks; - for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; - i < nblocks[istream]; i += stride) { - Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]); - } - } - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); - if (threadIdx.x == 0) { *hp = dst; } - }); #endif Gpu::streamSynchronize(); } @@ -1068,51 +1022,34 @@ bool AnyOf (N n, T const* v, P const& pred) int* dp = ds.dataPtr(); auto ec = Gpu::ExecutionConfig(n); ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()); - -#ifdef AMREX_USE_SYCL - const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; - const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); - if (gh.threadIdx() == 0) { *has_any = *dp; } - gh.sharedBarrier(); - - if (!(*has_any)) - { - int r = false; - for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim(); - i < n && !r; i += stride) - { - r = pred(v[i]) ? 1 : 0; - } - - r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0, gh); - if (gh.threadIdx() == 0 && r) { *dp = 1; } - } - }); -#else - amrex::launch(ec.numBlocks.x, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - __shared__ int has_any; - if (threadIdx.x == 0) { has_any = *dp; } - __syncthreads(); + const int nblocks_ec = ec.numBlocks.x; + [[maybe_unused]] const int num_ints = + std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size); + + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1), + [=] AMREX_GPU_DEVICE (auto lh) noexcept + { + int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints)); + if (lh.threadIdx1D() == 0) { has_any = *dp; } + lh.syncthreads(); if (!has_any) { int r = false; - for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; - i < n && !r; i += stride) + for (N i = AMREX_GPU_MAX_THREADS*lh.blockIdx1D()+lh.threadIdx1D(), + stride = AMREX_GPU_MAX_THREADS*nblocks_ec; + i < n && !r; i += stride) { r = pred(v[i]) ? 1 : 0; } r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0); - if (threadIdx.x == 0 && r) *dp = 1; + (r, Gpu::warpReduce >(), 0 + AMREX_IF_SYCL(, lh.handler())); + if (lh.threadIdx1D() == 0 && r) *dp = 1; } }); -#endif return ds.dataValue(); } @@ -1125,45 +1062,25 @@ bool AnyOf (BoxND const& box, P const& pred) const BoxIndexerND indexer(box); auto ec = Gpu::ExecutionConfig(box.numPts()); ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()); - -#ifdef AMREX_USE_SYCL - const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; - const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); - if (gh.threadIdx() == 0) { *has_any = *dp; } - gh.sharedBarrier(); - - if (!(*has_any)) - { - int r = false; - for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(), - stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim(); - icell < indexer.numPts() && !r; - icell += stride) - { - auto iv = indexer.intVect(icell); - r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0; - } - r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0, gh); - if (gh.threadIdx() == 0 && r) { *dp = 1; } - } - }); -#else - AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0, - Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - __shared__ int has_any; - if (threadIdx.x == 0) { has_any = *dp; } - __syncthreads(); + const int nblocks_ec = ec.numBlocks.x; + [[maybe_unused]] const int num_ints = + std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size); + + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1), + [=] AMREX_GPU_DEVICE (auto lh) noexcept + { + int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints)); + if (lh.threadIdx1D() == 0) { has_any = *dp; } + lh.syncthreads(); if (!has_any) { int r = false; - for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x, - stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; + for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)* + lh.blockIdx1D()+lh.threadIdx1D(), + stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec; icell < indexer.numPts() && !r; icell += stride) { @@ -1171,11 +1088,11 @@ bool AnyOf (BoxND const& box, P const& pred) r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0; } r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0); - if (threadIdx.x == 0 && r) *dp = 1; + (r, Gpu::warpReduce >(), 0 + AMREX_IF_SYCL(, lh.handler())); + if (lh.threadIdx1D() == 0 && r) *dp = 1; } }); -#endif return ds.dataValue(); } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp index 5e1fb62fc2..d9a29036d7 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp @@ -85,20 +85,12 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - amrex::launch<64>(12, Gpu::gpuStream(), -#ifdef AMREX_USE_SYCL - [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) + amrex::LaunchRaw<64>(amrex::IntVectND<1>{12}, + [=] AMREX_GPU_DEVICE (auto lh) { - int bid = item.get_group_linear_id(); - int tid = item.get_local_linear_id(); - int bdim = item.get_local_range(0); -#else - [=] AMREX_GPU_DEVICE () - { - int bid = blockIdx.x; - int tid = threadIdx.x; - int bdim = blockDim.x; -#endif + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int bdim = lh.blockDim1D(); mltensor_fill_edges(bid, tid, bdim, vbx, velfab, mxlo, mylo, mzlo, mxhi, myhi, mzhi, bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp index 8d72107d54..372fd1ae09 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp @@ -408,20 +408,12 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada // only edge vals used in 3D stencil #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - amrex::launch<64>(12, Gpu::gpuStream(), -#ifdef AMREX_USE_SYCL - [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) + amrex::LaunchRaw<64>(amrex::IntVectND<1>{12}, + [=] AMREX_GPU_DEVICE (auto lh) { - int bid = item.get_group_linear_id(); - int tid = item.get_local_linear_id(); - int bdim = item.get_local_range(0); -#else - [=] AMREX_GPU_DEVICE () - { - int bid = blockIdx.x; - int tid = threadIdx.x; - int bdim = blockDim.x; -#endif + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int bdim = lh.blockDim1D(); mltensor_fill_edges(bid, tid, bdim, vbx, velfab, mxlo, mylo, mzlo, mxhi, myhi, mzhi, bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi, diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H index 9b8e055d03..c6e7cabd2a 100644 --- a/Src/Particle/AMReX_ParticleUtil.H +++ b/Src/Particle/AMReX_ParticleUtil.H @@ -795,12 +795,13 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n pllist_next[i] = Gpu::Atomic::Exch(pllist_start + f(i), i); }); -#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) - amrex::launch(nbins / gpu_block_size, Gpu::gpuStream(), - [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () { - __shared__ index_type sdata[gpu_block_size]; - __shared__ index_type global_idx_start; - __shared__ index_type idx_start; +#if defined(AMREX_USE_GPU) + amrex::LaunchRaw(amrex::IntVectND<1>{nbins / gpu_block_size}, + gpu_block_size + 2, + [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE (auto lh) { + index_type* sdata = lh.shared_memory(); + index_type& global_idx_start = *(sdata + gpu_block_size); + index_type& idx_start = *(sdata + gpu_block_size + 1); index_type current_idx = 0; @@ -808,7 +809,7 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles // are put right next to each other, while without the compressed layout, // there can be other particle sweeps from different locations between them. - current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; + current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()]; index_type num_particles_thread = 0; while (current_idx != llist_guard) { @@ -816,36 +817,37 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n current_idx = pllist_next[current_idx]; } - index_type num_particles_block = - Gpu::blockReduceSum(num_particles_thread); + index_type num_particles_block = Gpu::blockReduceSum + AMREX_IF_NOT_SYCL() + (num_particles_thread AMREX_IF_SYCL(, lh.handler())); - if (threadIdx.x == 0) { + if (lh.threadIdx1D() == 0) { global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block); } } - current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; + current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()]; while (true) { - sdata[threadIdx.x] = index_type(current_idx != llist_guard); + sdata[lh.threadIdx1D()] = index_type(current_idx != llist_guard); index_type x = 0; // simple block wide prefix sum for (index_type i = 1; i= i) { - x = sdata[threadIdx.x - i]; + lh.syncthreads(); + if (lh.threadIdx1D() >= i) { + x = sdata[lh.threadIdx1D() - i]; } - __syncthreads(); - if (threadIdx.x >= i) { - sdata[threadIdx.x] += x; + lh.syncthreads(); + if (lh.threadIdx1D() >= i) { + sdata[lh.threadIdx1D()] += x; } } - __syncthreads(); + lh.syncthreads(); if (sdata[gpu_block_size_m1] == 0) { break; } - if (threadIdx.x == gpu_block_size_m1) { + if (lh.threadIdx1D() == gpu_block_size_m1) { if constexpr (compressed_layout) { idx_start = global_idx_start; global_idx_start += sdata[gpu_block_size_m1]; @@ -853,17 +855,17 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]); } } - __syncthreads(); - sdata[threadIdx.x] += idx_start; + lh.syncthreads(); + sdata[lh.threadIdx1D()] += idx_start; if (current_idx != llist_guard) { - pperm[sdata[threadIdx.x] - 1] = current_idx; + pperm[sdata[lh.threadIdx1D()] - 1] = current_idx; current_idx = pllist_next[current_idx]; } } }); #else amrex::ignore_unused(pperm, pglobal_idx, compressed_layout); - Abort("PermutationForDeposition only implemented for CUDA and HIP"); + Abort("PermutationForDeposition only implemented for GPU"); #endif Gpu::Device::streamSynchronize(); diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index c92e115f63..d7e47bc9a0 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -128,7 +128,7 @@ else() # List of subdirectories to search for CMakeLists. # set( AMREX_TESTS_SUBDIRS Amr ArrayND AsyncOut Base CallNoinline CLZ CommType CTOParFor DeviceGlobal - Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2 + Enum HeatEquation LaunchRaw MultiBlock MultiPeriod ParmParse Parser Parser2 ParserUserFn Reducer ReduceToPlanePatchy Reinit RoundoffDomain SIMD SmallMatrix SumBoundary TOML) diff --git a/Tests/LaunchRaw/CMakeLists.txt b/Tests/LaunchRaw/CMakeLists.txt new file mode 100644 index 0000000000..224c4563c8 --- /dev/null +++ b/Tests/LaunchRaw/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/LaunchRaw/GNUmakefile b/Tests/LaunchRaw/GNUmakefile new file mode 100644 index 0000000000..05ed32d344 --- /dev/null +++ b/Tests/LaunchRaw/GNUmakefile @@ -0,0 +1,15 @@ +AMREX_HOME = ../../ + +DEBUG = FALSE + +USE_MPI = FALSE +USE_OMP = FALSE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/LaunchRaw/Make.package b/Tests/LaunchRaw/Make.package new file mode 100644 index 0000000000..7f43e5e87c --- /dev/null +++ b/Tests/LaunchRaw/Make.package @@ -0,0 +1,2 @@ +CEXE_sources += main.cpp + diff --git a/Tests/LaunchRaw/inputs b/Tests/LaunchRaw/inputs new file mode 100644 index 0000000000..bba7f4f305 --- /dev/null +++ b/Tests/LaunchRaw/inputs @@ -0,0 +1 @@ +amrex.the_arena_is_managed = 0 diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp new file mode 100644 index 0000000000..10c4c94838 --- /dev/null +++ b/Tests/LaunchRaw/main.cpp @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include +#include +#include + +using namespace amrex; + +void test1d () { + + const IntVectND<1> num_blocks {31}; +#ifdef AMREX_USE_GPU + static constexpr int blockdim[1] {256}; +#else + static constexpr int blockdim[1] {1}; +#endif + static constexpr int num_threads = blockdim[0]; + + Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0], -999); + + auto * data = vect.dataPtr(); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.globalIdx1D()] = lh.blockIdx1D(); + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + auto block = lh.blockIdxND(); + auto thread = lh.template threadIdxND(); + auto tmp = data[ + block[0] * num_threads + thread[0] + ]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; + }); + + LaunchRaw(num_blocks, num_threads, + [=] AMREX_GPU_DEVICE (auto lh) { + auto smem = lh.shared_memory(); + auto thread = lh.template threadIdxND(); + auto locid = thread[0]; + smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + locid] = smem[locid]; + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.globalIdx1D()] = + data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); + }); + + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast(vect.size())); +} + +void test2d () { + + const IntVectND<2> num_blocks {31, 23}; +#ifdef AMREX_USE_GPU + static constexpr int blockdim[2] {8, 32}; +#else + static constexpr int blockdim[2] {1, 1}; +#endif + static constexpr int num_threads = blockdim[0] * blockdim[1]; + + Gpu::DeviceVector vect(static_cast(num_threads) + * num_blocks[0] * num_blocks[1], -999); + + auto * data = vect.dataPtr(); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.globalIdx1D()] = lh.blockIdx1D(); + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + auto block = lh.blockIdxND(); + auto thread = lh.template threadIdxND(); + auto tmp = data[ + (block[0] + block[1] * num_blocks[0]) * num_threads + + thread[1] + thread[0] * blockdim[1] + ]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; + }); + + LaunchRaw(num_blocks, num_threads, + [=] AMREX_GPU_DEVICE (auto lh) { + auto smem = lh.shared_memory(); + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[1] + thread1[0] * blockdim[0]; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[1] + thread2[0] * blockdim[1]; + smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.globalIdx1D()] = + data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); + }); + + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast(vect.size())); +} + +void test3d () { + + const IntVectND<3> num_blocks {31, 23, 11}; +#ifdef AMREX_USE_GPU + static constexpr int blockdim[3] {2, 8, 16}; +#else + static constexpr int blockdim[3] {1, 1, 1}; +#endif + static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; + + Gpu::DeviceVector vect(static_cast(num_threads) + * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); + + auto * data = vect.dataPtr(); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.globalIdx1D()] = lh.blockIdx1D(); + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + auto block = lh.blockIdxND(); + auto thread = lh.template threadIdxND(); + auto tmp = data[ + (block[0] + block[1] * num_blocks[0] + + block[2] * num_blocks[0] * num_blocks[1]) * num_threads + + thread[2] + thread[1] * blockdim[2] + + thread[0] * blockdim[2] * blockdim[1] + ]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; + }); + + LaunchRaw(num_blocks, num_threads, + [=] AMREX_GPU_DEVICE (auto lh) { + auto smem = lh.shared_memory(); + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[2] + thread1[1] * blockdim[0] + + thread1[0] * blockdim[0] * blockdim[1]; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[0] + thread2[2] * blockdim[2] + + thread2[1] * blockdim[2] * blockdim[1]; + smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; + }); + + LaunchRaw(num_blocks, + [=] AMREX_GPU_DEVICE (auto lh) { + data[lh.globalIdx1D()] = + data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); + }); + + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast(vect.size())); +} + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + test1d(); + + test2d(); + + test3d(); + + amrex::Print() << "Passed! \n"; + } + amrex::Finalize(); +}