AMReX-Codes · AlexanderSinn · Jan 13, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 19, 2026
diff --git a/Docs/Doxygen/groups.dox b/Docs/Doxygen/groups.dox
@@ -187,6 +187,7 @@
  * - \ref amrex::ParallelFor
  * - \ref amrex::ParallelForOMP
  * - \ref amrex::ParallelForRNG
+ * - \ref amrex::LaunchRaw
  */
 
 /**

diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -446,46 +446,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         int* ntags = dv_ntags.data() + blockoffset[li];
         const int ncells = fai.fabbox().numPts();
         const char* tags = (*this)[fai].dataPtr();
-#ifdef AMREX_USE_SYCL
-        amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
-                                  Gpu::Device::gpuStream(),
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
-        {
-            int bid = h.item->get_group_linear_id();
-            int tid = h.item->get_local_id(0);
-            int icell = h.item->get_global_id(0);
 
-            int t = 0;
-            if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                t = 1;
-            }
-
-            t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, h);
-            if (tid == 0) {
-                ntags[bid] = t;
-            }
-        });
-#else
-        amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
-        [=] AMREX_GPU_DEVICE () noexcept
+        amrex::LaunchRaw<block_size, int>(amrex::IntVectND<1>{nblocks[li]},
+        AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            int bid = blockIdx.x;
-            int tid = threadIdx.x;
-            int icell = block_size*blockIdx.x+threadIdx.x;
+            int bid = lh.blockIdx1D();
+            int tid = lh.threadIdx1D();
+            int icell = lh.globalIdx1D();
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
                 t = 1;
             }
 
             t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_IF_SYCL(, lh.handler()));
             if (tid == 0) {
                 ntags[bid] = t;
             }
         });
-#endif
     }
 
     Gpu::PinnedVector<int> hv_ntags(ntotblocks);
@@ -524,51 +505,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             const auto lenx  = len.x;
             const int ncells = bx.numPts();
             const char* tags = (*this)[fai].dataPtr();
-#ifdef AMREX_USE_SYCL
-            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
+            amrex::LaunchRaw<block_size, unsigned int>(amrex::IntVectND<1>{nblocks[li]}, 1,
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                int bid = h.item->get_group(0);
-                int tid = h.item->get_local_id(0);
-                int icell = h.item->get_global_id(0);
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+                int icell = lh.globalIdx1D();
 
-                unsigned int* shared_counter = (unsigned int*)h.local;
+                unsigned int * shared_counter = lh.shared_memory();
                 if (tid == 0) {
                     *shared_counter = 0;
                 }
-                h.item->barrier(sycl::access::fence_space::local_space);
+                lh.syncthreads();
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Add<unsigned int,
-                                                         sycl::access::address_space::local_space>
-                        (shared_counter, 1u);
-                    IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
-                }
-            });
-#else
-            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
-            [=] AMREX_GPU_DEVICE () noexcept
-            {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-                int icell = block_size*blockIdx.x+threadIdx.x;
 
-                Gpu::SharedMemory<unsigned int> gsm;
-                unsigned int * shared_counter = gsm.dataPtr();
-                if (tid == 0) {
-                    *shared_counter = 0;
-                }
-                __syncthreads();
+                    unsigned int itag = Gpu::Atomic::Add
+#ifdef AMREX_USE_SYCL
+                        <unsigned int, sycl::access::address_space::local_space>
+#endif
+                        (shared_counter, 1u);
 
-                if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;
@@ -579,7 +536,6 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                     p[itag] = IntVect(AMREX_D_DECL(i,j,k));
                 }
             });
-#endif
         }
     }
 

diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
@@ -5,7 +5,7 @@
 #include <AMReX_BLassert.H>
 #include <AMReX_INT.H>
 #ifdef AMREX_USE_GPU
-#include <AMReX_GpuControl.H>
+#include <AMReX_GpuTypes.H>
 #endif
 
 #ifdef AMREX_TINY_PROFILING

diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
@@ -42,49 +42,27 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
         std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
         AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
         auto nblocks = int(nblocks_long);
-        std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
+        std::size_t shared_mem_elem = nthreads_per_block * STRUCTSIZE;
         T* p = (T*)aos_fab.dataPtr();
-#ifdef AMREX_USE_SYCL
-        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+        amrex::LaunchRaw<nthreads_per_block, T>(amrex::IntVectND<1>{nblocks}, shared_mem_elem,
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            auto const icell = std::uint64_t(handler.globalIdx());
-            std::uint64_t const blockDimx = handler.blockDim();
-            std::uint64_t const threadIdxx = handler.threadIdx();
-            std::uint64_t const blockIdxx = handler.blockIdx();
-            auto const shared = (T*)handler.sharedMemory();
+            std::uint64_t const icell =
+                std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()+lh.threadIdx1D();
+            T* const shared = lh.shared_memory();
             if (icell < indexer.numPts()) {
-                auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
+                auto ga = new(shared+std::uint64_t(lh.threadIdx1D())*STRUCTSIZE) STRUCT;
                 auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
-            handler.sharedBarrier();
-            for (std::uint64_t m = threadIdxx,
-                     mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
-                 m < mend; m += blockDimx) {
-                p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
+            lh.syncthreads();
+            for (std::uint64_t m = lh.threadIdx1D(),
+                    mend = amrex::min<std::uint64_t>(lh.blockDim1D(),
+                    indexer.numPts()-std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()) * STRUCTSIZE;
+                 m < mend; m += lh.blockDim1D()) {
+                p[std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()*STRUCTSIZE+m] = shared[m];
             }
         });
-#else
-        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE () noexcept
-        {
-            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
-            Gpu::SharedMemory<T> gsm;
-            T* const shared = gsm.dataPtr();
-            if (icell < indexer.numPts()) {
-                auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
-                auto [i, j, k] = indexer(icell);
-                f(*ga, i, j, k);
-            }
-            __syncthreads();
-            for (std::uint64_t m = threadIdx.x,
-                     mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
-                 m < mend; m += blockDim.x) {
-                p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
-            }
-        });
-#endif
     } else
 #endif
     {

diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
@@ -295,25 +295,16 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
     auto const* pntags = d_ntags.data();
     auto const nblocks = int(h_ntags.size()-1);
     constexpr auto nthreads = 256;
-    amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
-        [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
-        [[sycl::reqd_work_group_size(nthreads)]]
-#else
-        [=] AMREX_GPU_DEVICE () noexcept
-#endif
-    {
-#ifdef AMREX_USE_SYCL
-        Dim1 blockIdx{item.get_group_linear_id()};
-        Dim1 threadIdx{item.get_local_linear_id()};
-#endif
 
-        for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
+    amrex::LaunchRaw<nthreads>(amrex::IntVectND<1>{nblocks},
+    [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        for (unsigned int itag = pntags[lh.blockIdx1D()]; itag < pntags[lh.blockIdx1D()+1]; ++itag) {
             auto const tag = ptag[itag];
             auto ncells = int(tag.dbox.numPts());
             const auto len = amrex::length(tag.dbox);
             const auto lo  = amrex::lbound(tag.dbox);
-            for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
+            for (int icell = int(lh.threadIdx1D()); icell < ncells; icell += nthreads) {
                 int k =  icell /   (len.x*len.y);
                 int j = (icell - k*(len.x*len.y)) /   len.x;
                 int i = (icell - k*(len.x*len.y)) - j*len.x;
@@ -328,12 +319,8 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
                 }
             }
 
-            if (itag+1 < pntags[blockIdx.x+1]) {
-#ifdef AMREX_USE_SYCL
-                sycl::group_barrier(item.get_group());
-#else
-                __syncthreads();
-#endif
+            if (itag+1 < pntags[lh.blockIdx1D()+1]) {
+                lh.syncthreads();
             }
         }
     });

diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
@@ -448,51 +448,27 @@ namespace amrex::Gpu {
             auto pu = reinterpret_cast<U*>(p);
             constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
             int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
-            std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
-#ifdef AMREX_USE_SYCL
-            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+            std::size_t shared_mem_Uelem = nthreads_per_block * nU;
+            amrex::LaunchRaw<nthreads_per_block, U>(amrex::IntVectND<1>{nblocks}, shared_mem_Uelem,
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                Long i = handler.globalIdx();
-                Long blockDimx = handler.blockDim();
-                Long threadIdxx = handler.threadIdx();
-                Long blockIdxx = handler.blockIdx();
-                auto const shared_U = (U*)handler.sharedMemory();
-                auto const shared_T = (T*)shared_U;
-                if (i < N) {
-                    auto ga = new(shared_T+threadIdxx) T;
-                    f(*ga, i);
-                }
-                handler.sharedBarrier();
-                for (Long m = threadIdxx,
-                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
-                     m < mend; m += blockDimx) {
-                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
-                }
-            });
-#else
-            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-                          [=] AMREX_GPU_DEVICE () noexcept
-            {
-                Long blockDimx = blockDim.x;
-                Long threadIdxx = threadIdx.x;
-                Long blockIdxx = blockIdx.x;
+                Long blockDimx = lh.blockDim1D();
+                Long threadIdxx = lh.threadIdx1D();
+                Long blockIdxx = lh.blockIdx1D();
                 Long i = blockDimx*blockIdxx + threadIdxx;
-                Gpu::SharedMemory<U> gsm;
-                auto const shared_U = gsm.dataPtr();
+                auto const shared_U = lh.shared_memory();
                 auto const shared_T = (T*)shared_U;
                 if (i < N) {
                     auto ga = new(shared_T+threadIdxx) T;
                     f(*ga, i);
                 }
-                __syncthreads();
+                lh.syncthreads();
                 for (Long m = threadIdxx,
                          mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
                      m < mend; m += blockDimx) {
                     pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
                 }
             });
-#endif
         }
 #endif
     }

diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
@@ -3,7 +3,6 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_GpuQualifiers.H>
-#include <AMReX_GpuTypes.H>
 
 #include <utility>
 
@@ -33,12 +32,28 @@
 #define AMREX_HIP_OR_CUDA_OR_SYCL(a,b,c) ((void)0);
 #endif
 
+#if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) a
+#elif defined(AMREX_USE_SYCL)
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) b
+#else
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) c
+#endif
+
 #ifdef AMREX_USE_GPU
 #define AMREX_GPU_OR_CPU(a,b) a
 #else
 #define AMREX_GPU_OR_CPU(a,b) b
 #endif
 
+#ifdef AMREX_USE_SYCL
+#define AMREX_IF_SYCL(...) __VA_ARGS__
+#define AMREX_IF_NOT_SYCL(...)
+#else
+#define AMREX_IF_SYCL(...)
+#define AMREX_IF_NOT_SYCL(...) __VA_ARGS__
+#endif
+
 #ifdef AMREX_USE_SYCL
 #define AMREX_SYCL_ONLY(a) a
 #else
@@ -75,15 +90,7 @@ namespace amrex {
 #define AMREX_DEFAULT_RUNON =amrex::RunOn::Host // by default run on Host when compiling for Cpu
 #endif
 
-namespace amrex { // NOLINT(modernize-concat-nested-namespaces)
-
-#ifdef AMREX_USE_HIP
-using gpuStream_t = hipStream_t;
-#elif defined(AMREX_USE_CUDA)
-using gpuStream_t = cudaStream_t;
-#endif
-
-namespace Gpu {
+namespace amrex::Gpu {
 
 #if defined(AMREX_USE_GPU)
 
@@ -225,7 +232,6 @@ namespace Gpu {
 
 #endif
 
-}
 }
 
 #endif