diff --git a/Docs/Doxygen/groups.dox b/Docs/Doxygen/groups.dox
index c2d946159f..1265c9a505 100644
--- a/Docs/Doxygen/groups.dox
+++ b/Docs/Doxygen/groups.dox
@@ -187,6 +187,7 @@
  * - \ref amrex::ParallelFor
  * - \ref amrex::ParallelForOMP
  * - \ref amrex::ParallelForRNG
+ * - \ref amrex::LaunchRaw
  */
 
 /**
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 3e8f4eb560..62e6d05149 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -446,33 +446,14 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         int* ntags = dv_ntags.data() + blockoffset[li];
         const int ncells = fai.fabbox().numPts();
         const char* tags = (*this)[fai].dataPtr();
-#ifdef AMREX_USE_SYCL
-        amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
-                                  Gpu::Device::gpuStream(),
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
-        {
-            int bid = h.item->get_group_linear_id();
-            int tid = h.item->get_local_id(0);
-            int icell = h.item->get_global_id(0);
 
-            int t = 0;
-            if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                t = 1;
-            }
-
-            t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, h);
-            if (tid == 0) {
-                ntags[bid] = t;
-            }
-        });
-#else
-        amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
-        [=] AMREX_GPU_DEVICE () noexcept
+        amrex::LaunchRaw<block_size, int>(amrex::IntVectND<1>{nblocks[li]},
+        AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            int bid = blockIdx.x;
-            int tid = threadIdx.x;
-            int icell = block_size*blockIdx.x+threadIdx.x;
+            int bid = lh.blockIdx1D();
+            int tid = lh.threadIdx1D();
+            int icell = lh.globalIdx1D();
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -480,12 +461,12 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             }
 
             t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_IF_SYCL(, lh.handler()));
             if (tid == 0) {
                 ntags[bid] = t;
             }
         });
-#endif
     }
 
     Gpu::PinnedVector<int> hv_ntags(ntotblocks);
@@ -524,51 +505,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             const auto lenx  = len.x;
             const int ncells = bx.numPts();
             const char* tags = (*this)[fai].dataPtr();
-#ifdef AMREX_USE_SYCL
-            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
+            amrex::LaunchRaw<block_size, unsigned int>(amrex::IntVectND<1>{nblocks[li]}, 1,
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                int bid = h.item->get_group(0);
-                int tid = h.item->get_local_id(0);
-                int icell = h.item->get_global_id(0);
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+                int icell = lh.globalIdx1D();
 
-                unsigned int* shared_counter = (unsigned int*)h.local;
+                unsigned int * shared_counter = lh.shared_memory();
                 if (tid == 0) {
                     *shared_counter = 0;
                 }
-                h.item->barrier(sycl::access::fence_space::local_space);
+                lh.syncthreads();
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Add<unsigned int,
-                                                         sycl::access::address_space::local_space>
-                        (shared_counter, 1u);
-                    IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
-                }
-            });
-#else
-            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
-            [=] AMREX_GPU_DEVICE () noexcept
-            {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-                int icell = block_size*blockIdx.x+threadIdx.x;
 
-                Gpu::SharedMemory<unsigned int> gsm;
-                unsigned int * shared_counter = gsm.dataPtr();
-                if (tid == 0) {
-                    *shared_counter = 0;
-                }
-                __syncthreads();
+                    unsigned int itag = Gpu::Atomic::Add
+#ifdef AMREX_USE_SYCL
+                        <unsigned int, sycl::access::address_space::local_space>
+#endif
+                        (shared_counter, 1u);
 
-                if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;
@@ -579,7 +536,6 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                     p[itag] = IntVect(AMREX_D_DECL(i,j,k));
                 }
             });
-#endif
         }
     }
 
diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index c8674aa004..41328f4982 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -5,7 +5,7 @@
 #include <AMReX_BLassert.H>
 #include <AMReX_INT.H>
 #ifdef AMREX_USE_GPU
-#include <AMReX_GpuControl.H>
+#include <AMReX_GpuTypes.H>
 #endif
 
 #ifdef AMREX_TINY_PROFILING
diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
index d0c1e78847..c6c125db5b 100644
--- a/Src/Base/AMReX_BaseFabUtility.H
+++ b/Src/Base/AMReX_BaseFabUtility.H
@@ -42,49 +42,27 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
         std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
         AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
         auto nblocks = int(nblocks_long);
-        std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
+        std::size_t shared_mem_elem = nthreads_per_block * STRUCTSIZE;
         T* p = (T*)aos_fab.dataPtr();
-#ifdef AMREX_USE_SYCL
-        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+        amrex::LaunchRaw<nthreads_per_block, T>(amrex::IntVectND<1>{nblocks}, shared_mem_elem,
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            auto const icell = std::uint64_t(handler.globalIdx());
-            std::uint64_t const blockDimx = handler.blockDim();
-            std::uint64_t const threadIdxx = handler.threadIdx();
-            std::uint64_t const blockIdxx = handler.blockIdx();
-            auto const shared = (T*)handler.sharedMemory();
+            std::uint64_t const icell =
+                std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()+lh.threadIdx1D();
+            T* const shared = lh.shared_memory();
             if (icell < indexer.numPts()) {
-                auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
+                auto ga = new(shared+std::uint64_t(lh.threadIdx1D())*STRUCTSIZE) STRUCT;
                 auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
-            handler.sharedBarrier();
-            for (std::uint64_t m = threadIdxx,
-                     mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
-                 m < mend; m += blockDimx) {
-                p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
+            lh.syncthreads();
+            for (std::uint64_t m = lh.threadIdx1D(),
+                    mend = amrex::min<std::uint64_t>(lh.blockDim1D(),
+                    indexer.numPts()-std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()) * STRUCTSIZE;
+                 m < mend; m += lh.blockDim1D()) {
+                p[std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()*STRUCTSIZE+m] = shared[m];
             }
         });
-#else
-        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE () noexcept
-        {
-            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
-            Gpu::SharedMemory<T> gsm;
-            T* const shared = gsm.dataPtr();
-            if (icell < indexer.numPts()) {
-                auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
-                auto [i, j, k] = indexer(icell);
-                f(*ga, i, j, k);
-            }
-            __syncthreads();
-            for (std::uint64_t m = threadIdx.x,
-                     mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
-                 m < mend; m += blockDim.x) {
-                p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
-            }
-        });
-#endif
     } else
 #endif
     {
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 1b3f8cc13f..ee9584a4c0 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -295,25 +295,16 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
     auto const* pntags = d_ntags.data();
     auto const nblocks = int(h_ntags.size()-1);
     constexpr auto nthreads = 256;
-    amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
-        [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
-        [[sycl::reqd_work_group_size(nthreads)]]
-#else
-        [=] AMREX_GPU_DEVICE () noexcept
-#endif
-    {
-#ifdef AMREX_USE_SYCL
-        Dim1 blockIdx{item.get_group_linear_id()};
-        Dim1 threadIdx{item.get_local_linear_id()};
-#endif
 
-        for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
+    amrex::LaunchRaw<nthreads>(amrex::IntVectND<1>{nblocks},
+    [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        for (unsigned int itag = pntags[lh.blockIdx1D()]; itag < pntags[lh.blockIdx1D()+1]; ++itag) {
             auto const tag = ptag[itag];
             auto ncells = int(tag.dbox.numPts());
             const auto len = amrex::length(tag.dbox);
             const auto lo  = amrex::lbound(tag.dbox);
-            for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
+            for (int icell = int(lh.threadIdx1D()); icell < ncells; icell += nthreads) {
                 int k =  icell /   (len.x*len.y);
                 int j = (icell - k*(len.x*len.y)) /   len.x;
                 int i = (icell - k*(len.x*len.y)) - j*len.x;
@@ -328,12 +319,8 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
                 }
             }
 
-            if (itag+1 < pntags[blockIdx.x+1]) {
-#ifdef AMREX_USE_SYCL
-                sycl::group_barrier(item.get_group());
-#else
-                __syncthreads();
-#endif
+            if (itag+1 < pntags[lh.blockIdx1D()+1]) {
+                lh.syncthreads();
             }
         }
     });
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index d404e8cfd1..981898f56c 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -448,51 +448,27 @@ namespace amrex::Gpu {
             auto pu = reinterpret_cast<U*>(p);
             constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
             int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
-            std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
-#ifdef AMREX_USE_SYCL
-            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+            std::size_t shared_mem_Uelem = nthreads_per_block * nU;
+            amrex::LaunchRaw<nthreads_per_block, U>(amrex::IntVectND<1>{nblocks}, shared_mem_Uelem,
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                Long i = handler.globalIdx();
-                Long blockDimx = handler.blockDim();
-                Long threadIdxx = handler.threadIdx();
-                Long blockIdxx = handler.blockIdx();
-                auto const shared_U = (U*)handler.sharedMemory();
-                auto const shared_T = (T*)shared_U;
-                if (i < N) {
-                    auto ga = new(shared_T+threadIdxx) T;
-                    f(*ga, i);
-                }
-                handler.sharedBarrier();
-                for (Long m = threadIdxx,
-                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
-                     m < mend; m += blockDimx) {
-                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
-                }
-            });
-#else
-            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-                          [=] AMREX_GPU_DEVICE () noexcept
-            {
-                Long blockDimx = blockDim.x;
-                Long threadIdxx = threadIdx.x;
-                Long blockIdxx = blockIdx.x;
+                Long blockDimx = lh.blockDim1D();
+                Long threadIdxx = lh.threadIdx1D();
+                Long blockIdxx = lh.blockIdx1D();
                 Long i = blockDimx*blockIdxx + threadIdxx;
-                Gpu::SharedMemory<U> gsm;
-                auto const shared_U = gsm.dataPtr();
+                auto const shared_U = lh.shared_memory();
                 auto const shared_T = (T*)shared_U;
                 if (i < N) {
                     auto ga = new(shared_T+threadIdxx) T;
                     f(*ga, i);
                 }
-                __syncthreads();
+                lh.syncthreads();
                 for (Long m = threadIdxx,
                          mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
                      m < mend; m += blockDimx) {
                     pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
                 }
             });
-#endif
         }
 #endif
     }
diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index e03951a724..9310f9ffae 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -3,7 +3,6 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_GpuQualifiers.H>
-#include <AMReX_GpuTypes.H>
 
 #include <utility>
 
@@ -33,12 +32,28 @@
 #define AMREX_HIP_OR_CUDA_OR_SYCL(a,b,c) ((void)0);
 #endif
 
+#if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) a
+#elif defined(AMREX_USE_SYCL)
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) b
+#else
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) c
+#endif
+
 #ifdef AMREX_USE_GPU
 #define AMREX_GPU_OR_CPU(a,b) a
 #else
 #define AMREX_GPU_OR_CPU(a,b) b
 #endif
 
+#ifdef AMREX_USE_SYCL
+#define AMREX_IF_SYCL(...) __VA_ARGS__
+#define AMREX_IF_NOT_SYCL(...)
+#else
+#define AMREX_IF_SYCL(...)
+#define AMREX_IF_NOT_SYCL(...) __VA_ARGS__
+#endif
+
 #ifdef AMREX_USE_SYCL
 #define AMREX_SYCL_ONLY(a) a
 #else
@@ -75,15 +90,7 @@ namespace amrex {
 #define AMREX_DEFAULT_RUNON =amrex::RunOn::Host // by default run on Host when compiling for Cpu
 #endif
 
-namespace amrex { // NOLINT(modernize-concat-nested-namespaces)
-
-#ifdef AMREX_USE_HIP
-using gpuStream_t = hipStream_t;
-#elif defined(AMREX_USE_CUDA)
-using gpuStream_t = cudaStream_t;
-#endif
-
-namespace Gpu {
+namespace amrex::Gpu {
 
 #if defined(AMREX_USE_GPU)
 
@@ -225,7 +232,6 @@ namespace Gpu {
 
 #endif
 
-}
 }
 
 #endif
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 1ffa48dbbb..a5a4ec4cdb 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -2,6 +2,8 @@
 #define AMREX_GPU_LAUNCH_FUNCTS_C_H_
 #include <AMReX_Config.H>
 
+#include <vector>
+
 namespace amrex {
 
 /// \cond DOXYGEN_IGNORE
@@ -131,6 +133,65 @@ void launch (T const& n, L&& f) noexcept
     std::forward<L>(f)(n);
 }
 
+template<int MT, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, L const& f)
+{
+    static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! "
+        "Otherwise the syncthreads function would not work");
+    if constexpr(dim == 1) {
+        for (int bx=0; bx < nblocks[0]; ++bx) {
+            f(Gpu::LaunchHandler<MT, void, dim>{
+                IntVectND<1>{bx}, nblocks, nullptr});
+        }
+    } else if constexpr(dim == 2) {
+        for (int by=0; by < nblocks[1]; ++by) {
+            for (int bx=0; bx < nblocks[0]; ++bx) {
+                f(Gpu::LaunchHandler<MT, void, dim>{
+                    IntVectND<2>{bx, by}, nblocks, nullptr});
+            }
+        }
+    } else {
+        for (int bz=0; bz < nblocks[2]; ++bz) {
+            for (int by=0; by < nblocks[1]; ++by) {
+                for (int bx=0; bx < nblocks[0]; ++bx) {
+                    f(Gpu::LaunchHandler<MT, void, dim>{
+                        IntVectND<3>{bx, by, bz}, nblocks, nullptr});
+                }
+            }
+        }
+    }
+}
+
+template<int MT, typename T, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
+{
+    static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! "
+        "Otherwise the syncthreads function would not work");
+    std::vector<T> smem(shared_mem_elements);
+    if constexpr(dim == 1) {
+        for (int bx=0; bx < nblocks[0]; ++bx) {
+            f(Gpu::LaunchHandler<MT, T, dim>{
+                IntVectND<1>{bx}, nblocks, smem.data()});
+        }
+    } else if constexpr(dim == 2) {
+        for (int by=0; by < nblocks[1]; ++by) {
+            for (int bx=0; bx < nblocks[0]; ++bx) {
+                f(Gpu::LaunchHandler<MT, T, dim>{
+                    IntVectND<2>{bx, by}, nblocks, smem.data()});
+            }
+        }
+    } else {
+        for (int bz=0; bz < nblocks[2]; ++bz) {
+            for (int by=0; by < nblocks[1]; ++by) {
+                for (int bx=0; bx < nblocks[0]; ++bx) {
+                    f(Gpu::LaunchHandler<MT, T, dim>{
+                        IntVectND<3>{bx, by, bz}, nblocks, smem.data()});
+                }
+            }
+        }
+    }
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 AMREX_ATTRIBUTE_FLATTEN_FOR
 void For (T n, L const& f) noexcept
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index e4d377fda8..8140b95015 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -896,6 +896,110 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
+template<int MT, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, L const& f)
+{
+    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
+    L const* pf = skdp.template get<0>();
+    amrex::ignore_unused(pf);
+
+    auto& q = Gpu::Device::streamQueue();
+
+    sycl::range<1> threads_per_block{MT};
+    sycl::range<1> threads_total{MT};
+
+    for (int i=0; i<dim; ++i) {
+        threads_total[0] *= static_cast<std::size_t>(nblocks[i]);
+    }
+
+    // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim,
+    // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that
+    // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here.
+    BoxIndexerND<dim> bxi(BoxND<dim>{IntVectND<dim>(0), nblocks - 1});
+
+    try {
+        q.submit([&] (sycl::handler& h) {
+            h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block),
+                [=] (sycl::nd_item<1> item)
+                [[sycl::reqd_work_group_size(MT)]]
+                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
+                {
+                    if constexpr (detail::is_big_kernel<L>()) {
+                        (*pf)(Gpu::LaunchHandler<MT, void, dim>{
+                            &item,
+                            bxi.intVect(item.get_group(0)),
+                            nullptr
+                        });
+                    } else {
+                        f(Gpu::LaunchHandler<MT, void, dim>{
+                            &item,
+                            bxi.intVect(item.get_group(0)),
+                            nullptr
+                        });
+                    }
+                });
+        });
+    } catch (sycl::exception const& ex) {
+        amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!");
+    }
+}
+
+template<int MT, typename T, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
+{
+    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
+    L const* pf = skdp.template get<0>();
+    amrex::ignore_unused(pf);
+
+    auto& q = Gpu::Device::streamQueue();
+    // use double4 for shared memory as it has the largest alignment of types that might be used
+    using ST = sycl::double4;
+    static_assert(alignof(ST) >= alignof(T));
+    const std::size_t shared_mem_num = (shared_mem_elements*sizeof(T)+sizeof(ST)-1) / sizeof(ST);
+
+    sycl::range<1> threads_per_block{MT};
+    sycl::range<1> threads_total{MT};
+
+    for (int i=0; i<dim; ++i) {
+        threads_total[0] *= static_cast<std::size_t>(nblocks[i]);
+    }
+
+    // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim,
+    // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that
+    // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here.
+    BoxIndexerND<dim> bxi(BoxND<dim>{IntVectND<dim>(0), nblocks - 1});
+
+    try {
+        q.submit([&] (sycl::handler& h) {
+            sycl::local_accessor<ST> shared_data(sycl::range<1>(shared_mem_num), h);
+            h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block),
+                [=] (sycl::nd_item<1> item)
+                [[sycl::reqd_work_group_size(MT)]]
+                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
+                {
+                    T* shared_mem = reinterpret_cast<T*>(
+                        shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get());
+                    if constexpr (detail::is_big_kernel<L>()) {
+                        (*pf)(Gpu::LaunchHandler<MT, T, dim>{
+                            &item,
+                            bxi.intVect(item.get_group(0)),
+                            shared_mem
+                        });
+                    } else {
+                        f(Gpu::LaunchHandler<MT, T, dim>{
+                            &item,
+                            bxi.intVect(item.get_group(0)),
+                            shared_mem
+                        });
+                    }
+                });
+        });
+    } catch (sycl::exception const& ex) {
+        amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!");
+    }
+}
+
+
 #else
 // CUDA or HIP
 
@@ -1284,6 +1388,87 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_GPU_ERROR_CHECK();
 }
 
+/**
+ * \ingroup amrex_execution
+ * \brief Performance-portable kernel launch function
+ * that provides low-level access to GPU thread blocks through \ref amrex::Gpu::LaunchHandler.
+ *
+ * The number of threads per block is a compile-time-known one-dimensional value that usually
+ * should be one of 128, 256, 512 or 1024.
+ *
+ * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native
+ * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z.
+ * Note that this uses types int and unsigned int which might overflow if many blocks or
+ * total threads are needed. In case this is an issue, it is necessary to explicitly add
+ * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks.
+ *
+ * \tparam MT number of threads per GPU block.
+ * \param nblocks number of GPU blocks to launch.
+ * \param f a callable object that takes amrex::Gpu::LaunchHandler as input.
+ */
+template<int MT, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, L const& f)
+{
+    dim3 num_blocks;
+    num_blocks.x = nblocks[0];
+    if constexpr (dim >= 2) {
+        num_blocks.y = nblocks[1];
+    }
+    if constexpr (dim == 3) {
+        num_blocks.z = nblocks[2];
+    }
+
+    AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            f(Gpu::LaunchHandler<MT, void, dim>{});
+        });
+    AMREX_GPU_ERROR_CHECK();
+}
+
+/**
+ * \ingroup amrex_execution
+ * \brief Performance-portable kernel launch function
+ * that provides low-level access to GPU thread blocks and shared memory
+ * through \ref amrex::Gpu::LaunchHandler.
+ *
+ * The number of threads per block is a compile-time-known one-dimensional value that usually
+ * should be one of 128, 256, 512 or 1024.
+ *
+ * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native
+ * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z.
+ * Note that this uses types int and unsigned int which might overflow if many blocks or
+ * total threads are needed. In case this is an issue, it is necessary to explicitly add
+ * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks.
+ *
+ * This version of LaunchRaw supports the use of dynamic shared memory inside the thread block.
+ * Shared memory is a fast cache local to thread blocks. It has different names in
+ * different GPU backends (CUDA: shared memory, HIP: local data share, SYCL: local memory).
+ *
+ * \tparam MT number of threads per GPU block.
+ * \tparam T data type of shared memory elements.
+ * \param nblocks number of GPU blocks to launch.
+ * \param shared_mem_elements number of shared memory elements per block to allocate.
+ * \param f a callable object that takes amrex::Gpu::LaunchHandler as input.
+ */
+template<int MT, typename T, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
+{
+    dim3 num_blocks;
+    num_blocks.x = nblocks[0];
+    if constexpr (dim >= 2) {
+        num_blocks.y = nblocks[1];
+    }
+    if constexpr (dim == 3) {
+        num_blocks.z = nblocks[2];
+    }
+
+    AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, shared_mem_elements * sizeof(T), Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            f(Gpu::LaunchHandler<MT, T, dim>{});
+        });
+    AMREX_GPU_ERROR_CHECK();
+}
+
 #endif
 
 template <typename L>
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index d4ed18c48e..3c96b04685 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -3,6 +3,10 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Extension.H>
+#include <AMReX_BaseFwd.H>
+#include <AMReX_GpuControl.H>
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_IntVect.H>
 
 #ifdef AMREX_USE_GPU
 
@@ -32,6 +36,14 @@ struct gpuStream_t {
     bool operator!= (gpuStream_t const& rhs) const noexcept { return queue != rhs.queue; }
 };
 
+#elif defined(AMREX_USE_HIP)
+
+using gpuStream_t = hipStream_t;
+
+#elif defined(AMREX_USE_CUDA)
+
+using gpuStream_t = cudaStream_t;
+
 #endif
 
 }
@@ -102,6 +114,223 @@ struct Handler {};
 
 #endif
 
+template<unsigned int threads_per_block, typename T, int dim>
+struct LaunchHandler
+{
+    static_assert(dim >= 1 && dim <= 3);
+
+#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+    LaunchHandler() = default;
+#elif defined(AMREX_USE_SYCL)
+    LaunchHandler(sycl::nd_item<1> const* a_item, IntVectND<dim> a_blockid, T * a_shared_mem)
+        : m_item{a_item}, m_blockid{a_blockid}, m_shared_mem{a_shared_mem} {}
+#else
+    LaunchHandler(IntVectND<dim> a_blockid, IntVectND<dim> a_griddim, T * a_shared_mem)
+        : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}
+#endif
+
+    /**
+     * \brief Returns the thread ID in the local block.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int threadIdx1D () const {
+        return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+            threadIdx.x,
+            m_item->get_local_linear_id(),
+            0
+        ));
+    }
+
+    /**
+     * \brief Splits the local thread ID into N dimensions.
+     * The sizes of the dimensions are supplied as template arguments.
+     */
+    template<int ...nd_block_size>
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    IntVectND<sizeof...(nd_block_size)> threadIdxND () const {
+        static_assert(sizeof...(nd_block_size) <= 3 &&
+                      (1 * ... * nd_block_size) == threads_per_block);
+        constexpr IntVectND<sizeof...(nd_block_size)> iv_block_size =
+            blockDimND<nd_block_size...>();
+        IntVectND<sizeof...(nd_block_size)> ret(0);
+        unsigned int idx = threadIdx1D();
+        if constexpr (sizeof...(nd_block_size) == 3) {
+            ret[2] = idx / (iv_block_size[0] * iv_block_size[1]);
+            idx = idx - ret[2] * (iv_block_size[0] * iv_block_size[1]);
+        }
+        if constexpr (sizeof...(nd_block_size) >= 2) {
+            ret[1] = idx / iv_block_size[0];
+            idx = idx - ret[1] * iv_block_size[0];
+        }
+        ret[0] = idx;
+        return ret;
+    }
+
+    /**
+     * \brief Returns the block ID flattened to 1D.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int blockIdx1D () const {
+        if constexpr (dim == 1) {
+            return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                blockIdx.x,
+                m_item->get_group(0),
+                m_blockid[0]
+            ));
+        } else if constexpr (dim == 2) {
+            return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                blockIdx.x + gridDim.x * blockIdx.y,
+                m_item->get_group(0),
+                m_blockid[0] + m_griddim[0] * m_blockid[1]
+            ));
+        } else {
+            return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z,
+                m_item->get_group(0),
+                m_blockid[0] + m_griddim[0] * m_blockid[1] +
+                m_griddim[0] * m_griddim[1] * m_blockid[2]
+            ));
+        }
+    }
+
+    /**
+     * \brief Returns the 1D/2D/3D block ID.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    IntVectND<dim> blockIdxND () const {
+        if constexpr (dim == 1) {
+            return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                IntVectND<dim>(static_cast<int>(blockIdx.x)),
+                m_blockid,
+                m_blockid
+            );
+        } else if constexpr (dim == 2) {
+            return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                IntVectND<dim>(static_cast<int>(blockIdx.x),
+                               static_cast<int>(blockIdx.y)),
+                m_blockid,
+                m_blockid
+            );
+        } else {
+            return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                IntVectND<dim>(static_cast<int>(blockIdx.x),
+                               static_cast<int>(blockIdx.y),
+                               static_cast<int>(blockIdx.z)),
+                m_blockid,
+                m_blockid
+            );
+        }
+    }
+
+    /**
+     * \brief Returns the number of threads inside a block.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    static constexpr unsigned int blockDim1D () {
+        return threads_per_block;
+    }
+
+    /**
+     * \brief Analogous to threadIdxND but returns the ND block size.
+     * This effectively just returns the template arguments as an IntVectND.
+     */
+    template<int ...nd_block_size>
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    static constexpr IntVectND<sizeof...(nd_block_size)> blockDimND () {
+        static_assert((1 * ... * nd_block_size) == threads_per_block);
+        return IntVectND<sizeof...(nd_block_size)>{nd_block_size...};
+    }
+
+    /**
+     * \brief Returns the global thread index flattened to 1D.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int globalIdx1D () const {
+        return blockIdx1D() * threads_per_block + threadIdx1D();
+    }
+
+    /**
+     * \brief Returns the global 1D/2D/3D thread index.
+     */
+    template<int ...nd_block_size>
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    IntVectND<dim> globalIdxND () const {
+        static_assert(sizeof...(nd_block_size) == dim &&
+                      (1 * ... * nd_block_size) == threads_per_block);
+        return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
+    }
+
+    /**
+     * \brief Synchronizes all threads within a block. This is needed before
+     * accessing (shared) memory that was previously written by another thread in the block.
+     */
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void syncthreads () const {
+        AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+            __syncthreads(),
+            m_item->barrier(sycl::access::fence_space::global_and_local),
+            (void)0
+        );
+    }
+
+    /**
+     * \brief Returns a pointer to block-local shared memory. If multiple shared memory
+     * allocations are needed in a block, then the allocation must be manually split by adding
+     * offsets to it. The memory is aligned to 32 bytes.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T* shared_memory () const {
+#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+        // 32 bytes is sufficient for double4_32a
+        static_assert(32 >= alignof(T));
+        alignas(32) extern __shared__ char smem[];
+        return reinterpret_cast<T*>(smem);
+#else
+        return m_shared_mem;
+#endif
+    }
+
+    /**
+     * \brief Returns an amrex::Gpu::Handler object, which is sometimes needed for reductions.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    Gpu::Handler handler () const {
+#if defined(AMREX_USE_SYCL)
+        return Gpu::Handler(m_item, reinterpret_cast<void*>(m_shared_mem), threads_per_block);
+#elif defined(AMREX_USE_GPU)
+        return Gpu::Handler(threads_per_block);
+#else
+        return Gpu::Handler{};
+#endif
+    }
+
+    /**
+     * \brief Returns the internal sycl::nd_item<1>.
+     */
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+#if defined(AMREX_USE_SYCL)
+    const sycl::nd_item<1>* sycl_item () const {
+        return m_item;
+    }
+#else
+    int sycl_item () const {
+        return 0;
+    }
+#endif
+
+private:
+
+#if defined(AMREX_USE_SYCL)
+    sycl::nd_item<1> const* m_item;
+    IntVectND<dim> m_blockid;
+    T * m_shared_mem;
+#elif !defined(AMREX_USE_GPU)
+    IntVectND<dim> m_blockid;
+    IntVectND<dim> m_griddim;
+    T * m_shared_mem;
+#endif
+};
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index 4f6cdf68c9..16240fadc6 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -1257,20 +1257,14 @@ void reduce_to_plane (Array4<T> const& ar, int direction, Box const& bx, int box
             const auto len = amrex::length(bx);
             constexpr int nthreads = 128;
             auto nblocks = static_cast<int>(b2d.numPts());
-#ifdef AMREX_USE_SYCL
-            constexpr std::size_t shared_mem_bytes = sizeof(T)*Gpu::Device::warp_size;
-            amrex::launch<nthreads>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-                                    [=] AMREX_GPU_DEVICE (Gpu::Handler const& h)
-            {
-                int bid = h.blockIdx();
-                int tid = h.threadIdx();
-#else
-            amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
-                                    [=] AMREX_GPU_DEVICE ()
+
+            amrex::LaunchRaw<nthreads, T>(amrex::IntVectND<1>{nblocks},
+                AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+                [=] AMREX_GPU_DEVICE (auto lh)
             {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-#endif
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+
                 T tmp;
                 Op().init(tmp);
                 T* p;
@@ -1303,7 +1297,7 @@ void reduce_to_plane (Array4<T> const& ar, int direction, Box const& bx, int box
                     p = ar.ptr(i,j,0);
                 }
 #ifdef AMREX_USE_SYCL
-                Op().template parallel_update<T>(*p, tmp, h);
+                Op().template parallel_update<T>(*p, tmp, lh.handler());
 #else
                 Op().template parallel_update<T,nthreads>(*p, tmp);
 #endif
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 2f99faade7..3870394bd8 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -870,22 +870,14 @@ namespace amrex
                 }
                 int n2dblocks = (n2d+AMREX_GPU_MAX_THREADS-1)/AMREX_GPU_MAX_THREADS;
                 int nblocks = n2dblocks * b.length(direction);
-#ifdef AMREX_USE_SYCL
-                std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size;
-                amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, shared_mem_byte, Gpu::gpuStream(),
-                              [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
-#else
-                amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, Gpu::gpuStream(),
-                              [=] AMREX_GPU_DEVICE () noexcept
-#endif
+
+                amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, amrex::Real>(amrex::IntVectND<1>{nblocks},
+                    AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+                    [=] AMREX_GPU_DEVICE (auto lh) noexcept
                 {
-#ifdef AMREX_USE_SYCL
-                    int i1d = h.blockIdx() / n2dblocks;
-                    int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks);
-#else
-                    int i1d = blockIdx.x / n2dblocks;
-                    int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks);
-#endif
+                    int i1d = lh.blockIdx1D() / n2dblocks;
+                    int i2d = lh.threadIdx1D() +
+                        AMREX_GPU_MAX_THREADS*(lh.blockIdx1D()-i1d*n2dblocks);
                     int i2dy = i2d / n2dx;
                     int i2dx = i2d - i2dy*n2dx;
                     int i, j, k, idir;
@@ -907,11 +899,8 @@ namespace amrex
                     }
                     for (int n = 0; n < ncomp; ++n) {
                         Real r = (i2d < n2d) ? fab(i,j,k,n+icomp) : Real(0.0);
-#ifdef AMREX_USE_SYCL
-                        Gpu::deviceReduceSum_full(p+n+ncomp*idir, r, h);
-#else
-                        Gpu::deviceReduceSum_full(p+n+ncomp*idir, r);
-#endif
+                        Gpu::deviceReduceSum_full(p+n+ncomp*idir, r
+                            AMREX_IF_SYCL(, lh.handler()));
                     }
                 });
             }
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 2883e05674..a58263e2a2 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -634,29 +634,22 @@ public:
             reduce_data.nBlocks(stream) = nblocks_ec;
             reduce_data.updateMaxStreamIndex(stream);
 
-#ifdef AMREX_USE_SYCL
             // device reduce needs local(i.e., shared) memory
-            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-            amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
-                          [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
-            {
-                Dim1 blockIdx {gh.blockIdx()};
-                Dim1 threadIdx{gh.threadIdx()};
-#else
-            amrex::launch_global<AMREX_GPU_MAX_THREADS>
-                <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
-                ([=] AMREX_GPU_DEVICE () noexcept
+            amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+                amrex::IntVectND<1>{nblocks_ec},
+                AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+                [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-#endif
                 ReduceTuple r;
                 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-                ReduceTuple& dst = pdst[blockIdx.x];
-                if (threadIdx.x == 0) {
+                ReduceTuple& dst = pdst[lh.blockIdx1D()];
+                if (lh.threadIdx1D() == 0) {
                     dst = r;
                 }
-                for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
+                for (int iblock = lh.blockIdx1D(); iblock < nblocks; iblock += nblocks_ec) {
                     int ibox = iblock / nblocks_per_box;
-                    auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
+                    auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS
+                        + lh.threadIdx1D();
 
                     BoxIndexer const& indexer = dp_boxes[ibox];
                     if (icell < indexer.numPts()) {
@@ -665,11 +658,8 @@ public:
                             (f, ibox, i, j, k, ncomp, r);
                     }
                 }
-#ifdef AMREX_USE_SYCL
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-#else
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-#endif
+                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                    AMREX_IF_SYCL(, lh.handler()));
             });
         }
     }
@@ -689,48 +679,42 @@ public:
             / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
         nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
         reduce_data.updateMaxStreamIndex(stream);
-#ifdef AMREX_USE_SYCL
-        // device reduce needs local(i.e., shared) memory
-        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+        amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+            amrex::IntVectND<1>{static_cast<int>(nblocks_ec)},
+            AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+            [=] AMREX_GPU_DEVICE (
+                Gpu::LaunchHandler<AMREX_GPU_MAX_THREADS, unsigned long long, 1> lh
+            ) noexcept
         {
-            Dim1 blockIdx {gh.blockIdx()};
-            Dim1 threadIdx{gh.threadIdx()};
-            Dim1 gridDim  {gh.gridDim()};
-#else
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
-        [=] AMREX_GPU_DEVICE () noexcept
-        {
-#endif
             ReduceTuple r;
             Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-            ReduceTuple& dst = *(dp+blockIdx.x);
-            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
+            ReduceTuple& dst = *(dp+lh.blockIdx1D());
+            if (lh.threadIdx1D() == 0 && static_cast<int>(lh.blockIdx1D()) >= nblocks) {
                 dst = r;
             }
-            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
-                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
+            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D()
+                + lh.threadIdx1D(),
+                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec;
                  icell < indexer.numPts();
                  icell += stride)
             {
                 auto iv = indexer.intVect(icell);
-                amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture
+                // work around first-capture
+                auto f2 = f;
+                auto ncomp2 = ncomp;
+                auto ixtype2 = ixtype;
                 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
-                    auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
+                    auto pr = Reduce::detail::call_f_intvect_box(f2, iv, ixtype2);
                     Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
                 } else {
-                    for (int n = 0; n < ncomp; ++n) {
-                        auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
+                    for (int n = 0; n < ncomp2; ++n) {
+                        auto pr = Reduce::detail::call_f_intvect_n(f2, iv, n);
                         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
                     }
                 }
             }
-#ifdef AMREX_USE_SYCL
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-#else
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-#endif
+            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                AMREX_IF_SYCL(, lh.handler()));
         });
         nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
     }
@@ -822,39 +806,28 @@ public:
             / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
         nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
         reduce_data.updateMaxStreamIndex(stream);
-#ifdef AMREX_USE_SYCL
         // device reduce needs local(i.e., shared) memory
-        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
-        {
-            Dim1 blockIdx {gh.blockIdx()};
-            Dim1 threadIdx{gh.threadIdx()};
-            Dim1 gridDim  {gh.gridDim()};
-#else
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
-        [=] AMREX_GPU_DEVICE () noexcept
+        amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+            amrex::IntVectND<1>{nblocks_ec},
+            AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-#endif
             ReduceTuple r;
             Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-            ReduceTuple& dst = *(dp+blockIdx.x);
-            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
+            ReduceTuple& dst = *(dp+lh.blockIdx1D());
+            if (lh.threadIdx1D() == 0 && static_cast<int>(lh.blockIdx1D()) >= nblocks) {
                 dst = r;
             }
-            for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
-                 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
+            for (N i = N(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D()+lh.threadIdx1D(),
+                 stride = N(AMREX_GPU_MAX_THREADS)*nblocks_ec;
                  i < n;
                  i += stride)
             {
                 auto pr = f(i);
                 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
             }
-#ifdef AMREX_USE_SYCL
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-#else
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-#endif
+            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                AMREX_IF_SYCL(, lh.handler()));
         });
         nblocks = amrex::max(nblocks, nblocks_ec);
     }
@@ -870,7 +843,6 @@ public:
         }
 
         using ReduceTuple = typename D::Type;
-        auto const& stream = Gpu::gpuStream();
         auto dp = reduce_data.devicePtr();
         auto const& nblocks = reduce_data.nBlocks();
 #if defined(AMREX_USE_SYCL)
@@ -891,52 +863,34 @@ public:
 #endif
         {
             int maxblocks = reduce_data.maxBlocks();
-#ifdef AMREX_USE_SYCL
-            // device reduce needs local(i.e., shared) memory
-            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL)
             // xxxxx SYCL todo: reduce bug workaround
             Gpu::DeviceVector<ReduceTuple> dtmp(1);
             auto presult = dtmp.data();
 #else
             auto presult = hp;
 #endif
-            amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+            amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+                amrex::IntVectND<1>{1},
+                AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
+                [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
                 ReduceTuple r;
                 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
                 ReduceTuple dst = r;
                 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
                     auto dp_stream = dp+istream*maxblocks;
-                    for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
+                    for (int i = lh.globalIdx1D(), stride = AMREX_GPU_MAX_THREADS;
                          i < nblocks[istream]; i += stride) {
                         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
                     }
                 }
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-                if (gh.threadIdx() == 0) { *presult = dst; }
+                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                    AMREX_IF_SYCL(, lh.handler()));
+                if (lh.threadIdx1D() == 0) { *presult = dst; }
             });
-#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL)
             Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
-#endif
-#else
-            amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
-            [=] AMREX_GPU_DEVICE () noexcept
-            {
-                ReduceTuple r;
-                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-                ReduceTuple dst = r;
-                for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
-                    auto dp_stream = dp+istream*maxblocks;
-                    for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
-                         i < nblocks[istream]; i += stride) {
-                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
-                    }
-                }
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-                if (threadIdx.x == 0) { *hp = dst; }
-            });
 #endif
             Gpu::streamSynchronize();
         }
@@ -1068,51 +1022,34 @@ bool AnyOf (N n, T const* v, P const& pred)
     int* dp = ds.dataPtr();
     auto ec = Gpu::ExecutionConfig(n);
     ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
-#ifdef AMREX_USE_SYCL
-    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
-    const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
-        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
-        if (gh.threadIdx() == 0) { *has_any = *dp; }
-        gh.sharedBarrier();
-
-        if (!(*has_any))
-        {
-            int r = false;
-            for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
-                 i < n && !r; i += stride)
-            {
-                r = pred(v[i]) ? 1 : 0;
-            }
-
-            r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
-            if (gh.threadIdx() == 0 && r) { *dp = 1; }
-        }
-    });
-#else
-    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        __shared__ int has_any;
-        if (threadIdx.x == 0) { has_any = *dp; }
-        __syncthreads();
+    const int nblocks_ec = ec.numBlocks.x;
+    [[maybe_unused]] const int num_ints =
+        std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size);
+
+    amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, int>(
+        amrex::IntVectND<1>{nblocks_ec},
+        AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1),
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints));
+        if (lh.threadIdx1D() == 0) { has_any = *dp; }
+        lh.syncthreads();
 
         if (!has_any)
         {
             int r = false;
-            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
-                 i < n && !r; i += stride)
+            for (N i = AMREX_GPU_MAX_THREADS*lh.blockIdx1D()+lh.threadIdx1D(),
+                stride = AMREX_GPU_MAX_THREADS*nblocks_ec;
+                i < n && !r; i += stride)
             {
                 r = pred(v[i]) ? 1 : 0;
             }
             r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
-            if (threadIdx.x == 0 && r) *dp = 1;
+                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_IF_SYCL(, lh.handler()));
+            if (lh.threadIdx1D() == 0 && r) *dp = 1;
         }
     });
-#endif
     return ds.dataValue();
 }
 
@@ -1125,45 +1062,25 @@ bool AnyOf (BoxND<dim> const& box, P const& pred)
     const BoxIndexerND<dim> indexer(box);
     auto ec = Gpu::ExecutionConfig(box.numPts());
     ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
-#ifdef AMREX_USE_SYCL
-    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
-    const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
-        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
-        if (gh.threadIdx() == 0) { *has_any = *dp; }
-        gh.sharedBarrier();
-
-        if (!(*has_any))
-        {
-            int r = false;
-            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
-                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
-                 icell < indexer.numPts() && !r;
-                 icell += stride)
-            {
-                auto iv = indexer.intVect(icell);
-                r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
-            }
-            r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
-            if (gh.threadIdx() == 0 && r) { *dp = 1; }
-        }
-    });
-#else
-    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
-                        Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        __shared__ int has_any;
-        if (threadIdx.x == 0) { has_any = *dp; }
-        __syncthreads();
+    const int nblocks_ec = ec.numBlocks.x;
+    [[maybe_unused]] const int num_ints =
+        std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size);
+
+    amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, int>(
+        amrex::IntVectND<1>{nblocks_ec},
+        AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1),
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints));
+        if (lh.threadIdx1D() == 0) { has_any = *dp; }
+        lh.syncthreads();
 
         if (!has_any)
         {
             int r = false;
-            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
-                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
+            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*
+                    lh.blockIdx1D()+lh.threadIdx1D(),
+                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec;
                  icell < indexer.numPts() && !r;
                  icell += stride)
             {
@@ -1171,11 +1088,11 @@ bool AnyOf (BoxND<dim> const& box, P const& pred)
                 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
             }
             r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
-            if (threadIdx.x == 0 && r) *dp = 1;
+                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_IF_SYCL(, lh.handler()));
+            if (lh.threadIdx1D() == 0 && r) *dp = 1;
         }
     });
-#endif
     return ds.dataValue();
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
index 5e1fb62fc2..d9a29036d7 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
@@ -85,20 +85,12 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
 
 #ifdef AMREX_USE_GPU
             if (Gpu::inLaunchRegion()) {
-                amrex::launch<64>(12, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
-                [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
+                amrex::LaunchRaw<64>(amrex::IntVectND<1>{12},
+                [=] AMREX_GPU_DEVICE (auto lh)
                 {
-                    int bid = item.get_group_linear_id();
-                    int tid = item.get_local_linear_id();
-                    int bdim = item.get_local_range(0);
-#else
-                [=] AMREX_GPU_DEVICE ()
-                {
-                    int bid = blockIdx.x;
-                    int tid = threadIdx.x;
-                    int bdim = blockDim.x;
-#endif
+                    int bid = lh.blockIdx1D();
+                    int tid = lh.threadIdx1D();
+                    int bdim = lh.blockDim1D();
                     mltensor_fill_edges(bid, tid, bdim, vbx, velfab,
                                         mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                         bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
index 8d72107d54..372fd1ae09 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
@@ -408,20 +408,12 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada
         // only edge vals used in 3D stencil
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion()) {
-            amrex::launch<64>(12, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
-            [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
+            amrex::LaunchRaw<64>(amrex::IntVectND<1>{12},
+            [=] AMREX_GPU_DEVICE (auto lh)
             {
-                int bid = item.get_group_linear_id();
-                int tid = item.get_local_linear_id();
-                int bdim = item.get_local_range(0);
-#else
-            [=] AMREX_GPU_DEVICE ()
-            {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-                int bdim = blockDim.x;
-#endif
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+                int bdim = lh.blockDim1D();
                 mltensor_fill_edges(bid, tid, bdim, vbx, velfab,
                                     mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                     bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index 9b8e055d03..c6e7cabd2a 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -795,12 +795,13 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
         pllist_next[i] = Gpu::Atomic::Exch(pllist_start + f(i), i);
     });
 
-#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
-    amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
-        [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
-            __shared__ index_type sdata[gpu_block_size];
-            __shared__ index_type global_idx_start;
-            __shared__ index_type idx_start;
+#if defined(AMREX_USE_GPU)
+    amrex::LaunchRaw<gpu_block_size, index_type>(amrex::IntVectND<1>{nbins / gpu_block_size},
+        gpu_block_size + 2,
+        [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE (auto lh) {
+            index_type* sdata = lh.shared_memory();
+            index_type& global_idx_start = *(sdata + gpu_block_size);
+            index_type& idx_start = *(sdata + gpu_block_size + 1);
 
             index_type current_idx = 0;
 
@@ -808,7 +809,7 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                 // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
                 // are put right next to each other, while without the compressed layout,
                 // there can be other particle sweeps from different locations between them.
-                current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
+                current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()];
 
                 index_type num_particles_thread = 0;
                 while (current_idx != llist_guard) {
@@ -816,36 +817,37 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                     current_idx = pllist_next[current_idx];
                 }
 
-                index_type num_particles_block =
-                    Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);
+                index_type num_particles_block = Gpu::blockReduceSum
+                    AMREX_IF_NOT_SYCL(<gpu_block_size>)
+                    (num_particles_thread AMREX_IF_SYCL(, lh.handler()));
 
-                if (threadIdx.x == 0) {
+                if (lh.threadIdx1D() == 0) {
                     global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
                 }
             }
 
-            current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
+            current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()];
 
             while (true) {
-                sdata[threadIdx.x] = index_type(current_idx != llist_guard);
+                sdata[lh.threadIdx1D()] = index_type(current_idx != llist_guard);
                 index_type x = 0;
 
                 // simple block wide prefix sum
                 for (index_type i = 1; i<gpu_block_size; i*=2) {
-                    __syncthreads();
-                    if (threadIdx.x >= i) {
-                        x = sdata[threadIdx.x - i];
+                    lh.syncthreads();
+                    if (lh.threadIdx1D() >= i) {
+                        x = sdata[lh.threadIdx1D() - i];
                     }
-                    __syncthreads();
-                    if (threadIdx.x >= i) {
-                        sdata[threadIdx.x] += x;
+                    lh.syncthreads();
+                    if (lh.threadIdx1D() >= i) {
+                        sdata[lh.threadIdx1D()] += x;
                     }
                 }
-                __syncthreads();
+                lh.syncthreads();
                 if (sdata[gpu_block_size_m1] == 0) {
                     break;
                 }
-                if (threadIdx.x == gpu_block_size_m1) {
+                if (lh.threadIdx1D() == gpu_block_size_m1) {
                     if constexpr (compressed_layout) {
                         idx_start = global_idx_start;
                         global_idx_start += sdata[gpu_block_size_m1];
@@ -853,17 +855,17 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                         idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
                     }
                 }
-                __syncthreads();
-                sdata[threadIdx.x] += idx_start;
+                lh.syncthreads();
+                sdata[lh.threadIdx1D()] += idx_start;
                 if (current_idx != llist_guard) {
-                    pperm[sdata[threadIdx.x] - 1] = current_idx;
+                    pperm[sdata[lh.threadIdx1D()] - 1] = current_idx;
                     current_idx = pllist_next[current_idx];
                 }
             }
         });
 #else
     amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
-    Abort("PermutationForDeposition only implemented for CUDA and HIP");
+    Abort("PermutationForDeposition only implemented for GPU");
 #endif
 
     Gpu::Device::streamSynchronize();
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index c92e115f63..d7e47bc9a0 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -128,7 +128,7 @@ else()
    # List of subdirectories to search for CMakeLists.
    #
    set( AMREX_TESTS_SUBDIRS Amr ArrayND AsyncOut Base CallNoinline CLZ CommType CTOParFor DeviceGlobal
-                            Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2
+                            Enum HeatEquation LaunchRaw MultiBlock MultiPeriod ParmParse Parser Parser2
                             ParserUserFn Reducer ReduceToPlanePatchy Reinit RoundoffDomain SIMD
                             SmallMatrix SumBoundary TOML)
 
diff --git a/Tests/LaunchRaw/CMakeLists.txt b/Tests/LaunchRaw/CMakeLists.txt
new file mode 100644
index 0000000000..224c4563c8
--- /dev/null
+++ b/Tests/LaunchRaw/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/LaunchRaw/GNUmakefile b/Tests/LaunchRaw/GNUmakefile
new file mode 100644
index 0000000000..05ed32d344
--- /dev/null
+++ b/Tests/LaunchRaw/GNUmakefile
@@ -0,0 +1,15 @@
+AMREX_HOME = ../../
+
+DEBUG	= FALSE
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/LaunchRaw/Make.package b/Tests/LaunchRaw/Make.package
new file mode 100644
index 0000000000..7f43e5e87c
--- /dev/null
+++ b/Tests/LaunchRaw/Make.package
@@ -0,0 +1,2 @@
+CEXE_sources += main.cpp
+
diff --git a/Tests/LaunchRaw/inputs b/Tests/LaunchRaw/inputs
new file mode 100644
index 0000000000..bba7f4f305
--- /dev/null
+++ b/Tests/LaunchRaw/inputs
@@ -0,0 +1 @@
+amrex.the_arena_is_managed = 0
diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
new file mode 100644
index 0000000000..10c4c94838
--- /dev/null
+++ b/Tests/LaunchRaw/main.cpp
@@ -0,0 +1,198 @@
+#include <AMReX_GpuTypes.H>
+#include <AMReX.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_GpuContainers.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_GpuPrint.H>
+#include <AMReX_Reduce.H>
+
+using namespace amrex;
+
+void test1d () {
+
+    const IntVectND<1> num_blocks {31};
+#ifdef AMREX_USE_GPU
+    static constexpr int blockdim[1] {256};
+#else
+    static constexpr int blockdim[1] {1};
+#endif
+    static constexpr int num_threads = blockdim[0];
+
+    Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads) * num_blocks[0], -999);
+
+    auto * data = vect.dataPtr();
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.globalIdx1D()] = lh.blockIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            auto block = lh.blockIdxND();
+            auto thread = lh.template threadIdxND<blockdim[0]>();
+            auto tmp = data[
+                block[0] * num_threads + thread[0]
+            ];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
+        });
+
+    LaunchRaw<num_threads, int>(num_blocks, num_threads,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            auto smem = lh.shared_memory();
+            auto thread = lh.template threadIdxND<blockdim[0]>();
+            auto locid = thread[0];
+            smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + locid] = smem[locid];
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.globalIdx1D()] =
+                data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
+        });
+
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast<int>(vect.size()));
+}
+
+void test2d () {
+
+    const IntVectND<2> num_blocks {31, 23};
+#ifdef AMREX_USE_GPU
+    static constexpr int blockdim[2] {8, 32};
+#else
+    static constexpr int blockdim[2] {1, 1};
+#endif
+    static constexpr int num_threads = blockdim[0] * blockdim[1];
+
+    Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
+        * num_blocks[0] * num_blocks[1], -999);
+
+    auto * data = vect.dataPtr();
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.globalIdx1D()] = lh.blockIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            auto block = lh.blockIdxND();
+            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1]>();
+            auto tmp = data[
+                (block[0] + block[1] * num_blocks[0]) * num_threads +
+                thread[1] + thread[0] * blockdim[1]
+            ];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
+        });
+
+    LaunchRaw<num_threads, int>(num_blocks, num_threads,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            auto smem = lh.shared_memory();
+            auto thread1 = lh.template threadIdxND<blockdim[1], blockdim[0]>();
+            auto locid1 = thread1[1] + thread1[0] * blockdim[0];
+            auto thread2 = lh.template threadIdxND<blockdim[0], blockdim[1]>();
+            auto locid2 = thread2[1] + thread2[0] * blockdim[1];
+            smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.globalIdx1D()] =
+                data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
+        });
+
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast<int>(vect.size()));
+}
+
+void test3d () {
+
+    const IntVectND<3> num_blocks {31, 23, 11};
+#ifdef AMREX_USE_GPU
+    static constexpr int blockdim[3] {2, 8, 16};
+#else
+    static constexpr int blockdim[3] {1, 1, 1};
+#endif
+    static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
+
+    Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
+         * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
+
+    auto * data = vect.dataPtr();
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.globalIdx1D()] = lh.blockIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            auto block = lh.blockIdxND();
+            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1], blockdim[2]>();
+            auto tmp = data[
+                (block[0] + block[1] * num_blocks[0] +
+                block[2] * num_blocks[0] * num_blocks[1]) * num_threads +
+                thread[2] + thread[1] * blockdim[2] +
+                thread[0] * blockdim[2] * blockdim[1]
+            ];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
+        });
+
+    LaunchRaw<num_threads, int>(num_blocks, num_threads,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            auto smem = lh.shared_memory();
+            auto thread1 = lh.template threadIdxND<blockdim[2], blockdim[1], blockdim[0]>();
+            auto locid1 = thread1[2] + thread1[1] * blockdim[0] +
+                          thread1[0] * blockdim[0] * blockdim[1];
+            auto thread2 = lh.template threadIdxND<blockdim[2], blockdim[0], blockdim[1]>();
+            auto locid2 = thread2[0] + thread2[2] * blockdim[2] +
+                          thread2[1] * blockdim[2] * blockdim[1];
+            smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=] AMREX_GPU_DEVICE (auto lh) {
+            data[lh.globalIdx1D()] =
+                data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
+        });
+
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast<int>(vect.size()));
+}
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        test1d();
+
+        test2d();
+
+        test3d();
+
+        amrex::Print() << "Passed! \n";
+    }
+    amrex::Finalize();
+}