diff --git a/HeterogeneousCore/AlpakaInterface/interface/warpsize.h b/HeterogeneousCore/AlpakaInterface/interface/warpsize.h index 09fcd281a0b17..cb0e14842a72a 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/warpsize.h +++ b/HeterogeneousCore/AlpakaInterface/interface/warpsize.h @@ -12,8 +12,20 @@ namespace cms::alpakatools { // CUDA always has a warp size of 32 inline constexpr int warpSize = 32; #elif defined(__HIP_DEVICE_COMPILE__) - // HIP/ROCm defines warpSize as a constant expression in device code, with value 32 or 64 depending on the target device - inline constexpr int warpSize = ::warpSize; + // HIP/ROCm may have a warp size of 32 or 64 depending on the target device +#if defined(__gfx900__) or defined(__gfx902__) or defined(__gfx903__) or defined(__gfx906__) or defined(__gfx908__) or \ + defined(__gfx909__) or defined(__gfx90a__) or defined(__gfx90c__) or defined(__gfx942__) or defined(__gfx950__) + inline constexpr int warpSize = 64; +#elif defined(__gfx1010__) or defined(__gfx1011__) or defined(__gfx1012__) or defined(__gfx1013__) or \ + defined(__gfx1030__) or defined(__gfx1031__) or defined(__gfx1032__) or defined(__gfx1033__) or \ + defined(__gfx1034__) or defined(__gfx1035__) or defined(__gfx1036__) or defined(__gfx1100__) or \ + defined(__gfx1101__) or defined(__gfx1102__) or defined(__gfx1103__) or defined(__gfx1150__) or \ + defined(__gfx1151__) or defined(__gfx1152__) or defined(__gfx1153__) or defined(__gfx1200__) or \ + defined(__gfx1201__) or defined(__gfx1250__) or defined(__gfx1251__) + inline constexpr int warpSize = 32; +#else +#error "Unknown AMDGCN architecture" +#endif #else // CPU back-ends always have a warp size of 1 inline constexpr int warpSize = 1; diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc index 34d3cc2c87db3..8413b026e957d 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testPrefixScan.dev.cc @@ -12,6 +12,7 @@ #include "HeterogeneousCore/AlpakaInterface/interface/memory.h" #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" +#include "HeterogeneousCore/AlpakaInterface/interface/warpsize.h" using namespace cms::alpakatools; using namespace ALPAKA_ACCELERATOR_NAMESPACE; @@ -33,17 +34,8 @@ struct format_traits { template struct testPrefixScan { ALPAKA_FN_ACC void operator()(Acc1D const& acc, unsigned int size) const { - // alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value -#if defined(__CUDA_ARCH__) - // CUDA always has a warp size of 32 - auto& ws = alpaka::declareSharedVar(acc); -#elif defined(__HIP_DEVICE_COMPILE__) - // HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device - auto& ws = alpaka::declareSharedVar(acc); -#else - // CPU back-ends always have a warp size of 1 - auto& ws = alpaka::declareSharedVar(acc); -#endif + // alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value, so we use cms::alpakatools::warpSize + auto& ws = alpaka::declareSharedVar(acc); auto& c = alpaka::declareSharedVar(acc); auto& co = alpaka::declareSharedVar(acc);