Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions HeterogeneousCore/AlpakaInterface/interface/warpsize.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,20 @@ namespace cms::alpakatools {
// CUDA always has a warp size of 32
inline constexpr int warpSize = 32;
#elif defined(__HIP_DEVICE_COMPILE__)
// HIP/ROCm defines warpSize as a constant expression in device code, with value 32 or 64 depending on the target device
inline constexpr int warpSize = ::warpSize;
// HIP/ROCm may have a warp size of 32 or 64 depending on the target device
#if defined(__gfx900__) or defined(__gfx902__) or defined(__gfx903__) or defined(__gfx906__) or defined(__gfx908__) or \
defined(__gfx909__) or defined(__gfx90a__) or defined(__gfx90c__) or defined(__gfx942__) or defined(__gfx950__)
inline constexpr int warpSize = 64;
#elif defined(__gfx1010__) or defined(__gfx1011__) or defined(__gfx1012__) or defined(__gfx1013__) or \
defined(__gfx1030__) or defined(__gfx1031__) or defined(__gfx1032__) or defined(__gfx1033__) or \
defined(__gfx1034__) or defined(__gfx1035__) or defined(__gfx1036__) or defined(__gfx1100__) or \
defined(__gfx1101__) or defined(__gfx1102__) or defined(__gfx1103__) or defined(__gfx1150__) or \
defined(__gfx1151__) or defined(__gfx1152__) or defined(__gfx1153__) or defined(__gfx1200__) or \
defined(__gfx1201__) or defined(__gfx1250__) or defined(__gfx1251__)
inline constexpr int warpSize = 32;
#else
#error "Unknown AMDGCN architecture"
#endif
Comment thread
fwyzard marked this conversation as resolved.
#else
// CPU back-ends always have a warp size of 1
inline constexpr int warpSize = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
#include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
#include "HeterogeneousCore/AlpakaInterface/interface/warpsize.h"

using namespace cms::alpakatools;
using namespace ALPAKA_ACCELERATOR_NAMESPACE;
Expand All @@ -33,17 +34,8 @@ struct format_traits<float> {
template <typename T>
struct testPrefixScan {
ALPAKA_FN_ACC void operator()(Acc1D const& acc, unsigned int size) const {
// alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value
#if defined(__CUDA_ARCH__)
// CUDA always has a warp size of 32
auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
#elif defined(__HIP_DEVICE_COMPILE__)
// HIP/ROCm defines warpSize as a constant expression with value 32 or 64 depending on the target device
auto& ws = alpaka::declareSharedVar<T[warpSize], __COUNTER__>(acc);
#else
// CPU back-ends always have a warp size of 1
auto& ws = alpaka::declareSharedVar<T[1], __COUNTER__>(acc);
#endif
// alpaka::warp::getSize(acc) is runtime, but we need a compile-time or constexpr value, so we use cms::alpakatools::warpSize
auto& ws = alpaka::declareSharedVar<T[cms::alpakatools::warpSize], __COUNTER__>(acc);
auto& c = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);
auto& co = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);

Expand Down