Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 109 additions & 88 deletions RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,34 @@

//#define GPU_DEBUG

// TODO move to HeterogeneousCore/AlpakaInterface or upstream to alpaka
template <typename TAcc, typename T>
ALPAKA_FN_ACC inline T atomicLoadFromShared(TAcc const& acc [[maybe_unused]], T* arg) {
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) or defined(ALPAKA_ACC_GPU_HIP_ENABLED)
// GPU backend, use a volatile read to force a non-cached acess
return *reinterpret_cast<volatile T*>(arg);
#elif defined(ALPAKA_ACC_SYCL_ENABLED)
// SYCL backend, use an atomic load
sycl::atomic_ref<uint32_t,
sycl::memory_order::relaxed,
sycl::memory_scope::work_group,
sycl::access::address_space::local_space>
ref{*arg};
return ref.load();
#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) or defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
// CPU backend with multiple threads per block, use an atomic load
std::atomic_ref<uint32_t> ref{*arg};
return ref.load();
#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) or defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) or \
defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
// CPU backend with one thread per block, use a standard read
return *arg;
#else
#error "Unsupported alpaka backend"
return T{};
#endif
}

namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {

#ifdef GPU_DEBUG
Expand All @@ -27,25 +55,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {

namespace pixelStatus {
// Phase-1 pixel modules
constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule; // 2 x 80 = 160
constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule; // 8 x 52 = 416

// Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };

// 2-bit per pixel Status packed in 32-bit words
constexpr uint32_t bits = 2;
constexpr uint32_t mask = (0x01 << bits) - 1;
constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits; // 16 values per 32-bit word
constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord; // 160 x 416 / 16 = 4160 32-bit words

ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
return (pixelSizeX * y + x) / valuesPerWord;
}

ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
return (x % valuesPerWord) * 2;
return (x % valuesPerWord) * bits;
}

// Return the current status of a pixel based on its coordinates.
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
uint16_t x,
uint16_t y) {
Expand All @@ -54,38 +83,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
return Status{(status[index] >> shift) & mask};
}

// Check whether a pixel at the given coordinates has been marked as duplicate.
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
uint16_t x,
uint16_t y) {
return getStatus(status, x, y) == kDuplicate;
}

/* FIXME
* In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
* between the read of status[index] at line NNN and the atomicCas at line NNN.
* We should investigate:
* - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
* - if `status` should be read with an atomic load (CPU)
*/
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(Acc1D const& acc,
uint32_t* __restrict__ status,
const uint16_t x,
const uint16_t y) {
// Record a pixel at the given coordinates and return the updated status.
ALPAKA_FN_ACC ALPAKA_FN_INLINE Status promote(Acc1D const& acc,
uint32_t* status,
const uint16_t x,
const uint16_t y) {
uint32_t index = getIndex(x, y);
uint32_t shift = getShift(x, y);
uint32_t old_word = status[index];
uint32_t expected = old_word;
uint32_t old_word = atomicLoadFromShared(acc, status + index);
uint32_t expected;
Status new_status;
do {
expected = old_word;
Status old_status{(old_word >> shift) & mask};
if (kDuplicate == old_status) {
// nothing to do
return;
// this pixel has already been marked as duplicate
return kDuplicate;
}
Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
new_status = (kEmpty == old_status) ? kFound : kDuplicate;
uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Threads{});
} while (expected != old_word);
return new_status;
}

} // namespace pixelStatus
Expand All @@ -105,6 +131,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
}
#endif
for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
// Initialise each pixel with a cluster id equal to its index
digi_view[i].clus() = i;
if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
continue;
Expand Down Expand Up @@ -144,6 +171,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);

auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
#ifdef GPU_DEBUG
auto& goodPixels = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
#endif

const uint32_t lastModule = clus_view[0].moduleStart();
for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
Expand All @@ -152,18 +182,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
ALPAKA_ASSERT_ACC(thisModuleId < TrackerTraits::numberOfModules);

#ifdef GPU_DEBUG
auto block = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
if (thisModuleId % 100 == 1)
if (cms::alpakatools::once_per_block(acc))
printf("start clusterizer for module %4d in block %4d\n",
thisModuleId,
alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
printf("start clusterizer for module %4d in block %4d\n", thisModuleId, block);
#endif

// find the index of the first pixel not belonging to this module (or invalid)
// Find the index of the first pixel not belonging to this module (or invalid).
// Note: modules are not consecutive in clus_view, so we cannot use something like
// lastPixel = (module + 1 == lastModule) ? numElements : clus_view[2 + module].moduleStart();
lastPixel = numElements;
#ifdef GPU_DEBUG
goodPixels = 0;
#endif
alpaka::syncBlockThreads(acc);

// skip threads not associated to an existing pixel
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
auto id = digi_view[i].moduleId();
// skip invalid pixels
Expand Down Expand Up @@ -191,10 +224,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {

ALPAKA_ASSERT_ACC((lastPixel == numElements) or
((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
// limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
// Limit the number of pixels to to maxPixInModule.
// FIXME if this happens frequently (and not only in simulation with low threshold) one will need to implement something cleverer.
if (cms::alpakatools::once_per_block(acc)) {
if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
printf("too many pixels in module %u: %u > %u\n",
printf("Too many pixels in module %u: %u > %u\n",
thisModuleId,
lastPixel - firstPixel,
TrackerTraits::maxPixInModule);
Expand All @@ -204,71 +238,74 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
alpaka::syncBlockThreads(acc);
ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);

#ifdef GPU_DEBUG
auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
totGood = 0;
alpaka::syncBlockThreads(acc);
#endif

// remove duplicate pixels
constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
if constexpr (not isPhase2) {
// packed words array used to store the pixelStatus of each pixel
auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
auto& image = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);

if (lastPixel > 1) {
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) {
status[i] = 0;
image[i] = 0;
}
alpaka::syncBlockThreads(acc);

for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
// skip invalid pixels
if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
continue;
pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
pixelStatus::promote(acc, image, digi_view[i].xx(), digi_view[i].yy());
}
alpaka::syncBlockThreads(acc);

for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
// skip invalid pixels
if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
continue;
if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
auto status = pixelStatus::getStatus(image, digi_view[i].xx(), digi_view[i].yy());
if (pixelStatus::kDuplicate == status) {
// Mark all duplicate pixels as invalid.
// Note: the alternative approach to keep a single one of the duplicates would probably make more sense.
// According to Danek (16 March 2022): "The best would be to ignore such hits, most likely they are just
// noise. Accepting just one is also OK, any of them."
digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
digi_view[i].rawIdArr() = 0;
}
}
alpaka::syncBlockThreads(acc);
}
}
} // if (lastPixel > 1)
} // if constexpr (not isPhase2)

// fill histo
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
// skip invalid pixels
if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
hist.count(acc, digi_view[i].yy());
#ifdef GPU_DEBUG
alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
alpaka::atomicAdd(acc, &goodPixels, 1u, alpaka::hierarchy::Threads{});
#endif
}
}
alpaka::syncBlockThreads(acc); // FIXME this can be removed
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, warpSize)) {
ws[i] = 0; // used by prefix scan...
}
alpaka::syncBlockThreads(acc);
hist.finalize(acc, ws);
alpaka::syncBlockThreads(acc);

#ifdef GPU_DEBUG
ALPAKA_ASSERT_ACC(hist.size() == totGood);
if (thisModuleId % 100 == 1)
if (cms::alpakatools::once_per_block(acc))
ALPAKA_ASSERT_ACC(hist.size() == goodPixels);
if (thisModuleId % 100 == 1) {
if (cms::alpakatools::once_per_block(acc)) {
printf("histo size %d\n", hist.size());
}
}
#endif
for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
// skip invalid pixels
if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
// For valid pixels, the id used in the histogram is `i - firstPixel`, ranging from `0` to
// `lastPixel - firstPixel` (excluded), so in the `[0, TrackerTraits::maxPixInModule)` range.
hist.fill(acc, digi_view[i].yy(), i - firstPixel);
}
}
Expand All @@ -284,9 +321,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
alpaka::syncBlockThreads(acc);
for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) {
if (hist.size(j) > 60)
alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Threads{});
if (hist.size(j) > 40)
alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Threads{});
}
alpaka::syncBlockThreads(acc);
if (cms::alpakatools::once_per_block(acc)) {
Expand All @@ -310,8 +347,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
constexpr unsigned int maxIter = maxIterGPU * maxElements;

// nearest neighbours (nn)
// allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
constexpr int maxNeighbours = 10;
constexpr int maxNeighbours = 8;
uint16_t nn[maxIter][maxNeighbours];
uint8_t nnn[maxIter]; // number of nn
for (uint32_t k = 0; k < maxIter; ++k) {
Expand Down Expand Up @@ -346,20 +382,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
++k;
}

// for each pixel, look at all the pixels until the end of the module;
// For each pixel, look at all the pixels until the end of the module;
// when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
// after the loop, all the pixel in each cluster should have the id equeal to the lowest
// pixel in the cluster ( clus[i] == i ).
bool more = true;
/*
int nloops = 0;
*/
while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
/*
if (nloops % 2 == 0) {
// even iterations of the outer loop
*/
more = false;
bool done = false;
while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, not done)) {
done = true;
uint32_t k = 0;
for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
ALPAKA_ASSERT_ACC(k < maxIter);
Expand All @@ -369,22 +398,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
auto l = nn[k][kk];
auto m = l + firstPixel;
ALPAKA_ASSERT_ACC(m != i);
// FIXME ::Threads ?
auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
// The algorithm processes one module per block, so the atomic operation's scope
// is "Threads" (all threads in the current block)
auto old =
alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Threads{});
if (old != digi_view[i].clus()) {
// end the loop only if no changes were applied
more = true;
done = false;
}
// FIXME ::Threads ?
alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Threads{});
} // neighbours loop
++k;
} // pixel loop
/*
// use the outer loop to force a synchronisation
} else {
// odd iterations of the outer loop
*/
alpaka::syncBlockThreads(acc);
for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
auto p = hist.begin() + j;
Expand All @@ -394,23 +419,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
m = digi_view[m].clus();
digi_view[i].clus() = m;
}
/*
}
++nloops;
*/
} // end while

/*
// check that all threads in the block have executed the same number of iterations
auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
if (cms::alpakatools::once_per_block(acc))
n0 = nloops;
alpaka::syncBlockThreads(acc);
ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
if (thisModuleId % 100 == 1)
if (cms::alpakatools::once_per_block(acc))
printf("# loops %d\n", nloops);
*/
// check that all threads in the block have executed the same number of iterations
auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
if (cms::alpakatools::once_per_block(acc))
n0 = nloops;
alpaka::syncBlockThreads(acc);
ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
if (thisModuleId % 100 == 1)
if (cms::alpakatools::once_per_block(acc))
printf("# loops %d\n", nloops);
*/

auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
foundClusters = 0;
Expand Down
Loading