diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h index a8c89f7484722..1169eded95d8e 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h @@ -19,6 +19,34 @@ //#define GPU_DEBUG +// TODO move to HeterogeneousCore/AlpakaInterface or upstream to alpaka +template +ALPAKA_FN_ACC inline T atomicLoadFromShared(TAcc const& acc [[maybe_unused]], T* arg) { +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) or defined(ALPAKA_ACC_GPU_HIP_ENABLED) + // GPU backend, use a volatile read to force a non-cached acess + return *reinterpret_cast(arg); +#elif defined(ALPAKA_ACC_SYCL_ENABLED) + // SYCL backend, use an atomic load + sycl::atomic_ref + ref{*arg}; + return ref.load(); +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) or defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) + // CPU backend with multiple threads per block, use an atomic load + std::atomic_ref ref{*arg}; + return ref.load(); +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) or defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) or \ + defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) + // CPU backend with one thread per block, use a standard read + return *arg; +#else +#error "Unsupported alpaka backend" + return T{}; +#endif +} + namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { #ifdef GPU_DEBUG @@ -27,25 +55,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { namespace pixelStatus { // Phase-1 pixel modules - constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule; - constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule; + constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule; // 2 x 80 = 160 + constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule; // 8 x 52 = 416 - // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 }; + // 2-bit per pixel Status packed in 32-bit words constexpr uint32_t bits = 2; constexpr uint32_t mask = (0x01 << bits) - 1; - constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits; - constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord; + constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits; // 16 values per 32-bit word + constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord; // 160 x 416 / 16 = 4160 32-bit words ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) { return (pixelSizeX * y + x) / valuesPerWord; } ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) { - return (x % valuesPerWord) * 2; + return (x % valuesPerWord) * bits; } + // Return the current status of a pixel based on its coordinates. ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) { @@ -54,38 +83,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { return Status{(status[index] >> shift) & mask}; } + // Check whether a pixel at the given coordinates has been marked as duplicate. ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) { return getStatus(status, x, y) == kDuplicate; } - /* FIXME - * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition - * between the read of status[index] at line NNN and the atomicCas at line NNN. - * We should investigate: - * - if `status` should be read through a `volatile` pointer (CUDA/ROCm) - * - if `status` should be read with an atomic load (CPU) - */ - ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(Acc1D const& acc, - uint32_t* __restrict__ status, - const uint16_t x, - const uint16_t y) { + // Record a pixel at the given coordinates and return the updated status. + ALPAKA_FN_ACC ALPAKA_FN_INLINE Status promote(Acc1D const& acc, + uint32_t* status, + const uint16_t x, + const uint16_t y) { uint32_t index = getIndex(x, y); uint32_t shift = getShift(x, y); - uint32_t old_word = status[index]; - uint32_t expected = old_word; + uint32_t old_word = atomicLoadFromShared(acc, status + index); + uint32_t expected; + Status new_status; do { expected = old_word; Status old_status{(old_word >> shift) & mask}; if (kDuplicate == old_status) { - // nothing to do - return; + // this pixel has already been marked as duplicate + return kDuplicate; } - Status new_status = (kEmpty == old_status) ? kFound : kDuplicate; + new_status = (kEmpty == old_status) ? kFound : kDuplicate; uint32_t new_word = old_word | (static_cast(new_status) << shift); - old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{}); + old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Threads{}); } while (expected != old_word); + return new_status; } } // namespace pixelStatus @@ -105,6 +131,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { } #endif for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) { + // Initialise each pixel with a cluster id equal to its index digi_view[i].clus() = i; if (::pixelClustering::invalidModuleId == digi_view[i].moduleId()) continue; @@ -144,6 +171,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules); auto& lastPixel = alpaka::declareSharedVar(acc); +#ifdef GPU_DEBUG + auto& goodPixels = alpaka::declareSharedVar(acc); +#endif const uint32_t lastModule = clus_view[0].moduleStart(); for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) { @@ -152,18 +182,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { ALPAKA_ASSERT_ACC(thisModuleId < TrackerTraits::numberOfModules); #ifdef GPU_DEBUG + auto block = alpaka::getIdx(acc)[0u]; if (thisModuleId % 100 == 1) if (cms::alpakatools::once_per_block(acc)) - printf("start clusterizer for module %4d in block %4d\n", - thisModuleId, - alpaka::getIdx(acc)[0u]); + printf("start clusterizer for module %4d in block %4d\n", thisModuleId, block); #endif - // find the index of the first pixel not belonging to this module (or invalid) + // Find the index of the first pixel not belonging to this module (or invalid). + // Note: modules are not consecutive in clus_view, so we cannot use something like + // lastPixel = (module + 1 == lastModule) ? numElements : clus_view[2 + module].moduleStart(); lastPixel = numElements; +#ifdef GPU_DEBUG + goodPixels = 0; +#endif alpaka::syncBlockThreads(acc); - // skip threads not associated to an existing pixel for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) { auto id = digi_view[i].moduleId(); // skip invalid pixels @@ -191,10 +224,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { ALPAKA_ASSERT_ACC((lastPixel == numElements) or ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId))); - // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer) + // Limit the number of pixels to to maxPixInModule. + // FIXME if this happens frequently (and not only in simulation with low threshold) one will need to implement something cleverer. if (cms::alpakatools::once_per_block(acc)) { if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) { - printf("too many pixels in module %u: %u > %u\n", + printf("Too many pixels in module %u: %u > %u\n", thisModuleId, lastPixel - firstPixel, TrackerTraits::maxPixInModule); @@ -204,44 +238,43 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { alpaka::syncBlockThreads(acc); ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule); -#ifdef GPU_DEBUG - auto& totGood = alpaka::declareSharedVar(acc); - totGood = 0; - alpaka::syncBlockThreads(acc); -#endif - // remove duplicate pixels constexpr bool isPhase2 = std::is_base_of::value; if constexpr (not isPhase2) { // packed words array used to store the pixelStatus of each pixel - auto& status = alpaka::declareSharedVar(acc); + auto& image = alpaka::declareSharedVar(acc); if (lastPixel > 1) { for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) { - status[i] = 0; + image[i] = 0; } alpaka::syncBlockThreads(acc); - for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) { + for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) { // skip invalid pixels if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) continue; - pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy()); + pixelStatus::promote(acc, image, digi_view[i].xx(), digi_view[i].yy()); } alpaka::syncBlockThreads(acc); - for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) { + for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) { // skip invalid pixels if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) continue; - if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) { + auto status = pixelStatus::getStatus(image, digi_view[i].xx(), digi_view[i].yy()); + if (pixelStatus::kDuplicate == status) { + // Mark all duplicate pixels as invalid. + // Note: the alternative approach to keep a single one of the duplicates would probably make more sense. + // According to Danek (16 March 2022): "The best would be to ignore such hits, most likely they are just + // noise. Accepting just one is also OK, any of them." digi_view[i].moduleId() = ::pixelClustering::invalidModuleId; digi_view[i].rawIdArr() = 0; } } alpaka::syncBlockThreads(acc); - } - } + } // if (lastPixel > 1) + } // if constexpr (not isPhase2) // fill histo for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) { @@ -249,26 +282,30 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) { hist.count(acc, digi_view[i].yy()); #ifdef GPU_DEBUG - alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &goodPixels, 1u, alpaka::hierarchy::Threads{}); #endif } } - alpaka::syncBlockThreads(acc); // FIXME this can be removed for (uint32_t i : cms::alpakatools::independent_group_elements(acc, warpSize)) { ws[i] = 0; // used by prefix scan... } alpaka::syncBlockThreads(acc); hist.finalize(acc, ws); alpaka::syncBlockThreads(acc); + #ifdef GPU_DEBUG - ALPAKA_ASSERT_ACC(hist.size() == totGood); - if (thisModuleId % 100 == 1) - if (cms::alpakatools::once_per_block(acc)) + ALPAKA_ASSERT_ACC(hist.size() == goodPixels); + if (thisModuleId % 100 == 1) { + if (cms::alpakatools::once_per_block(acc)) { printf("histo size %d\n", hist.size()); + } + } #endif for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) { // skip invalid pixels if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) { + // For valid pixels, the id used in the histogram is `i - firstPixel`, ranging from `0` to + // `lastPixel - firstPixel` (excluded), so in the `[0, TrackerTraits::maxPixInModule)` range. hist.fill(acc, digi_view[i].yy(), i - firstPixel); } } @@ -284,9 +321,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { alpaka::syncBlockThreads(acc); for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) { if (hist.size(j) > 60) - alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Threads{}); if (hist.size(j) > 40) - alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Threads{}); } alpaka::syncBlockThreads(acc); if (cms::alpakatools::once_per_block(acc)) { @@ -310,8 +347,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { constexpr unsigned int maxIter = maxIterGPU * maxElements; // nearest neighbours (nn) - // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event - constexpr int maxNeighbours = 10; + constexpr int maxNeighbours = 8; uint16_t nn[maxIter][maxNeighbours]; uint8_t nnn[maxIter]; // number of nn for (uint32_t k = 0; k < maxIter; ++k) { @@ -346,20 +382,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { ++k; } - // for each pixel, look at all the pixels until the end of the module; + // For each pixel, look at all the pixels until the end of the module; // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; // after the loop, all the pixel in each cluster should have the id equeal to the lowest // pixel in the cluster ( clus[i] == i ). - bool more = true; - /* - int nloops = 0; - */ - while (alpaka::syncBlockThreadsPredicate(acc, more)) { - /* - if (nloops % 2 == 0) { - // even iterations of the outer loop - */ - more = false; + bool done = false; + while (alpaka::syncBlockThreadsPredicate(acc, not done)) { + done = true; uint32_t k = 0; for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) { ALPAKA_ASSERT_ACC(k < maxIter); @@ -369,22 +398,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { auto l = nn[k][kk]; auto m = l + firstPixel; ALPAKA_ASSERT_ACC(m != i); - // FIXME ::Threads ? - auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{}); + // The algorithm processes one module per block, so the atomic operation's scope + // is "Threads" (all threads in the current block) + auto old = + alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Threads{}); if (old != digi_view[i].clus()) { // end the loop only if no changes were applied - more = true; + done = false; } - // FIXME ::Threads ? - alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{}); + alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Threads{}); } // neighbours loop ++k; } // pixel loop - /* - // use the outer loop to force a synchronisation - } else { - // odd iterations of the outer loop - */ alpaka::syncBlockThreads(acc); for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) { auto p = hist.begin() + j; @@ -394,23 +419,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering { m = digi_view[m].clus(); digi_view[i].clus() = m; } - /* - } - ++nloops; - */ } // end while /* - // check that all threads in the block have executed the same number of iterations - auto& n0 = alpaka::declareSharedVar(acc); - if (cms::alpakatools::once_per_block(acc)) - n0 = nloops; - alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate(acc, nloops == n0)); - if (thisModuleId % 100 == 1) - if (cms::alpakatools::once_per_block(acc)) - printf("# loops %d\n", nloops); - */ + // check that all threads in the block have executed the same number of iterations + auto& n0 = alpaka::declareSharedVar(acc); + if (cms::alpakatools::once_per_block(acc)) + n0 = nloops; + alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate(acc, nloops == n0)); + if (thisModuleId % 100 == 1) + if (cms::alpakatools::once_per_block(acc)) + printf("# loops %d\n", nloops); + */ auto& foundClusters = alpaka::declareSharedVar(acc); foundClusters = 0; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc index bb358df8a52f9..e03881559fca8 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc @@ -621,18 +621,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto moduleStartFirstElement = cms::alpakatools::make_device_view(queue, clusters_d->view().moduleStart(), 1u); alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement); - const auto elementsPerBlockFindClus = FindClus::maxElementsPerBlock; - const auto workDivMaxNumModules = - cms::alpakatools::make_workdiv(numberOfModules, elementsPerBlockFindClus); + { + const int blocks = 64; + const auto elementsPerBlockFindClus = FindClus::maxElementsPerBlock; + const auto workDivMaxNumModules = cms::alpakatools::make_workdiv(blocks, elementsPerBlockFindClus); + #ifdef GPU_DEBUG - std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus - << " threadsPerBlockOrElementsPerThread\n"; + std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus + << " threadsPerBlockOrElementsPerThread\n"; #endif - alpaka::exec( - queue, workDivMaxNumModules, FindClus{}, digis_d->view(), clusters_d->view(), wordCounter); + alpaka::exec( + queue, workDivMaxNumModules, FindClus{}, digis_d->view(), clusters_d->view(), wordCounter); #ifdef GPU_DEBUG - alpaka::wait(queue); + alpaka::wait(queue); #endif + } constexpr auto threadsPerBlockChargeCut = 256; const auto workDivChargeCut = cms::alpakatools::make_workdiv(numberOfModules, threadsPerBlockChargeCut);