diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
index a8c89f7484722..1169eded95d8e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
@@ -19,6 +19,34 @@
 
 //#define GPU_DEBUG
 
+// TODO move to HeterogeneousCore/AlpakaInterface or upstream to alpaka
+template <typename TAcc, typename T>
+ALPAKA_FN_ACC inline T atomicLoadFromShared(TAcc const& acc [[maybe_unused]], T* arg) {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) or defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+  // GPU backend, use a volatile read to force a non-cached acess
+  return *reinterpret_cast<volatile T*>(arg);
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+  // SYCL backend, use an atomic load
+  sycl::atomic_ref<uint32_t,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::work_group,
+                   sycl::access::address_space::local_space>
+      ref{*arg};
+  return ref.load();
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) or defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
+  // CPU backend with multiple threads per block, use an atomic load
+  std::atomic_ref<uint32_t> ref{*arg};
+  return ref.load();
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) or defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) or \
+    defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
+  // CPU backend with one thread per block, use a standard read
+  return *arg;
+#else
+#error "Unsupported alpaka backend"
+  return T{};
+#endif
+}
+
 namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
 
 #ifdef GPU_DEBUG
@@ -27,25 +55,26 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
 
   namespace pixelStatus {
     // Phase-1 pixel modules
-    constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;
-    constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;
+    constexpr uint32_t pixelSizeX = pixelTopology::Phase1::numRowsInModule;  // 2 x 80 = 160
+    constexpr uint32_t pixelSizeY = pixelTopology::Phase1::numColsInModule;  // 8 x 52 = 416
 
-    // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
     enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
 
+    // 2-bit per pixel Status packed in 32-bit words
     constexpr uint32_t bits = 2;
     constexpr uint32_t mask = (0x01 << bits) - 1;
-    constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
-    constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
+    constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;     // 16 values per 32-bit word
+    constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;  // 160 x 416 / 16 = 4160 32-bit words
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
       return (pixelSizeX * y + x) / valuesPerWord;
     }
 
     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
-      return (x % valuesPerWord) * 2;
+      return (x % valuesPerWord) * bits;
     }
 
+    // Return the current status of a pixel based on its coordinates.
     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
                                                               uint16_t x,
                                                               uint16_t y) {
@@ -54,38 +83,35 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
       return Status{(status[index] >> shift) & mask};
     }
 
+    // Check whether a pixel at the given coordinates has been marked as duplicate.
     ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
                                                               uint16_t x,
                                                               uint16_t y) {
       return getStatus(status, x, y) == kDuplicate;
     }
 
-    /* FIXME
-       * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
-       * between the read of status[index] at line NNN and the atomicCas at line NNN.
-       * We should investigate:
-       *   - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
-       *   - if `status` should be read with an atomic load (CPU)
-       */
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(Acc1D const& acc,
-                                                          uint32_t* __restrict__ status,
-                                                          const uint16_t x,
-                                                          const uint16_t y) {
+    // Record a pixel at the given coordinates and return the updated status.
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE Status promote(Acc1D const& acc,
+                                                  uint32_t* status,
+                                                  const uint16_t x,
+                                                  const uint16_t y) {
       uint32_t index = getIndex(x, y);
       uint32_t shift = getShift(x, y);
-      uint32_t old_word = status[index];
-      uint32_t expected = old_word;
+      uint32_t old_word = atomicLoadFromShared(acc, status + index);
+      uint32_t expected;
+      Status new_status;
       do {
         expected = old_word;
         Status old_status{(old_word >> shift) & mask};
         if (kDuplicate == old_status) {
-          // nothing to do
-          return;
+          // this pixel has already been marked as duplicate
+          return kDuplicate;
         }
-        Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
+        new_status = (kEmpty == old_status) ? kFound : kDuplicate;
         uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
-        old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
+        old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Threads{});
       } while (expected != old_word);
+      return new_status;
     }
 
   }  // namespace pixelStatus
@@ -105,6 +131,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
       }
 #endif
       for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
+        // Initialise each pixel with a cluster id equal to its index
         digi_view[i].clus() = i;
         if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
           continue;
@@ -144,6 +171,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
       static_assert(TrackerTraits::numberOfModules < ::pixelClustering::maxNumModules);
 
       auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+#ifdef GPU_DEBUG
+      auto& goodPixels = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+#endif
 
       const uint32_t lastModule = clus_view[0].moduleStart();
       for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
@@ -152,18 +182,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
         ALPAKA_ASSERT_ACC(thisModuleId < TrackerTraits::numberOfModules);
 
 #ifdef GPU_DEBUG
+        auto block = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
         if (thisModuleId % 100 == 1)
           if (cms::alpakatools::once_per_block(acc))
-            printf("start clusterizer for module %4d in block %4d\n",
-                   thisModuleId,
-                   alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+            printf("start clusterizer for module %4d in block %4d\n", thisModuleId, block);
 #endif
 
-        // find the index of the first pixel not belonging to this module (or invalid)
+        // Find the index of the first pixel not belonging to this module (or invalid).
+        // Note: modules are not consecutive in clus_view, so we cannot use something like
+        // lastPixel = (module + 1 == lastModule) ? numElements : clus_view[2 + module].moduleStart();
         lastPixel = numElements;
+#ifdef GPU_DEBUG
+        goodPixels = 0;
+#endif
         alpaka::syncBlockThreads(acc);
 
-        // skip threads not associated to an existing pixel
         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
           auto id = digi_view[i].moduleId();
           // skip invalid pixels
@@ -191,10 +224,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
 
         ALPAKA_ASSERT_ACC((lastPixel == numElements) or
                           ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
-        // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+        // Limit the number of pixels to to maxPixInModule.
+        // FIXME if this happens frequently (and not only in simulation with low threshold) one will need to implement something cleverer.
         if (cms::alpakatools::once_per_block(acc)) {
           if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
-            printf("too many pixels in module %u: %u > %u\n",
+            printf("Too many pixels in module %u: %u > %u\n",
                    thisModuleId,
                    lastPixel - firstPixel,
                    TrackerTraits::maxPixInModule);
@@ -204,44 +238,43 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
         alpaka::syncBlockThreads(acc);
         ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
 
-#ifdef GPU_DEBUG
-        auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
-        totGood = 0;
-        alpaka::syncBlockThreads(acc);
-#endif
-
         // remove duplicate pixels
         constexpr bool isPhase2 = std::is_base_of<pixelTopology::Phase2, TrackerTraits>::value;
         if constexpr (not isPhase2) {
           // packed words array used to store the pixelStatus of each pixel
-          auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
+          auto& image = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
 
           if (lastPixel > 1) {
             for (uint32_t i : cms::alpakatools::independent_group_elements(acc, pixelStatus::size)) {
-              status[i] = 0;
+              image[i] = 0;
             }
             alpaka::syncBlockThreads(acc);
 
-            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
               // skip invalid pixels
               if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
                 continue;
-              pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
+              pixelStatus::promote(acc, image, digi_view[i].xx(), digi_view[i].yy());
             }
             alpaka::syncBlockThreads(acc);
 
-            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
+            for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
               // skip invalid pixels
               if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
                 continue;
-              if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
+              auto status = pixelStatus::getStatus(image, digi_view[i].xx(), digi_view[i].yy());
+              if (pixelStatus::kDuplicate == status) {
+                // Mark all duplicate pixels as invalid.
+                // Note: the alternative approach to keep a single one of the duplicates would probably make more sense.
+                // According to Danek (16 March 2022): "The best would be to ignore such hits, most likely they are just
+                // noise. Accepting just one is also OK, any of them."
                 digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
                 digi_view[i].rawIdArr() = 0;
               }
             }
             alpaka::syncBlockThreads(acc);
-          }
-        }
+          }  // if (lastPixel > 1)
+        }  // if constexpr (not isPhase2)
 
         // fill histo
         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
@@ -249,26 +282,30 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
           if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
             hist.count(acc, digi_view[i].yy());
 #ifdef GPU_DEBUG
-            alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
+            alpaka::atomicAdd(acc, &goodPixels, 1u, alpaka::hierarchy::Threads{});
 #endif
           }
         }
-        alpaka::syncBlockThreads(acc);  // FIXME this can be removed
         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, warpSize)) {
           ws[i] = 0;  // used by prefix scan...
         }
         alpaka::syncBlockThreads(acc);
         hist.finalize(acc, ws);
         alpaka::syncBlockThreads(acc);
+
 #ifdef GPU_DEBUG
-        ALPAKA_ASSERT_ACC(hist.size() == totGood);
-        if (thisModuleId % 100 == 1)
-          if (cms::alpakatools::once_per_block(acc))
+        ALPAKA_ASSERT_ACC(hist.size() == goodPixels);
+        if (thisModuleId % 100 == 1) {
+          if (cms::alpakatools::once_per_block(acc)) {
             printf("histo size %d\n", hist.size());
+          }
+        }
 #endif
         for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
           // skip invalid pixels
           if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
+            // For valid pixels, the id used in the histogram is `i - firstPixel`, ranging from `0` to
+            // `lastPixel - firstPixel` (excluded), so in the `[0, TrackerTraits::maxPixInModule)` range.
             hist.fill(acc, digi_view[i].yy(), i - firstPixel);
           }
         }
@@ -284,9 +321,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
         alpaka::syncBlockThreads(acc);
         for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::nbins())) {
           if (hist.size(j) > 60)
-            alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Blocks{});
+            alpaka::atomicAdd(acc, &n60, 1u, alpaka::hierarchy::Threads{});
           if (hist.size(j) > 40)
-            alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Blocks{});
+            alpaka::atomicAdd(acc, &n40, 1u, alpaka::hierarchy::Threads{});
         }
         alpaka::syncBlockThreads(acc);
         if (cms::alpakatools::once_per_block(acc)) {
@@ -310,8 +347,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
         constexpr unsigned int maxIter = maxIterGPU * maxElements;
 
         // nearest neighbours (nn)
-        // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
-        constexpr int maxNeighbours = 10;
+        constexpr int maxNeighbours = 8;
         uint16_t nn[maxIter][maxNeighbours];
         uint8_t nnn[maxIter];  // number of nn
         for (uint32_t k = 0; k < maxIter; ++k) {
@@ -346,20 +382,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
           ++k;
         }
 
-        // for each pixel, look at all the pixels until the end of the module;
+        // For each pixel, look at all the pixels until the end of the module;
         // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
         // after the loop, all the pixel in each cluster should have the id equeal to the lowest
         // pixel in the cluster ( clus[i] == i ).
-        bool more = true;
-        /*
-          int nloops = 0;
-          */
-        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
-          /*
-            if (nloops % 2 == 0) {
-              // even iterations of the outer loop
-            */
-          more = false;
+        bool done = false;
+        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, not done)) {
+          done = true;
           uint32_t k = 0;
           for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
             ALPAKA_ASSERT_ACC(k < maxIter);
@@ -369,22 +398,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
               auto l = nn[k][kk];
               auto m = l + firstPixel;
               ALPAKA_ASSERT_ACC(m != i);
-              // FIXME ::Threads ?
-              auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
+              // The algorithm processes one module per block, so the atomic operation's scope
+              // is "Threads" (all threads in the current block)
+              auto old =
+                  alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Threads{});
               if (old != digi_view[i].clus()) {
                 // end the loop only if no changes were applied
-                more = true;
+                done = false;
               }
-              // FIXME ::Threads ?
-              alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
+              alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Threads{});
             }  // neighbours loop
             ++k;
           }  // pixel loop
-          /*
-              // use the outer loop to force a synchronisation
-            } else {
-              // odd iterations of the outer loop
-            */
           alpaka::syncBlockThreads(acc);
           for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
             auto p = hist.begin() + j;
@@ -394,23 +419,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
               m = digi_view[m].clus();
             digi_view[i].clus() = m;
           }
-          /*
-            }
-            ++nloops;
-            */
         }  // end while
 
         /*
-            // check that all threads in the block have executed the same number of iterations
-            auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
-            if (cms::alpakatools::once_per_block(acc))
-              n0 = nloops;
-            alpaka::syncBlockThreads(acc);
-            ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
-            if (thisModuleId % 100 == 1)
-              if (cms::alpakatools::once_per_block(acc))
-                printf("# loops %d\n", nloops);
-          */
+        // check that all threads in the block have executed the same number of iterations
+        auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+        if (cms::alpakatools::once_per_block(acc))
+          n0 = nloops;
+        alpaka::syncBlockThreads(acc);
+        ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
+        if (thisModuleId % 100 == 1)
+          if (cms::alpakatools::once_per_block(acc))
+            printf("# loops %d\n", nloops);
+        */
 
         auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
         foundClusters = 0;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index bb358df8a52f9..e03881559fca8 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -621,18 +621,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         auto moduleStartFirstElement = cms::alpakatools::make_device_view(queue, clusters_d->view().moduleStart(), 1u);
         alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement);
 
-        const auto elementsPerBlockFindClus = FindClus<TrackerTraits>::maxElementsPerBlock;
-        const auto workDivMaxNumModules =
-            cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, elementsPerBlockFindClus);
+        {
+          const int blocks = 64;
+          const auto elementsPerBlockFindClus = FindClus<TrackerTraits>::maxElementsPerBlock;
+          const auto workDivMaxNumModules = cms::alpakatools::make_workdiv<Acc1D>(blocks, elementsPerBlockFindClus);
+
 #ifdef GPU_DEBUG
-        std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus
-                  << " threadsPerBlockOrElementsPerThread\n";
+          std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus
+                    << " threadsPerBlockOrElementsPerThread\n";
 #endif
-        alpaka::exec<Acc1D>(
-            queue, workDivMaxNumModules, FindClus<TrackerTraits>{}, digis_d->view(), clusters_d->view(), wordCounter);
+          alpaka::exec<Acc1D>(
+              queue, workDivMaxNumModules, FindClus<TrackerTraits>{}, digis_d->view(), clusters_d->view(), wordCounter);
 #ifdef GPU_DEBUG
-        alpaka::wait(queue);
+          alpaka::wait(queue);
 #endif
+        }
 
         constexpr auto threadsPerBlockChargeCut = 256;
         const auto workDivChargeCut = cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, threadsPerBlockChargeCut);