CHIP-SPV · pvelesko · Mar 19, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -248,7 +248,7 @@ option(CHIP_SET_RPATH "Add CMAKE_INSTALL_PREFIX/lib to the RPATH for chipStar ex
 option(CHIP_ENABLE_UNCOMPILABLE_TESTS "Enable tests which are known to not compile" OFF)
 option(CHIP_BUILD_TESTS "Enable build_tests target" ON)
 option(CHIP_BUILD_SAMPLES "Build samples" ON)
-option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" OFF)
+option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" ON)
 option(CHIP_USE_INTEL_USM "When enabled, cl_intel_unified_shared_memory extension, when available, will be used for HIP allocations in the OpenCL backend" ON)
 option(CATCH2_DISCOVER_TESTS_COMPILE_TIME "Discover the tests at compile time" ON)
 option(CHIP_SKIP_TESTS_WITH_DOUBLES "Skip tests where kernels use doubles." OFF)
@@ -262,9 +262,6 @@ option(CHIP_L0_MAKE_MEM_RESIDENT "First-touch workaround for Level Zero." OFF)
 option(CHIP_L0_KERNEL_TIMESTAMPS "Enable kernel timestamps for Level Zero events." ON)
 set(CHIP_DEFAULT_JIT_FLAGS "-cl-kernel-arg-info -cl-std=CL3.0")
 
-# This mitigation might be necessary for PVC on Aurora (https://github.com/argonne-lcf/AuroraBugTracking/issues/124)
-option(CHIP_LZ_API_QUERY_QUEUE_EMPTY "This uses zeCommandListHostSynchronize(CmdList, 0) to query if CmdList is empty instead of using a boolean." OFF)
-
 option(CHIP_PROFILE_TESTS "Attach iprof to each test under ctest" OFF)
 if (CHIP_PROFILE_TESTS)
   set(HIP_PROFILE_TESTS_COMMAND "iprof" "-m" "full" "--")

diff --git a/chipStarConfig.hh.in b/chipStarConfig.hh.in
@@ -55,8 +55,6 @@
 
 #cmakedefine CHIP_L0_KERNEL_TIMESTAMPS
 
-#cmakedefine CHIP_LZ_API_QUERY_QUEUE_EMPTY
-
 #cmakedefine CHIP_ERROR_IF_NOT_IMPLEMENTED
 
 #cmakedefine LLVM_TOOLS_BINARY_DIR "@LLVM_TOOLS_BINARY_DIR@"

diff --git a/deleteme b/deleteme
diff --git a/hip-tests b/hip-tests
diff --git a/include/hip/spirv_hip.hh b/include/hip/spirv_hip.hh
@@ -170,15 +170,24 @@ _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line)
   // FIXME: Need `wchar_t` support to generate assertion message.
   abort();
 }
-#else  // defined(_WIN32) || defined(_WIN64)
+#elif defined(__APPLE__)
+// On macOS, assert() expands to __assert_rtn instead of __assert_fail.
+__device__ __attribute__((noinline)) __attribute__((weak)) void
+__assert_rtn(const char *function, const char *file, int line,
+             const char *assertion) {
+  printf("%s:%d: %s: Device-side assertion `%s' failed.\n", file, line,
+         function, assertion);
+  abort();
+}
+#else  // defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
 __device__ __attribute__((noinline)) __attribute__((weak)) void
 __assert_fail(const char *assertion, const char *file, unsigned int line,
               const char *function) {
   printf("%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
          function, assertion);
   abort();
 }
-#endif // defined(_WIN32) || defined(_WIN64)
+#endif // defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
 } // extern "C"
 #endif // defined(__clang__) && defined(__HIP__)
 

diff --git a/include/hip/spirv_hip_cooperative_groups.h b/include/hip/spirv_hip_cooperative_groups.h
@@ -37,7 +37,12 @@ THE SOFTWARE.
 #include <hip/spirv_hip_cooperative_groups_helper.h>
 #endif
 
+#ifdef __APPLE__
+// On macOS, assert() expands to __assert_rtn which is not available in device code.
+#define __hip_assert(cond) do { if (!(cond)) __builtin_trap(); } while(0)
+#else
 #define __hip_assert assert
+#endif
 
 namespace cooperative_groups {
 

diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc
@@ -1,3 +1,3 @@
 /*
 * Copyright (c) 2021-23 chipStar developers
 *
@@ -898,22 +898,160 @@
     return Prop.pageableMemoryAccessUsesHostPageTables;
     break;
   case hipDeviceAttributeCanUseStreamWaitValue:
-    // hipStreamWaitValue64() and hipStreamWaitValue32() support
-    // return g_devices[device]->devices()[0]->info().aqlBarrierValue_;
-    CHIPERR_LOG_AND_THROW(
-        "Device::getAttr(hipDeviceAttributeCanUseStreamWaitValue path "
-        "unimplemented",
-        hipErrorTbd);
+    return 0; // Not supported
     break;
   case hipDeviceAttributeUnifiedAddressing:
     return Prop.unifiedAddressing;
     break;
   case hipDeviceAttributeMemoryPoolsSupported:
     return Prop.memoryPoolsSupported;
     break;
+  case hipDeviceAttributeAccessPolicyMaxWindowSize:
+    return Prop.accessPolicyMaxWindowSize;
+    break;
+  case hipDeviceAttributeAsyncEngineCount:
+    return Prop.asyncEngineCount;
+    break;
+  case hipDeviceAttributeCanUseHostPointerForRegisteredMem:
+    return Prop.canUseHostPointerForRegisteredMem;
+    break;
+  case hipDeviceAttributeComputePreemptionSupported:
+    return Prop.computePreemptionSupported;
+    break;
+  case hipDeviceAttributeDeviceOverlap:
+    return Prop.deviceOverlap;
+    break;
+  case hipDeviceAttributeGlobalL1CacheSupported:
+    return Prop.globalL1CacheSupported;
+    break;
+  case hipDeviceAttributeHostNativeAtomicSupported:
+    return Prop.hostNativeAtomicSupported;
+    break;
+  case hipDeviceAttributeLocalL1CacheSupported:
+    return Prop.localL1CacheSupported;
+    break;
+  case hipDeviceAttributeMaxBlocksPerMultiProcessor:
+    return Prop.maxBlocksPerMultiProcessor;
+    break;
+  case hipDeviceAttributeMaxRegistersPerMultiprocessor:
+    return Prop.regsPerMultiprocessor;
+    break;
+  case hipDeviceAttributeMaxTexture1DLayered:
+    return Prop.maxTexture1DLayered[0];
+    break;
+  case hipDeviceAttributeMaxTexture1DMipmap:
+    return Prop.maxTexture1DMipmap;
+    break;
+  case hipDeviceAttributeMaxTexture2DGather:
+    return Prop.maxTexture2DGather[0];
+    break;
+  case hipDeviceAttributeMaxTexture2DLayered:
+    return Prop.maxTexture2DLayered[0];
+    break;
+  case hipDeviceAttributeMaxTexture2DMipmap:
+    return Prop.maxTexture2DMipmap[0];
+    break;
+  case hipDeviceAttributeMaxTexture3DAlt:
+    return Prop.maxTexture3DAlt[0];
+    break;
+  case hipDeviceAttributeMaxTextureCubemap:
+    return Prop.maxTextureCubemap;
+    break;
+  case hipDeviceAttributeMaxTextureCubemapLayered:
+    return Prop.maxTextureCubemapLayered[0];
+    break;
+  case hipDeviceAttributeMaxSurface1D:
+    return Prop.maxSurface1D;
+    break;
+  case hipDeviceAttributeMaxSurface1DLayered:
+    return Prop.maxSurface1DLayered[0];
+    break;
+  case hipDeviceAttributeMaxSurface2D:
+    return Prop.maxSurface2D[0];
+    break;
+  case hipDeviceAttributeMaxSurface2DLayered:
+    return Prop.maxSurface2DLayered[0];
+    break;
+  case hipDeviceAttributeMaxSurface3D:
+    return Prop.maxSurface3D[0];
+    break;
+  case hipDeviceAttributeMaxSurfaceCubemap:
+    return Prop.maxSurfaceCubemap;
+    break;
+  case hipDeviceAttributeMaxSurfaceCubemapLayered:
+    return Prop.maxSurfaceCubemapLayered[0];
+    break;
+  case hipDeviceAttributeMultiGpuBoardGroupID:
+    return Prop.multiGpuBoardGroupID;
+    break;
+  case hipDeviceAttributePciDomainId:
+    return Prop.pciDomainID;
+    break;
+  case hipDeviceAttributePersistingL2CacheMaxSize:
+    return Prop.persistingL2CacheMaxSize;
+    break;
+  case hipDeviceAttributeReservedSharedMemPerBlock:
+    return static_cast<int>(Prop.reservedSharedMemPerBlock);
+    break;
+  case hipDeviceAttributeSharedMemPerBlockOptin:
+    return static_cast<int>(Prop.sharedMemPerBlockOptin);
+    break;
+  case hipDeviceAttributeSharedMemPerMultiprocessor:
+    return static_cast<int>(Prop.sharedMemPerMultiprocessor);
+    break;
+  case hipDeviceAttributeSingleToDoublePrecisionPerfRatio:
+    return Prop.singleToDoublePrecisionPerfRatio;
+    break;
+  case hipDeviceAttributeStreamPrioritiesSupported:
+    return Prop.streamPrioritiesSupported;
+    break;
+  case hipDeviceAttributeSurfaceAlignment:
+    return static_cast<int>(Prop.surfaceAlignment);
+    break;
+  case hipDeviceAttributeTccDriver:
+    return Prop.tccDriver;
+    break;
+  case hipDeviceAttributeTotalGlobalMem:
+    return static_cast<int>(Prop.totalGlobalMem / (1024 * 1024)); // MB
+    break;
+  case hipDeviceAttributeVirtualMemoryManagementSupported:
+    return 0; // Not supported
+    break;
+  case hipDeviceAttributeHostRegisterSupported:
+    return Prop.hostRegisterSupported;
+    break;
+  case hipDeviceAttributeMemoryPoolSupportedHandleTypes:
+    return static_cast<int>(Prop.memoryPoolSupportedHandleTypes);
+    break;
+  case hipDeviceAttributePhysicalMultiProcessorCount:
+    return Prop.multiProcessorCount;
+    break;
+  case hipDeviceAttributeClockInstructionRate:
+    return Prop.clockRate; // Approximate with clock rate
+    break;
+  case hipDeviceAttributeImageSupport:
+    return 0; // Not supported in chipStar
+    break;
+  case hipDeviceAttributeMaxThreadsDim:
+    return Prop.maxThreadsDim[0];
+    break;
+  case hipDeviceAttributeHostNumaId:
+    return -1; // NUMA not supported
+    break;
+  case hipDeviceAttributeIsLargeBar:
+    return 0;
+    break;
+  case hipDeviceAttributeFineGrainSupport:
+    return 0;
+    break;
+  case hipDeviceAttributeWallClockRate:
+    return Prop.clockRate;
+    break;
   default:
-    CHIPERR_LOG_AND_THROW("Device::getAttr asked for an unkown attribute",
-                          hipErrorInvalidValue);
+    CHIPERR_LOG_AND_THROW(
+        "Device::getAttr asked for an unknown attribute: " +
+            std::to_string(static_cast<int>(Attr)),
+        hipErrorInvalidValue);
   }
   return -1;
 }
@@ -2003,16 +2141,31 @@
     logDebug("{}", InfoStr.str());
   }
 
-  auto TotalThreadsPerBlock =
-      ExItem->getBlock().x * ExItem->getBlock().y * ExItem->getBlock().z;
   auto DeviceProps = getDevice()->getDeviceProps();
   auto MaxTotalThreadsPerBlock = DeviceProps.maxThreadsPerBlock;
 
+  // Detect negative block dimensions passed as large unsigned values (sign wrap).
+  // Use INT32_MAX as threshold since no legitimate block dimension approaches it.
+  const auto kMaxReasonableBlockDim =
+      static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+  if (ExItem->getBlock().x > kMaxReasonableBlockDim ||
+      ExItem->getBlock().y > kMaxReasonableBlockDim ||
+      ExItem->getBlock().z > kMaxReasonableBlockDim) {
+    logCritical("Negative block dimension ({}, {}, {})",
+                ExItem->getBlock().x, ExItem->getBlock().y,
+                ExItem->getBlock().z);
+    CHIPERR_LOG_AND_THROW("Negative block dimension",
+                          hipErrorInvalidConfiguration);
+  }
+
+  auto TotalThreadsPerBlock =
+      ExItem->getBlock().x * ExItem->getBlock().y * ExItem->getBlock().z;
+
   if (TotalThreadsPerBlock > MaxTotalThreadsPerBlock) {
     logCritical("Requested total local size {} exceeds HW limit {}",
                 TotalThreadsPerBlock, MaxTotalThreadsPerBlock);
     CHIPERR_LOG_AND_THROW("Requested local size exceeds HW max",
-                          hipErrorLaunchFailure);
+                          hipErrorInvalidValue);
   }
 
   if (ExItem->getBlock().x > DeviceProps.maxThreadsDim[0] ||
@@ -2024,7 +2177,7 @@
         DeviceProps.maxThreadsDim[0], DeviceProps.maxThreadsDim[1],
         DeviceProps.maxThreadsDim[2]);
     CHIPERR_LOG_AND_THROW("Requested local size exceeds HW max",
-                          hipErrorLaunchFailure);
+                          hipErrorInvalidValue);
   }
 
   std::shared_ptr<chipstar::Event> RegisteredVarInEvent =

diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh
@@ -2141,7 +2141,6 @@ public:
   virtual void recordEvent(chipstar::Event *Event) = 0;
   bool isDefaultLegacyQueue() { return isDefaultLegacyQueue_; }
   bool isDefaultPerThreadQueue() { return isPerThreadDefaultQueue_; }
-  virtual bool isEmptyQueue() {return false;} ;
   void setDefaultLegacyQueue(bool Status) { isDefaultLegacyQueue_ = Status; }
   void setDefaultPerThreadQueue(bool Status) {
     isPerThreadDefaultQueue_ = Status;
@@ -2166,74 +2165,55 @@ protected:
     EventLocks.push_back(
         std::make_unique<std::unique_lock<std::mutex>>(ChipDevice_->QueueAddRemoveMtx));
 
-    // Collect marker events to keep alive on this queue. These are
-    // cross-queue sync markers whose underlying ze_events are referenced
-    // as GPU-level wait dependencies. They must not be recycled by
-    // checkEvents() until this queue finishes.
-    std::vector<std::shared_ptr<chipstar::Event>> MarkerEvents;
-
     // If this is a default stream (legacy or per-thread), create markers for all blocking queues
     if (this->isDefaultLegacyQueue() || this->isDefaultPerThreadQueue()) {
       // Create markers for all blocking queues
       for (auto &q : ChipDevice_->getQueuesNoLock()) {
-	// only do this if it's blocking and there's anything in the other queue.
-        if (q->getQueueFlags().isBlocking() && !q->isEmptyQueue()) {
+        if (q->getQueueFlags().isBlocking()) {
           // Create a marker event in the other queue
           std::shared_ptr<chipstar::Event> ChipMarkerEvent;
           EventHandle handle = CreateMarkerInQueue(q, ChipMarkerEvent);
-
+          
           // Add the marker to the list of events to wait on
           EventHandles.push_back(handle);
 
           // Track the marker event so it gets properly managed by the event monitor
           BackendPtr->trackEvent(ChipMarkerEvent);
-          MarkerEvents.push_back(ChipMarkerEvent);
 
-	}
+          // Add dependency to the target event
+          TargetEvent->addDependency(ChipMarkerEvent);
+        }
       }
     } else if (this->getQueueFlags().isBlocking()) {
       // This is a blocking queue, sync with default streams
       // Create marker for legacy default stream
       auto LegacyDefaultQueue = ChipDevice_->getLegacyDefaultQueue();
-      if (LegacyDefaultQueue && !LegacyDefaultQueue->isEmptyQueue()) {
-
+      if (LegacyDefaultQueue) {
         std::shared_ptr<chipstar::Event> ChipMarkerEvent;
         EventHandle handle = CreateMarkerInQueue(LegacyDefaultQueue, ChipMarkerEvent);
-
+        
         EventHandles.push_back(handle);
         BackendPtr->trackEvent(ChipMarkerEvent);
-        MarkerEvents.push_back(ChipMarkerEvent);
+        TargetEvent->addDependency(ChipMarkerEvent);
       }
 
       // Create marker for per-thread default stream if used
       if (ChipDevice_->isPerThreadStreamUsedNoLock()) {
         auto PerThreadDefaultQueue = ChipDevice_->getPerThreadDefaultQueueNoLock();
-        if (PerThreadDefaultQueue && !PerThreadDefaultQueue->isEmptyQueue() ) {
+        if (PerThreadDefaultQueue) {
           std::shared_ptr<chipstar::Event> ChipMarkerEvent;
           EventHandle handle = CreateMarkerInQueue(PerThreadDefaultQueue, ChipMarkerEvent);
-
+          
           EventHandles.push_back(handle);
           BackendPtr->trackEvent(ChipMarkerEvent);
-          MarkerEvents.push_back(ChipMarkerEvent);
+          TargetEvent->addDependency(ChipMarkerEvent);
         }
       }
     }
 
-    // Store marker events on this queue to prevent premature recycling.
-    // checkEvents() may see that marker ze_events have been signaled and
-    // recycle them, but GPU operations on THIS queue still reference those
-    // ze_events as wait dependencies. Holding shared_ptrs here prevents
-    // the event pool from resetting the ze_events until this queue finishes.
-    if (!MarkerEvents.empty()) {
-      storeCrossQueueDeps(std::move(MarkerEvents));
-    }
-
     return {EventHandles, std::move(EventLocks)};
   }
 
-  virtual void storeCrossQueueDeps(
-      std::vector<std::shared_ptr<chipstar::Event>> Markers) {}
-
 public:
   enum MEM_MAP_TYPE { HOST_READ, HOST_WRITE, HOST_READ_WRITE };
   virtual void MemMap(const chipstar::AllocationInfo *AllocInfo,