Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ option(CHIP_SET_RPATH "Add CMAKE_INSTALL_PREFIX/lib to the RPATH for chipStar ex
option(CHIP_ENABLE_UNCOMPILABLE_TESTS "Enable tests which are known to not compile" OFF)
option(CHIP_BUILD_TESTS "Enable build_tests target" ON)
option(CHIP_BUILD_SAMPLES "Build samples" ON)
option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" OFF)
option(CHIP_USE_EXTERNAL_HIP_TESTS "Use Catch2 tests from the hip-tests submodule" ON)
option(CHIP_USE_INTEL_USM "When enabled, cl_intel_unified_shared_memory extension, when available, will be used for HIP allocations in the OpenCL backend" ON)
option(CATCH2_DISCOVER_TESTS_COMPILE_TIME "Discover the tests at compile time" ON)
option(CHIP_SKIP_TESTS_WITH_DOUBLES "Skip tests where kernels use doubles." OFF)
Expand All @@ -262,9 +262,6 @@ option(CHIP_L0_MAKE_MEM_RESIDENT "First-touch workaround for Level Zero." OFF)
option(CHIP_L0_KERNEL_TIMESTAMPS "Enable kernel timestamps for Level Zero events." ON)
set(CHIP_DEFAULT_JIT_FLAGS "-cl-kernel-arg-info -cl-std=CL3.0")

# This mitigation might be necessary for PVC on Aurora (https://github.com/argonne-lcf/AuroraBugTracking/issues/124)
option(CHIP_LZ_API_QUERY_QUEUE_EMPTY "This uses zeCommandListHostSynchronize(CmdList, 0) to query if CmdList is empty instead of using a boolean." OFF)

option(CHIP_PROFILE_TESTS "Attach iprof to each test under ctest" OFF)
if (CHIP_PROFILE_TESTS)
set(HIP_PROFILE_TESTS_COMMAND "iprof" "-m" "full" "--")
Expand Down
2 changes: 0 additions & 2 deletions chipStarConfig.hh.in
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@

#cmakedefine CHIP_L0_KERNEL_TIMESTAMPS

#cmakedefine CHIP_LZ_API_QUERY_QUEUE_EMPTY

#cmakedefine CHIP_ERROR_IF_NOT_IMPLEMENTED

#cmakedefine LLVM_TOOLS_BINARY_DIR "@LLVM_TOOLS_BINARY_DIR@"
Expand Down
Empty file removed deleteme
Empty file.
2 changes: 1 addition & 1 deletion hip-tests
Submodule hip-tests updated 1310 files
13 changes: 11 additions & 2 deletions include/hip/spirv_hip.hh
Original file line number Diff line number Diff line change
Expand Up @@ -170,15 +170,24 @@ _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line)
// FIXME: Need `wchar_t` support to generate assertion message.
abort();
}
#else // defined(_WIN32) || defined(_WIN64)
#elif defined(__APPLE__)
// On macOS, assert() expands to __assert_rtn instead of __assert_fail.
__device__ __attribute__((noinline)) __attribute__((weak)) void
__assert_rtn(const char *function, const char *file, int line,
const char *assertion) {
printf("%s:%d: %s: Device-side assertion `%s' failed.\n", file, line,
function, assertion);
abort();
}
#else // defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
__device__ __attribute__((noinline)) __attribute__((weak)) void
__assert_fail(const char *assertion, const char *file, unsigned int line,
const char *function) {
printf("%s:%u: %s: Device-side assertion `%s' failed.\n", file, line,
function, assertion);
abort();
}
#endif // defined(_WIN32) || defined(_WIN64)
#endif // defined(_WIN32) || defined(_WIN64) || defined(__APPLE__)
} // extern "C"
#endif // defined(__clang__) && defined(__HIP__)

Expand Down
5 changes: 5 additions & 0 deletions include/hip/spirv_hip_cooperative_groups.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ THE SOFTWARE.
#include <hip/spirv_hip_cooperative_groups_helper.h>
#endif

#ifdef __APPLE__
// On macOS, assert() expands to __assert_rtn which is not available in device code.
#define __hip_assert(cond) do { if (!(cond)) __builtin_trap(); } while(0)
#else
#define __hip_assert assert
#endif

namespace cooperative_groups {

Expand Down
177 changes: 165 additions & 12 deletions src/CHIPBackend.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
/*
* Copyright (c) 2021-23 chipStar developers
*
Expand Down Expand Up @@ -898,22 +898,160 @@
return Prop.pageableMemoryAccessUsesHostPageTables;
break;
case hipDeviceAttributeCanUseStreamWaitValue:
// hipStreamWaitValue64() and hipStreamWaitValue32() support
// return g_devices[device]->devices()[0]->info().aqlBarrierValue_;
CHIPERR_LOG_AND_THROW(
"Device::getAttr(hipDeviceAttributeCanUseStreamWaitValue path "
"unimplemented",
hipErrorTbd);
return 0; // Not supported
break;
case hipDeviceAttributeUnifiedAddressing:
return Prop.unifiedAddressing;
break;
case hipDeviceAttributeMemoryPoolsSupported:
return Prop.memoryPoolsSupported;
break;
case hipDeviceAttributeAccessPolicyMaxWindowSize:
return Prop.accessPolicyMaxWindowSize;
break;
case hipDeviceAttributeAsyncEngineCount:
return Prop.asyncEngineCount;
break;
case hipDeviceAttributeCanUseHostPointerForRegisteredMem:
return Prop.canUseHostPointerForRegisteredMem;
break;
case hipDeviceAttributeComputePreemptionSupported:
return Prop.computePreemptionSupported;
break;
case hipDeviceAttributeDeviceOverlap:
return Prop.deviceOverlap;
break;
case hipDeviceAttributeGlobalL1CacheSupported:
return Prop.globalL1CacheSupported;
break;
case hipDeviceAttributeHostNativeAtomicSupported:
return Prop.hostNativeAtomicSupported;
break;
case hipDeviceAttributeLocalL1CacheSupported:
return Prop.localL1CacheSupported;
break;
case hipDeviceAttributeMaxBlocksPerMultiProcessor:
return Prop.maxBlocksPerMultiProcessor;
break;
case hipDeviceAttributeMaxRegistersPerMultiprocessor:
return Prop.regsPerMultiprocessor;
break;
case hipDeviceAttributeMaxTexture1DLayered:
return Prop.maxTexture1DLayered[0];
break;
case hipDeviceAttributeMaxTexture1DMipmap:
return Prop.maxTexture1DMipmap;
break;
case hipDeviceAttributeMaxTexture2DGather:
return Prop.maxTexture2DGather[0];
break;
case hipDeviceAttributeMaxTexture2DLayered:
return Prop.maxTexture2DLayered[0];
break;
case hipDeviceAttributeMaxTexture2DMipmap:
return Prop.maxTexture2DMipmap[0];
break;
case hipDeviceAttributeMaxTexture3DAlt:
return Prop.maxTexture3DAlt[0];
break;
case hipDeviceAttributeMaxTextureCubemap:
return Prop.maxTextureCubemap;
break;
case hipDeviceAttributeMaxTextureCubemapLayered:
return Prop.maxTextureCubemapLayered[0];
break;
case hipDeviceAttributeMaxSurface1D:
return Prop.maxSurface1D;
break;
case hipDeviceAttributeMaxSurface1DLayered:
return Prop.maxSurface1DLayered[0];
break;
case hipDeviceAttributeMaxSurface2D:
return Prop.maxSurface2D[0];
break;
case hipDeviceAttributeMaxSurface2DLayered:
return Prop.maxSurface2DLayered[0];
break;
case hipDeviceAttributeMaxSurface3D:
return Prop.maxSurface3D[0];
break;
case hipDeviceAttributeMaxSurfaceCubemap:
return Prop.maxSurfaceCubemap;
break;
case hipDeviceAttributeMaxSurfaceCubemapLayered:
return Prop.maxSurfaceCubemapLayered[0];
break;
case hipDeviceAttributeMultiGpuBoardGroupID:
return Prop.multiGpuBoardGroupID;
break;
case hipDeviceAttributePciDomainId:
return Prop.pciDomainID;
break;
case hipDeviceAttributePersistingL2CacheMaxSize:
return Prop.persistingL2CacheMaxSize;
break;
case hipDeviceAttributeReservedSharedMemPerBlock:
return static_cast<int>(Prop.reservedSharedMemPerBlock);
break;
case hipDeviceAttributeSharedMemPerBlockOptin:
return static_cast<int>(Prop.sharedMemPerBlockOptin);
break;
case hipDeviceAttributeSharedMemPerMultiprocessor:
return static_cast<int>(Prop.sharedMemPerMultiprocessor);
break;
case hipDeviceAttributeSingleToDoublePrecisionPerfRatio:
return Prop.singleToDoublePrecisionPerfRatio;
break;
case hipDeviceAttributeStreamPrioritiesSupported:
return Prop.streamPrioritiesSupported;
break;
case hipDeviceAttributeSurfaceAlignment:
return static_cast<int>(Prop.surfaceAlignment);
break;
case hipDeviceAttributeTccDriver:
return Prop.tccDriver;
break;
case hipDeviceAttributeTotalGlobalMem:
return static_cast<int>(Prop.totalGlobalMem / (1024 * 1024)); // MB

Check warning on line 1015 in src/CHIPBackend.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/CHIPBackend.cc:1015:59 [readability-magic-numbers]

1024 is a magic number; consider replacing it with a named constant

Check warning on line 1015 in src/CHIPBackend.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/CHIPBackend.cc:1015:52 [readability-magic-numbers]

1024 is a magic number; consider replacing it with a named constant
break;
case hipDeviceAttributeVirtualMemoryManagementSupported:
return 0; // Not supported
break;
case hipDeviceAttributeHostRegisterSupported:
return Prop.hostRegisterSupported;
break;
case hipDeviceAttributeMemoryPoolSupportedHandleTypes:
return static_cast<int>(Prop.memoryPoolSupportedHandleTypes);
break;
case hipDeviceAttributePhysicalMultiProcessorCount:
return Prop.multiProcessorCount;
break;
case hipDeviceAttributeClockInstructionRate:
return Prop.clockRate; // Approximate with clock rate
break;
case hipDeviceAttributeImageSupport:
return 0; // Not supported in chipStar
break;
case hipDeviceAttributeMaxThreadsDim:
return Prop.maxThreadsDim[0];
break;
case hipDeviceAttributeHostNumaId:
return -1; // NUMA not supported
break;
case hipDeviceAttributeIsLargeBar:
return 0;
break;
case hipDeviceAttributeFineGrainSupport:
return 0;
break;
case hipDeviceAttributeWallClockRate:
return Prop.clockRate;
break;
default:
CHIPERR_LOG_AND_THROW("Device::getAttr asked for an unkown attribute",
hipErrorInvalidValue);
CHIPERR_LOG_AND_THROW(
"Device::getAttr asked for an unknown attribute: " +
std::to_string(static_cast<int>(Attr)),
hipErrorInvalidValue);
}
return -1;
}
Expand Down Expand Up @@ -2003,16 +2141,31 @@
logDebug("{}", InfoStr.str());
}

auto TotalThreadsPerBlock =
ExItem->getBlock().x * ExItem->getBlock().y * ExItem->getBlock().z;
auto DeviceProps = getDevice()->getDeviceProps();
auto MaxTotalThreadsPerBlock = DeviceProps.maxThreadsPerBlock;

// Detect negative block dimensions passed as large unsigned values (sign wrap).
// Use INT32_MAX as threshold since no legitimate block dimension approaches it.
const auto kMaxReasonableBlockDim =

Check warning on line 2149 in src/CHIPBackend.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/CHIPBackend.cc:2149:14 [readability-identifier-naming]

invalid case style for local variable 'kMaxReasonableBlockDim'
static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
if (ExItem->getBlock().x > kMaxReasonableBlockDim ||
ExItem->getBlock().y > kMaxReasonableBlockDim ||
ExItem->getBlock().z > kMaxReasonableBlockDim) {
logCritical("Negative block dimension ({}, {}, {})",
ExItem->getBlock().x, ExItem->getBlock().y,
ExItem->getBlock().z);
CHIPERR_LOG_AND_THROW("Negative block dimension",
hipErrorInvalidConfiguration);
}

auto TotalThreadsPerBlock =
ExItem->getBlock().x * ExItem->getBlock().y * ExItem->getBlock().z;

if (TotalThreadsPerBlock > MaxTotalThreadsPerBlock) {
logCritical("Requested total local size {} exceeds HW limit {}",
TotalThreadsPerBlock, MaxTotalThreadsPerBlock);
CHIPERR_LOG_AND_THROW("Requested local size exceeds HW max",
hipErrorLaunchFailure);
hipErrorInvalidValue);
}

if (ExItem->getBlock().x > DeviceProps.maxThreadsDim[0] ||
Expand All @@ -2024,7 +2177,7 @@
DeviceProps.maxThreadsDim[0], DeviceProps.maxThreadsDim[1],
DeviceProps.maxThreadsDim[2]);
CHIPERR_LOG_AND_THROW("Requested local size exceeds HW max",
hipErrorLaunchFailure);
hipErrorInvalidValue);
}

std::shared_ptr<chipstar::Event> RegisteredVarInEvent =
Expand Down
42 changes: 11 additions & 31 deletions src/CHIPBackend.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2141,7 +2141,6 @@ public:
virtual void recordEvent(chipstar::Event *Event) = 0;
bool isDefaultLegacyQueue() { return isDefaultLegacyQueue_; }
bool isDefaultPerThreadQueue() { return isPerThreadDefaultQueue_; }
virtual bool isEmptyQueue() {return false;} ;
void setDefaultLegacyQueue(bool Status) { isDefaultLegacyQueue_ = Status; }
void setDefaultPerThreadQueue(bool Status) {
isPerThreadDefaultQueue_ = Status;
Expand All @@ -2166,74 +2165,55 @@ protected:
EventLocks.push_back(
std::make_unique<std::unique_lock<std::mutex>>(ChipDevice_->QueueAddRemoveMtx));

// Collect marker events to keep alive on this queue. These are
// cross-queue sync markers whose underlying ze_events are referenced
// as GPU-level wait dependencies. They must not be recycled by
// checkEvents() until this queue finishes.
std::vector<std::shared_ptr<chipstar::Event>> MarkerEvents;

// If this is a default stream (legacy or per-thread), create markers for all blocking queues
if (this->isDefaultLegacyQueue() || this->isDefaultPerThreadQueue()) {
// Create markers for all blocking queues
for (auto &q : ChipDevice_->getQueuesNoLock()) {
// only do this if it's blocking and there's anything in the other queue.
if (q->getQueueFlags().isBlocking() && !q->isEmptyQueue()) {
if (q->getQueueFlags().isBlocking()) {
// Create a marker event in the other queue
std::shared_ptr<chipstar::Event> ChipMarkerEvent;
EventHandle handle = CreateMarkerInQueue(q, ChipMarkerEvent);

// Add the marker to the list of events to wait on
EventHandles.push_back(handle);

// Track the marker event so it gets properly managed by the event monitor
BackendPtr->trackEvent(ChipMarkerEvent);
MarkerEvents.push_back(ChipMarkerEvent);

}
// Add dependency to the target event
TargetEvent->addDependency(ChipMarkerEvent);
}
}
} else if (this->getQueueFlags().isBlocking()) {
// This is a blocking queue, sync with default streams
// Create marker for legacy default stream
auto LegacyDefaultQueue = ChipDevice_->getLegacyDefaultQueue();
if (LegacyDefaultQueue && !LegacyDefaultQueue->isEmptyQueue()) {

if (LegacyDefaultQueue) {
std::shared_ptr<chipstar::Event> ChipMarkerEvent;
EventHandle handle = CreateMarkerInQueue(LegacyDefaultQueue, ChipMarkerEvent);

EventHandles.push_back(handle);
BackendPtr->trackEvent(ChipMarkerEvent);
MarkerEvents.push_back(ChipMarkerEvent);
TargetEvent->addDependency(ChipMarkerEvent);
}

// Create marker for per-thread default stream if used
if (ChipDevice_->isPerThreadStreamUsedNoLock()) {
auto PerThreadDefaultQueue = ChipDevice_->getPerThreadDefaultQueueNoLock();
if (PerThreadDefaultQueue && !PerThreadDefaultQueue->isEmptyQueue() ) {
if (PerThreadDefaultQueue) {
std::shared_ptr<chipstar::Event> ChipMarkerEvent;
EventHandle handle = CreateMarkerInQueue(PerThreadDefaultQueue, ChipMarkerEvent);

EventHandles.push_back(handle);
BackendPtr->trackEvent(ChipMarkerEvent);
MarkerEvents.push_back(ChipMarkerEvent);
TargetEvent->addDependency(ChipMarkerEvent);
}
}
}

// Store marker events on this queue to prevent premature recycling.
// checkEvents() may see that marker ze_events have been signaled and
// recycle them, but GPU operations on THIS queue still reference those
// ze_events as wait dependencies. Holding shared_ptrs here prevents
// the event pool from resetting the ze_events until this queue finishes.
if (!MarkerEvents.empty()) {
storeCrossQueueDeps(std::move(MarkerEvents));
}

return {EventHandles, std::move(EventLocks)};
}

virtual void storeCrossQueueDeps(
std::vector<std::shared_ptr<chipstar::Event>> Markers) {}

public:
enum MEM_MAP_TYPE { HOST_READ, HOST_WRITE, HOST_READ_WRITE };
virtual void MemMap(const chipstar::AllocationInfo *AllocInfo,
Expand Down
Loading
Loading