From 5234be86a7aff745e2ba406c931150f3595d9943 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Tue, 14 Apr 2026 22:25:43 +0300 Subject: [PATCH 1/2] test: reproduce hipCUB ThreadStore AMDGCN inline-asm crash Adds a minimal standalone reproducer for the hipspv-link stage crash that prevents test_hipcub_thread from building. hipCUB's ThreadStore templates in thread_store.hpp emit AMDGCN inline assembly of the form asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val)); asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00)); which chipStar's LLVM pass pipeline leaves intact in the lowered bitcode. When SPIRV-LLVM-Translator walks that IR it hits the InlineAsm callee in SPIRV::LLVMToSPIRVBase::transDirectCallInst (lib/SPIRV/SPIRVWriter.cpp:5598), calls getCalledFunction() which returns nullptr for InlineAsm callees, and segfaults dereferencing it on the next line. This test reproduces the exact pattern with a single __device__ helper mimicking HIPCUB_ASM_THREAD_STORE. On unpatched chipStar the compile aborts with 'hipspv-link command failed due to signal' and the identical stack trace (transDirectCallInst). After the HipStripAMDGCNAsm fix the compile succeeds and the test prints PASS. --- tests/regression/test_strip_amdgcn_asm.cpp | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/regression/test_strip_amdgcn_asm.cpp diff --git a/tests/regression/test_strip_amdgcn_asm.cpp b/tests/regression/test_strip_amdgcn_asm.cpp new file mode 100644 index 000000000..a172257e7 --- /dev/null +++ b/tests/regression/test_strip_amdgcn_asm.cpp @@ -0,0 +1,70 @@ +// Reproducer: hipCUB's ThreadStore templates emit AMDGCN +// inline assembly (flat_store_dword ... glc, s_waitcnt vmcnt(0)) that +// SPIRV-LLVM-Translator cannot consume. On an unpatched chipStar the +// lowered bitcode still carries these inline-asm calls; invoking +// llvm-spirv then null-derefs inside transDirectCallInst +// (SPIRVWriter.cpp:5598, CI->getCalledFunction() is nullptr when the +// callee operand is an InlineAsm). +// +// This file mimics the minimal hipCUB pattern: a __device__ wrapper +// that emits "flat_store_dword $0, $1 glc" followed by +// "s_waitcnt vmcnt(0)" via `asm volatile`, called from a kernel. It +// must build cleanly when chipStar's HipStripAMDGCNAsm pass replaces +// the inline asm with plain LLVM stores and drops the waitcnt. +// +// Without the fix: hipcc aborts at hipspv-link stage with +// "clang++: error: hipspv-link command failed due to signal" and the +// llvm-spirv stack trace ending at SPIRV::LLVMToSPIRVBase::transDirectCallInst. +#include +#include + +__device__ __forceinline__ void amdgcn_store_cs(unsigned int* ptr, + unsigned int val) { +#if defined(__HIP_DEVICE_COMPILE__) + // Same shape as hipCUB thread_store.hpp HIPCUB_ASM_THREAD_STORE: + // asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val)); + // asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00)); + asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val)); + asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00)); +#else + *ptr = val; +#endif +} + +__global__ void test_kernel(unsigned int* out, int n) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n) + amdgcn_store_cs(&out[tid], (unsigned int)(tid * 42)); +} + +#define CHECK(expr) do { \ + hipError_t e = (expr); \ + if (e != hipSuccess) { \ + printf("FAIL %s = %d (%s)\n", #expr, (int)e, hipGetErrorString(e)); \ + return 1; \ + } \ +} while (0) + +int main() { + constexpr int N = 64; + unsigned int* d_out = nullptr; + CHECK(hipMalloc(&d_out, N * sizeof(unsigned int))); + CHECK(hipMemset(d_out, 0, N * sizeof(unsigned int))); + + hipLaunchKernelGGL(test_kernel, dim3(1), dim3(N), 0, 0, d_out, N); + CHECK(hipGetLastError()); + CHECK(hipDeviceSynchronize()); + + unsigned int host[N]; + CHECK(hipMemcpy(host, d_out, N * sizeof(unsigned int), hipMemcpyDeviceToHost)); + for (int i = 0; i < N; ++i) { + if (host[i] != (unsigned int)(i * 42)) { + printf("FAIL host[%d]=%u expected %u\n", i, host[i], + (unsigned int)(i * 42)); + return 1; + } + } + CHECK(hipFree(d_out)); + printf("PASS\n"); + return 0; +} From 3f2fd0b8a54b20c01c0f8300b49c74747cd845f4 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Tue, 14 Apr 2026 22:29:27 +0300 Subject: [PATCH 2/2] fix: strip AMDGCN inline asm before SPIR-V emission hipCUB's ThreadStore templates emit AMDGCN inline assembly (flat_store_dword glc, s_waitcnt, etc.) that SPIRV-LLVM-Translator does not support. When the IR reaches llvm-spirv, it null-derefs in transDirectCallInst (SPIRVWriter.cpp line 5598, CallInst::getCalledFunction() returns nullptr for InlineAsm). Add HipStripAMDGCNAsm LLVM pass that walks call instructions and replaces AMDGCN-mnemonic inline-asm with equivalent plain LLVM load/store/fence/no-op. These cache-hint modifiers have no SPIR-V equivalent and are performance hints only - the semantic store/load is preserved. Resolves: hipCUB test_hipcub_thread (ThreadStore) build crash. --- llvm_passes/CMakeLists.txt | 1 + llvm_passes/HipPasses.cpp | 7 + llvm_passes/HipStripAMDGCNAsm.cpp | 222 ++++++++++++++++++++++++++++++ llvm_passes/HipStripAMDGCNAsm.h | 39 ++++++ 4 files changed, 269 insertions(+) create mode 100644 llvm_passes/HipStripAMDGCNAsm.cpp create mode 100644 llvm_passes/HipStripAMDGCNAsm.h diff --git a/llvm_passes/CMakeLists.txt b/llvm_passes/CMakeLists.txt index 492f01a97..7770cb81c 100644 --- a/llvm_passes/CMakeLists.txt +++ b/llvm_passes/CMakeLists.txt @@ -104,6 +104,7 @@ add_library(LLVMHipPasses MODULE HipPasses.cpp HipLowerZeroLengthArrays.cpp HipSanityChecks.cpp HipLowerSwitch.cpp HipLowerMemset.cpp HipIGBADetector.cpp HipPromoteInts.cpp HipSpirvFunctionReorderPass.cpp + HipStripAMDGCNAsm.cpp HipVerify.cpp ${EXTRA_OBJS}) diff --git a/llvm_passes/HipPasses.cpp b/llvm_passes/HipPasses.cpp index 1f6474e8e..a107027df 100644 --- a/llvm_passes/HipPasses.cpp +++ b/llvm_passes/HipPasses.cpp @@ -27,6 +27,7 @@ #include "HipKernelArgSpiller.h" #include "HipLowerZeroLengthArrays.h" #include "HipSanityChecks.h" +#include "HipStripAMDGCNAsm.h" #include "HipLowerSwitch.h" #include "HipLowerMemset.h" #include "HipIGBADetector.h" @@ -126,6 +127,12 @@ static void addFullLinkTimePasses(ModulePassManager &MPM) { // Initial verification MPM.addPass(HipVerifyPass("Pre-HIP passes", false)); // false = don't print summary yet + // Strip AMDGCN-mnemonic inline assembly (hipCUB ThreadStore/ThreadLoad) + // before any other pass sees it. These call sites would otherwise reach + // SPIRV-LLVM-Translator and null-deref transDirectCallInst. + addPassWithVerification(MPM, HipStripAMDGCNAsmPass(), + "HipStripAMDGCNAsmPass"); + // Use HipVerify for intermediate passes without printing summary addPassWithVerification(MPM, HipSanityChecksPass(), "HipSanityChecksPass"); diff --git a/llvm_passes/HipStripAMDGCNAsm.cpp b/llvm_passes/HipStripAMDGCNAsm.cpp new file mode 100644 index 000000000..bab6477f0 --- /dev/null +++ b/llvm_passes/HipStripAMDGCNAsm.cpp @@ -0,0 +1,222 @@ +//===- HipStripAMDGCNAsm.cpp ----------------------------------------------===// +// +// Part of the chipStar Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Rewrite AMDGCN inline-assembly call sites into plain LLVM load/store (or +// delete them) before the IR is handed to SPIRV-LLVM-Translator. +// +// The motivating case is hipCUB's ThreadStore/ThreadLoad templates in +// hipcub/backend/rocprim/thread/thread_{store,load}.hpp, which emit +// constructs like: +// +// asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val)); +// asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00)); +// +// These AMDGCN mnemonics have no SPIR-V analogue. Without this pass +// they reach llvm-spirv, which segfaults in +// SPIRV::LLVMToSPIRVBase::transDirectCallInst (lib/SPIRV/SPIRVWriter.cpp) +// when CI->getCalledFunction() returns nullptr for the InlineAsm callee. +// +// Replacement policy: +// flat_store_{byte,short,dword,dwordx2,...} => plain `store` to the +// first (ptr) operand of the value held in the second operand. +// flat_load_{...} => plain `load`; result replaces the inline-asm +// output. +// s_waitcnt / s_barrier / v_* / other => erase the call. These are +// performance/ordering hints; the surrounding plain LLVM loads and +// stores preserve program semantics. +// +// (c) 2026 chipStar developers +//===----------------------------------------------------------------------===// + +#include "HipStripAMDGCNAsm.h" + +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "hip-strip-amdgcn-asm" + +using namespace llvm; + +namespace { + +// Returns true if the asm string contains any AMDGCN mnemonic we recognise +// as needing replacement/removal before SPIR-V emission. +static bool isAMDGCNAsm(StringRef Asm) { + static const char *Mnemonics[] = { + "flat_store_", "flat_load_", "global_store_", "global_load_", + "buffer_store_", "buffer_load_", "s_waitcnt", "s_barrier", + "s_memtime", "s_memrealtime", + // Conservative: treat any v_ / ds_ mnemonic as AMDGCN. + "ds_write", "ds_read", "v_mov_", "v_add_", "v_sub_", "v_mul_", + }; + for (const char *M : Mnemonics) + if (Asm.contains(M)) + return true; + return false; +} + +// Classify the store width for flat_store_* / global_store_* / buffer_store_* +// based on the mnemonic suffix. Returns 0 when we cannot determine a width. +static unsigned storeWidthBits(StringRef Asm) { + if (Asm.contains("store_byte")) return 8; + if (Asm.contains("store_short")) return 16; + if (Asm.contains("store_dwordx4")) return 128; + if (Asm.contains("store_dwordx2")) return 64; + if (Asm.contains("store_dwordx3")) return 96; + if (Asm.contains("store_dword")) return 32; + return 0; +} + +static unsigned loadWidthBits(StringRef Asm) { + if (Asm.contains("load_ubyte") || Asm.contains("load_sbyte")) return 8; + if (Asm.contains("load_ushort") || Asm.contains("load_sshort")) return 16; + if (Asm.contains("load_dwordx4")) return 128; + if (Asm.contains("load_dwordx2")) return 64; + if (Asm.contains("load_dwordx3")) return 96; + if (Asm.contains("load_dword")) return 32; + return 0; +} + +// Try to rewrite a flat_store_* inline-asm call as a plain LLVM store. +// hipCUB / rocPRIM emit the pattern: +// asm volatile("flat_store_ %0, %1 glc" : : "v"(ptr), "v"(val)); +// so operand 0 is the pointer and operand 1 is the value. +static bool tryReplaceStore(CallInst *CI, StringRef Asm) { + unsigned Bits = storeWidthBits(Asm); + if (!Bits || CI->arg_size() < 2) + return false; + + Value *Ptr = CI->getArgOperand(0); + Value *Val = CI->getArgOperand(1); + if (!Ptr->getType()->isPointerTy()) + return false; + + IRBuilder<> B(CI); + + // The inline-asm operand value may be narrower than the store width + // (e.g. `flat_store_byte` with an i16 src holding the byte). Truncate + // as needed; only widen when strictly necessary. + Type *ValTy = Val->getType(); + Type *TargetTy = nullptr; + if (ValTy->isIntegerTy()) { + unsigned VBits = ValTy->getIntegerBitWidth(); + if (VBits == Bits) { + TargetTy = ValTy; + } else if (VBits > Bits) { + TargetTy = B.getIntNTy(Bits); + Val = B.CreateTrunc(Val, TargetTy); + } else { + // Extending would change bit patterns; fall back to deletion. + return false; + } + } else if (ValTy->isFloatingPointTy() && + ValTy->getPrimitiveSizeInBits() == Bits) { + TargetTy = ValTy; + } else { + return false; + } + + (void)TargetTy; + StoreInst *SI = B.CreateStore(Val, Ptr); + SI->setAlignment(Align(1)); + SI->setVolatile(true); // Preserve the asm volatile intent. + CI->eraseFromParent(); + return true; +} + +// Try to rewrite a flat_load_* inline-asm call as a plain LLVM load. +// hipCUB emits: +// asm volatile("flat_load_ %0, %1 ...\ns_waitcnt ..." : +// "=v"(retval) : "v"(ptr)); +// so operand 0 is the pointer (only input). +static bool tryReplaceLoad(CallInst *CI, StringRef Asm) { + unsigned Bits = loadWidthBits(Asm); + if (!Bits || CI->arg_size() < 1) + return false; + + Value *Ptr = CI->getArgOperand(0); + if (!Ptr->getType()->isPointerTy()) + return false; + + Type *RetTy = CI->getType(); + if (RetTy->isVoidTy() || RetTy->getPrimitiveSizeInBits() == 0) + return false; + + IRBuilder<> B(CI); + + // Load at the natural width of the inline-asm return type; trust the + // frontend to have matched it to the mnemonic suffix (hipCUB does). + LoadInst *LI = B.CreateLoad(RetTy, Ptr); + LI->setAlignment(Align(1)); + LI->setVolatile(true); + CI->replaceAllUsesWith(LI); + CI->eraseFromParent(); + (void)Bits; + return true; +} + +static bool processFunction(Function &F) { + SmallVector Worklist; + for (auto &BB : F) + for (auto &I : BB) + if (auto *CI = dyn_cast(&I)) + if (isa(CI->getCalledOperand())) + Worklist.push_back(CI); + + bool Changed = false; + for (CallInst *CI : Worklist) { + auto *IA = cast(CI->getCalledOperand()); + StringRef Asm = IA->getAsmString(); + if (!isAMDGCNAsm(Asm)) + continue; + + if (Asm.contains("flat_store_") || Asm.contains("global_store_") || + Asm.contains("buffer_store_")) { + if (tryReplaceStore(CI, Asm)) { + Changed = true; + continue; + } + } else if (Asm.contains("flat_load_") || Asm.contains("global_load_") || + Asm.contains("buffer_load_")) { + if (tryReplaceLoad(CI, Asm)) { + Changed = true; + continue; + } + } + + // Fallback: drop the call. For s_waitcnt / s_barrier this is the + // right thing (pure hardware hints). For an unrecognised store/load + // shape we emit a warning — the resulting program may be slower or + // miss a cache hint, but at least it will compile. + if (!CI->getType()->isVoidTy()) { + // Leave a poison so uses don't reference a deleted value. + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + errs() << "Warning: HipStripAMDGCNAsm dropping unsupported AMDGCN " + << "inline asm with non-void return: '" << Asm << "' in " + << F.getName() << "\n"; + } + CI->eraseFromParent(); + Changed = true; + } + return Changed; +} + +} // namespace + +PreservedAnalyses HipStripAMDGCNAsmPass::run(Module &M, + ModuleAnalysisManager &AM) { + bool Changed = false; + for (auto &F : M) + if (!F.isDeclaration()) + Changed |= processFunction(F); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm_passes/HipStripAMDGCNAsm.h b/llvm_passes/HipStripAMDGCNAsm.h new file mode 100644 index 000000000..3bdd71369 --- /dev/null +++ b/llvm_passes/HipStripAMDGCNAsm.h @@ -0,0 +1,39 @@ +//===- HipStripAMDGCNAsm.h ------------------------------------------------===// +// +// Part of the chipStar Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// LLVM pass that strips AMDGCN-mnemonic inline assembly from the IR before +// SPIR-V emission. hipCUB's ThreadStore templates embed +// raw AMDGCN inline asm (flat_store_dword ... glc, s_waitcnt) which +// SPIRV-LLVM-Translator cannot lower (it null-derefs in transDirectCallInst +// because CI->getCalledFunction() is nullptr for InlineAsm callees). +// +// This pass walks all CallInst whose callee is an InlineAsm, detects AMDGCN +// mnemonics in the asm string, and replaces the call with an equivalent +// plain LLVM load/store (for flat_load_*/flat_store_*) or simply deletes +// the call (for s_waitcnt/s_barrier/etc., which are pure cache/ordering +// hints with no SPIR-V analogue). Unrecognised AMDGCN mnemonics are +// deleted with a warning — correct enough for optional cache-modifier +// hints emitted by hipCUB. +// +// (c) 2026 chipStar developers +//===----------------------------------------------------------------------===// + +#ifndef LLVM_PASSES_HIP_STRIP_AMDGCN_ASM_H +#define LLVM_PASSES_HIP_STRIP_AMDGCN_ASM_H + +#include "llvm/IR/PassManager.h" + +using namespace llvm; + +class HipStripAMDGCNAsmPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + static bool isRequired() { return true; } +}; + +#endif // LLVM_PASSES_HIP_STRIP_AMDGCN_ASM_H