Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm_passes/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ add_library(LLVMHipPasses MODULE HipPasses.cpp
HipLowerZeroLengthArrays.cpp HipSanityChecks.cpp HipLowerSwitch.cpp
HipLowerMemset.cpp HipIGBADetector.cpp HipPromoteInts.cpp
HipSpirvFunctionReorderPass.cpp
HipStripAMDGCNAsm.cpp
HipVerify.cpp
${EXTRA_OBJS})

Expand Down
7 changes: 7 additions & 0 deletions llvm_passes/HipPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "HipKernelArgSpiller.h"
#include "HipLowerZeroLengthArrays.h"
#include "HipSanityChecks.h"
#include "HipStripAMDGCNAsm.h"
#include "HipLowerSwitch.h"
#include "HipLowerMemset.h"
#include "HipIGBADetector.h"
Expand Down Expand Up @@ -126,6 +127,12 @@ static void addFullLinkTimePasses(ModulePassManager &MPM) {
// Initial verification
MPM.addPass(HipVerifyPass("Pre-HIP passes", false)); // false = don't print summary yet

// Strip AMDGCN-mnemonic inline assembly (hipCUB ThreadStore/ThreadLoad)
// before any other pass sees it. These call sites would otherwise reach
// SPIRV-LLVM-Translator and null-deref transDirectCallInst.
addPassWithVerification(MPM, HipStripAMDGCNAsmPass(),
"HipStripAMDGCNAsmPass");

// Use HipVerify for intermediate passes without printing summary
addPassWithVerification(MPM, HipSanityChecksPass(), "HipSanityChecksPass");

Expand Down
222 changes: 222 additions & 0 deletions llvm_passes/HipStripAMDGCNAsm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
//===- HipStripAMDGCNAsm.cpp ----------------------------------------------===//
//
// Part of the chipStar Project, under the Apache License v2.0 with LLVM
// Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Rewrite AMDGCN inline-assembly call sites into plain LLVM load/store (or
// delete them) before the IR is handed to SPIRV-LLVM-Translator.
//
// The motivating case is hipCUB's ThreadStore/ThreadLoad templates in
// hipcub/backend/rocprim/thread/thread_{store,load}.hpp, which emit
// constructs like:
//
// asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val));
// asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00));
//
// These AMDGCN mnemonics have no SPIR-V analogue. Without this pass
// they reach llvm-spirv, which segfaults in
// SPIRV::LLVMToSPIRVBase::transDirectCallInst (lib/SPIRV/SPIRVWriter.cpp)
// when CI->getCalledFunction() returns nullptr for the InlineAsm callee.
//
// Replacement policy:
// flat_store_{byte,short,dword,dwordx2,...} => plain `store` to the
// first (ptr) operand of the value held in the second operand.
// flat_load_{...} => plain `load`; result replaces the inline-asm
// output.
// s_waitcnt / s_barrier / v_* / other => erase the call. These are
// performance/ordering hints; the surrounding plain LLVM loads and
// stores preserve program semantics.
//
// (c) 2026 chipStar developers
//===----------------------------------------------------------------------===//

#include "HipStripAMDGCNAsm.h"

#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

#define DEBUG_TYPE "hip-strip-amdgcn-asm"

using namespace llvm;

namespace {

// Returns true if the asm string contains any AMDGCN mnemonic we recognise
// as needing replacement/removal before SPIR-V emission.
static bool isAMDGCNAsm(StringRef Asm) {
static const char *Mnemonics[] = {
"flat_store_", "flat_load_", "global_store_", "global_load_",
"buffer_store_", "buffer_load_", "s_waitcnt", "s_barrier",
"s_memtime", "s_memrealtime",
// Conservative: treat any v_ / ds_ mnemonic as AMDGCN.
"ds_write", "ds_read", "v_mov_", "v_add_", "v_sub_", "v_mul_",
};
for (const char *M : Mnemonics)
if (Asm.contains(M))
return true;
return false;
}

// Classify the store width for flat_store_* / global_store_* / buffer_store_*
// based on the mnemonic suffix. Returns 0 when we cannot determine a width.
static unsigned storeWidthBits(StringRef Asm) {
if (Asm.contains("store_byte")) return 8;
if (Asm.contains("store_short")) return 16;
if (Asm.contains("store_dwordx4")) return 128;
if (Asm.contains("store_dwordx2")) return 64;
if (Asm.contains("store_dwordx3")) return 96;
if (Asm.contains("store_dword")) return 32;
return 0;
}

static unsigned loadWidthBits(StringRef Asm) {
if (Asm.contains("load_ubyte") || Asm.contains("load_sbyte")) return 8;
if (Asm.contains("load_ushort") || Asm.contains("load_sshort")) return 16;
if (Asm.contains("load_dwordx4")) return 128;
if (Asm.contains("load_dwordx2")) return 64;
if (Asm.contains("load_dwordx3")) return 96;
if (Asm.contains("load_dword")) return 32;
return 0;
}

// Try to rewrite a flat_store_* inline-asm call as a plain LLVM store.
// hipCUB / rocPRIM emit the pattern:
// asm volatile("flat_store_<w> %0, %1 glc" : : "v"(ptr), "v"(val));
// so operand 0 is the pointer and operand 1 is the value.
static bool tryReplaceStore(CallInst *CI, StringRef Asm) {
unsigned Bits = storeWidthBits(Asm);
if (!Bits || CI->arg_size() < 2)
return false;

Value *Ptr = CI->getArgOperand(0);
Value *Val = CI->getArgOperand(1);
if (!Ptr->getType()->isPointerTy())
return false;

IRBuilder<> B(CI);

// The inline-asm operand value may be narrower than the store width
// (e.g. `flat_store_byte` with an i16 src holding the byte). Truncate
// as needed; only widen when strictly necessary.
Type *ValTy = Val->getType();
Type *TargetTy = nullptr;
if (ValTy->isIntegerTy()) {
unsigned VBits = ValTy->getIntegerBitWidth();
if (VBits == Bits) {
TargetTy = ValTy;
} else if (VBits > Bits) {
TargetTy = B.getIntNTy(Bits);
Val = B.CreateTrunc(Val, TargetTy);
} else {
// Extending would change bit patterns; fall back to deletion.
return false;
}
} else if (ValTy->isFloatingPointTy() &&
ValTy->getPrimitiveSizeInBits() == Bits) {
TargetTy = ValTy;
} else {
return false;
}

(void)TargetTy;
StoreInst *SI = B.CreateStore(Val, Ptr);
SI->setAlignment(Align(1));
SI->setVolatile(true); // Preserve the asm volatile intent.
CI->eraseFromParent();
return true;
}

// Try to rewrite a flat_load_* inline-asm call as a plain LLVM load.
// hipCUB emits:
// asm volatile("flat_load_<w> %0, %1 ...\ns_waitcnt ..." :
// "=v"(retval) : "v"(ptr));
// so operand 0 is the pointer (only input).
static bool tryReplaceLoad(CallInst *CI, StringRef Asm) {
unsigned Bits = loadWidthBits(Asm);
if (!Bits || CI->arg_size() < 1)
return false;

Value *Ptr = CI->getArgOperand(0);
if (!Ptr->getType()->isPointerTy())
return false;

Type *RetTy = CI->getType();
if (RetTy->isVoidTy() || RetTy->getPrimitiveSizeInBits() == 0)
return false;

IRBuilder<> B(CI);

// Load at the natural width of the inline-asm return type; trust the
// frontend to have matched it to the mnemonic suffix (hipCUB does).
LoadInst *LI = B.CreateLoad(RetTy, Ptr);
LI->setAlignment(Align(1));
LI->setVolatile(true);
CI->replaceAllUsesWith(LI);
CI->eraseFromParent();
(void)Bits;
return true;
}

static bool processFunction(Function &F) {
SmallVector<CallInst *, 16> Worklist;
for (auto &BB : F)
for (auto &I : BB)
if (auto *CI = dyn_cast<CallInst>(&I))
if (isa<InlineAsm>(CI->getCalledOperand()))
Worklist.push_back(CI);

bool Changed = false;
for (CallInst *CI : Worklist) {
auto *IA = cast<InlineAsm>(CI->getCalledOperand());
StringRef Asm = IA->getAsmString();
if (!isAMDGCNAsm(Asm))
continue;

if (Asm.contains("flat_store_") || Asm.contains("global_store_") ||
Asm.contains("buffer_store_")) {
if (tryReplaceStore(CI, Asm)) {
Changed = true;
continue;
}
} else if (Asm.contains("flat_load_") || Asm.contains("global_load_") ||
Asm.contains("buffer_load_")) {
if (tryReplaceLoad(CI, Asm)) {
Changed = true;
continue;
}
}

// Fallback: drop the call. For s_waitcnt / s_barrier this is the
// right thing (pure hardware hints). For an unrecognised store/load
// shape we emit a warning — the resulting program may be slower or
// miss a cache hint, but at least it will compile.
if (!CI->getType()->isVoidTy()) {
// Leave a poison so uses don't reference a deleted value.
CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
errs() << "Warning: HipStripAMDGCNAsm dropping unsupported AMDGCN "
<< "inline asm with non-void return: '" << Asm << "' in "
<< F.getName() << "\n";
}
CI->eraseFromParent();
Changed = true;
}
return Changed;
}

} // namespace

PreservedAnalyses HipStripAMDGCNAsmPass::run(Module &M,
ModuleAnalysisManager &AM) {
bool Changed = false;
for (auto &F : M)
if (!F.isDeclaration())
Changed |= processFunction(F);
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
39 changes: 39 additions & 0 deletions llvm_passes/HipStripAMDGCNAsm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//===- HipStripAMDGCNAsm.h ------------------------------------------------===//
//
// Part of the chipStar Project, under the Apache License v2.0 with LLVM
// Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// LLVM pass that strips AMDGCN-mnemonic inline assembly from the IR before
// SPIR-V emission. hipCUB's ThreadStore<STORE_CS/CG/...> templates embed
// raw AMDGCN inline asm (flat_store_dword ... glc, s_waitcnt) which
// SPIRV-LLVM-Translator cannot lower (it null-derefs in transDirectCallInst
// because CI->getCalledFunction() is nullptr for InlineAsm callees).
//
// This pass walks all CallInst whose callee is an InlineAsm, detects AMDGCN
// mnemonics in the asm string, and replaces the call with an equivalent
// plain LLVM load/store (for flat_load_*/flat_store_*) or simply deletes
// the call (for s_waitcnt/s_barrier/etc., which are pure cache/ordering
// hints with no SPIR-V analogue). Unrecognised AMDGCN mnemonics are
// deleted with a warning — correct enough for optional cache-modifier
// hints emitted by hipCUB.
//
// (c) 2026 chipStar developers
//===----------------------------------------------------------------------===//

#ifndef LLVM_PASSES_HIP_STRIP_AMDGCN_ASM_H
#define LLVM_PASSES_HIP_STRIP_AMDGCN_ASM_H

#include "llvm/IR/PassManager.h"

using namespace llvm;

class HipStripAMDGCNAsmPass : public PassInfoMixin<HipStripAMDGCNAsmPass> {
public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
static bool isRequired() { return true; }
};

#endif // LLVM_PASSES_HIP_STRIP_AMDGCN_ASM_H
70 changes: 70 additions & 0 deletions tests/regression/test_strip_amdgcn_asm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Reproducer: hipCUB's ThreadStore<STORE_CS/CG/WB> templates emit AMDGCN
// inline assembly (flat_store_dword ... glc, s_waitcnt vmcnt(0)) that
// SPIRV-LLVM-Translator cannot consume. On an unpatched chipStar the
// lowered bitcode still carries these inline-asm calls; invoking
// llvm-spirv then null-derefs inside transDirectCallInst
// (SPIRVWriter.cpp:5598, CI->getCalledFunction() is nullptr when the
// callee operand is an InlineAsm).
//
// This file mimics the minimal hipCUB pattern: a __device__ wrapper
// that emits "flat_store_dword $0, $1 glc" followed by
// "s_waitcnt vmcnt(0)" via `asm volatile`, called from a kernel. It
// must build cleanly when chipStar's HipStripAMDGCNAsm pass replaces
// the inline asm with plain LLVM stores and drops the waitcnt.
//
// Without the fix: hipcc aborts at hipspv-link stage with
// "clang++: error: hipspv-link command failed due to signal" and the
// llvm-spirv stack trace ending at SPIRV::LLVMToSPIRVBase::transDirectCallInst.
#include <hip/hip_runtime.h>
#include <cstdio>

__device__ __forceinline__ void amdgcn_store_cs(unsigned int* ptr,
unsigned int val) {
#if defined(__HIP_DEVICE_COMPILE__)
// Same shape as hipCUB thread_store.hpp HIPCUB_ASM_THREAD_STORE:
// asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val));
// asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00));
asm volatile("flat_store_dword %0, %1 glc" : : "v"(ptr), "v"(val));
asm volatile("s_waitcnt vmcnt(%0)" : : "I"(0x00));
#else
*ptr = val;
#endif
}

__global__ void test_kernel(unsigned int* out, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n)
amdgcn_store_cs(&out[tid], (unsigned int)(tid * 42));
}

#define CHECK(expr) do { \
hipError_t e = (expr); \
if (e != hipSuccess) { \
printf("FAIL %s = %d (%s)\n", #expr, (int)e, hipGetErrorString(e)); \
return 1; \
} \
} while (0)

int main() {
constexpr int N = 64;
unsigned int* d_out = nullptr;
CHECK(hipMalloc(&d_out, N * sizeof(unsigned int)));
CHECK(hipMemset(d_out, 0, N * sizeof(unsigned int)));

hipLaunchKernelGGL(test_kernel, dim3(1), dim3(N), 0, 0, d_out, N);
CHECK(hipGetLastError());
CHECK(hipDeviceSynchronize());

unsigned int host[N];
CHECK(hipMemcpy(host, d_out, N * sizeof(unsigned int), hipMemcpyDeviceToHost));
for (int i = 0; i < N; ++i) {
if (host[i] != (unsigned int)(i * 42)) {
printf("FAIL host[%d]=%u expected %u\n", i, host[i],
(unsigned int)(i * 42));
return 1;
}
}
CHECK(hipFree(d_out));
printf("PASS\n");
return 0;
}
Loading