diff --git a/.github/workflows/ubuntu24_04_cuda13_1.yaml b/.github/workflows/ubuntu24_04_cuda13_1.yaml new file mode 100644 index 00000000..4ca61c68 --- /dev/null +++ b/.github/workflows/ubuntu24_04_cuda13_1.yaml @@ -0,0 +1,55 @@ +name: Build and Test perftest on Ubuntu 24.04 with CUDA 13.1 + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-24.04 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install CUDA repository + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + build-essential \ + autoconf \ + automake \ + libtool \ + pkg-config \ + libibverbs-dev \ + librdmacm-dev \ + libibumad-dev \ + libpci-dev \ + cuda-toolkit-13-1 \ + cuda-drivers + + - name: Set up CUDA environment + run: | + echo 'export PATH=/usr/local/cuda-13.1/bin:${PATH}' >> $GITHUB_ENV + echo 'export LD_LIBRARY_PATH=/usr/local/cuda-13.1/lib64:${LD_LIBRARY_PATH}' >> $GITHUB_ENV + + - name: Run autogen.sh + run: ./autogen.sh + + - name: Configure the build + run: ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h + + - name: Build perftest + run: make + + - name: Install perftest + run: sudo make install diff --git a/configure.ac b/configure.ac index d8c353f8..dc5108f6 100755 --- a/configure.ac +++ b/configure.ac @@ -369,7 +369,6 @@ if test "$cuda_found" = "yes"; then AC_DEFINE_UNQUOTED([CUDA_PATH], "$cuda_h_path" , [Enable CUDA feature]) AC_CHECK_LIB([cuda], [cuMemGetHandleForAddressRange], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=yes], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=no]) cuda_toolkit_version=`grep "define CUDA_VERSION" $cuda_h_path | cut -d' ' -f3` - AC_DEFINE_UNQUOTED([CUDA_VER], [$cuda_toolkit_version], [Define CUDA_VER]) AC_TRY_LINK([ #include <$cuda_h_path>], [int x = CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD|CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED;], diff --git a/src/cuda_loader.c b/src/cuda_loader.c index 32a550ec..c5c85212 100755 --- a/src/cuda_loader.c +++ b/src/cuda_loader.c @@ -10,7 +10,7 @@ CUresult (*p_cuDeviceGetCount)(int *) = NULL; CUresult (*p_cuDeviceGet)(CUdevice *, int) = NULL; CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice) = NULL; CUresult (*p_cuDeviceGetName)(char *, int, CUdevice) = NULL; -CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice) = NULL; +CUresult (*p_cuCtxCreate_v2)(CUcontext *, unsigned int, CUdevice) = NULL; CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice) = NULL; CUresult (*p_cuCtxSetCurrent)(CUcontext) = NULL; CUresult (*p_cuCtxDestroy)(CUcontext) = NULL; @@ -25,7 +25,7 @@ CUresult (*p_cuMemcpyDtoD)(CUdeviceptr, CUdeviceptr, size_t) = NULL; CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmemRangeHandleType, unsigned int) = NULL; #endif CUresult (*p_cuDriverGetVersion)(int* driverVersion) = NULL; -#if CUDA_VER >= 12000 +#if CUDA_VERSION >= 12000 CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) = NULL; #else CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags) = NULL; @@ -34,7 +34,7 @@ CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int CUresult (*p_cuCtxSynchronize) (void) = NULL; int load_cuda_function(void **func_ptr, const char *func_name, int version) { - #if CUDA_VER >= 12000 + #if CUDA_VERSION >= 12000 CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0, NULL); #else CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0); @@ -69,7 +69,8 @@ int load_cuda_library(void) { { (void**)&p_cuDeviceGet, "cuDeviceGet", CUDA_VER_2_0 }, { (void**)&p_cuDeviceGetAttribute, "cuDeviceGetAttribute", CUDA_VER_2_0 }, { (void**)&p_cuDeviceGetName, "cuDeviceGetName", CUDA_VER_2_0 }, - { (void**)&p_cuCtxCreate, "cuCtxCreate", CUDA_VER_3_2 }, + /* CUDA_VER_3_2 selects the cuCtxCreate_v2 ABI across CUDA 11-13. */ + { (void**)&p_cuCtxCreate_v2, "cuCtxCreate", CUDA_VER_3_2 }, { (void**)&p_cuDevicePrimaryCtxRetain, "cuDevicePrimaryCtxRetain", CUDA_VER_7_0 }, { (void**)&p_cuCtxSetCurrent, "cuCtxSetCurrent", CUDA_VER_4_0 }, { (void**)&p_cuCtxDestroy, "cuCtxDestroy", CUDA_VER_4_0 }, diff --git a/src/cuda_loader.h b/src/cuda_loader.h index d0835ac1..e63642c8 100755 --- a/src/cuda_loader.h +++ b/src/cuda_loader.h @@ -30,7 +30,7 @@ extern CUresult (*p_cuDeviceGetCount)(int *); extern CUresult (*p_cuDeviceGet)(CUdevice *, int); extern CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice); extern CUresult (*p_cuDeviceGetName)(char *, int, CUdevice); -extern CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice); +extern CUresult (*p_cuCtxCreate_v2)(CUcontext *, unsigned int, CUdevice); extern CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice); extern CUresult (*p_cuCtxSetCurrent)(CUcontext); extern CUresult (*p_cuCtxDestroy)(CUcontext); @@ -47,7 +47,7 @@ extern CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmem extern CUresult (*p_cuDriverGetVersion)(int* driverVersion); extern CUresult (*p_cuCtxSynchronize) (void); extern CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int flags); -#if CUDA_VER >= 12000 +#if CUDA_VERSION >= 12000 extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus); #else extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags); diff --git a/src/cuda_memory.c b/src/cuda_memory.c index 92948c21..e24bff4f 100644 --- a/src/cuda_memory.c +++ b/src/cuda_memory.c @@ -94,9 +94,11 @@ static int init_gpu(struct cuda_memory_ctx *ctx) printf("[pid = %d, dev = %d] device name = [%s]\n", getpid(), ctx->cuDevice, name); printf("creating CUDA Ctx\n"); - error = p_cuCtxCreate(&ctx->cuContext, CU_CTX_MAP_HOST, ctx->cuDevice); + /* Create context */ + error = p_cuCtxCreate_v2(&ctx->cuContext, CU_CTX_MAP_HOST, ctx->cuDevice); + if (error != CUDA_SUCCESS) { - printf("cuCtxCreate() error=%d\n", error); + printf("cuCtxCreate_v2() error=%d\n", error); return FAILURE; }