diff --git a/.github/workflows/ubuntu24_04_cuda13_1.yaml b/.github/workflows/ubuntu24_04_cuda13_1.yaml
new file mode 100644
index 00000000..4ca61c68
--- /dev/null
+++ b/.github/workflows/ubuntu24_04_cuda13_1.yaml
@@ -0,0 +1,55 @@
+name: Build and Test perftest on Ubuntu 24.04 with CUDA 13.1
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install CUDA repository
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            build-essential \
+            autoconf \
+            automake \
+            libtool \
+            pkg-config \
+            libibverbs-dev \
+            librdmacm-dev \
+            libibumad-dev \
+            libpci-dev \
+            cuda-toolkit-13-1 \
+            cuda-drivers
+
+      - name: Set up CUDA environment
+        run: |
+          echo 'export PATH=/usr/local/cuda-13.1/bin:${PATH}' >> $GITHUB_ENV
+          echo 'export LD_LIBRARY_PATH=/usr/local/cuda-13.1/lib64:${LD_LIBRARY_PATH}' >> $GITHUB_ENV
+
+      - name: Run autogen.sh
+        run: ./autogen.sh
+
+      - name: Configure the build
+        run: ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h
+
+      - name: Build perftest
+        run: make
+
+      - name: Install perftest
+        run: sudo make install
diff --git a/configure.ac b/configure.ac
index d8c353f8..dc5108f6 100755
--- a/configure.ac
+++ b/configure.ac
@@ -369,7 +369,6 @@ if test "$cuda_found" = "yes"; then
 	AC_DEFINE_UNQUOTED([CUDA_PATH], "$cuda_h_path" , [Enable CUDA feature])
 	AC_CHECK_LIB([cuda], [cuMemGetHandleForAddressRange], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=yes], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=no])
 	cuda_toolkit_version=`grep "define CUDA_VERSION" $cuda_h_path | cut -d' ' -f3`
-	AC_DEFINE_UNQUOTED([CUDA_VER], [$cuda_toolkit_version], [Define CUDA_VER])
 	AC_TRY_LINK([
 	#include <$cuda_h_path>],
 	[int x = CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD|CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED;],
diff --git a/src/cuda_loader.c b/src/cuda_loader.c
index 32a550ec..c5c85212 100755
--- a/src/cuda_loader.c
+++ b/src/cuda_loader.c
@@ -10,7 +10,7 @@ CUresult (*p_cuDeviceGetCount)(int *) = NULL;
 CUresult (*p_cuDeviceGet)(CUdevice *, int) = NULL;
 CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice) = NULL;
 CUresult (*p_cuDeviceGetName)(char *, int, CUdevice) = NULL;
-CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice) = NULL;
+CUresult (*p_cuCtxCreate_v2)(CUcontext *, unsigned int, CUdevice) = NULL;
 CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice) = NULL;
 CUresult (*p_cuCtxSetCurrent)(CUcontext) = NULL;
 CUresult (*p_cuCtxDestroy)(CUcontext) = NULL;
@@ -25,7 +25,7 @@ CUresult (*p_cuMemcpyDtoD)(CUdeviceptr, CUdeviceptr, size_t) = NULL;
 CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmemRangeHandleType, unsigned int) = NULL;
 #endif
 CUresult (*p_cuDriverGetVersion)(int* driverVersion) = NULL;
-#if CUDA_VER >= 12000
+#if CUDA_VERSION >= 12000
 CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) = NULL;
 #else
 CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags) = NULL;
@@ -34,7 +34,7 @@ CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int
 CUresult (*p_cuCtxSynchronize) (void) = NULL;
 
 int load_cuda_function(void **func_ptr, const char *func_name, int version) {
-    #if CUDA_VER >= 12000
+    #if CUDA_VERSION >= 12000
     CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0, NULL);
     #else
     CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0);
@@ -69,7 +69,8 @@ int load_cuda_library(void) {
         { (void**)&p_cuDeviceGet,                 "cuDeviceGet",                  CUDA_VER_2_0  },
         { (void**)&p_cuDeviceGetAttribute,        "cuDeviceGetAttribute",         CUDA_VER_2_0  },
         { (void**)&p_cuDeviceGetName,             "cuDeviceGetName",              CUDA_VER_2_0  },
-        { (void**)&p_cuCtxCreate,                 "cuCtxCreate",                  CUDA_VER_3_2  },
+        /* CUDA_VER_3_2 selects the cuCtxCreate_v2 ABI across CUDA 11-13. */
+        { (void**)&p_cuCtxCreate_v2,              "cuCtxCreate",                  CUDA_VER_3_2  },
         { (void**)&p_cuDevicePrimaryCtxRetain,    "cuDevicePrimaryCtxRetain",     CUDA_VER_7_0  },
         { (void**)&p_cuCtxSetCurrent,             "cuCtxSetCurrent",              CUDA_VER_4_0  },
         { (void**)&p_cuCtxDestroy,                "cuCtxDestroy",                 CUDA_VER_4_0  },
diff --git a/src/cuda_loader.h b/src/cuda_loader.h
index d0835ac1..e63642c8 100755
--- a/src/cuda_loader.h
+++ b/src/cuda_loader.h
@@ -30,7 +30,7 @@ extern CUresult (*p_cuDeviceGetCount)(int *);
 extern CUresult (*p_cuDeviceGet)(CUdevice *, int);
 extern CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice);
 extern CUresult (*p_cuDeviceGetName)(char *, int, CUdevice);
-extern CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice);
+extern CUresult (*p_cuCtxCreate_v2)(CUcontext *, unsigned int, CUdevice);
 extern CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice);
 extern CUresult (*p_cuCtxSetCurrent)(CUcontext);
 extern CUresult (*p_cuCtxDestroy)(CUcontext);
@@ -47,7 +47,7 @@ extern CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmem
 extern CUresult (*p_cuDriverGetVersion)(int* driverVersion);
 extern CUresult (*p_cuCtxSynchronize) (void);
 extern CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int  flags);
-#if CUDA_VER >= 12000
+#if CUDA_VERSION >= 12000
 extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus);
 #else
 extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags);
diff --git a/src/cuda_memory.c b/src/cuda_memory.c
index 92948c21..e24bff4f 100644
--- a/src/cuda_memory.c
+++ b/src/cuda_memory.c
@@ -94,9 +94,11 @@ static int init_gpu(struct cuda_memory_ctx *ctx)
 	printf("[pid = %d, dev = %d] device name = [%s]\n", getpid(), ctx->cuDevice, name);
 	printf("creating CUDA Ctx\n");
 
-	error = p_cuCtxCreate(&ctx->cuContext, CU_CTX_MAP_HOST, ctx->cuDevice);
+	/* Create context */
+	error = p_cuCtxCreate_v2(&ctx->cuContext, CU_CTX_MAP_HOST, ctx->cuDevice);
+
 	if (error != CUDA_SUCCESS) {
-		printf("cuCtxCreate() error=%d\n", error);
+		printf("cuCtxCreate_v2() error=%d\n", error);
 		return FAILURE;
 	}