ggml-org · sourceupdev · Apr 15, 2026
@@ -23,7 +23,10 @@ if (CUDAToolkit_FOUND)
         #
         # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
         #     for best performance and to also build real architectures for the most commonly used GPUs.
-        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
+        if (CUDAToolkit_VERSION VERSION_LESS "11.0")
+            # CUDA 10.2: only compute 6.2 (Jetson TX2 / Pascal)
+            set(CMAKE_CUDA_ARCHITECTURES "62")
+        elseif (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
             set(CMAKE_CUDA_ARCHITECTURES "native")
         else()
             if (CUDAToolkit_VERSION VERSION_LESS "13")
@@ -57,6 +60,12 @@ if (CUDAToolkit_FOUND)
 
     enable_language(CUDA)
 
+    # CUDA 10.2 compat: force C++14 standard and enable if constexpr support
+    if (CUDAToolkit_VERSION VERSION_LESS "11.0")
+        set(CMAKE_CUDA_STANDARD 14)
+        set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    endif()
+
     # TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
     if (GGML_CUDA_CUB_3DOT2)
         include(FetchContent)
@@ -195,6 +204,10 @@ if (CUDAToolkit_FOUND)
 
     set(CUDA_FLAGS -use_fast_math -extended-lambda)
 
+    if (CUDAToolkit_VERSION VERSION_LESS "11.0")
+        list(APPEND CUDA_FLAGS --expt-relaxed-constexpr)
+    endif()
+
     if (GGML_CUDA_DEBUG)
         list(APPEND CUDA_FLAGS -lineinfo)
         add_compile_definitions(GGML_CUDA_DEBUG)

@@ -77,7 +77,8 @@ static __global__ void k_bin_bcast(const src0_t *         src0,
 
         float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
         if constexpr (sizeof...(src1_ptrs) > 0) {
-            result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
+            int _dummy[] = { (result = bin_op(result, (float)src1s[i_src1 + i10*s10]), 0)... };
+            (void)_dummy;
         } else {
             result = bin_op(result, (float)src1[i_src1 + i10*s10]);
         }
@@ -143,7 +144,8 @@ static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
 
     float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
     if constexpr (sizeof...(src1_ptrs) > 0) {
-        result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
+        int _dummy[] = { (result = bin_op(result, (float)src1s[i_src1 + i10*s10]), 0)... };
+            (void)_dummy;
     } else {
         result = bin_op(result, (float)src1[i_src1 + i10*s10]);
     }

@@ -546,7 +546,20 @@ template<block_reduce_method method_t, typename T>
 struct block_reduce_policy;
 
 template <typename T, typename... Ts>
-inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
+struct is_any_impl;
+
+template <typename T>
+struct is_any_impl<T> {
+    static constexpr bool value = false;
+};
+
+template <typename T, typename First, typename... Rest>
+struct is_any_impl<T, First, Rest...> {
+    static constexpr bool value = std::is_same<T, First>::value || is_any_impl<T, Rest...>::value;
+};
+
+template <typename T, typename... Ts>
+inline constexpr bool is_any = is_any_impl<T, Ts...>::value;
 
 template<typename...>
 inline constexpr bool ggml_cuda_dependent_false_v = false;
@@ -561,13 +574,13 @@ template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
     }
 
     static __device__ T sentinel() {
-        if constexpr (std::is_same_v<T, float>) {
+        if constexpr (std::is_same<T, float>::value) {
             return 0.0f;
-        } else if constexpr (std::is_same_v<T, float2>) {
+        } else if constexpr (std::is_same<T, float2>::value) {
             return make_float2(0.0f, 0.0f);
-        } else if constexpr (std::is_same_v<T, half2>) {
+        } else if constexpr (std::is_same<T, half2>::value) {
             return make_half2(0.0f, 0.0f);
-        } else if constexpr (std::is_same_v<T, int>) {
+        } else if constexpr (std::is_same<T, int>::value) {
             return 0;
         } else {
             static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
@@ -585,9 +598,9 @@ template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
     }
 
     static __device__ T sentinel() {
-        if constexpr (std::is_same_v<T, float>) {
+        if constexpr (std::is_same<T, float>::value) {
             return -INFINITY;
-        } else if constexpr (std::is_same_v<T, half2>) {
+        } else if constexpr (std::is_same<T, half2>::value) {
             return make_half2(-INFINITY, -INFINITY);
         } else {
             static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
@@ -1252,7 +1265,9 @@ struct ggml_cuda_concurrent_event {
         const int64_t       join_start = (int64_t) join_t->data;
         const int64_t       join_end   = join_start + ggml_nbytes(join_t);
 
-        for (const auto & [tensor, stream] : stream_mapping) {
+        for (const auto & _kv : stream_mapping) {
+            const auto & tensor = _kv.first;
+            const auto & stream = _kv.second;
             const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
             const int64_t       t_start = (int64_t) t->data;
             const int64_t       t_end   = t_start + ggml_nbytes(t);
@@ -1273,7 +1288,9 @@ struct ggml_cuda_concurrent_event {
 
         bool writes_overlap = false;
         bool dependent_srcs = false;
-        for (const auto & [tensor, stream] : stream_mapping) {
+        for (const auto & _kv : stream_mapping) {
+            const auto & tensor = _kv.first;
+            const auto & stream = _kv.second;
             const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
             const int64_t       t_start = (int64_t) t->data;
             const int64_t       t_end   = t_start + ggml_nbytes(t);
@@ -1379,7 +1396,9 @@ struct ggml_backend_cuda_context {
     // Check if any CUDA graph is enabled for this context (used by kernels that need to know
     // if graphs are in use without having access to the specific graph key)
     bool any_cuda_graph_enabled() const {
-        for (const auto & [key, graph] : cuda_graphs) {
+        for (const auto & _kv : cuda_graphs) {
+            const auto & key = _kv.first;
+            const auto & graph = _kv.second;
             if (graph && graph->is_enabled()) {
                 return true;
             }
@@ -1389,7 +1408,9 @@ struct ggml_backend_cuda_context {
 
     // Check if any CUDA graph has an instance for this context
     bool any_cuda_graph_has_instance() const {
-        for (const auto & [key, graph] : cuda_graphs) {
+        for (const auto & _kv : cuda_graphs) {
+            const auto & key = _kv.first;
+            const auto & graph = _kv.second;
             if (graph && graph->instance != nullptr) {
                 return true;
             }

@@ -0,0 +1,57 @@
+#pragma once
+
+// Compatibility polyfills for CUDA 10.2 (Jetson TX2)
+// bf16 types are mapped to fp16 since compute 6.2 has no hardware bf16 support.
+
+#include <cuda_fp16.h>
+
+// bf16 type polyfills (mapped to fp16 for compute 6.2)
+typedef __half nv_bfloat16;
+
+struct nv_bfloat162 {
+    nv_bfloat16 x;
+    nv_bfloat16 y;
+};
+
+static __host__ __device__ __forceinline__ nv_bfloat16 __float2bfloat16(float f) {
+    return __float2half(f);
+}
+
+static __host__ __device__ __forceinline__ float __bfloat162float(nv_bfloat16 h) {
+    return __half2float(h);
+}
+
+static __host__ __device__ __forceinline__ nv_bfloat162 make_bfloat162(nv_bfloat16 a, nv_bfloat16 b) {
+    nv_bfloat162 r;
+    r.x = a;
+    r.y = b;
+    return r;
+}
+
+static __host__ __device__ __forceinline__ nv_bfloat162 __float22bfloat162_rn(float2 f) {
+    return make_bfloat162(__float2bfloat16(f.x), __float2bfloat16(f.y));
+}
+
+static __host__ __device__ __forceinline__ float2 __bfloat1622float2(nv_bfloat162 h) {
+    return make_float2(__bfloat162float(h.x), __bfloat162float(h.y));
+}
+
+static __host__ __device__ __forceinline__ nv_bfloat16 __low2bfloat16(nv_bfloat162 h) {
+    return h.x;
+}
+
+static __host__ __device__ __forceinline__ nv_bfloat16 __high2bfloat16(nv_bfloat162 h) {
+    return h.y;
+}
+
+static __host__ __device__ __forceinline__ nv_bfloat162 __halves2bfloat162(nv_bfloat16 a, nv_bfloat16 b) {
+    nv_bfloat162 r;
+    r.x = a;
+    r.y = b;
+    return r;
+}
+
+// CUDA_R_16BF cublas data type (not defined in CUDA 10.2)
+#ifndef CUDA_R_16BF
+#define CUDA_R_16BF CUDA_R_16F
+#endif
@@ -33,15 +33,15 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
 
 template<typename dst_t, typename src_t>
  __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
-    if constexpr (std::is_same_v<dst_t, src_t>) {
+    if constexpr (std::is_same<dst_t, src_t>::value) {
         return x;
-    } else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
+    } else if constexpr(std::is_same<dst_t, nv_bfloat16>::value) {
         return __float2bfloat16(float(x));
-    } else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
+    } else if constexpr(std::is_same<src_t, nv_bfloat16>::value) {
         return __bfloat162float(x);
-    } else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
+    } else if constexpr(std::is_same<src_t, float2>::value && std::is_same<dst_t, half2>::value) {
         return __float22half2_rn(x);
-    } else if constexpr(std::is_same_v<src_t, nv_bfloat162> && std::is_same_v<dst_t, float2>) {
+    } else if constexpr(std::is_same<src_t, nv_bfloat162>::value && std::is_same<dst_t, float2>::value) {
 #ifdef GGML_USE_HIP
         return make_float2(__bfloat162float(__low2bfloat16(x)), __bfloat162float(__high2bfloat16(x)));
 #else
@@ -51,14 +51,14 @@ template<typename dst_t, typename src_t>
         return make_float2(__bfloat162float(x.x), __bfloat162float(x.y));
 #endif // __CUDA_ARCH__ >= 800
 #endif // GGML_USE_HIP
-    } else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
+    } else if constexpr(std::is_same<src_t, float2>::value && std::is_same<dst_t, nv_bfloat162>::value) {
         // bypass compile error on cuda 12.0.1
 #ifdef GGML_USE_HIP
         return __float22bfloat162_rn(x);
 #else
         return {x.x, x.y};
 #endif // GGML_USE_HIP
-    } else if constexpr(std::is_same_v<dst_t, int32_t>) {
+    } else if constexpr(std::is_same<dst_t, int32_t>::value) {
         return int32_t(x);
     } else {
         return float(x);