Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion ggml/src/ggml-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ if (CUDAToolkit_FOUND)
#
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
# for best performance and to also build real architectures for the most commonly used GPUs.
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
if (CUDAToolkit_VERSION VERSION_LESS "11.0")
# CUDA 10.2: only compute 6.2 (Jetson TX2 / Pascal)
set(CMAKE_CUDA_ARCHITECTURES "62")
elseif (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
set(CMAKE_CUDA_ARCHITECTURES "native")
else()
if (CUDAToolkit_VERSION VERSION_LESS "13")
Expand Down Expand Up @@ -57,6 +60,12 @@ if (CUDAToolkit_FOUND)

enable_language(CUDA)

# CUDA 10.2 compat: force C++14 standard and enable if constexpr support
if (CUDAToolkit_VERSION VERSION_LESS "11.0")
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
endif()

# TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
if (GGML_CUDA_CUB_3DOT2)
include(FetchContent)
Expand Down Expand Up @@ -195,6 +204,10 @@ if (CUDAToolkit_FOUND)

set(CUDA_FLAGS -use_fast_math -extended-lambda)

if (CUDAToolkit_VERSION VERSION_LESS "11.0")
list(APPEND CUDA_FLAGS --expt-relaxed-constexpr)
endif()

if (GGML_CUDA_DEBUG)
list(APPEND CUDA_FLAGS -lineinfo)
add_compile_definitions(GGML_CUDA_DEBUG)
Expand Down
6 changes: 4 additions & 2 deletions ggml/src/ggml-cuda/binbcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ static __global__ void k_bin_bcast(const src0_t * src0,

float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
if constexpr (sizeof...(src1_ptrs) > 0) {
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
int _dummy[] = { (result = bin_op(result, (float)src1s[i_src1 + i10*s10]), 0)... };
(void)_dummy;
} else {
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
}
Expand Down Expand Up @@ -143,7 +144,8 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,

float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
if constexpr (sizeof...(src1_ptrs) > 0) {
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
int _dummy[] = { (result = bin_op(result, (float)src1s[i_src1 + i10*s10]), 0)... };
(void)_dummy;
} else {
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
}
Expand Down
43 changes: 32 additions & 11 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,20 @@ template<block_reduce_method method_t, typename T>
struct block_reduce_policy;

template <typename T, typename... Ts>
inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
struct is_any_impl;

template <typename T>
struct is_any_impl<T> {
static constexpr bool value = false;
};

template <typename T, typename First, typename... Rest>
struct is_any_impl<T, First, Rest...> {
static constexpr bool value = std::is_same<T, First>::value || is_any_impl<T, Rest...>::value;
};

template <typename T, typename... Ts>
inline constexpr bool is_any = is_any_impl<T, Ts...>::value;

template<typename...>
inline constexpr bool ggml_cuda_dependent_false_v = false;
Expand All @@ -561,13 +574,13 @@ template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
}

static __device__ T sentinel() {
if constexpr (std::is_same_v<T, float>) {
if constexpr (std::is_same<T, float>::value) {
return 0.0f;
} else if constexpr (std::is_same_v<T, float2>) {
} else if constexpr (std::is_same<T, float2>::value) {
return make_float2(0.0f, 0.0f);
} else if constexpr (std::is_same_v<T, half2>) {
} else if constexpr (std::is_same<T, half2>::value) {
return make_half2(0.0f, 0.0f);
} else if constexpr (std::is_same_v<T, int>) {
} else if constexpr (std::is_same<T, int>::value) {
return 0;
} else {
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
Expand All @@ -585,9 +598,9 @@ template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
}

static __device__ T sentinel() {
if constexpr (std::is_same_v<T, float>) {
if constexpr (std::is_same<T, float>::value) {
return -INFINITY;
} else if constexpr (std::is_same_v<T, half2>) {
} else if constexpr (std::is_same<T, half2>::value) {
return make_half2(-INFINITY, -INFINITY);
} else {
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
Expand Down Expand Up @@ -1252,7 +1265,9 @@ struct ggml_cuda_concurrent_event {
const int64_t join_start = (int64_t) join_t->data;
const int64_t join_end = join_start + ggml_nbytes(join_t);

for (const auto & [tensor, stream] : stream_mapping) {
for (const auto & _kv : stream_mapping) {
const auto & tensor = _kv.first;
const auto & stream = _kv.second;
const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
const int64_t t_start = (int64_t) t->data;
const int64_t t_end = t_start + ggml_nbytes(t);
Expand All @@ -1273,7 +1288,9 @@ struct ggml_cuda_concurrent_event {

bool writes_overlap = false;
bool dependent_srcs = false;
for (const auto & [tensor, stream] : stream_mapping) {
for (const auto & _kv : stream_mapping) {
const auto & tensor = _kv.first;
const auto & stream = _kv.second;
const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
const int64_t t_start = (int64_t) t->data;
const int64_t t_end = t_start + ggml_nbytes(t);
Expand Down Expand Up @@ -1379,7 +1396,9 @@ struct ggml_backend_cuda_context {
// Check if any CUDA graph is enabled for this context (used by kernels that need to know
// if graphs are in use without having access to the specific graph key)
bool any_cuda_graph_enabled() const {
for (const auto & [key, graph] : cuda_graphs) {
for (const auto & _kv : cuda_graphs) {
const auto & key = _kv.first;
const auto & graph = _kv.second;
if (graph && graph->is_enabled()) {
return true;
}
Expand All @@ -1389,7 +1408,9 @@ struct ggml_backend_cuda_context {

// Check if any CUDA graph has an instance for this context
bool any_cuda_graph_has_instance() const {
for (const auto & [key, graph] : cuda_graphs) {
for (const auto & _kv : cuda_graphs) {
const auto & key = _kv.first;
const auto & graph = _kv.second;
if (graph && graph->instance != nullptr) {
return true;
}
Expand Down
57 changes: 57 additions & 0 deletions ggml/src/ggml-cuda/compat-cuda10.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#pragma once

// Compatibility polyfills for CUDA 10.2 (Jetson TX2)
// bf16 types are mapped to fp16 since compute 6.2 has no hardware bf16 support.

#include <cuda_fp16.h>

// bf16 type polyfills (mapped to fp16 for compute 6.2)
typedef __half nv_bfloat16;

struct nv_bfloat162 {
nv_bfloat16 x;
nv_bfloat16 y;
};

static __host__ __device__ __forceinline__ nv_bfloat16 __float2bfloat16(float f) {
return __float2half(f);
}

static __host__ __device__ __forceinline__ float __bfloat162float(nv_bfloat16 h) {
return __half2float(h);
}

static __host__ __device__ __forceinline__ nv_bfloat162 make_bfloat162(nv_bfloat16 a, nv_bfloat16 b) {
nv_bfloat162 r;
r.x = a;
r.y = b;
return r;
}

static __host__ __device__ __forceinline__ nv_bfloat162 __float22bfloat162_rn(float2 f) {
return make_bfloat162(__float2bfloat16(f.x), __float2bfloat16(f.y));
}

static __host__ __device__ __forceinline__ float2 __bfloat1622float2(nv_bfloat162 h) {
return make_float2(__bfloat162float(h.x), __bfloat162float(h.y));
}

static __host__ __device__ __forceinline__ nv_bfloat16 __low2bfloat16(nv_bfloat162 h) {
return h.x;
}

static __host__ __device__ __forceinline__ nv_bfloat16 __high2bfloat16(nv_bfloat162 h) {
return h.y;
}

static __host__ __device__ __forceinline__ nv_bfloat162 __halves2bfloat162(nv_bfloat16 a, nv_bfloat16 b) {
nv_bfloat162 r;
r.x = a;
r.y = b;
return r;
}

// CUDA_R_16BF cublas data type (not defined in CUDA 10.2)
#ifndef CUDA_R_16BF
#define CUDA_R_16BF CUDA_R_16F
#endif
14 changes: 7 additions & 7 deletions ggml/src/ggml-cuda/convert.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);

template<typename dst_t, typename src_t>
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
if constexpr (std::is_same_v<dst_t, src_t>) {
if constexpr (std::is_same<dst_t, src_t>::value) {
return x;
} else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
} else if constexpr(std::is_same<dst_t, nv_bfloat16>::value) {
return __float2bfloat16(float(x));
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
} else if constexpr(std::is_same<src_t, nv_bfloat16>::value) {
return __bfloat162float(x);
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
} else if constexpr(std::is_same<src_t, float2>::value && std::is_same<dst_t, half2>::value) {
return __float22half2_rn(x);
} else if constexpr(std::is_same_v<src_t, nv_bfloat162> && std::is_same_v<dst_t, float2>) {
} else if constexpr(std::is_same<src_t, nv_bfloat162>::value && std::is_same<dst_t, float2>::value) {
#ifdef GGML_USE_HIP
return make_float2(__bfloat162float(__low2bfloat16(x)), __bfloat162float(__high2bfloat16(x)));
#else
Expand All @@ -51,14 +51,14 @@ template<typename dst_t, typename src_t>
return make_float2(__bfloat162float(x.x), __bfloat162float(x.y));
#endif // __CUDA_ARCH__ >= 800
#endif // GGML_USE_HIP
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
} else if constexpr(std::is_same<src_t, float2>::value && std::is_same<dst_t, nv_bfloat162>::value) {
// bypass compile error on cuda 12.0.1
#ifdef GGML_USE_HIP
return __float22bfloat162_rn(x);
#else
return {x.x, x.y};
#endif // GGML_USE_HIP
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
} else if constexpr(std::is_same<dst_t, int32_t>::value) {
return int32_t(x);
} else {
return float(x);
Expand Down
Loading