Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7f23c75
remove fixed gpu id & numa id assignment
WenqingLan1 Dec 18, 2025
d63fe8c
use 128bit alignment, add float support, cleanup
WenqingLan1 Dec 18, 2025
242714e
add data_type arg
WenqingLan1 Dec 19, 2025
e8d0282
fix lint
WenqingLan1 Dec 19, 2025
5a18946
fix clang lint
WenqingLan1 Dec 19, 2025
fddf56e
update doc
WenqingLan1 Dec 20, 2025
3c359a3
Merge branch 'main' into wenqinglan/refine-gpu-stream
WenqingLan1 Dec 22, 2025
e445363
Merge branch 'microsoft:main' into wenqinglan/refine-gpu-stream
WenqingLan1 Feb 3, 2026
60b130c
Merge branch 'microsoft:main' into wenqinglan/refine-gpu-stream
WenqingLan1 Feb 6, 2026
f31933f
fix alloc count & comment
WenqingLan1 Feb 6, 2026
d8a91ab
fix: reset gpu-burn submodule to correct commit
WenqingLan1 Feb 6, 2026
2dfa122
Merge branch 'microsoft:main' into wenqinglan/refine-gpu-stream
WenqingLan1 Apr 8, 2026
6dfdaa6
resolve comments
WenqingLan1 Apr 9, 2026
e3232f5
fix lint
WenqingLan1 Apr 9, 2026
58fead3
resolve comment
WenqingLan1 Apr 9, 2026
620a9fa
Merge remote-tracking branch 'origin/main' into wenqinglan/refine-gpu…
WenqingLan1 Apr 22, 2026
5cec42c
Merge branch 'microsoft:main' into wenqinglan/refine-gpu-stream
WenqingLan1 May 13, 2026
8c51d2f
Merge branch 'microsoft:main' into wenqinglan/refine-gpu-stream
WenqingLan1 May 20, 2026
01c7454
resolve comments
WenqingLan1 May 20, 2026
450a28d
refine doc
WenqingLan1 May 20, 2026
9cffad8
fix lint
WenqingLan1 May 21, 2026
fe00d1a
resolve comment
WenqingLan1 May 21, 2026
ea3fd8e
fix cuda11.1 build
WenqingLan1 May 21, 2026
2b6ea7e
fix doc
WenqingLan1 May 21, 2026
0fd405c
resolve comments
WenqingLan1 May 21, 2026
4173759
fix syntax
WenqingLan1 May 21, 2026
ee52086
fix lint
WenqingLan1 May 21, 2026
80d5f0a
resolve comment
WenqingLan1 May 21, 2026
ff9e254
Merge branch 'main' into wenqinglan/refine-gpu-stream
polarG May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,14 @@ Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark te

| Metric Name | Unit | Description |
|------------------------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size. |
| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size. |
| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size. |
| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size. |
| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size. |
| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size. |
| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size. |
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated

### `ib-loopback`

Expand Down
2 changes: 1 addition & 1 deletion examples/benchmarks/gpu_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10'
'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10 --data_type double'
)
# For ROCm environment, please specify the benchmark name and the platform as the following.
# context = BenchmarkRegistry.create_benchmark_context(
Expand Down
13 changes: 11 additions & 2 deletions superbench/benchmarks/micro_benchmarks/gpu_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@ def add_parser_arguments(self):
help='Number of data buffer copies performed.',
)

self._parser.add_argument(
'--data_type',
type=str,
default='double',
choices=['float', 'double'],
required=False,
help='Data type of the buffer elements.',
)

self._parser.add_argument(
'--check_data',
action='store_true',
Expand All @@ -68,8 +77,8 @@ def _preprocess(self):

self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)

args = '--size %d --num_warm_up %d --num_loops %d ' % (
self._args.size, self._args.num_warm_up, self._args.num_loops
args = '--size %d --num_warm_up %d --num_loops %d --data_type %s' % (
self._args.size, self._args.num_warm_up, self._args.num_loops, self._args.data_type
)

Comment thread
WenqingLan1 marked this conversation as resolved.
if self._args.check_data:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ find_package(CUDAToolkit QUIET)

# Source files
set(SOURCES
gpu_stream_test.cpp
gpu_stream_main.cpp
gpu_stream_utils.cpp
gpu_stream.cu
gpu_stream_kernels.cu
Expand Down
89 changes: 45 additions & 44 deletions superbench/benchmarks/micro_benchmarks/gpu_stream/gpu_stream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,15 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
cudaError_t cuda_err = cudaSuccess;

if (args->check_data) {
Comment thread
WenqingLan1 marked this conversation as resolved.
// Generate data to copy
args->sub.data_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
// Generate data to copy - use local NUMA node for best CPU access
args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));

for (int j = 0; j < args->size / sizeof(T); j++) {
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
}

// Allocate check buffer
args->sub.check_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
// Allocate check buffer on local NUMA node
args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
}

// Allocate buffers
Expand Down Expand Up @@ -420,8 +420,10 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
int size_factor = 2;

// Validate data size
uint64_t num_elements_in_thread_block = kNumLoopUnroll * num_threads_per_block;
uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(T);
// Each thread processes 128 bits (16 bytes) for optimal memory bandwidth.
// For double: uses double2 (16 bytes). For float: would use float4 (16 bytes).
constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread;
Comment thread
WenqingLan1 marked this conversation as resolved.
if (args->size % num_bytes_in_thread_block) {
std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
return -1;
Expand All @@ -448,30 +450,30 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne

switch (kernel) {
case Kernel::kCopy:
CopyKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()));
CopyKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()));
args->sub.kernel_name = "COPY";
break;
case Kernel::kScale:
ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
ScaleKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
args->sub.kernel_name = "SCALE";
break;
case Kernel::kAdd:
AddKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()));
AddKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()));
size_factor = 3;
args->sub.kernel_name = "ADD";
break;
case Kernel::kTriad:
TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
TriadKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
size_factor = 3;
args->sub.kernel_name = "TRIAD";
break;
Expand Down Expand Up @@ -583,10 +585,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string

// output formatted results to stdout
// Tags are of format:
// STREAM_<Kernelname>_datatype_gpu_<gpu_id>_buffer_<buffer_size>_block_<block_size>
// STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size>
for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_gpu_" + std::to_string(args->gpu_id) +
"_buffer_" + std::to_string(args->size);
std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size);
Comment thread
WenqingLan1 marked this conversation as resolved.
for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
// Calculate and display bandwidth
double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
Expand All @@ -608,9 +609,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
/**
* @brief Runs the Stream benchmark.
*
* @details This function processes the input args, validates and composes the BenchArgs structure for the
availavble
* GPUs, and runs the benchmark.
* @details This function processes the input args, validates and composes the BenchArgs structure for
* the first visible GPU (CUDA device 0). When running under Superbench's default_local_mode,
* CUDA_VISIBLE_DEVICES is set per process, so device 0 maps to the assigned physical GPU.
*
* @return int The status code indicating success or failure of the benchmark execution.
* */
Expand All @@ -631,21 +632,29 @@ int GpuStream::Run() {
return ret;
}

// find all GPUs and compose the Benchmarking data structure
for (int j = 0; j < gpu_count; j++) {
auto args = std::make_unique<BenchArgs<double>>();
args->numa_id = 0;
args->gpu_id = j;
cudaGetDeviceProperties(&args->gpu_device_prop, j);
if (gpu_count < 1) {
std::cerr << "Run::No GPU available" << std::endl;
return -1;
}
Comment thread
WenqingLan1 marked this conversation as resolved.

// Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
if (opts_.data_type == "float") {
auto args = std::make_unique<BenchArgs<float>>();
args->gpu_id = 0;
cudaGetDeviceProperties(&args->gpu_device_prop, 0);
args->num_warm_up = opts_.num_warm_up;
args->num_loops = opts_.num_loops;
args->size = opts_.size;
args->check_data = opts_.check_data;
bench_args_.emplace_back(std::move(args));
Comment thread
WenqingLan1 marked this conversation as resolved.
Outdated
} else {
auto args = std::make_unique<BenchArgs<double>>();
args->gpu_id = 0;
cudaGetDeviceProperties(&args->gpu_device_prop, 0);
args->num_warm_up = opts_.num_warm_up;
args->num_loops = opts_.num_loops;
args->size = opts_.size;
args->check_data = opts_.check_data;
args->numa_id = 0;
args->gpu_id = j;

// add data to vector
bench_args_.emplace_back(std::move(args));
}

Expand All @@ -668,14 +677,6 @@ int GpuStream::Run() {
// Print device info with both the memory clock and peak bandwidth
PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);

// Set the NUMA node
ret = numa_run_on_node(curr_args->numa_id);
if (ret != 0) {
Comment thread
WenqingLan1 marked this conversation as resolved.
std::cerr << "Run::numa_run_on_node error: " << errno << std::endl;
has_error = true;
return;
}

// Run the stream benchmark for the configured data, passing the peak bandwidth
if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
ret = RunStream<float>(curr_args, "float", peak_bw);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class GpuStream {
int Run();

private:
Comment thread
WenqingLan1 marked this conversation as resolved.
using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<double>>>;
using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<float>>, std::unique_ptr<BenchArgs<double>>>;
std::vector<BenchArgsVariant> bench_args_;
Opts opts_;

Expand Down
Loading
Loading