microsoft · WenqingLan1 · Dec 18, 2025 · Dec 18, 2025 · Dec 19, 2025 · Dec 19, 2025
@@ -273,14 +273,14 @@ Measure the memory bandwidth of GPU using the STREAM benchmark. The benchmark te
 
 | Metric Name                                                | Unit             | Description                                                                                                                             |
 |------------------------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
-| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.                         |
-| STREAM\_COPY\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.                         |
-| STREAM\_SCALE\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size.                         |
-| STREAM\_ADD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.                         |
-| STREAM\_TRIAD\_double\_gpu\_[0-9]\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size.                         |
+| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw  | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the copy operation with specified buffer size and block size.                         |
+| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the scale operation with specified buffer size and block size.                         |
+| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw   | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the add operation with specified buffer size and block size.                         |
+| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_bw | bandwidth (GB/s) | The fp64 memory bandwidth of the GPU for the triad operation with specified buffer size and block size.                         |
+| STREAM\_COPY\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio  | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the copy operation with specified buffer size and block size.                         |
+| STREAM\_SCALE\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the scale operation with specified buffer size and block size.                         |
+| STREAM\_ADD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio   | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the add operation with specified buffer size and block size.                         |
+| STREAM\_TRIAD\_(double\|float)\_buffer\_[0-9]+\_block\_[0-9]+\_ratio | Efficiency (%) | The fp64 memory bandwidth efficiency of the GPU for the triad operation with specified buffer size and block size.                         |
 
 ### `ib-loopback`
 

@@ -12,7 +12,7 @@
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
-        'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10'
+        'gpu-stream', platform=Platform.CUDA, parameters='--num_warm_up 1 --num_loops 10 --data_type double'
     )
     # For ROCm environment, please specify the benchmark name and the platform as the following.
     # context = BenchmarkRegistry.create_benchmark_context(

@@ -51,6 +51,15 @@ def add_parser_arguments(self):
             help='Number of data buffer copies performed.',
         )
 
+        self._parser.add_argument(
+            '--data_type',
+            type=str,
+            default='double',
+            choices=['float', 'double'],
+            required=False,
+            help='Data type of the buffer elements.',
+        )
+
         self._parser.add_argument(
             '--check_data',
             action='store_true',
@@ -68,8 +77,8 @@ def _preprocess(self):
 
         self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
 
-        args = '--size %d --num_warm_up %d --num_loops %d ' % (
-            self._args.size, self._args.num_warm_up, self._args.num_loops
+        args = '--size %d --num_warm_up %d --num_loops %d --data_type %s' % (
+            self._args.size, self._args.num_warm_up, self._args.num_loops, self._args.data_type
         )
 
         if self._args.check_data:

@@ -15,7 +15,7 @@ find_package(CUDAToolkit QUIET)
 
 # Source files
 set(SOURCES
-    gpu_stream_test.cpp
+    gpu_stream_main.cpp
     gpu_stream_utils.cpp
     gpu_stream.cu
     gpu_stream_kernels.cu

@@ -235,15 +235,15 @@ template <typename T> int GpuStream::PrepareBufAndStream(std::unique_ptr<BenchAr
     cudaError_t cuda_err = cudaSuccess;
 
     if (args->check_data) {
-        // Generate data to copy
-        args->sub.data_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
+        // Generate data to copy - use local NUMA node for best CPU access
+        args->sub.data_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
 
         for (int j = 0; j < args->size / sizeof(T); j++) {
             args->sub.data_buf[j] = static_cast<T>(j % kUInt8Mod);
         }
 
-        // Allocate check buffer
-        args->sub.check_buf = static_cast<T *>(numa_alloc_onnode(args->size * sizeof(T), args->numa_id));
+        // Allocate check buffer on local NUMA node
+        args->sub.check_buf = static_cast<T *>(numa_alloc_local(args->size * sizeof(T)));
     }
 
     // Allocate buffers
@@ -420,8 +420,10 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
     int size_factor = 2;
 
     // Validate data size
-    uint64_t num_elements_in_thread_block = kNumLoopUnroll * num_threads_per_block;
-    uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(T);
+    // Each thread processes 128 bits (16 bytes) for optimal memory bandwidth.
+    // For double: uses double2 (16 bytes). For float: would use float4 (16 bytes).
+    constexpr uint64_t kBytesPerThread = 16; // 128-bit aligned access
+    uint64_t num_bytes_in_thread_block = num_threads_per_block * kBytesPerThread;
     if (args->size % num_bytes_in_thread_block) {
         std::cerr << "RunCopy: Data size should be multiple of " << num_bytes_in_thread_block << std::endl;
         return -1;
@@ -448,30 +450,30 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
 
         switch (kernel) {
         case Kernel::kCopy:
-            CopyKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()));
+            CopyKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()));
             args->sub.kernel_name = "COPY";
             break;
         case Kernel::kScale:
-            ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
+            ScaleKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
             args->sub.kernel_name = "SCALE";
             break;
         case Kernel::kAdd:
-            AddKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()));
+            AddKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()));
             size_factor = 3;
             args->sub.kernel_name = "ADD";
             break;
         case Kernel::kTriad:
-            TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
-                reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
+            TriadKernel<T><<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
+                reinterpret_cast<VecT<T> *>(args->sub.gpu_buf_ptrs[2].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[0].get()),
+                reinterpret_cast<const VecT<T> *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
             size_factor = 3;
             args->sub.kernel_name = "TRIAD";
             break;
@@ -583,10 +585,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
 
     // output formatted results to stdout
     // Tags are of format:
-    // STREAM_<Kernelname>_datatype_gpu_<gpu_id>_buffer_<buffer_size>_block_<block_size>
+    // STREAM_<Kernelname>_datatype_buffer_<buffer_size>_block_<block_size>
     for (int i = 0; i < args->sub.times_in_ms.size(); i++) {
-        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_gpu_" + std::to_string(args->gpu_id) +
-                          "_buffer_" + std::to_string(args->size);
+        std::string tag = "STREAM_" + KernelToString(i) + "_" + data_type + "_buffer_" + std::to_string(args->size);
         for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
             // Calculate and display bandwidth
             double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
@@ -608,9 +609,9 @@ int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string
 /**
  * @brief Runs the Stream benchmark.
  *
- * @details This function processes the input args, validates and composes the BenchArgs structure for the
- availavble
- * GPUs, and runs the benchmark.
+ * @details This function processes the input args, validates and composes the BenchArgs structure for
+ * the first visible GPU (CUDA device 0). When running under Superbench's default_local_mode,
+ * CUDA_VISIBLE_DEVICES is set per process, so device 0 maps to the assigned physical GPU.
  *
  * @return int The status code indicating success or failure of the benchmark execution.
  * */
@@ -631,21 +632,29 @@ int GpuStream::Run() {
         return ret;
     }
 
-    // find all GPUs and compose the Benchmarking data structure
-    for (int j = 0; j < gpu_count; j++) {
-        auto args = std::make_unique<BenchArgs<double>>();
-        args->numa_id = 0;
-        args->gpu_id = j;
-        cudaGetDeviceProperties(&args->gpu_device_prop, j);
+    if (gpu_count < 1) {
+        std::cerr << "Run::No GPU available" << std::endl;
+        return -1;
+    }
 
+    // Run on CUDA device 0 (the visible GPU assigned by CUDA_VISIBLE_DEVICES).
+    if (opts_.data_type == "float") {
+        auto args = std::make_unique<BenchArgs<float>>();
+        args->gpu_id = 0;
+        cudaGetDeviceProperties(&args->gpu_device_prop, 0);
+        args->num_warm_up = opts_.num_warm_up;
+        args->num_loops = opts_.num_loops;
+        args->size = opts_.size;
+        args->check_data = opts_.check_data;
+        bench_args_.emplace_back(std::move(args));
+    } else {
+        auto args = std::make_unique<BenchArgs<double>>();
+        args->gpu_id = 0;
+        cudaGetDeviceProperties(&args->gpu_device_prop, 0);
         args->num_warm_up = opts_.num_warm_up;
         args->num_loops = opts_.num_loops;
         args->size = opts_.size;
         args->check_data = opts_.check_data;
-        args->numa_id = 0;
-        args->gpu_id = j;
-
-        // add data to vector
         bench_args_.emplace_back(std::move(args));
     }
 
@@ -668,14 +677,6 @@ int GpuStream::Run() {
                 // Print device info with both the memory clock and peak bandwidth
                 PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
 
-                // Set the NUMA node
-                ret = numa_run_on_node(curr_args->numa_id);
-                if (ret != 0) {
-                    std::cerr << "Run::numa_run_on_node error: " << errno << std::endl;
-                    has_error = true;
-                    return;
-                }
-
                 // Run the stream benchmark for the configured data, passing the peak bandwidth
                 if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
                     ret = RunStream<float>(curr_args, "float", peak_bw);

@@ -34,7 +34,7 @@ class GpuStream {
     int Run();
 
   private:
-    using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<double>>>;
+    using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<float>>, std::unique_ptr<BenchArgs<double>>>;
     std::vector<BenchArgsVariant> bench_args_;
     Opts opts_;