From 741ee988004f0f7062965e4d31f9553be0692f86 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 22 Jul 2025 16:03:28 -0700
Subject: [PATCH 01/37] add nvbench kernel launch

---
 .gitignore                                    |   3 +
 .gitmodules                                   |   3 +
 examples/benchmarks/nvbench_kernel_launch.py  |  25 +++
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../micro_benchmarks/nvbench/CMakeLists.txt   |  15 ++
 .../micro_benchmarks/nvbench/launch_bench.cu  |  11 ++
 .../micro_benchmarks/nvbench_kernel_launch.py | 173 ++++++++++++++++++
 .../test_nvbench_kernel_launch.py             | 105 +++++++++++
 tests/data/nvbench_kernel_launch.log          |  34 ++++
 third_party/Makefile                          |  13 ++
 10 files changed, 384 insertions(+)
 create mode 100644 examples/benchmarks/nvbench_kernel_launch.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
 create mode 100644 tests/data/nvbench_kernel_launch.log

diff --git a/.gitignore b/.gitignore
index 5888455a8..97b51dcd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,9 @@ cython_debug/
 *.userosscache
 *.sln.docstates
 
+# Build temporary files
+compile_commands.json
+
 # Build results
 [Dd]ebug/
 [Dd]ebugPublic/
diff --git a/.gitmodules b/.gitmodules
index 9be41b59b..f82383ae9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -33,3 +33,6 @@
 [submodule "third_party/nvbandwidth"]
 	path = third_party/nvbandwidth
 	url = https://github.com/NVIDIA/nvbandwidth.git
+[submodule "third_party/nvbench"]
+	path = third_party/nvbench
+	url = https://github.com/NVIDIA/nvbench.git
diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
new file mode 100644
index 000000000..73377084b
--- /dev/null
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -0,0 +1,25 @@
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-kernel-launch',
+        platform=Platform.CUDA,
+        parameters=(
+            '--timeout 30 '
+            '--min-samples 10 '
+            '--min-time 1.0 '
+            '--max-noise 0.1 '
+            '--stopping-criterion stdrel '
+            '--throttle-threshold 80 '
+            '--throttle-recovery-delay 1.0'
+        )
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 978c2d385..9ae738444 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -39,6 +39,7 @@
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
+from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
 
 __all__ = [
     'BlasLtBaseBenchmark',
@@ -77,4 +78,5 @@
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
     'NvBandwidthBenchmark',
+    'NvbenchKernelLaunch'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
new file mode 100644
index 000000000..a23d722a3
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.20)
+project(nvbench_benchmarks LANGUAGES CUDA)
+
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
+
+  find_package(nvbench CONFIG REQUIRED)
+  add_executable(nvbench_kernel_launch launch_bench.cu)
+  target_compile_features(nvbench_kernel_launch PUBLIC cuda_std_17)
+  target_link_libraries(nvbench_kernel_launch
+    PRIVATE nvbench::nvbench nvbench::main
+  )
+  install(TARGETS nvbench_kernel_launch RUNTIME DESTINATION bin)
+endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu b/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
new file mode 100644
index 000000000..f3e1bc7f0
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
@@ -0,0 +1,11 @@
+#include <nvbench/nvbench.cuh>
+
+__global__ void empty_kernel() {}
+
+void launch_bench(nvbench::state &state) {
+  state.exec([](nvbench::launch &launch) {
+    empty_kernel<<<1, 1, 0, launch.get_stream()>>>();
+  });
+}
+
+NVBENCH_BENCH(launch_bench);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
new file mode 100644
index 000000000..74cf94383
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -0,0 +1,173 @@
+import os
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+def parse_time_to_us(raw: str) -> float:
+    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
+    raw = raw.strip()
+    if raw.endswith('%'):
+        return float(raw[:-1])
+    # split “value unit” or “valueunit”
+    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
+    if not m:
+        return float(raw)
+    val, unit = float(m.group(1)), (m.group(2) or 'us')
+    if unit == 'ns':  return val / 1e3
+    if unit == 'ms':  return val * 1e3
+    return val
+
+class NvbenchKernelLaunch(MicroBenchmarkWithInvoke):
+    """Nvbench benchmark wrapper for SuperBench."""
+    def __init__(self, name, parameters=None):
+        """Initialize the benchmark."""
+        super().__init__(name, parameters)
+        self._bin_name = "nvbench_kernel_launch"
+    
+    def add_parser_arguments(self):
+        """
+        Add NVBench CLI options (excluding Queries, Device modification, Output, Benchmark / Axis Specification):
+          - Benchmark Properties (Omit --run-once, --disable-blocking-kernel, --profile)
+          - Stopping Criteria
+        """
+        super().add_parser_arguments()
+
+        # Benchmark Properties
+        self._parser.add_argument(
+            '--devices', '--device', '-d', type=str, default=None,
+            help='Limit execution to one or more device IDs (comma-separated or "all").'
+        )
+        self._parser.add_argument(
+            '--skip-time', type=float, default=-1.0,
+            help='Skip a measurement when a warmup run executes in less than this (seconds).'
+        )
+        # With a threshold >0 and a recovery delay >0, NVBench will automatically pause 
+        # and “wait for the card to warm up” back to a stable high‐clock state, giving you 
+        # more consistent (and lower) timings that reflect the GPU’s true peak performance.
+        self._parser.add_argument(
+            '--throttle-threshold', type=float, default=75.0,
+            help="GPU throttle threshold as percent of default clock rate. Disabled when nvbench::exec_tag::sync is used."
+        )
+        self._parser.add_argument(
+            '--throttle-recovery-delay', type=float, default=0.05,
+            help='Seconds to wait after throttle before resuming. '
+            'Disabled when nvbench::exec_tag::sync is used.'
+        )
+
+        # Stopping Criteria
+        self._parser.add_argument(
+            '--timeout', type=int, default=15,
+            help='Walltime timeout in seconds for each measurement.'
+        )
+        self._parser.add_argument(
+            '--min-samples', type=int, default=10,
+            help='Minimum number of samples per measurement before checking other criteria.'
+        )
+        self._parser.add_argument(
+            '--stopping-criterion', type=str, default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion to use after --min-samples is satisfied: '
+            '"stdrel" or "entropy".'
+        )
+        # stdrel-specific
+        self._parser.add_argument(
+            '--min-time', type=float, default=0.5, 
+            help='(stdrel) Minimum execution time accumulated per measurement (seconds).'
+        )
+        self._parser.add_argument(
+            '--max-noise', type=float, default=0.5,
+            help='(stdrel) Maximum relative standard deviation (%) before stopping.'
+        )
+        # entropy-specific
+        self._parser.add_argument(
+            '--max-angle', type=float, default=0.048,
+            help='(entropy) Maximum linear regression angle of cumulative entropy.'
+        )
+        self._parser.add_argument(
+            '--min-r2', type=float, default=0.36,
+            help='(entropy) Minimum coefficient of determination (R²) for linear regression of cumulative entropy.'
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [bin_path]
+
+        # Benchmark Properties (all optional)
+        if self._args.devices:
+            parts.extend(['--devices', self._args.devices])
+        if self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        if self._args.throttle_threshold > 0:
+            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        if self._args.throttle_recovery_delay > 0:
+            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+
+        # Stopping Criteria (all optional)
+        if self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                if self._args.min_time is not None:
+                    parts.extend(['--min-time', str(self._args.min_time)])
+                if self._args.max_noise is not None:
+                    parts.extend(['--max-noise', str(self._args.max_noise)])
+            elif self._args.stopping_criterion == 'entropy':
+                if self._args.max_angle is not None:
+                    parts.extend(['--max-angle', str(self._args.max_angle)])
+                if self._args.min_r2 is not None:
+                    parts.extend(['--min-r2', str(self._args.min_r2)])
+
+        # finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        try:
+            gpu_section = r"### \[(\d+)\] NVIDIA"
+            row_pat = (
+                r"\| (\d+)x \| ([\d.]+ ?[mun]?s) \| ([\d.]+%) \| "
+                r"([\d.]+ ?[mun]?s) \| ([\d.]+%) \| (\d+)x \| *([\d.]+ ?[mun]?s) \|"
+            )
+            current = None
+            parsed_any = False  # Track if any valid rows are parsed
+            for line in raw_output.splitlines():
+                line = line.strip()
+                g = re.match(gpu_section, line)
+                if g:
+                    current = f"gpu_{g.group(1)}"
+                    continue
+                r = re.match(row_pat, line)
+                if r and current:
+                    self._result.add_result(f"{current}_samples", int(r.group(1)))
+                    self._result.add_result(f"{current}_cpu_time", parse_time_to_us(r.group(2)))
+                    self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
+                    self._result.add_result(f"{current}_gpu_time", parse_time_to_us(r.group(4)))
+                    self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
+                    self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
+                    self._result.add_result(f"{current}_batch_gpu_time", parse_time_to_us(r.group(7)))
+                    parsed_any = True
+            if not parsed_any:
+                logger.error("No valid rows parsed from the raw output.")
+                raise RuntimeError("No valid rows parsed")
+        except Exception as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{e}\n{raw_output}"
+            )
+            return False
+        return True
+
+# Register the benchmark
+BenchmarkRegistry.register_benchmark("nvbench-kernel-launch", NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
new file mode 100644
index 000000000..7c131a6c8
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench kernel launch benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchKernelLaunchBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Kernel Launch benchmark."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_kernel_launch'])
+
+    def test_nvbench_kernel_launch_preprocess(self):
+        """Test NVBench Kernel Launch benchmark preprocess."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--device 0 '
+            '--timeout 20 '
+            '--min-samples 300 '
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.5 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 1.0'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 300' in benchmark._commands[0])
+        assert ('--stopping-criterion stdrel' in benchmark._commands[0])
+        assert ('--min-time 2.0' in benchmark._commands[0])
+        assert ('--max-noise 0.5' in benchmark._commands[0])
+        assert ('--throttle-threshold 80.0' in benchmark._commands[0])
+        assert ('--throttle-recovery-delay 1.0' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbench_kernel_launch.log')
+    def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
+        """Test NVBench Kernel Launch benchmark result parsing."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        assert benchmark.result['gpu_0_samples'][0] == 120000
+        assert benchmark.result['gpu_0_cpu_time'][0] == 24.222
+        assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
+        assert benchmark.result['gpu_0_gpu_time'][0] == 7.808
+        assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
+        assert benchmark.result['gpu_0_batch_samples'][0] == 300000
+        assert benchmark.result['gpu_0_batch_gpu_time'][0] == 6.024
+
+    def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
+        """Test NVBench Kernel Launch benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = "Invalid output format"
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/data/nvbench_kernel_launch.log b/tests/data/nvbench_kernel_launch.log
new file mode 100644
index 000000000..95ccbc065
--- /dev/null
+++ b/tests/data/nvbench_kernel_launch.log
@@ -0,0 +1,34 @@
+# Devices
+
+## [0] `NVIDIA GPU`
+* SM Version: 900 (PTX Version: 900)
+* Number of SMs: 100
+* SM Default Clock Rate: 800 MHz
+* Global Memory: 100000 MiB Free / 100000 MiB Total
+* Global Memory Bus Peak: 3000 GB/sec (3000-bit DDR @4000MHz)
+* Max Shared Memory: 100 KiB/SM, 20 KiB/Block
+* L2 Cache Size: 1000 KiB
+* Maximum Active Blocks: 10/SM
+* Maximum Active Threads: 500/SM, 300/Block
+* Available Registers: 500/SM, 500/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/1] launch_bench [Device=0]
+Warn: Current measurement timed out (20.00s) while over noise threshold (14.42% > 0.50%)
+Warn: Current measurement timed out (20.00s) before accumulating min_time (0.94s < 2.00s)
+Pass: Cold: 0.005878ms GPU, 0.022181ms CPU, 0.94s total GPU, 20.00s total wall, 120000x 
+Pass: Batch: 0.004024ms GPU, 2.00s total GPU, 2.00s total wall, 300000x
+```
+
+# Benchmark Results
+
+## launch_bench
+
+### [0] NVIDIA GPU
+
+| Samples | CPU Time  | Noise  | GPU Time | Noise  | Samples | Batch GPU |
+|---------|-----------|--------|----------|--------|---------|-----------|
+| 120000x | 24.222 us | 30.44% | 7.808 us | 14.42% | 300000x |  6.024 us |
\ No newline at end of file
diff --git a/third_party/Makefile b/third_party/Makefile
index 667a46a47..465b5d6e4 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -25,6 +25,7 @@ cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcne
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
+nvbench_only: nvbench
 
 # non aarch64 specific targets
 ifneq ($(shell uname -m), aarch64)
@@ -301,3 +302,15 @@ endif
 nvbandwidth: sb_micro_path
 	cd ./nvbandwidth && cmake . && make && cd ..
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
+
+# Build nvbench
+nvbench: sb_micro_path
+	cd ./nvbench && mkdir -p build && cd build && \
+	cmake                                         \
+	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
+	-DCMAKE_CUDA_ARCHITECTURES=100                \
+	-DNVBench_ENABLE_CUPTI=ON                     \
+	-DCMAKE_BUILD_TYPE=Release                    \
+	.. &&                                         \
+	make -j $(NUM_MAKE_JOBS) &&                   \
+	make install

From 0ae7864c52d914ac95c7852e0093ae48489f3e46 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 22 Jul 2025 23:44:24 +0000
Subject: [PATCH 02/37] submodule update

---
 third_party/nvbench | 1 +
 1 file changed, 1 insertion(+)
 create mode 160000 third_party/nvbench

diff --git a/third_party/nvbench b/third_party/nvbench
new file mode 160000
index 000000000..3de9dc95d
--- /dev/null
+++ b/third_party/nvbench
@@ -0,0 +1 @@
+Subproject commit 3de9dc95da3fc859900a799918a0c1d9091920ff

From 35bfb61c118dab3436577797977f806da7861382 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 30 Jul 2025 05:55:02 +0000
Subject: [PATCH 03/37] init sleep kernel

---
 examples/benchmarks/nvbench_sleep_kernel.py   |  28 ++
 .../benchmarks/micro_benchmarks/__init__.py   |   4 +-
 .../micro_benchmarks/nvbench/CMakeLists.txt   |  29 +-
 .../{launch_bench.cu => kernel_launch.cu}     |   4 +-
 .../micro_benchmarks/nvbench/sleep_kernel.cu  |  23 ++
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 262 ++++++++++++++++++
 .../test_nvbench_sleep_kernel.py              | 110 ++++++++
 tests/data/nvbench_sleep_kernel.log           |  43 +++
 third_party/Makefile                          |  25 +-
 9 files changed, 509 insertions(+), 19 deletions(-)
 create mode 100644 examples/benchmarks/nvbench_sleep_kernel.py
 rename superbench/benchmarks/micro_benchmarks/nvbench/{launch_bench.cu => kernel_launch.cu} (70%)
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
 create mode 100644 tests/data/nvbench_sleep_kernel.log

diff --git a/examples/benchmarks/nvbench_sleep_kernel.py b/examples/benchmarks/nvbench_sleep_kernel.py
new file mode 100644
index 000000000..083bd0a7c
--- /dev/null
+++ b/examples/benchmarks/nvbench_sleep_kernel.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Example of NVBench Sleep Kernel benchmark."""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+
+def main():
+    """Main method to run the nvbench sleep kernel benchmark."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    else:
+        logger.error('benchmark: nvbench-sleep-kernel launch failed.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 9ae738444..ac50ce252 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -40,6 +40,7 @@
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
 from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
+from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
 
 __all__ = [
     'BlasLtBaseBenchmark',
@@ -78,5 +79,6 @@
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
     'NvBandwidthBenchmark',
-    'NvbenchKernelLaunch'
+    'NvbenchKernelLaunch',
+    'NvbenchSleepKernel'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index a23d722a3..448e83800 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -2,14 +2,29 @@ cmake_minimum_required(VERSION 3.20)
 project(nvbench_benchmarks LANGUAGES CUDA)
 
 find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
+if (CUDAToolkit_FOUND)
   include(../cuda_common.cmake)
-
   find_package(nvbench CONFIG REQUIRED)
-  add_executable(nvbench_kernel_launch launch_bench.cu)
-  target_compile_features(nvbench_kernel_launch PUBLIC cuda_std_17)
-  target_link_libraries(nvbench_kernel_launch
-    PRIVATE nvbench::nvbench nvbench::main
+
+  # list all your CUDA benchmark source files here
+  set(NVBENCH_SOURCES
+    kernel_launch.cu
+    sleep_kernel.cu
+    # add more *.cu as needed
   )
-  install(TARGETS nvbench_kernel_launch RUNTIME DESTINATION bin)
+
+  foreach(src ${NVBENCH_SOURCES})
+    # strip ".cu" → NAME_WE
+    get_filename_component(basename ${src} NAME_WE)
+    set(target nvbench_${basename})
+
+    add_executable(${target} ${src})
+    target_compile_features(${target} PUBLIC cuda_std_17)
+    target_link_libraries(${target}
+      PRIVATE nvbench::nvbench nvbench::main
+    )
+    install(TARGETS ${target} RUNTIME DESTINATION bin)
+  endforeach()
+else()
+  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
 endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
similarity index 70%
rename from superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
rename to superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
index f3e1bc7f0..f703c587a 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
@@ -2,10 +2,10 @@
 
 __global__ void empty_kernel() {}
 
-void launch_bench(nvbench::state &state) {
+void kernel_launch(nvbench::state &state) {
   state.exec([](nvbench::launch &launch) {
     empty_kernel<<<1, 1, 0, launch.get_stream()>>>();
   });
 }
 
-NVBENCH_BENCH(launch_bench);
\ No newline at end of file
+NVBENCH_BENCH(kernel_launch);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
new file mode 100644
index 000000000..d486d8ee7
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
@@ -0,0 +1,23 @@
+#include <nvbench/nvbench.cuh>
+#include <cuda/std/chrono>
+#include <cuda_runtime.h>
+
+__global__ void sleep_kernel(nvbench::int64_t microseconds) {
+  const auto start = cuda::std::chrono::high_resolution_clock::now();
+  const auto target_duration = cuda::std::chrono::microseconds(microseconds);
+  const auto finish = start + target_duration;
+
+  while (cuda::std::chrono::high_resolution_clock::now() < finish) {
+    // busy wait
+  }
+}
+
+void sleep_benchmark(nvbench::state &state) {
+  const auto duration_us = state.get_int64("Duration (us)");
+  state.exec([&duration_us](nvbench::launch &launch) {
+    sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us);
+  });
+}
+NVBENCH_BENCH(sleep_benchmark)
+    .add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
+    .set_timeout(1); // Limit to one second per measurement.
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
new file mode 100644
index 000000000..2171a8b6b
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -0,0 +1,262 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Sleep Kernel benchmark."""
+
+import re
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+
+
+class NvbenchSleepKernel(MicroBenchmarkWithInvoke):
+    """The NVBench Sleep Kernel benchmark class."""
+
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'nvbench_sleep_kernel'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--devices',
+            type=str,
+            default=None,
+            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
+        )
+        self._parser.add_argument(
+            '--duration_us',
+            type=str,
+            default='[0,25,50,75,100]',
+            help='Duration axis values in microseconds, e.g., "[0,25,50,75,100]".',
+        )
+        self._parser.add_argument(
+            '--skip_time',
+            type=float,
+            default=-1.0,
+            help='Skip time in seconds.',
+        )
+        self._parser.add_argument(
+            '--throttle_threshold',
+            type=float,
+            default=75.0,
+            help='Throttle threshold percentage.',
+        )
+        self._parser.add_argument(
+            '--throttle_recovery_delay',
+            type=float,
+            default=0.05,
+            help='Throttle recovery delay in seconds.',
+        )
+        self._parser.add_argument(
+            '--run_once',
+            action='store_true',
+            help='Run once flag.',
+        )
+        self._parser.add_argument(
+            '--disable_blocking_kernel',
+            action='store_true',
+            help='Disable blocking kernel flag.',
+        )
+        self._parser.add_argument(
+            '--profile',
+            action='store_true',
+            help='Enable profiling flag.',
+        )
+        self._parser.add_argument(
+            '--timeout',
+            type=int,
+            default=15,
+            help='Timeout in seconds.',
+        )
+        self._parser.add_argument(
+            '--min_samples',
+            type=int,
+            default=None,
+            help='Minimum number of samples.',
+        )
+        self._parser.add_argument(
+            '--stopping_criterion',
+            type=str,
+            default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min_time',
+            type=float,
+            default=None,
+            help='Minimum time for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max_noise',
+            type=float,
+            default=None,
+            help='Maximum noise for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max_angle',
+            type=float,
+            default=None,
+            help='Maximum angle for entropy stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min_r2',
+            type=float,
+            default=None,
+            help='Minimum R-squared for entropy stopping criterion.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        command = str(self._args.bin_dir / self._bin_name)
+        parts = [command]
+
+        # Basic configuration
+        if self._args.devices is not None:
+            if self._args.devices == 'all':
+                parts.extend(['--devices', 'all'])
+            else:
+                parts.extend(['--devices', self._args.devices])
+
+        # Duration axis
+        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us}"'])
+
+        # Performance configuration
+        if self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+        if self._args.run_once:
+            parts.append('--run-once')
+        if self._args.disable_blocking_kernel:
+            parts.append('--disable-blocking-kernel')
+        if self._args.profile:
+            parts.append('--profile')
+
+        # Stopping criteria
+        if self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                if self._args.min_time is not None:
+                    parts.extend(['--min-time', str(self._args.min_time)])
+                if self._args.max_noise is not None:
+                    parts.extend(['--max-noise', str(self._args.max_noise)])
+            elif self._args.stopping_criterion == 'entropy':
+                if self._args.max_angle is not None:
+                    parts.extend(['--max-angle', str(self._args.max_angle)])
+                if self._args.min_r2 is not None:
+                    parts.extend(['--min-r2', str(self._args.min_r2)])
+
+        # finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+        self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        logger.debug(f"Processing raw result for command index {cmd_idx}.")
+        logger.debug(f"Raw output:\n{raw_output}")
+
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        try:
+            gpu_section = r"### \[(\d+)\] NVIDIA"
+            row_pat = (
+                r"\|\s*(\d+)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*"
+                r"([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|"
+            )
+            current = None
+            parsed_any = False
+            for line in raw_output.splitlines():
+                line = line.strip()
+                logger.debug(f"Processing line: {line}")
+                g = re.match(gpu_section, line)
+                if g:
+                    current = f"gpu_{g.group(1)}"
+                    logger.debug(f"Found GPU section: {current}")
+                    continue
+                r = re.match(row_pat, line)
+                if r and current:
+                    logger.debug(f"Matched row: {r.groups()}")
+                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu', self._parse_time_value(batch_gpu))
+                    parsed_any = True
+            if not parsed_any:
+                raise RuntimeError("No valid rows parsed")
+        except Exception as e:
+            logger.error(f"Error processing raw result: {e}")
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            return False
+        return True
+
+    def _parse_time_value(self, time_str):
+        """Parse time string to microseconds.
+
+        Args:
+            time_str (str): Time string like "25.321 us", "1.234 ms", etc.
+
+        Returns:
+            float: Time value in microseconds.
+        """
+        time_str = time_str.strip()
+        if time_str.endswith('us'):
+            return float(time_str[:-2].strip())
+        elif time_str.endswith('ms'):
+            return float(time_str[:-2].strip()) * 1000
+        elif time_str.endswith('ns'):
+            return float(time_str[:-2].strip()) / 1000
+        elif time_str.endswith('s'):
+            return float(time_str[:-1].strip()) * 1000000
+        else:
+            # Assume microseconds if no unit
+            return float(time_str)
+
+    def _parse_percentage(self, percent_str):
+        """Parse percentage string to float.
+
+        Args:
+            percent_str (str): Percentage string like "0.93%".
+
+        Returns:
+            float: Percentage value as float.
+        """
+        return float(percent_str[:-1].strip())
+
+
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
new file mode 100644
index 000000000..2784a007c
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -0,0 +1,110 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench sleep kernel benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchSleepKernelBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Sleep Kernel benchmark."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_sleep_kernel'])
+
+    def test_nvbench_sleep_kernel_preprocess(self):
+        """Test NVBench Sleep Kernel benchmark preprocess."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--devices 0 '
+            '--duration_us "[10,25,50,75]" '
+            '--timeout 20 '
+            '--min_samples 300 '
+            '--stopping_criterion stdrel '
+            '--min_time 2.0 '
+            '--max_noise 0.5 '
+            '--throttle_threshold 80.0 '
+            '--throttle_recovery_delay 1.0'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--axis "Duration (us)=[10,25,50,75]"' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 300' in benchmark._commands[0])
+        assert ('--stopping-criterion stdrel' in benchmark._commands[0])
+        assert ('--min-time 2.0' in benchmark._commands[0])
+        assert ('--max-noise 0.5' in benchmark._commands[0])
+        assert ('--throttle-threshold 80.0' in benchmark._commands[0])
+        assert ('--throttle-recovery-delay 1.0' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbench_sleep_kernel.log')
+    def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
+        """Test NVBench Sleep Kernel benchmark result parsing."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
+        assert benchmark.result['gpu_0_duration_us_25_cpu_time'][0] == 42.123
+        assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
+        assert benchmark.result['gpu_0_duration_us_25_gpu_time'][0] == 25.321
+        assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
+        assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
+        assert benchmark.result['gpu_0_duration_us_25_batch_gpu'][0] == 23.456
+
+        assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
+        assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
+
+    def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
+        """Test NVBench Sleep Kernel benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = "Invalid output format"
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/data/nvbench_sleep_kernel.log b/tests/data/nvbench_sleep_kernel.log
new file mode 100644
index 000000000..b2c4037da
--- /dev/null
+++ b/tests/data/nvbench_sleep_kernel.log
@@ -0,0 +1,43 @@
+# Devices
+
+## [0] `NVIDIA GPU`
+* SM Version: 900 (PTX Version: 900)
+* Number of SMs: 100
+* SM Default Clock Rate: 800 MHz
+* Global Memory: 100000 MiB Free / 100000 MiB Total
+* Global Memory Bus Peak: 3000 GB/sec (3000-bit DDR @4000MHz)
+* Max Shared Memory: 100 KiB/SM, 20 KiB/Block
+* L2 Cache Size: 1000 KiB
+* Maximum Active Blocks: 10/SM
+* Maximum Active Threads: 500/SM, 300/Block
+* Available Registers: 500/SM, 500/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/3] sleep_benchmark [Device=0 Duration (us)=25]
+Warn: Current measurement timed out (1.00s) while over noise threshold (0.93% > 0.50%)
+Warn: Current measurement timed out (1.00s) before accumulating min_time (0.31s < 0.50s)
+Pass: Cold: 0.030374ms GPU, 0.047379ms CPU, 0.31s total GPU, 1.00s total wall, 10175x 
+Pass: Batch: 0.028658ms GPU, 0.50s total GPU, 0.50s total wall, 17448x
+Run:  [2/3] sleep_benchmark [Device=0 Duration (us)=50]
+Warn: Current measurement timed out (1.00s) before accumulating min_time (0.45s < 0.50s)
+Pass: Cold: 0.055036ms GPU, 0.072054ms CPU, 0.45s total GPU, 1.00s total wall, 8187x 
+Pass: Batch: 0.053246ms GPU, 0.50s total GPU, 0.50s total wall, 9403x
+Run:  [3/3] sleep_benchmark [Device=0 Duration (us)=75]
+Pass: Cold: 0.079643ms GPU, 0.096788ms CPU, 0.50s total GPU, 0.92s total wall, 6279x 
+Pass: Batch: 0.077862ms GPU, 0.51s total GPU, 0.51s total wall, 6547x
+```
+
+# Benchmark Results
+
+## sleep_benchmark
+
+### [0] NVIDIA GPU
+
+| Duration (us) | Samples | CPU Time  | Noise  | GPU Time  | Noise | Samples | Batch GPU |
+|---------------|---------|-----------|--------|-----------|-------|---------|-----------|
+|            25 |  10175x | 42.123 us | 69.78% | 25.321 us | 0.93% |  17448x | 23.456 us |
+|            50 |   8187x | 68.456 us |  2.34% | 50.654 us | 0.45% |   9403x | 49.321 us |
+|            75 |   6279x | 90.789 us |  1.85% | 75.987 us | 0.33% |   6547x | 77.862 us |
\ No newline at end of file
diff --git a/third_party/Makefile b/third_party/Makefile
index 465b5d6e4..5f16bbce2 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -305,12 +305,19 @@ nvbandwidth: sb_micro_path
 
 # Build nvbench
 nvbench: sb_micro_path
-	cd ./nvbench && mkdir -p build && cd build && \
-	cmake                                         \
-	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
-	-DCMAKE_CUDA_ARCHITECTURES=100                \
-	-DNVBench_ENABLE_CUPTI=ON                     \
-	-DCMAKE_BUILD_TYPE=Release                    \
-	.. &&                                         \
-	make -j $(NUM_MAKE_JOBS) &&                   \
-	make install
+ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+    $(eval ARCHS := "90;100")
+else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+    $(eval ARCHS := "70;75;80;86;89;90")
+else
+    $(eval ARCHS := "70;75;80;86")
+endif
+    cd ./nvbench && mkdir -p build && cd build && \
+    cmake                                         \
+    -DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
+    -DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)           \
+    -DNVBench_ENABLE_CUPTI=ON                     \
+    -DCMAKE_BUILD_TYPE=Release                    \
+    .. &&                                         \
+    make -j $(NUM_MAKE_JOBS) &&                   \
+    make install

From bd87f50d96b9accd6ed1b5005d9f3ab8a6771d6e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 8 Oct 2025 02:49:48 +0000
Subject: [PATCH 04/37] test sleep kernel

---
 .../micro_benchmarks/nvbench_sleep_kernel.py  |  3 +-
 third_party/Makefile                          | 42 +++++++++----------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 2171a8b6b..a1a1d55ad 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -4,6 +4,7 @@
 """Module of the NVBench Sleep Kernel benchmark."""
 
 import re
+import os
 
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
@@ -126,7 +127,7 @@ def _preprocess(self):
         if not super()._preprocess():
             return False
 
-        command = str(self._args.bin_dir / self._bin_name)
+        command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
 
         # Basic configuration
diff --git a/third_party/Makefile b/third_party/Makefile
index c73ff7f73..df077eb45 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -192,7 +192,7 @@ endif
 cpu_hpl: sb_micro_path
 ifneq (,$(wildcard hpl-tests/Makefile))
 	cd ./hpl-tests && \
-    wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
+	wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
 	tar xzf hpl-2.3.tar.gz && \
 	cp Make.Linux_zen3 hpl-2.3 && \
 	cp Make.Linux_zen4 hpl-2.3 && \
@@ -208,7 +208,7 @@ endif
 cpu_stream: sb_micro_path
 ifneq (,$(wildcard stream-tests/Makefile))
 	cd ./stream-tests && \
-    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+	wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
 	make all
 	cp -v ./stream-tests/stream* $(SB_MICRO_PATH)/bin/
 endif
@@ -243,10 +243,10 @@ rocm_megatron_lm:
 	fi
 	cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
 	git clone https://github.com/caaatch22/grouped_gemm.git &&\
-    	cd grouped_gemm &&\
-    	git checkout 8a9b438 &&\
-    	git submodule update --init --recursive &&\
-    	pip install .
+		cd grouped_gemm &&\
+		git checkout 8a9b438 &&\
+		git submodule update --init --recursive &&\
+		pip install .
 
 # Instal apex of ROCm due to dependency of Megatron
 apex_rocm:
@@ -264,7 +264,7 @@ apex_rocm:
 	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
 		git checkout release/1.0.0 ; \
 	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
-	    git checkout release/1.0.0 ; \
+		git checkout release/1.0.0 ; \
 	fi
 	pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
 
@@ -276,11 +276,11 @@ ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
 	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-    # Get commit 87048bd from msscl to support updated nccl and sm_100
+	# Get commit 87048bd from msscl to support updated nccl and sm_100
 	$(eval ARCHS := 75 80 86 89 90 100)
 	if [ -d msccl ]; then rm -rf msccl; fi; \
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
-    && git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
+	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := 70 75 80 86 89 90)
 else
@@ -316,18 +316,18 @@ nvbandwidth: sb_micro_path
 # Build nvbench
 nvbench: sb_micro_path
 ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-    $(eval ARCHS := "90;100")
+	$(eval ARCHS := "90;100")
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
-    $(eval ARCHS := "70;75;80;86;89;90")
+	$(eval ARCHS := "70;75;80;86;89;90")
 else
-    $(eval ARCHS := "70;75;80;86")
+	$(eval ARCHS := "70;75;80;86")
 endif
-    cd ./nvbench && mkdir -p build && cd build && \
-    cmake                                         \
-    -DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
-    -DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)           \
-    -DNVBench_ENABLE_CUPTI=ON                     \
-    -DCMAKE_BUILD_TYPE=Release                    \
-    .. &&                                         \
-    make -j $(NUM_MAKE_JOBS) &&                   \
-    make install
+	cd ./nvbench && mkdir -p build && cd build && \
+	cmake										 \
+	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)	   \
+	-DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)		   \
+	-DNVBench_ENABLE_CUPTI=ON					 \
+	-DCMAKE_BUILD_TYPE=Release					\
+	.. &&										 \
+	make -j $(NUM_MAKE_JOBS) &&				   \
+	make install

From a663db6edecf4340a5ba5d8fcbbff750e81dffe9 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 8 Oct 2025 02:52:39 +0000
Subject: [PATCH 05/37] add sm 103

---
 third_party/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index df077eb45..508faa88d 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -315,7 +315,9 @@ nvbandwidth: sb_micro_path
 
 # Build nvbench
 nvbench: sb_micro_path
-ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
+	$(eval ARCHS := "100;103")
+else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval ARCHS := "90;100")
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := "70;75;80;86;89;90")

From 32fe19702041794c675e2374f04752143edcc234 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 8 Oct 2025 23:54:24 +0000
Subject: [PATCH 06/37] add arg parsing logic

---
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index a1a1d55ad..2f4fa4222 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -39,7 +39,8 @@ def add_parser_arguments(self):
             '--duration_us',
             type=str,
             default='[0,25,50,75,100]',
-            help='Duration axis values in microseconds, e.g., "[0,25,50,75,100]".',
+            help='Duration axis values in microseconds. Supports multiple formats: '
+                 '"50" (single value), "[25,50,75]" (list), "[0:10]" (range), "[0:50:10]" (range with step).',
         )
         self._parser.add_argument(
             '--skip_time',
@@ -118,6 +119,30 @@ def add_parser_arguments(self):
             help='Minimum R-squared for entropy stopping criterion.',
         )
 
+    def _parse_duration_format(self, duration_str):
+        """Parse duration parameter to proper axis format.
+        
+        Args:
+            duration_str (str): Duration specification as string
+            
+        Returns:
+            str: Properly formatted duration string for --axis parameter
+        """
+        if not isinstance(duration_str, str):
+            return str(duration_str)
+            
+        # String format - could be various formats
+        duration_str = duration_str.strip()
+        
+        # Remove outer quotes if present
+        if duration_str.startswith('"') and duration_str.endswith('"'):
+            duration_str = duration_str[1:-1]
+        elif duration_str.startswith("'") and duration_str.endswith("'"):
+            duration_str = duration_str[1:-1]
+        
+        # Return as-is - should already be in correct format
+        return duration_str
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -137,8 +162,9 @@ def _preprocess(self):
             else:
                 parts.extend(['--devices', self._args.devices])
 
-        # Duration axis
-        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us}"'])
+        # Duration axis - parse the format properly
+        duration_formatted = self._parse_duration_format(self._args.duration_us)
+        parts.extend(['--axis', f'"Duration (us)={duration_formatted}"'])
 
         # Performance configuration
         if self._args.skip_time >= 0:

From 3eb55253ee2f4194242ab419ddb1336f216635b0 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 00:02:01 +0000
Subject: [PATCH 07/37] add arg parsing tests

---
 .../test_nvbench_sleep_kernel.py              | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 2784a007c..83f4702d5 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -87,6 +87,78 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
         assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
 
+    def test_nvbench_sleep_kernel_parse_duration_formats(self):
+        """Test NVBench Sleep Kernel duration format parsing."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Test single value formats
+        assert benchmark._parse_duration_format("50") == "50"
+        assert benchmark._parse_duration_format("100") == "100"
+        
+        # Test quoted single values
+        assert benchmark._parse_duration_format('"50"') == "50"
+        assert benchmark._parse_duration_format("'100'") == "100"
+        
+        # Test list formats
+        assert benchmark._parse_duration_format("[25,50,75]") == "[25,50,75]"
+        assert benchmark._parse_duration_format("[10,20,30,40]") == "[10,20,30,40]"
+        
+        # Test quoted list formats
+        assert benchmark._parse_duration_format('"[25,50,75]"') == "[25,50,75]"
+        assert benchmark._parse_duration_format("'[10,20,30]'") == "[10,20,30]"
+        
+        # Test range formats
+        assert benchmark._parse_duration_format("[25:75]") == "[25:75]"
+        assert benchmark._parse_duration_format("[0:100]") == "[0:100]"
+        
+        # Test range with step formats
+        assert benchmark._parse_duration_format("[0:50:10]") == "[0:50:10]"
+        assert benchmark._parse_duration_format("[10:100:20]") == "[10:100:20]"
+        
+        # Test quoted range formats
+        assert benchmark._parse_duration_format('"[25:75]"') == "[25:75]"
+        assert benchmark._parse_duration_format("'[0:50:10]'") == "[0:50:10]"
+        
+        # Test with whitespace
+        assert benchmark._parse_duration_format(" 50 ") == "50"
+        assert benchmark._parse_duration_format(" [25,50,75] ") == "[25,50,75]"
+        assert benchmark._parse_duration_format(" [25:75] ") == "[25:75]"
+
+    def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
+        """Test NVBench Sleep Kernel preprocess with different duration formats."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "50"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=50"' in benchmark._commands[0]
+        
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25,50,75]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[25,50,75]"' in benchmark._commands[0]
+        
+        # Test range format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25:75]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[25:75]"' in benchmark._commands[0]
+        
+        # Test range with step format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[0:50:10]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[0:50:10]"' in benchmark._commands[0]
+        
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[0,25,50,75,100]"' in benchmark._commands[0]
+
     def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
         """Test NVBench Sleep Kernel benchmark result parsing with invalid output."""
         benchmark_name = 'nvbench-sleep-kernel'

From 4785fe6b6c6c91bb38fef0973c8e0ea3544a5527 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 17:49:29 +0000
Subject: [PATCH 08/37] refactor

---
 .../micro_benchmarks/nvbench_base.py          | 241 ++++++++++++++++++
 .../micro_benchmarks/nvbench_kernel_launch.py | 176 +++----------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 179 ++-----------
 .../test_nvbench_kernel_launch.py             |   8 +-
 .../test_nvbench_sleep_kernel.py              |  14 +-
 5 files changed, 306 insertions(+), 312 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_base.py

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
new file mode 100644
index 000000000..3dba511b4
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -0,0 +1,241 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Base class for NVBench benchmarks."""
+
+import os
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import ReturnCode
+from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+
+
+def parse_time_to_us(raw: str) -> float:
+    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
+    raw = raw.strip()
+    if raw.endswith('%'):
+        return float(raw[:-1])
+    # split "value unit" or "valueunit"
+    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
+    if not m:
+        return float(raw)
+    val, unit = float(m.group(1)), (m.group(2) or 'us')
+    if unit == 'ns':
+        return val / 1e3
+    if unit == 'ms':
+        return val * 1e3
+    return val
+
+
+class NvbenchBase(MicroBenchmarkWithInvoke):
+    """Base class for NVBench benchmarks with common functionality."""
+
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        # Subclasses should set this
+        self._bin_name = None
+
+    def add_parser_arguments(self):
+        """Add common NVBench arguments."""
+        super().add_parser_arguments()
+
+        # Device configuration
+        self._parser.add_argument(
+            '--devices',
+            type=str,
+            default=None,
+            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
+        )
+
+        # Benchmark Properties
+        self._parser.add_argument(
+            '--skip-time',
+            type=float,
+            default=-1.0,
+            help='Skip time in seconds.',
+        )
+        self._parser.add_argument(
+            '--throttle-threshold',
+            type=float,
+            default=75.0,
+            help='Throttle threshold percentage.',
+        )
+        self._parser.add_argument(
+            '--throttle-recovery-delay',
+            type=float,
+            default=0.05,
+            help='Throttle recovery delay in seconds.',
+        )
+        self._parser.add_argument(
+            '--run-once',
+            action='store_true',
+            help='Run once flag.',
+        )
+        self._parser.add_argument(
+            '--disable-blocking-kernel',
+            action='store_true',
+            help='Disable blocking kernel flag.',
+        )
+        self._parser.add_argument(
+            '--profile',
+            action='store_true',
+            help='Enable profiling flag.',
+        )
+
+        # Stopping Criteria
+        self._parser.add_argument(
+            '--timeout',
+            type=int,
+            default=15,
+            help='Timeout in seconds.',
+        )
+        self._parser.add_argument(
+            '--min-samples',
+            type=int,
+            default=10,
+            help='Minimum number of samples.',
+        )
+        self._parser.add_argument(
+            '--stopping-criterion',
+            type=str,
+            default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion.',
+        )
+        # stdrel-specific
+        self._parser.add_argument(
+            '--min-time',
+            type=float,
+            default=0.5,
+            help='Minimum time for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max-noise',
+            type=float,
+            default=0.5,
+            help='Maximum noise for stdrel stopping criterion.',
+        )
+        # entropy-specific
+        self._parser.add_argument(
+            '--max-angle',
+            type=float,
+            default=0.048,
+            help='Maximum angle for entropy stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min-r2',
+            type=float,
+            default=0.36,
+            help='Minimum R-squared for entropy stopping criterion.',
+        )
+
+    def _build_base_command(self):
+        """Build the base nvbench command with common arguments.
+        
+        Returns:
+            list: Command parts that can be extended by subclasses.
+        """
+        if not self._bin_name:
+            raise ValueError("Subclass must set _bin_name")
+            
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [command]
+
+        # Device configuration
+        if self._args.devices is not None:
+            if self._args.devices == 'all':
+                parts.extend(['--devices', 'all'])
+            else:
+                parts.extend(['--devices', self._args.devices])
+
+        # Benchmark Properties
+        if hasattr(self._args, 'skip_time') and self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        if hasattr(self._args, 'throttle_threshold') and self._args.throttle_threshold > 0:
+            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        if hasattr(self._args, 'throttle_recovery_delay') and self._args.throttle_recovery_delay > 0:
+            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+        if hasattr(self._args, 'run_once') and self._args.run_once:
+            parts.append('--run-once')
+        if hasattr(self._args, 'disable_blocking_kernel') and self._args.disable_blocking_kernel:
+            parts.append('--disable-blocking-kernel')
+        if hasattr(self._args, 'profile') and self._args.profile:
+            parts.append('--profile')
+
+        # Stopping criteria
+        if hasattr(self._args, 'timeout') and self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if hasattr(self._args, 'min_samples') and self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if hasattr(self._args, 'stopping_criterion') and self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                if hasattr(self._args, 'min_time') and self._args.min_time is not None:
+                    parts.extend(['--min-time', str(self._args.min_time)])
+                if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
+                    parts.extend(['--max-noise', str(self._args.max_noise)])
+            elif self._args.stopping_criterion == 'entropy':
+                if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
+                    parts.extend(['--max-angle', str(self._args.max_angle)])
+                if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
+                    parts.extend(['--min-r2', str(self._args.min_r2)])
+
+        return parts
+
+    def _preprocess(self):
+        """Default preprocess implementation. Can be overridden by subclasses.
+        
+        Returns:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Build base command - subclasses can override this method to add specific arguments
+        parts = self._build_base_command()
+        
+        # Finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _parse_time_value(self, time_str):
+        """Parse time string to microseconds.
+        
+        Args:
+            time_str (str): Time string like "123.45 us", "678.9 ns", etc.
+            
+        Returns:
+            float: Time in microseconds.
+        """
+        return parse_time_to_us(time_str)
+
+    def _parse_percentage(self, percent_str):
+        """Parse percentage string to float.
+        
+        Args:
+            percent_str (str): Percentage string like "12.34%"
+            
+        Returns:
+            float: Percentage value as float.
+        """
+        if isinstance(percent_str, str) and percent_str.endswith('%'):
+            return float(percent_str[:-1])
+        return float(percent_str)
+
+    def _handle_parsing_error(self, error_msg, raw_output):
+        """Handle parsing errors consistently.
+        
+        Args:
+            error_msg (str): Error message to log.
+            raw_output (str): Raw output that failed to parse.
+        """
+        self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+        logger.error(
+            f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}"
+        )
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 74cf94383..3ff8fdd07 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -1,139 +1,39 @@
-import os
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Kernel Launch benchmark."""
+
 import re
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
-from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
 
-def parse_time_to_us(raw: str) -> float:
-    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
-    raw = raw.strip()
-    if raw.endswith('%'):
-        return float(raw[:-1])
-    # split “value unit” or “valueunit”
-    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
-    if not m:
-        return float(raw)
-    val, unit = float(m.group(1)), (m.group(2) or 'us')
-    if unit == 'ns':  return val / 1e3
-    if unit == 'ms':  return val * 1e3
-    return val
 
-class NvbenchKernelLaunch(MicroBenchmarkWithInvoke):
-    """Nvbench benchmark wrapper for SuperBench."""
-    def __init__(self, name, parameters=None):
-        """Initialize the benchmark."""
-        super().__init__(name, parameters)
-        self._bin_name = "nvbench_kernel_launch"
-    
-    def add_parser_arguments(self):
-        """
-        Add NVBench CLI options (excluding Queries, Device modification, Output, Benchmark / Axis Specification):
-          - Benchmark Properties (Omit --run-once, --disable-blocking-kernel, --profile)
-          - Stopping Criteria
-        """
-        super().add_parser_arguments()
+class NvbenchKernelLaunch(NvbenchBase):
+    """The NVBench Kernel Launch benchmark class."""
 
-        # Benchmark Properties
-        self._parser.add_argument(
-            '--devices', '--device', '-d', type=str, default=None,
-            help='Limit execution to one or more device IDs (comma-separated or "all").'
-        )
-        self._parser.add_argument(
-            '--skip-time', type=float, default=-1.0,
-            help='Skip a measurement when a warmup run executes in less than this (seconds).'
-        )
-        # With a threshold >0 and a recovery delay >0, NVBench will automatically pause 
-        # and “wait for the card to warm up” back to a stable high‐clock state, giving you 
-        # more consistent (and lower) timings that reflect the GPU’s true peak performance.
-        self._parser.add_argument(
-            '--throttle-threshold', type=float, default=75.0,
-            help="GPU throttle threshold as percent of default clock rate. Disabled when nvbench::exec_tag::sync is used."
-        )
-        self._parser.add_argument(
-            '--throttle-recovery-delay', type=float, default=0.05,
-            help='Seconds to wait after throttle before resuming. '
-            'Disabled when nvbench::exec_tag::sync is used.'
-        )
+    def __init__(self, name, parameters=''):
+        """Constructor.
 
-        # Stopping Criteria
-        self._parser.add_argument(
-            '--timeout', type=int, default=15,
-            help='Walltime timeout in seconds for each measurement.'
-        )
-        self._parser.add_argument(
-            '--min-samples', type=int, default=10,
-            help='Minimum number of samples per measurement before checking other criteria.'
-        )
-        self._parser.add_argument(
-            '--stopping-criterion', type=str, default='stdrel',
-            choices=['stdrel', 'entropy'],
-            help='Stopping criterion to use after --min-samples is satisfied: '
-            '"stdrel" or "entropy".'
-        )
-        # stdrel-specific
-        self._parser.add_argument(
-            '--min-time', type=float, default=0.5, 
-            help='(stdrel) Minimum execution time accumulated per measurement (seconds).'
-        )
-        self._parser.add_argument(
-            '--max-noise', type=float, default=0.5,
-            help='(stdrel) Maximum relative standard deviation (%) before stopping.'
-        )
-        # entropy-specific
-        self._parser.add_argument(
-            '--max-angle', type=float, default=0.048,
-            help='(entropy) Maximum linear regression angle of cumulative entropy.'
-        )
-        self._parser.add_argument(
-            '--min-r2', type=float, default=0.36,
-            help='(entropy) Minimum coefficient of determination (R²) for linear regression of cumulative entropy.'
-        )
-
-    def _preprocess(self):
-        """Preprocess/preparation operations before the benchmarking.
-        Return:
-            True if _preprocess() succeed.
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
         """
-        if not super()._preprocess():
-            return False
-
-        bin_path = os.path.join(self._args.bin_dir, self._bin_name)
-        parts = [bin_path]
+        super().__init__(name, parameters)
+        self._bin_name = 'nvbench_kernel_launch'
 
-        # Benchmark Properties (all optional)
-        if self._args.devices:
-            parts.extend(['--devices', self._args.devices])
-        if self._args.skip_time >= 0:
-            parts.extend(['--skip-time', str(self._args.skip_time)])
-        if self._args.throttle_threshold > 0:
-            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
-        if self._args.throttle_recovery_delay > 0:
-            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
 
-        # Stopping Criteria (all optional)
-        if self._args.timeout is not None:
-            parts.extend(['--timeout', str(self._args.timeout)])
-        if self._args.min_samples is not None:
-            parts.extend(['--min-samples', str(self._args.min_samples)])
-        if self._args.stopping_criterion:
-            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
-            if self._args.stopping_criterion == 'stdrel':
-                if self._args.min_time is not None:
-                    parts.extend(['--min-time', str(self._args.min_time)])
-                if self._args.max_noise is not None:
-                    parts.extend(['--max-noise', str(self._args.max_noise)])
-            elif self._args.stopping_criterion == 'entropy':
-                if self._args.max_angle is not None:
-                    parts.extend(['--max-angle', str(self._args.max_angle)])
-                if self._args.min_r2 is not None:
-                    parts.extend(['--min-r2', str(self._args.min_r2)])
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
 
-        # finalize command
-        self._commands = [' '.join(parts)]
-        return True
-
-    def _process_raw_result(self, cmd_idx, raw_output):
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
             row_pat = (
@@ -142,32 +42,34 @@ def _process_raw_result(self, cmd_idx, raw_output):
             )
             current = None
             parsed_any = False  # Track if any valid rows are parsed
+            
             for line in raw_output.splitlines():
                 line = line.strip()
                 g = re.match(gpu_section, line)
                 if g:
                     current = f"gpu_{g.group(1)}"
                     continue
+                    
                 r = re.match(row_pat, line)
                 if r and current:
-                    self._result.add_result(f"{current}_samples", int(r.group(1)))
-                    self._result.add_result(f"{current}_cpu_time", parse_time_to_us(r.group(2)))
-                    self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
-                    self._result.add_result(f"{current}_gpu_time", parse_time_to_us(r.group(4)))
-                    self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
-                    self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
-                    self._result.add_result(f"{current}_batch_gpu_time", parse_time_to_us(r.group(7)))
+                    # self._result.add_result(f"{current}_samples", int(r.group(1)))
+                    self._result.add_result(f"{current}_cpu_time", self._parse_time_value(r.group(2)))
+                    # self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
+                    self._result.add_result(f"{current}_gpu_time", self._parse_time_value(r.group(4)))
+                    # self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
+                    # self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
+                    self._result.add_result(f"{current}_batch_gpu_time", self._parse_time_value(r.group(7)))
                     parsed_any = True
+                    
             if not parsed_any:
                 logger.error("No valid rows parsed from the raw output.")
                 raise RuntimeError("No valid rows parsed")
+                
         except Exception as e:
-            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
-            logger.error(
-                f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{e}\n{raw_output}"
-            )
+            self._handle_parsing_error(str(e), raw_output)
             return False
+            
         return True
 
-# Register the benchmark
-BenchmarkRegistry.register_benchmark("nvbench-kernel-launch", NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
+
+BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 2f4fa4222..bc154d8b1 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -8,10 +8,10 @@
 
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
-from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
 
 
-class NvbenchSleepKernel(MicroBenchmarkWithInvoke):
+class NvbenchSleepKernel(NvbenchBase):
     """The NVBench Sleep Kernel benchmark class."""
 
     def __init__(self, name, parameters=''):
@@ -26,97 +26,16 @@ def __init__(self, name, parameters=''):
         self._bin_name = 'nvbench_sleep_kernel'
 
     def add_parser_arguments(self):
-        """Add the specified arguments."""
+        """Add sleep-kernel specific arguments."""
         super().add_parser_arguments()
 
-        self._parser.add_argument(
-            '--devices',
-            type=str,
-            default=None,
-            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
-        )
+        # Sleep-kernel specific argument
         self._parser.add_argument(
             '--duration_us',
             type=str,
             default='[0,25,50,75,100]',
             help='Duration axis values in microseconds. Supports multiple formats: '
-                 '"50" (single value), "[25,50,75]" (list), "[0:10]" (range), "[0:50:10]" (range with step).',
-        )
-        self._parser.add_argument(
-            '--skip_time',
-            type=float,
-            default=-1.0,
-            help='Skip time in seconds.',
-        )
-        self._parser.add_argument(
-            '--throttle_threshold',
-            type=float,
-            default=75.0,
-            help='Throttle threshold percentage.',
-        )
-        self._parser.add_argument(
-            '--throttle_recovery_delay',
-            type=float,
-            default=0.05,
-            help='Throttle recovery delay in seconds.',
-        )
-        self._parser.add_argument(
-            '--run_once',
-            action='store_true',
-            help='Run once flag.',
-        )
-        self._parser.add_argument(
-            '--disable_blocking_kernel',
-            action='store_true',
-            help='Disable blocking kernel flag.',
-        )
-        self._parser.add_argument(
-            '--profile',
-            action='store_true',
-            help='Enable profiling flag.',
-        )
-        self._parser.add_argument(
-            '--timeout',
-            type=int,
-            default=15,
-            help='Timeout in seconds.',
-        )
-        self._parser.add_argument(
-            '--min_samples',
-            type=int,
-            default=None,
-            help='Minimum number of samples.',
-        )
-        self._parser.add_argument(
-            '--stopping_criterion',
-            type=str,
-            default='stdrel',
-            choices=['stdrel', 'entropy'],
-            help='Stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--min_time',
-            type=float,
-            default=None,
-            help='Minimum time for stdrel stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--max_noise',
-            type=float,
-            default=None,
-            help='Maximum noise for stdrel stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--max_angle',
-            type=float,
-            default=None,
-            help='Maximum angle for entropy stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--min_r2',
-            type=float,
-            default=None,
-            help='Minimum R-squared for entropy stopping criterion.',
+                 '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
         )
 
     def _parse_duration_format(self, duration_str):
@@ -152,51 +71,14 @@ def _preprocess(self):
         if not super()._preprocess():
             return False
 
-        command = os.path.join(self._args.bin_dir, self._bin_name)
-        parts = [command]
-
-        # Basic configuration
-        if self._args.devices is not None:
-            if self._args.devices == 'all':
-                parts.extend(['--devices', 'all'])
-            else:
-                parts.extend(['--devices', self._args.devices])
-
-        # Duration axis - parse the format properly
+        # Build base command with common nvbench arguments
+        parts = self._build_base_command()
+        
+        # Add sleep-kernel specific arguments
         duration_formatted = self._parse_duration_format(self._args.duration_us)
         parts.extend(['--axis', f'"Duration (us)={duration_formatted}"'])
 
-        # Performance configuration
-        if self._args.skip_time >= 0:
-            parts.extend(['--skip-time', str(self._args.skip_time)])
-        parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
-        parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
-        if self._args.run_once:
-            parts.append('--run-once')
-        if self._args.disable_blocking_kernel:
-            parts.append('--disable-blocking-kernel')
-        if self._args.profile:
-            parts.append('--profile')
-
-        # Stopping criteria
-        if self._args.timeout is not None:
-            parts.extend(['--timeout', str(self._args.timeout)])
-        if self._args.min_samples is not None:
-            parts.extend(['--min-samples', str(self._args.min_samples)])
-        if self._args.stopping_criterion:
-            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
-            if self._args.stopping_criterion == 'stdrel':
-                if self._args.min_time is not None:
-                    parts.extend(['--min-time', str(self._args.min_time)])
-                if self._args.max_noise is not None:
-                    parts.extend(['--max-noise', str(self._args.max_noise)])
-            elif self._args.stopping_criterion == 'entropy':
-                if self._args.max_angle is not None:
-                    parts.extend(['--max-angle', str(self._args.max_angle)])
-                if self._args.min_r2 is not None:
-                    parts.extend(['--min-r2', str(self._args.min_r2)])
-
-        # finalize command
+        # Finalize command
         self._commands = [' '.join(parts)]
         return True
 
@@ -236,13 +118,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 if r and current:
                     logger.debug(f"Matched row: {r.groups()}")
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
                     self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu', self._parse_time_value(batch_gpu))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
             if not parsed_any:
                 raise RuntimeError("No valid rows parsed")
@@ -252,38 +134,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
             return False
         return True
 
-    def _parse_time_value(self, time_str):
-        """Parse time string to microseconds.
-
-        Args:
-            time_str (str): Time string like "25.321 us", "1.234 ms", etc.
-
-        Returns:
-            float: Time value in microseconds.
-        """
-        time_str = time_str.strip()
-        if time_str.endswith('us'):
-            return float(time_str[:-2].strip())
-        elif time_str.endswith('ms'):
-            return float(time_str[:-2].strip()) * 1000
-        elif time_str.endswith('ns'):
-            return float(time_str[:-2].strip()) / 1000
-        elif time_str.endswith('s'):
-            return float(time_str[:-1].strip()) * 1000000
-        else:
-            # Assume microseconds if no unit
-            return float(time_str)
 
-    def _parse_percentage(self, percent_str):
-        """Parse percentage string to float.
-
-        Args:
-            percent_str (str): Percentage string like "0.93%".
-
-        Returns:
-            float: Percentage value as float.
-        """
-        return float(percent_str[:-1].strip())
 
 
 BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 7c131a6c8..63f2738d0 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -74,12 +74,12 @@ def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        assert benchmark.result['gpu_0_samples'][0] == 120000
+        # assert benchmark.result['gpu_0_samples'][0] == 120000
         assert benchmark.result['gpu_0_cpu_time'][0] == 24.222
-        assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
+        # assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
         assert benchmark.result['gpu_0_gpu_time'][0] == 7.808
-        assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
-        assert benchmark.result['gpu_0_batch_samples'][0] == 300000
+        # assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
+        # assert benchmark.result['gpu_0_batch_samples'][0] == 300000
         assert benchmark.result['gpu_0_batch_gpu_time'][0] == 6.024
 
     def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 83f4702d5..4eeb366b3 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -76,16 +76,16 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
+        # assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
         assert benchmark.result['gpu_0_duration_us_25_cpu_time'][0] == 42.123
-        assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
+        # assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
         assert benchmark.result['gpu_0_duration_us_25_gpu_time'][0] == 25.321
-        assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
-        assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
-        assert benchmark.result['gpu_0_duration_us_25_batch_gpu'][0] == 23.456
+        # assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
+        # assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
+        assert benchmark.result['gpu_0_duration_us_25_batch_gpu_time'][0] == 23.456
 
-        assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
-        assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
+        # assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
+        # assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
 
     def test_nvbench_sleep_kernel_parse_duration_formats(self):
         """Test NVBench Sleep Kernel duration format parsing."""

From 1fb7c05c6744d428c998e095deaccc03907f5903 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 21:25:21 +0000
Subject: [PATCH 09/37] refine logic - remove gpu_id

---
 .../micro_benchmarks/nvbench_base.py          |  5 +-
 .../micro_benchmarks/nvbench_kernel_launch.py | 15 ++--
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 46 +++---------
 .../test_nvbench_kernel_launch.py             | 16 ++--
 .../test_nvbench_sleep_kernel.py              | 73 ++++---------------
 5 files changed, 43 insertions(+), 112 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 3dba511b4..916408304 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -147,8 +147,9 @@ def _build_base_command(self):
         command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
 
-        # Device configuration
-        if self._args.devices is not None:
+        # Device configuration - in distributed mode, let SuperBench handle device assignment
+        # Only add --devices if explicitly specified
+        if hasattr(self._args, 'devices') and self._args.devices is not None:
             if self._args.devices == 'all':
                 parts.extend(['--devices', 'all'])
             else:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 3ff8fdd07..1544586fa 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -52,13 +52,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     
                 r = re.match(row_pat, line)
                 if r and current:
-                    # self._result.add_result(f"{current}_samples", int(r.group(1)))
-                    self._result.add_result(f"{current}_cpu_time", self._parse_time_value(r.group(2)))
-                    # self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
-                    self._result.add_result(f"{current}_gpu_time", self._parse_time_value(r.group(4)))
-                    # self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
-                    # self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
-                    self._result.add_result(f"{current}_batch_gpu_time", self._parse_time_value(r.group(7)))
+                    # self._result.add_result("samples", int(r.group(1)))
+                    self._result.add_result("cpu_time", self._parse_time_value(r.group(2)))
+                    # self._result.add_result("cpu_noise", self._parse_percentage(r.group(3)))
+                    self._result.add_result("gpu_time", self._parse_time_value(r.group(4)))
+                    # self._result.add_result("gpu_noise", self._parse_percentage(r.group(5)))
+                    # self._result.add_result("batch_samples", int(r.group(6)))
+                    self._result.add_result("batch_gpu_time", self._parse_time_value(r.group(7)))
                     parsed_any = True
                     
             if not parsed_any:
@@ -71,5 +71,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
             
         return True
 
-
 BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index bc154d8b1..75b54ab66 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -38,30 +38,6 @@ def add_parser_arguments(self):
                  '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
         )
 
-    def _parse_duration_format(self, duration_str):
-        """Parse duration parameter to proper axis format.
-        
-        Args:
-            duration_str (str): Duration specification as string
-            
-        Returns:
-            str: Properly formatted duration string for --axis parameter
-        """
-        if not isinstance(duration_str, str):
-            return str(duration_str)
-            
-        # String format - could be various formats
-        duration_str = duration_str.strip()
-        
-        # Remove outer quotes if present
-        if duration_str.startswith('"') and duration_str.endswith('"'):
-            duration_str = duration_str[1:-1]
-        elif duration_str.startswith("'") and duration_str.endswith("'"):
-            duration_str = duration_str[1:-1]
-        
-        # Return as-is - should already be in correct format
-        return duration_str
-
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -75,8 +51,7 @@ def _preprocess(self):
         parts = self._build_base_command()
         
         # Add sleep-kernel specific arguments
-        duration_formatted = self._parse_duration_format(self._args.duration_us)
-        parts.extend(['--axis', f'"Duration (us)={duration_formatted}"'])
+        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us.strip()}"'])
 
         # Finalize command
         self._commands = [' '.join(parts)]
@@ -118,13 +93,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 if r and current:
                     logger.debug(f"Matched row: {r.groups()}")
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
+                    # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
+                    self._result.add_result(f'duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
+                    # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
+                    self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
+                    # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
             if not parsed_any:
                 raise RuntimeError("No valid rows parsed")
@@ -134,7 +109,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
             return False
         return True
 
-
-
-
-BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 63f2738d0..2923c7d3b 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -33,7 +33,7 @@ def test_nvbench_kernel_launch_preprocess(self):
 
         # Test preprocess with specified parameters
         parameters = (
-            '--device 0 '
+            '--devices 0 '
             '--timeout 20 '
             '--min-samples 300 '
             '--stopping-criterion stdrel '
@@ -74,13 +74,13 @@ def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        # assert benchmark.result['gpu_0_samples'][0] == 120000
-        assert benchmark.result['gpu_0_cpu_time'][0] == 24.222
-        # assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
-        assert benchmark.result['gpu_0_gpu_time'][0] == 7.808
-        # assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
-        # assert benchmark.result['gpu_0_batch_samples'][0] == 300000
-        assert benchmark.result['gpu_0_batch_gpu_time'][0] == 6.024
+        # assert benchmark.result['samples'][0] == 120000
+        assert benchmark.result['cpu_time'][0] == 24.222
+        # assert benchmark.result['cpu_noise'][0] == 30.44
+        assert benchmark.result['gpu_time'][0] == 7.808
+        # assert benchmark.result['gpu_noise'][0] == 14.42
+        # assert benchmark.result['batch_samples'][0] == 300000
+        assert benchmark.result['batch_gpu_time'][0] == 6.024
 
     def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
         """Test NVBench Kernel Launch benchmark result parsing with invalid output."""
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 4eeb366b3..6d020814a 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -36,12 +36,12 @@ def test_nvbench_sleep_kernel_preprocess(self):
             '--devices 0 '
             '--duration_us "[10,25,50,75]" '
             '--timeout 20 '
-            '--min_samples 300 '
-            '--stopping_criterion stdrel '
-            '--min_time 2.0 '
-            '--max_noise 0.5 '
-            '--throttle_threshold 80.0 '
-            '--throttle_recovery_delay 1.0'
+            '--min-samples 300 '
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.5 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 1.0'
         )
         benchmark = benchmark_class(benchmark_name, parameters=parameters)
         assert benchmark._preprocess()
@@ -76,57 +76,16 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        # assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
-        assert benchmark.result['gpu_0_duration_us_25_cpu_time'][0] == 42.123
-        # assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
-        assert benchmark.result['gpu_0_duration_us_25_gpu_time'][0] == 25.321
-        # assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
-        # assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
-        assert benchmark.result['gpu_0_duration_us_25_batch_gpu_time'][0] == 23.456
-
-        # assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
-        # assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
-
-    def test_nvbench_sleep_kernel_parse_duration_formats(self):
-        """Test NVBench Sleep Kernel duration format parsing."""
-        benchmark_name = 'nvbench-sleep-kernel'
-        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
-        assert (benchmark_class)
-
-        benchmark = benchmark_class(benchmark_name, parameters='')
-
-        # Test single value formats
-        assert benchmark._parse_duration_format("50") == "50"
-        assert benchmark._parse_duration_format("100") == "100"
-        
-        # Test quoted single values
-        assert benchmark._parse_duration_format('"50"') == "50"
-        assert benchmark._parse_duration_format("'100'") == "100"
-        
-        # Test list formats
-        assert benchmark._parse_duration_format("[25,50,75]") == "[25,50,75]"
-        assert benchmark._parse_duration_format("[10,20,30,40]") == "[10,20,30,40]"
-        
-        # Test quoted list formats
-        assert benchmark._parse_duration_format('"[25,50,75]"') == "[25,50,75]"
-        assert benchmark._parse_duration_format("'[10,20,30]'") == "[10,20,30]"
-        
-        # Test range formats
-        assert benchmark._parse_duration_format("[25:75]") == "[25:75]"
-        assert benchmark._parse_duration_format("[0:100]") == "[0:100]"
-        
-        # Test range with step formats
-        assert benchmark._parse_duration_format("[0:50:10]") == "[0:50:10]"
-        assert benchmark._parse_duration_format("[10:100:20]") == "[10:100:20]"
-        
-        # Test quoted range formats
-        assert benchmark._parse_duration_format('"[25:75]"') == "[25:75]"
-        assert benchmark._parse_duration_format("'[0:50:10]'") == "[0:50:10]"
-        
-        # Test with whitespace
-        assert benchmark._parse_duration_format(" 50 ") == "50"
-        assert benchmark._parse_duration_format(" [25,50,75] ") == "[25,50,75]"
-        assert benchmark._parse_duration_format(" [25:75] ") == "[25:75]"
+        # assert benchmark.result['duration_us_25_samples'][0] == 10175
+        assert benchmark.result['duration_us_25_cpu_time'][0] == 42.123
+        # assert benchmark.result['duration_us_25_cpu_noise'][0] == 69.78
+        assert benchmark.result['duration_us_25_gpu_time'][0] == 25.321
+        # assert benchmark.result['duration_us_25_gpu_noise'][0] == 0.93
+        # assert benchmark.result['duration_us_25_batch_samples'][0] == 17448
+        assert benchmark.result['duration_us_25_batch_gpu_time'][0] == 23.456
+
+        # assert benchmark.result['duration_us_50_samples'][0] == 8187
+        # assert benchmark.result['duration_us_75_samples'][0] == 6279
 
     def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
         """Test NVBench Sleep Kernel preprocess with different duration formats."""

From 83c442c7f77151558cf54198a93ee40369cee283 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 21:32:57 +0000
Subject: [PATCH 10/37] add doc

---
 .../benchmarks/micro-benchmarks.md            | 45 +++++++++++++++++++
 third_party/Makefile                          |  3 +-
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index aa3aa965b..a5bc2fa5c 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -172,6 +172,51 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `nvbench-sleep-kernel`
+
+#### Introduction
+
+Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.
+
+The benchmark supports multiple duration specification formats:
+- Single value: `"50"` - Test single duration of 50μs
+- List format: `"[25,50,75]"` - Test multiple specific durations
+- Range format: `"[25:75]"` - Test all values from 25μs to 75μs  
+- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.
+
+#### Metrics
+
+| Name                                    | Unit      | Description                                           |
+|-----------------------------------------|-----------|-------------------------------------------------------|
+| nvbench-sleep-kernel/duration_us_{X}_cpu_time     | time (μs) | CPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_gpu_time     | time (μs) | GPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |
+
+Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).
+
+### `nvbench-kernel-launch`
+
+#### Introduction
+
+Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.
+
+The benchmark provides insights into:
+- Kernel launch latency
+- CPU/GPU synchronization overhead  
+- Batch execution performance
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.
+
+#### Metrics
+
+| Name                                | Unit      | Description                                    |
+|-------------------------------------|-----------|------------------------------------------------|
+| nvbench-kernel-launch/cpu_time      | time (μs) | CPU-measured kernel execution time.            |
+| nvbench-kernel-launch/gpu_time      | time (μs) | GPU-measured kernel execution time.            |
+| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time.                     |
+
 ### `cpu-hpl`
 
 #### Introduction
diff --git a/third_party/Makefile b/third_party/Makefile
index 508faa88d..2300cd2ac 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -22,11 +22,10 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 all: cuda rocm
 # msccl: api change in cudaStreamUpdateCaptureDependencies
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth nvbench
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
-nvbench_only: nvbench
 
 # non aarch64 specific targets
 ifneq ($(shell uname -m), aarch64)

From 4b274c417c5b00c7c384c1ca9e8d659c94e0e233 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 23:02:02 +0000
Subject: [PATCH 11/37] refine regex & update nvbench submodule

---
 .../micro_benchmarks/nvbench_kernel_launch.py | 25 ++++++++++++-------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 13 +++++++---
 third_party/nvbench                           |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 1544586fa..1a424589f 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -36,9 +36,15 @@ def _process_raw_result(self, cmd_idx, raw_output):
         
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
+            # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\| (\d+)x \| ([\d.]+ ?[mun]?s) \| ([\d.]+%) \| "
-                r"([\d.]+ ?[mun]?s) \| ([\d.]+%) \| (\d+)x \| *([\d.]+ ?[mun]?s) \|"
+                r"\|\s*([0-9]+)x\s*\|\s*"                            # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
+                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
             )
             current = None
             parsed_any = False  # Track if any valid rows are parsed
@@ -52,13 +58,14 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     
                 r = re.match(row_pat, line)
                 if r and current:
-                    # self._result.add_result("samples", int(r.group(1)))
-                    self._result.add_result("cpu_time", self._parse_time_value(r.group(2)))
-                    # self._result.add_result("cpu_noise", self._parse_percentage(r.group(3)))
-                    self._result.add_result("gpu_time", self._parse_time_value(r.group(4)))
-                    # self._result.add_result("gpu_noise", self._parse_percentage(r.group(5)))
-                    # self._result.add_result("batch_samples", int(r.group(6)))
-                    self._result.add_result("batch_gpu_time", self._parse_time_value(r.group(7)))
+                    samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    # self._result.add_result("samples", int(samples.replace('x', '')))
+                    self._result.add_result("cpu_time", self._parse_time_value(cpu_time))
+                    # self._result.add_result("cpu_noise", self._parse_percentage(cpu_noise))
+                    self._result.add_result("gpu_time", self._parse_time_value(gpu_time))
+                    # self._result.add_result("gpu_noise", self._parse_percentage(gpu_noise))
+                    # self._result.add_result("batch_samples", int(batch_samples.replace('x', '')))
+                    self._result.add_result("batch_gpu_time", self._parse_time_value(batch_gpu))
                     parsed_any = True
                     
             if not parsed_any:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 75b54ab66..03801a666 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -75,9 +75,16 @@ def _process_raw_result(self, cmd_idx, raw_output):
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
+            # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*(\d+)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*"
-                r"([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|"
+                r"\|\s*([0-9]+)\s*\|\s*"                             # Duration (us)
+                r"([0-9]+)x\s*\|\s*"                                 # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
+                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
             )
             current = None
             parsed_any = False
@@ -98,7 +105,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples.replace('x', '')))
                     self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
             if not parsed_any:
diff --git a/third_party/nvbench b/third_party/nvbench
index 3de9dc95d..7feda2cf3 160000
--- a/third_party/nvbench
+++ b/third_party/nvbench
@@ -1 +1 @@
-Subproject commit 3de9dc95da3fc859900a799918a0c1d9091920ff
+Subproject commit 7feda2cf3ade88b3e73a0e0414ba543a4fbfbc43

From 0cf48bb57df6beff272f8e14fe40243770136de9 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 16:48:58 +0000
Subject: [PATCH 12/37] update cmake

---
 dockerfile/cuda12.8.dockerfile | 9 ++++++++-
 dockerfile/cuda12.9.dockerfile | 9 ++++++++-
 dockerfile/cuda13.0.dockerfile | 9 ++++++++-
 third_party/Makefile           | 4 ++--
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 47a35feba..b3bea186f 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -61,6 +61,13 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN cd /tmp && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
+    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    rm -rf cmake-3.30.4-linux-x86_64*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -161,7 +168,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index d823b1746..34d7a9900 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -62,6 +62,13 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN cd /tmp && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
+    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    rm -rf cmake-3.30.4-linux-x86_64*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -162,7 +169,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 5ce97c1c9..d9b54d081 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -62,6 +62,13 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN cd /tmp && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
+    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    rm -rf cmake-3.30.4-linux-x86_64*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -151,7 +158,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
diff --git a/third_party/Makefile b/third_party/Makefile
index 2300cd2ac..b25fca042 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -22,7 +22,7 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 all: cuda rocm
 # msccl: api change in cudaStreamUpdateCaptureDependencies
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth nvbench
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
@@ -313,7 +313,7 @@ nvbandwidth: sb_micro_path
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
 
 # Build nvbench
-nvbench: sb_micro_path
+cuda_nvbench: sb_micro_path
 ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
 	$(eval ARCHS := "100;103")
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)

From 5905647c49d1482da3e3a62c474ab5a7398e07e5 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 16:58:01 +0000
Subject: [PATCH 13/37] fix lint

---
 .../benchmarks/micro_benchmarks/__init__.py   | 46 ++++---------------
 .../micro_benchmarks/nvbench/CMakeLists.txt   | 40 ++++++----------
 .../micro_benchmarks/nvbench/kernel_launch.cu |  4 +-
 .../micro_benchmarks/nvbench/sleep_kernel.cu  | 21 ++++-----
 .../micro_benchmarks/nvbench_base.py          |  7 ++-
 .../micro_benchmarks/nvbench_kernel_launch.py | 32 ++++++-------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 31 +++++++------
 .../test_nvbench_kernel_launch.py             |  3 +-
 .../test_nvbench_sleep_kernel.py              |  9 ++--
 9 files changed, 74 insertions(+), 119 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index ac50ce252..47a786f6d 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -43,42 +43,12 @@
 from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
 
 __all__ = [
-    'BlasLtBaseBenchmark',
-    'ComputationCommunicationOverlap',
-    'CpuMemBwLatencyBenchmark',
-    'CpuHplBenchmark',
-    'CpuStreamBenchmark',
-    'CublasBenchmark',
-    'CublasLtBenchmark',
-    'CudaGemmFlopsBenchmark',
-    'CudaMemBwBenchmark',
-    'CudaNcclBwBenchmark',
-    'CudnnBenchmark',
-    'DiskBenchmark',
-    'DistInference',
-    'HipBlasLtBenchmark',
-    'GPCNetBenchmark',
-    'GemmFlopsBenchmark',
-    'GpuBurnBenchmark',
-    'GpuCopyBwBenchmark',
-    'GpuStreamBenchmark',
-    'IBBenchmark',
-    'IBLoopbackBenchmark',
-    'KernelLaunch',
-    'MemBwBenchmark',
-    'MicroBenchmark',
-    'MicroBenchmarkWithInvoke',
-    'ORTInferenceBenchmark',
-    'RocmGemmFlopsBenchmark',
-    'RocmMemBwBenchmark',
-    'ShardingMatmul',
-    'TCPConnectivityBenchmark',
-    'TensorRTInferenceBenchmark',
-    'DirectXGPUEncodingLatency',
-    'DirectXGPUCopyBw',
-    'DirectXGPUMemBw',
-    'DirectXGPUCoreFlops',
-    'NvBandwidthBenchmark',
-    'NvbenchKernelLaunch',
-    'NvbenchSleepKernel'
+    'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
+    'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
+    'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
+    'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
+    'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
+    'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
+    'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
+    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 448e83800..f6c8508d4 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,30 +1,18 @@
-cmake_minimum_required(VERSION 3.20)
-project(nvbench_benchmarks LANGUAGES CUDA)
+cmake_minimum_required(VERSION 3.20) project(nvbench_benchmarks LANGUAGES CUDA)
 
-find_package(CUDAToolkit QUIET)
-if (CUDAToolkit_FOUND)
-  include(../cuda_common.cmake)
-  find_package(nvbench CONFIG REQUIRED)
+    find_package(CUDAToolkit QUIET) if (CUDAToolkit_FOUND) include(../ cuda_common.cmake)
+        find_package(nvbench CONFIG REQUIRED)
 
-  # list all your CUDA benchmark source files here
-  set(NVBENCH_SOURCES
-    kernel_launch.cu
-    sleep_kernel.cu
-    # add more *.cu as needed
-  )
+#list all your CUDA benchmark source files here
+            set(NVBENCH_SOURCES kernel_launch.cu sleep_kernel.cu
+#add more *.cu as needed
+                )
 
-  foreach(src ${NVBENCH_SOURCES})
-    # strip ".cu" → NAME_WE
-    get_filename_component(basename ${src} NAME_WE)
-    set(target nvbench_${basename})
+                foreach (src ${NVBENCH_SOURCES})
+#strip ".cu" → NAME_WE
+                    get_filename_component(basename ${src} NAME_WE) set(target nvbench_${basename})
 
-    add_executable(${target} ${src})
-    target_compile_features(${target} PUBLIC cuda_std_17)
-    target_link_libraries(${target}
-      PRIVATE nvbench::nvbench nvbench::main
-    )
-    install(TARGETS ${target} RUNTIME DESTINATION bin)
-  endforeach()
-else()
-  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
-endif()
\ No newline at end of file
+                        add_executable(${target} ${src}) target_compile_features(${target} PUBLIC cuda_std_17)
+                            target_link_libraries(${target} PRIVATE nvbench::nvbench nvbench::main)
+                                install(TARGETS ${target} RUNTIME DESTINATION bin) endforeach() else()
+                                    message(STATUS "CUDA not found, skipping nvbench benchmarks.") endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
index f703c587a..08dc40294 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
@@ -3,9 +3,7 @@
 __global__ void empty_kernel() {}
 
 void kernel_launch(nvbench::state &state) {
-  state.exec([](nvbench::launch &launch) {
-    empty_kernel<<<1, 1, 0, launch.get_stream()>>>();
-  });
+    state.exec([](nvbench::launch &launch) { empty_kernel<<<1, 1, 0, launch.get_stream()>>>(); });
 }
 
 NVBENCH_BENCH(kernel_launch);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
index d486d8ee7..b4789377e 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
@@ -1,22 +1,21 @@
-#include <nvbench/nvbench.cuh>
 #include <cuda/std/chrono>
 #include <cuda_runtime.h>
+#include <nvbench/nvbench.cuh>
 
 __global__ void sleep_kernel(nvbench::int64_t microseconds) {
-  const auto start = cuda::std::chrono::high_resolution_clock::now();
-  const auto target_duration = cuda::std::chrono::microseconds(microseconds);
-  const auto finish = start + target_duration;
+    const auto start = cuda::std::chrono::high_resolution_clock::now();
+    const auto target_duration = cuda::std::chrono::microseconds(microseconds);
+    const auto finish = start + target_duration;
 
-  while (cuda::std::chrono::high_resolution_clock::now() < finish) {
-    // busy wait
-  }
+    while (cuda::std::chrono::high_resolution_clock::now() < finish) {
+        // busy wait
+    }
 }
 
 void sleep_benchmark(nvbench::state &state) {
-  const auto duration_us = state.get_int64("Duration (us)");
-  state.exec([&duration_us](nvbench::launch &launch) {
-    sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us);
-  });
+    const auto duration_us = state.get_int64("Duration (us)");
+    state.exec(
+        [&duration_us](nvbench::launch &launch) { sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us); });
 }
 NVBENCH_BENCH(sleep_benchmark)
     .add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 916408304..8f472444f 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -29,7 +29,6 @@ def parse_time_to_us(raw: str) -> float:
 
 class NvbenchBase(MicroBenchmarkWithInvoke):
     """Base class for NVBench benchmarks with common functionality."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -143,7 +142,7 @@ def _build_base_command(self):
         """
         if not self._bin_name:
             raise ValueError("Subclass must set _bin_name")
-            
+
         command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
 
@@ -200,7 +199,7 @@ def _preprocess(self):
 
         # Build base command - subclasses can override this method to add specific arguments
         parts = self._build_base_command()
-        
+
         # Finalize command
         self._commands = [' '.join(parts)]
         return True
@@ -239,4 +238,4 @@ def _handle_parsing_error(self, error_msg, raw_output):
         self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
         logger.error(
             f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}"
-        )
\ No newline at end of file
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 1a424589f..80cfd9378 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -11,7 +11,6 @@
 
 class NvbenchKernelLaunch(NvbenchBase):
     """The NVBench Kernel Launch benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -33,29 +32,29 @@ def _process_raw_result(self, cmd_idx, raw_output):
             True if the raw output string is valid and result can be extracted.
         """
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
-        
+
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)x\s*\|\s*"                            # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
-                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
+                r"\|\s*([0-9]+)x\s*\|\s*"    # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
+                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"    # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
             )
             current = None
-            parsed_any = False  # Track if any valid rows are parsed
-            
+            parsed_any = False    # Track if any valid rows are parsed
+
             for line in raw_output.splitlines():
                 line = line.strip()
                 g = re.match(gpu_section, line)
                 if g:
                     current = f"gpu_{g.group(1)}"
                     continue
-                    
+
                 r = re.match(row_pat, line)
                 if r and current:
                     samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
@@ -67,15 +66,16 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result("batch_samples", int(batch_samples.replace('x', '')))
                     self._result.add_result("batch_gpu_time", self._parse_time_value(batch_gpu))
                     parsed_any = True
-                    
+
             if not parsed_any:
                 logger.error("No valid rows parsed from the raw output.")
                 raise RuntimeError("No valid rows parsed")
-                
+
         except Exception as e:
             self._handle_parsing_error(str(e), raw_output)
             return False
-            
+
         return True
 
-BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
+
+BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 03801a666..f505280a2 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -13,7 +13,6 @@
 
 class NvbenchSleepKernel(NvbenchBase):
     """The NVBench Sleep Kernel benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -35,7 +34,7 @@ def add_parser_arguments(self):
             type=str,
             default='[0,25,50,75,100]',
             help='Duration axis values in microseconds. Supports multiple formats: '
-                 '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
+            '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
         )
 
     def _preprocess(self):
@@ -49,7 +48,7 @@ def _preprocess(self):
 
         # Build base command with common nvbench arguments
         parts = self._build_base_command()
-        
+
         # Add sleep-kernel specific arguments
         parts.extend(['--axis', f'"Duration (us)={self._args.duration_us.strip()}"'])
 
@@ -77,14 +76,14 @@ def _process_raw_result(self, cmd_idx, raw_output):
             gpu_section = r"### \[(\d+)\] NVIDIA"
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)\s*\|\s*"                             # Duration (us)
-                r"([0-9]+)x\s*\|\s*"                                 # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
-                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
+                r"\|\s*([0-9]+)\s*\|\s*"    # Duration (us)
+                r"([0-9]+)x\s*\|\s*"    # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
+                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"    # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
             )
             current = None
             parsed_any = False
@@ -99,14 +98,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 r = re.match(row_pat, line)
                 if r and current:
                     logger.debug(f"Matched row: {r.groups()}")
-                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
+                    )
                     # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
                     self._result.add_result(f'duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
                     # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples.replace('x', '')))
-                    self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
+                    self._result.add_result(
+                        f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)
+                    )
                     parsed_any = True
             if not parsed_any:
                 raise RuntimeError("No valid rows parsed")
@@ -116,4 +118,5 @@ def _process_raw_result(self, cmd_idx, raw_output):
             return False
         return True
 
-BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
\ No newline at end of file
+
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 2923c7d3b..80072231c 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -12,7 +12,6 @@
 
 class TestNvbenchKernelLaunchBenchmark(BenchmarkTestCase, unittest.TestCase):
     """Test class for NVBench Kernel Launch benchmark."""
-
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""
@@ -102,4 +101,4 @@ def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 6d020814a..434c842e7 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -12,7 +12,6 @@
 
 class TestNvbenchSleepKernelBenchmark(BenchmarkTestCase, unittest.TestCase):
     """Test class for NVBench Sleep Kernel benchmark."""
-
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""
@@ -97,22 +96,22 @@ def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "50"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=50"' in benchmark._commands[0]
-        
+
         # Test list format
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25,50,75]"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=[25,50,75]"' in benchmark._commands[0]
-        
+
         # Test range format
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25:75]"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=[25:75]"' in benchmark._commands[0]
-        
+
         # Test range with step format
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[0:50:10]"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=[0:50:10]"' in benchmark._commands[0]
-        
+
         # Test default format
         benchmark = benchmark_class(benchmark_name, parameters='')
         assert benchmark._preprocess()

From baa57c9ef72678513bf140ce9219b39b7c61fe1e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 17:35:57 +0000
Subject: [PATCH 14/37] fix lint

---
 examples/benchmarks/nvbench_kernel_launch.py  |  9 ++++-
 .../micro_benchmarks/nvbench/CMakeLists.txt   | 40 ++++++++++++-------
 .../micro_benchmarks/nvbench_base.py          | 22 +++++-----
 .../micro_benchmarks/nvbench_kernel_launch.py | 22 +++++-----
 .../micro_benchmarks/nvbench_sleep_kernel.py  |  2 +-
 .../test_nvbench_kernel_launch.py             |  2 +-
 .../test_nvbench_sleep_kernel.py              |  2 +-
 7 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
index 73377084b..eac2db602 100644
--- a/examples/benchmarks/nvbench_kernel_launch.py
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -1,5 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Example of NVBench Kernel Launch benchmark."""
+
 from superbench.benchmarks import BenchmarkRegistry, Platform
-from superbench.common.utils import logger
+from superbench.common.utils importSlogger
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
@@ -22,4 +27,4 @@
             'benchmark: {}, return code: {}, result: {}'.format(
                 benchmark.name, benchmark.return_code, benchmark.result
             )
-        )
\ No newline at end of file
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index f6c8508d4..448e83800 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,18 +1,30 @@
-cmake_minimum_required(VERSION 3.20) project(nvbench_benchmarks LANGUAGES CUDA)
+cmake_minimum_required(VERSION 3.20)
+project(nvbench_benchmarks LANGUAGES CUDA)
 
-    find_package(CUDAToolkit QUIET) if (CUDAToolkit_FOUND) include(../ cuda_common.cmake)
-        find_package(nvbench CONFIG REQUIRED)
+find_package(CUDAToolkit QUIET)
+if (CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
+  find_package(nvbench CONFIG REQUIRED)
 
-#list all your CUDA benchmark source files here
-            set(NVBENCH_SOURCES kernel_launch.cu sleep_kernel.cu
-#add more *.cu as needed
-                )
+  # list all your CUDA benchmark source files here
+  set(NVBENCH_SOURCES
+    kernel_launch.cu
+    sleep_kernel.cu
+    # add more *.cu as needed
+  )
 
-                foreach (src ${NVBENCH_SOURCES})
-#strip ".cu" → NAME_WE
-                    get_filename_component(basename ${src} NAME_WE) set(target nvbench_${basename})
+  foreach(src ${NVBENCH_SOURCES})
+    # strip ".cu" → NAME_WE
+    get_filename_component(basename ${src} NAME_WE)
+    set(target nvbench_${basename})
 
-                        add_executable(${target} ${src}) target_compile_features(${target} PUBLIC cuda_std_17)
-                            target_link_libraries(${target} PRIVATE nvbench::nvbench nvbench::main)
-                                install(TARGETS ${target} RUNTIME DESTINATION bin) endforeach() else()
-                                    message(STATUS "CUDA not found, skipping nvbench benchmarks.") endif()
\ No newline at end of file
+    add_executable(${target} ${src})
+    target_compile_features(${target} PUBLIC cuda_std_17)
+    target_link_libraries(${target}
+      PRIVATE nvbench::nvbench nvbench::main
+    )
+    install(TARGETS ${target} RUNTIME DESTINATION bin)
+  endforeach()
+else()
+  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
+endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 8f472444f..0be282cb4 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -136,12 +136,12 @@ def add_parser_arguments(self):
 
     def _build_base_command(self):
         """Build the base nvbench command with common arguments.
-        
+
         Returns:
             list: Command parts that can be extended by subclasses.
         """
         if not self._bin_name:
-            raise ValueError("Subclass must set _bin_name")
+            raise ValueError('Subclass must set _bin_name')
 
         command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
@@ -190,7 +190,7 @@ def _build_base_command(self):
 
     def _preprocess(self):
         """Default preprocess implementation. Can be overridden by subclasses.
-        
+
         Returns:
             True if _preprocess() succeed.
         """
@@ -206,10 +206,10 @@ def _preprocess(self):
 
     def _parse_time_value(self, time_str):
         """Parse time string to microseconds.
-        
+
         Args:
-            time_str (str): Time string like "123.45 us", "678.9 ns", etc.
-            
+            time_str (str): Time string like '123.45 us', '678.9 ns', etc.
+
         Returns:
             float: Time in microseconds.
         """
@@ -217,10 +217,10 @@ def _parse_time_value(self, time_str):
 
     def _parse_percentage(self, percent_str):
         """Parse percentage string to float.
-        
+
         Args:
-            percent_str (str): Percentage string like "12.34%"
-            
+            percent_str (str): Percentage string like '12.34%'
+
         Returns:
             float: Percentage value as float.
         """
@@ -230,12 +230,12 @@ def _parse_percentage(self, percent_str):
 
     def _handle_parsing_error(self, error_msg, raw_output):
         """Handle parsing errors consistently.
-        
+
         Args:
             error_msg (str): Error message to log.
             raw_output (str): Raw output that failed to parse.
         """
         self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
         logger.error(
-            f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}"
+            f'Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}'
         )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 80cfd9378..83233644f 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -5,7 +5,7 @@
 
 import re
 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
 
 
@@ -34,16 +34,16 @@ def _process_raw_result(self, cmd_idx, raw_output):
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
 
         try:
-            gpu_section = r"### \[(\d+)\] NVIDIA"
+            gpu_section = r'### \[(\d+)\] NVIDIA'
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)x\s*\|\s*"    # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
-                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"    # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
+                r'\|\s*([0-9]+)x\s*\|\s*'    # Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # CPU Time (μs, ns, ms, us, s)
+                r'([\d.]+%)\s*\|\s*'    # CPU Noise percentage
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # GPU Time
+                r'([\d.]+%)\s*\|\s*'    # GPU Noise percentage
+                r'([0-9]+)x\s*\|\s*'    # Batch Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
             current = None
             parsed_any = False    # Track if any valid rows are parsed
@@ -68,8 +68,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     parsed_any = True
 
             if not parsed_any:
-                logger.error("No valid rows parsed from the raw output.")
-                raise RuntimeError("No valid rows parsed")
+                logger.error('No valid rows parsed from the raw output.')
+                raise RuntimeError('No valid rows parsed')
 
         except Exception as e:
             self._handle_parsing_error(str(e), raw_output)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index f505280a2..64612bb45 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -111,7 +111,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     )
                     parsed_any = True
             if not parsed_any:
-                raise RuntimeError("No valid rows parsed")
+                raise RuntimeError('No valid rows parsed')
         except Exception as e:
             logger.error(f"Error processing raw result: {e}")
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 80072231c..02908e5eb 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -93,7 +93,7 @@ def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
         assert benchmark._preprocess()
 
         # Mock raw output with invalid format
-        raw_output = "Invalid output format"
+        raw_output = 'Invalid output format'
 
         # Parse the provided raw output
         assert not benchmark._process_raw_result(0, raw_output)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 434c842e7..4606768ff 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -129,7 +129,7 @@ def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
         assert benchmark._preprocess()
 
         # Mock raw output with invalid format
-        raw_output = "Invalid output format"
+        raw_output = 'Invalid output format'
 
         # Parse the provided raw output
         assert not benchmark._process_raw_result(0, raw_output)

From ecce2d9fed5ef2452fa14518077a8b4c0a4bd6fb Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 17:38:06 +0000
Subject: [PATCH 15/37] fix import

---
 superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 64612bb45..b6163f86d 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -4,8 +4,6 @@
 """Module of the NVBench Sleep Kernel benchmark."""
 
 import re
-import os
-
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase

From 3a58ead25ba7b91c65dd4f5e5f59fbd52753769f Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 19:54:38 +0000
Subject: [PATCH 16/37] fix

---
 examples/benchmarks/nvbench_kernel_launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
index eac2db602..c0f74f55a 100644
--- a/examples/benchmarks/nvbench_kernel_launch.py
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -4,7 +4,7 @@
 """Example of NVBench Kernel Launch benchmark."""
 
 from superbench.benchmarks import BenchmarkRegistry, Platform
-from superbench.common.utils importSlogger
+from superbench.common.utils import logger
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(

From d0d8773e5148c203e8a74eb75d21c8778df01046 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 20:14:12 +0000
Subject: [PATCH 17/37] fix

---
 .../micro_benchmarks/nvbench_base.py          | 62 ++++++++++++-------
 .../micro_benchmarks/nvbench_kernel_launch.py | 16 ++---
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 22 +++----
 3 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 0be282cb4..98e705b46 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -134,27 +134,16 @@ def add_parser_arguments(self):
             help='Minimum R-squared for entropy stopping criterion.',
         )
 
-    def _build_base_command(self):
-        """Build the base nvbench command with common arguments.
-
-        Returns:
-            list: Command parts that can be extended by subclasses.
-        """
-        if not self._bin_name:
-            raise ValueError('Subclass must set _bin_name')
-
-        command = os.path.join(self._args.bin_dir, self._bin_name)
-        parts = [command]
-
-        # Device configuration - in distributed mode, let SuperBench handle device assignment
-        # Only add --devices if explicitly specified
+    def _add_device_args(self, parts):
+        """Add device configuration arguments to command parts."""
         if hasattr(self._args, 'devices') and self._args.devices is not None:
             if self._args.devices == 'all':
                 parts.extend(['--devices', 'all'])
             else:
                 parts.extend(['--devices', self._args.devices])
 
-        # Benchmark Properties
+    def _add_benchmark_property_args(self, parts):
+        """Add benchmark property arguments to command parts."""
         if hasattr(self._args, 'skip_time') and self._args.skip_time >= 0:
             parts.extend(['--skip-time', str(self._args.skip_time)])
         if hasattr(self._args, 'throttle_threshold') and self._args.throttle_threshold > 0:
@@ -168,7 +157,8 @@ def _build_base_command(self):
         if hasattr(self._args, 'profile') and self._args.profile:
             parts.append('--profile')
 
-        # Stopping criteria
+    def _add_stopping_criteria_args(self, parts):
+        """Add stopping criteria arguments to command parts."""
         if hasattr(self._args, 'timeout') and self._args.timeout is not None:
             parts.extend(['--timeout', str(self._args.timeout)])
         if hasattr(self._args, 'min_samples') and self._args.min_samples is not None:
@@ -176,15 +166,39 @@ def _build_base_command(self):
         if hasattr(self._args, 'stopping_criterion') and self._args.stopping_criterion:
             parts.extend(['--stopping-criterion', self._args.stopping_criterion])
             if self._args.stopping_criterion == 'stdrel':
-                if hasattr(self._args, 'min_time') and self._args.min_time is not None:
-                    parts.extend(['--min-time', str(self._args.min_time)])
-                if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
-                    parts.extend(['--max-noise', str(self._args.max_noise)])
+                self._add_stdrel_args(parts)
             elif self._args.stopping_criterion == 'entropy':
-                if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
-                    parts.extend(['--max-angle', str(self._args.max_angle)])
-                if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
-                    parts.extend(['--min-r2', str(self._args.min_r2)])
+                self._add_entropy_args(parts)
+
+    def _add_stdrel_args(self, parts):
+        """Add stdrel-specific stopping criterion arguments."""
+        if hasattr(self._args, 'min_time') and self._args.min_time is not None:
+            parts.extend(['--min-time', str(self._args.min_time)])
+        if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
+            parts.extend(['--max-noise', str(self._args.max_noise)])
+
+    def _add_entropy_args(self, parts):
+        """Add entropy-specific stopping criterion arguments."""
+        if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
+            parts.extend(['--max-angle', str(self._args.max_angle)])
+        if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
+            parts.extend(['--min-r2', str(self._args.min_r2)])
+
+    def _build_base_command(self):
+        """Build the base nvbench command with common arguments.
+
+        Returns:
+            list: Command parts that can be extended by subclasses.
+        """
+        if not self._bin_name:
+            raise ValueError('Subclass must set _bin_name')
+
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [command]
+
+        self._add_device_args(parts)
+        self._add_benchmark_property_args(parts)
+        self._add_stopping_criteria_args(parts)
 
         return parts
 
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 83233644f..5120f1b51 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -52,19 +52,19 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 line = line.strip()
                 g = re.match(gpu_section, line)
                 if g:
-                    current = f"gpu_{g.group(1)}"
+                    current = f'gpu_{g.group(1)}'
                     continue
 
                 r = re.match(row_pat, line)
                 if r and current:
                     samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    # self._result.add_result("samples", int(samples.replace('x', '')))
-                    self._result.add_result("cpu_time", self._parse_time_value(cpu_time))
-                    # self._result.add_result("cpu_noise", self._parse_percentage(cpu_noise))
-                    self._result.add_result("gpu_time", self._parse_time_value(gpu_time))
-                    # self._result.add_result("gpu_noise", self._parse_percentage(gpu_noise))
-                    # self._result.add_result("batch_samples", int(batch_samples.replace('x', '')))
-                    self._result.add_result("batch_gpu_time", self._parse_time_value(batch_gpu))
+                    # self._result.add_result('samples', int(samples.replace('x', '')))
+                    self._result.add_result('cpu_time', self._parse_time_value(cpu_time))
+                    # self._result.add_result('cpu_noise', self._parse_percentage(cpu_noise))
+                    self._result.add_result('gpu_time', self._parse_time_value(gpu_time))
+                    # self._result.add_result('gpu_noise', self._parse_percentage(gpu_noise))
+                    # self._result.add_result('batch_samples', int(batch_samples.replace('x', '')))
+                    self._result.add_result('batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
 
             if not parsed_any:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index b6163f86d..93465a0e8 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -66,22 +66,22 @@ def _process_raw_result(self, cmd_idx, raw_output):
         Return:
             True if the raw output string is valid and result can be extracted.
         """
-        logger.debug(f"Processing raw result for command index {cmd_idx}.")
-        logger.debug(f"Raw output:\n{raw_output}")
+        logger.debug(f'Processing raw result for command index {cmd_idx}.')
+        logger.debug(f'Raw output:\n{raw_output}')
 
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
         try:
-            gpu_section = r"### \[(\d+)\] NVIDIA"
+            gpu_section = r'### \[(\d+)\] NVIDIA'
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)\s*\|\s*"    # Duration (us)
-                r"([0-9]+)x\s*\|\s*"    # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
-                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"    # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
+                r'\|\s*([0-9]+)\s*\|\s*'    # Duration (us)
+                r'([0-9]+)x\s*\|\s*'    # Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # CPU Time (μs, ns, ms, us, s)
+                r'([\d.]+%)\s*\|\s*'    # CPU Noise percentage
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # GPU Time
+                r'([\d.]+%)\s*\|\s*'    # GPU Noise percentage
+                r'([0-9]+)x\s*\|\s*'    # Batch Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
             current = None
             parsed_any = False

From fbb5969388397ab5cd673534b2c4562ab013ad84 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 20:24:30 +0000
Subject: [PATCH 18/37] fix

---
 .../micro_benchmarks/nvbench_sleep_kernel.py        | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 93465a0e8..3a2ccd9a7 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -87,15 +87,15 @@ def _process_raw_result(self, cmd_idx, raw_output):
             parsed_any = False
             for line in raw_output.splitlines():
                 line = line.strip()
-                logger.debug(f"Processing line: {line}")
+                logger.debug(f'Processing line: {line}')
                 g = re.match(gpu_section, line)
                 if g:
-                    current = f"gpu_{g.group(1)}"
-                    logger.debug(f"Found GPU section: {current}")
+                    current = f'gpu_{g.group(1)}'
+                    logger.debug(f'Found GPU section: {current}')
                     continue
                 r = re.match(row_pat, line)
                 if r and current:
-                    logger.debug(f"Matched row: {r.groups()}")
+                    logger.debug(f'Matched row: {r.groups()}')
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
                     )
                     # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
@@ -103,7 +103,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples.replace('x', '')))
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', 
+                    # int(batch_samples.replace('x', '')))
                     self._result.add_result(
                         f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)
                     )
@@ -111,7 +112,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
             if not parsed_any:
                 raise RuntimeError('No valid rows parsed')
         except Exception as e:
-            logger.error(f"Error processing raw result: {e}")
+            logger.error(f'Error processing raw result: {e}')
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
             return False
         return True

From f007745ec4d9e63094042360d44d7f8c2e8c7b02 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 20:34:54 +0000
Subject: [PATCH 19/37] fix

---
 superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 3a2ccd9a7..e7bcb4322 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -103,7 +103,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', 
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples',
                     # int(batch_samples.replace('x', '')))
                     self._result.add_result(
                         f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)

From b6b6082f0016d497d77d98794c6faaf2a7bb1375 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 21:23:25 +0000
Subject: [PATCH 20/37] fix

---
 .github/workflows/codeql-analysis.yml         | 18 +++++--
 .../micro_benchmarks/nvbench/CMakeLists.txt   | 47 +++++++++++--------
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index e53acebf6..4c9d43e56 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -29,14 +29,22 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+      - name: Install CMake >= 3.20 and pthreads
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential
+          # Install newer CMake
+          sudo apt-get remove --purge --auto-remove cmake
+          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
+          sudo sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           languages: ${{ matrix.language }}
       - name: Autobuild
-        uses: github/codeql-action/autobuild@v2
+        uses: github/codeql-action/autobuild@v3
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
   analyze-cpp:
     name: CodeQL analyze cpp
     runs-on: ubuntu-latest
@@ -54,10 +62,10 @@ jobs:
           DEBIAN_FRONTEND=noninteractive apt-get update
           DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           languages: cpp
       - name: Build
         run: make cppbuild -j
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 448e83800..527176774 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -4,27 +4,36 @@ project(nvbench_benchmarks LANGUAGES CUDA)
 find_package(CUDAToolkit QUIET)
 if (CUDAToolkit_FOUND)
   include(../cuda_common.cmake)
-  find_package(nvbench CONFIG REQUIRED)
-
-  # list all your CUDA benchmark source files here
-  set(NVBENCH_SOURCES
-    kernel_launch.cu
-    sleep_kernel.cu
-    # add more *.cu as needed
-  )
+  
+  # Try to find nvbench, but don't require it
+  find_package(nvbench CONFIG QUIET)
+  
+  if (nvbench_FOUND)
+    message(STATUS "Found nvbench, building nvbench benchmarks")
+    
+    # list all your CUDA benchmark source files here
+    set(NVBENCH_SOURCES
+      kernel_launch.cu
+      sleep_kernel.cu
+      # add more *.cu as needed
+    )
 
-  foreach(src ${NVBENCH_SOURCES})
-    # strip ".cu" → NAME_WE
-    get_filename_component(basename ${src} NAME_WE)
-    set(target nvbench_${basename})
+    foreach(src ${NVBENCH_SOURCES})
+      # strip ".cu" → NAME_WE
+      get_filename_component(basename ${src} NAME_WE)
+      set(target nvbench_${basename})
 
-    add_executable(${target} ${src})
-    target_compile_features(${target} PUBLIC cuda_std_17)
-    target_link_libraries(${target}
-      PRIVATE nvbench::nvbench nvbench::main
-    )
-    install(TARGETS ${target} RUNTIME DESTINATION bin)
-  endforeach()
+      add_executable(${target} ${src})
+      target_compile_features(${target} PUBLIC cuda_std_17)
+      target_link_libraries(${target}
+        PRIVATE nvbench::nvbench nvbench::main
+      )
+      install(TARGETS ${target} RUNTIME DESTINATION bin)
+    endforeach()
+  else()
+    message(STATUS "nvbench not found, skipping nvbench benchmarks.")
+    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench)
+  endif()
 else()
   message(STATUS "CUDA not found, skipping nvbench benchmarks.")
 endif()
\ No newline at end of file

From 0f2c838cd1174046121c0ae061ce9146ca54e717 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 21:40:50 +0000
Subject: [PATCH 21/37] fix

---
 dockerfile/cuda12.8.dockerfile                           | 9 +++++++--
 dockerfile/cuda12.9.dockerfile                           | 9 +++++++--
 dockerfile/cuda13.0.dockerfile                           | 9 +++++++--
 .../benchmarks/micro_benchmarks/nvbench/CMakeLists.txt   | 2 +-
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index b3bea186f..03534f245 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -62,11 +62,16 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 # Install CMake 3.30.4 for nvbench compatibility
-RUN cd /tmp && \
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
     cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
-    rm -rf cmake-3.30.4-linux-x86_64*
+    rm -rf cmake-3.30.4-linux-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 34d7a9900..eb47d252e 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -63,11 +63,16 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 # Install CMake 3.30.4 for nvbench compatibility
-RUN cd /tmp && \
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
     cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
-    rm -rf cmake-3.30.4-linux-x86_64*
+    rm -rf cmake-3.30.4-linux-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index d9b54d081..a33a39181 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -63,11 +63,16 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 # Install CMake 3.30.4 for nvbench compatibility
-RUN cd /tmp && \
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
     cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
-    rm -rf cmake-3.30.4-linux-x86_64*
+    rm -rf cmake-3.30.4-linux-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 527176774..ceac53e56 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -32,7 +32,7 @@ if (CUDAToolkit_FOUND)
     endforeach()
   else()
     message(STATUS "nvbench not found, skipping nvbench benchmarks.")
-    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench)
+    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench")
   endif()
 else()
   message(STATUS "CUDA not found, skipping nvbench benchmarks.")

From 5bd20f6e599dbfaa7b19946fad915af0e04cdb3f Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 22:02:25 +0000
Subject: [PATCH 22/37] fix

---
 .github/workflows/codeql-analysis.yml | 16 ++++++++--------
 dockerfile/cuda12.8.dockerfile        |  3 ++-
 dockerfile/cuda12.9.dockerfile        |  3 ++-
 dockerfile/cuda13.0.dockerfile        |  3 ++-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 4c9d43e56..2960da08b 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -29,14 +29,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
-      - name: Install CMake >= 3.20 and pthreads
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential
-          # Install newer CMake
-          sudo apt-get remove --purge --auto-remove cmake
-          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
-          sudo sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
@@ -61,6 +53,14 @@ jobs:
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
           DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
+      - name: Install CMake >= 3.20 and pthreads
+        run: |
+          apt-get update
+          apt-get install -y build-essential
+          apt-get remove --purge --auto-remove cmake
+          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
+          sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
+          cmake --version
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 03534f245..59cc15408 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -68,7 +68,8 @@ RUN apt-get update && \
     cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
     rm -rf cmake-3.30.4-linux-x86_64* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index eb47d252e..6d1fe537f 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -69,7 +69,8 @@ RUN apt-get update && \
     cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
     rm -rf cmake-3.30.4-linux-x86_64* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index a33a39181..649f236ad 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -69,7 +69,8 @@ RUN apt-get update && \
     cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
     rm -rf cmake-3.30.4-linux-x86_64* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*

From ab88d254861d0f7e0214bc60695a93b807700739 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 22:30:16 +0000
Subject: [PATCH 23/37] fix pipeline

---
 .github/workflows/codeql-analysis.yml | 14 +++++---------
 dockerfile/cuda12.8.dockerfile        | 13 +++++++++----
 dockerfile/cuda12.9.dockerfile        | 13 +++++++++----
 dockerfile/cuda13.0.dockerfile        | 13 +++++++++----
 4 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 2960da08b..e8d1ce862 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -52,15 +52,11 @@ jobs:
       - name: Install Dependency
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
-          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
-      - name: Install CMake >= 3.20 and pthreads
-        run: |
-          apt-get update
-          apt-get install -y build-essential
-          apt-get remove --purge --auto-remove cmake
-          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
-          sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
-          cmake --version
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
+      - name: Setup CMake
+        uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: '3.20.0'
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 59cc15408..d2eebd2b2 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -66,11 +66,16 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    case ${TARGETPLATFORM} in \
+        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
+        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
     ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-x86_64* && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 6d1fe537f..f776f35c8 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -67,11 +67,16 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    case ${TARGETPLATFORM} in \
+        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
+        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
     ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-x86_64* && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 649f236ad..858043176 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -67,11 +67,16 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    case ${TARGETPLATFORM} in \
+        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
+        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
     ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-x86_64* && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

From 3faaf60bae2ade90fff3d732a70270118e5dab9e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Mon, 13 Oct 2025 23:51:45 +0000
Subject: [PATCH 24/37] fix cmake

---
 dockerfile/cuda12.8.dockerfile |  9 ++++++---
 dockerfile/cuda12.9.dockerfile | 10 +++++++---
 dockerfile/cuda13.0.dockerfile |  9 ++++++---
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index d2eebd2b2..e3fd4d797 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -66,11 +66,14 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    case ${TARGETPLATFORM} in \
-        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
-        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
         *) CMAKE_ARCH="x86_64" ;; \
     esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index f776f35c8..cac7a662d 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -67,11 +67,14 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    case ${TARGETPLATFORM} in \
-        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
-        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
         *) CMAKE_ARCH="x86_64" ;; \
     esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
@@ -80,6 +83,7 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 858043176..63e5eeeb8 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -67,11 +67,14 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    case ${TARGETPLATFORM} in \
-        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
-        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
         *) CMAKE_ARCH="x86_64" ;; \
     esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \

From 896a46a61d259fc5c500de024f77a9b110625667 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 14 Oct 2025 17:02:55 +0000
Subject: [PATCH 25/37] fix pipeline

---
 .../benchmarks/micro_benchmarks/nvbench/CMakeLists.txt      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index ceac53e56..0aeb37730 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,3 +1,9 @@
+# skip the build for < cuda12.8
+if(CMAKE_VERSION VERSION_LESS "3.20")
+  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.20, skipping nvbench benchmarks")
+  return()
+endif()
+
 cmake_minimum_required(VERSION 3.20)
 project(nvbench_benchmarks LANGUAGES CUDA)
 

From 5d4986b67b62cf569f6bb3dc023ca427208e9646 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 14 Oct 2025 19:46:45 +0000
Subject: [PATCH 26/37] fix pipeline

---
 dockerfile/cuda13.0.dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 63e5eeeb8..55ed300df 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -172,7 +172,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl cuda_nvbench
+RUN make -C third_party cuda cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \

From b246522a10992a35266feaac6d860d78564ea7c8 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 14 Oct 2025 23:42:18 +0000
Subject: [PATCH 27/37] fix pipeline & mlc version

---
 dockerfile/cuda11.1.1.dockerfile                     |  2 +-
 dockerfile/cuda12.2.dockerfile                       |  2 +-
 dockerfile/cuda12.4.dockerfile                       |  2 +-
 dockerfile/cuda12.8.dockerfile                       |  2 +-
 dockerfile/cuda12.9.dockerfile                       |  2 +-
 dockerfile/cuda13.0.dockerfile                       |  2 +-
 dockerfile/rocm5.0.x.dockerfile                      |  2 +-
 dockerfile/rocm5.1.x.dockerfile                      |  2 +-
 dockerfile/rocm5.7.x.dockerfile                      |  2 +-
 dockerfile/rocm6.0.x.dockerfile                      |  2 +-
 .../micro_benchmarks/nvbench/CMakeLists.txt          | 12 ++++++------
 11 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index ece2f3a6c..16fd447a4 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -114,7 +114,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/cuda12.2.dockerfile b/dockerfile/cuda12.2.dockerfile
index b53fe1c7c..d5f8a9873 100644
--- a/dockerfile/cuda12.2.dockerfile
+++ b/dockerfile/cuda12.2.dockerfile
@@ -95,7 +95,7 @@ RUN cd /opt && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index 560f0908a..db9841b45 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -104,7 +104,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index e3fd4d797..f2bbdf183 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -126,7 +126,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index cac7a662d..c66e27f9d 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -128,7 +128,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 55ed300df..0baa8cc69 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -127,7 +127,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile
index 8bb718794..321590e8c 100644
--- a/dockerfile/rocm5.0.x.dockerfile
+++ b/dockerfile/rocm5.0.x.dockerfile
@@ -98,7 +98,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm5.1.x.dockerfile b/dockerfile/rocm5.1.x.dockerfile
index e56c35c27..8cfaa4f52 100644
--- a/dockerfile/rocm5.1.x.dockerfile
+++ b/dockerfile/rocm5.1.x.dockerfile
@@ -110,7 +110,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile
index 85ba1919e..acff9f9e3 100644
--- a/dockerfile/rocm5.7.x.dockerfile
+++ b/dockerfile/rocm5.7.x.dockerfile
@@ -126,7 +126,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile
index ce5736e29..436864c19 100644
--- a/dockerfile/rocm6.0.x.dockerfile
+++ b/dockerfile/rocm6.0.x.dockerfile
@@ -132,7 +132,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 0aeb37730..8415b10b4 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,12 +1,12 @@
-# skip the build for < cuda12.8
-if(CMAKE_VERSION VERSION_LESS "3.20")
-  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.20, skipping nvbench benchmarks")
+cmake_minimum_required(VERSION 3.18)
+project(nvbench_benchmarks LANGUAGES CUDA)
+
+# Check if we have a recent enough CMake for nvbench (which requires 3.30.4)
+if(CMAKE_VERSION VERSION_LESS "3.30.4")
+  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.30.4 (required by nvbench), skipping nvbench benchmarks")
   return()
 endif()
 
-cmake_minimum_required(VERSION 3.20)
-project(nvbench_benchmarks LANGUAGES CUDA)
-
 find_package(CUDAToolkit QUIET)
 if (CUDAToolkit_FOUND)
   include(../cuda_common.cmake)

From 0804c12ccb262c2d09cc63be14be8570c7bda20e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 6 Feb 2026 11:03:59 -0800
Subject: [PATCH 28/37] fix comments

---
 examples/benchmarks/nvbench_kernel_launch.py  |   8 +-
 examples/benchmarks/nvbench_sleep_kernel.py   |  18 +-
 .../micro_benchmarks/nvbench_base.py          |   8 +-
 .../micro_benchmarks/test_nvbench_base.py     | 271 ++++++++++++++++++
 .../test_nvbench_kernel_launch.py             |  10 +-
 .../test_nvbench_sleep_kernel.py              |  10 +-
 6 files changed, 299 insertions(+), 26 deletions(-)
 create mode 100644 tests/benchmarks/micro_benchmarks/test_nvbench_base.py

diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
index c0f74f55a..5efb14078 100644
--- a/examples/benchmarks/nvbench_kernel_launch.py
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -1,7 +1,11 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
+# Licensed under the MIT license.
 
-"""Example of NVBench Kernel Launch benchmark."""
+"""Micro benchmark example for NVBench Kernel Launch.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_kernel_launch.py
+"""
 
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
diff --git a/examples/benchmarks/nvbench_sleep_kernel.py b/examples/benchmarks/nvbench_sleep_kernel.py
index 083bd0a7c..a6adf7a9c 100644
--- a/examples/benchmarks/nvbench_sleep_kernel.py
+++ b/examples/benchmarks/nvbench_sleep_kernel.py
@@ -1,14 +1,16 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
+# Licensed under the MIT license.
 
-"""Example of NVBench Sleep Kernel benchmark."""
+"""Micro benchmark example for NVBench Sleep Kernel.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_sleep_kernel.py
+"""
 
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
 
-
-def main():
-    """Main method to run the nvbench sleep kernel benchmark."""
+if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
         'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
     )
@@ -20,9 +22,3 @@ def main():
                 benchmark.name, benchmark.return_code, benchmark.result
             )
         )
-    else:
-        logger.error('benchmark: nvbench-sleep-kernel launch failed.')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 98e705b46..d6d8ed466 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -11,7 +11,7 @@
 
 
 def parse_time_to_us(raw: str) -> float:
-    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
+    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms', '1.5 s' → float µs."""
     raw = raw.strip()
     if raw.endswith('%'):
         return float(raw[:-1])
@@ -20,9 +20,11 @@ def parse_time_to_us(raw: str) -> float:
     if not m:
         return float(raw)
     val, unit = float(m.group(1)), (m.group(2) or 'us')
-    if unit == 'ns':
+    if unit == 's':
+        return val * 1e6
+    elif unit == 'ns':
         return val / 1e3
-    if unit == 'ms':
+    elif unit == 'ms':
         return val * 1e3
     return val
 
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
new file mode 100644
index 000000000..76d465709
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
@@ -0,0 +1,271 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench_base module."""
+
+import unittest
+from argparse import Namespace
+
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import ReturnCode
+from superbench.benchmarks.micro_benchmarks.nvbench_base import parse_time_to_us, NvbenchBase
+
+
+class TestParseTimeToUs(unittest.TestCase):
+    """Test class for parse_time_to_us function."""
+
+    def test_parse_microseconds(self):
+        """Test parsing microseconds values."""
+        self.assertAlmostEqual(parse_time_to_us('123.45 us'), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('123.45us'), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('0.5 us'), 0.5)
+
+    def test_parse_nanoseconds(self):
+        """Test parsing nanoseconds values (converted to us)."""
+        self.assertAlmostEqual(parse_time_to_us('1000 ns'), 1.0)
+        self.assertAlmostEqual(parse_time_to_us('1000ns'), 1.0)
+        self.assertAlmostEqual(parse_time_to_us('500 ns'), 0.5)
+        self.assertAlmostEqual(parse_time_to_us('123.456 ns'), 0.123456)
+
+    def test_parse_milliseconds(self):
+        """Test parsing milliseconds values (converted to us)."""
+        self.assertAlmostEqual(parse_time_to_us('1 ms'), 1000.0)
+        self.assertAlmostEqual(parse_time_to_us('1ms'), 1000.0)
+        self.assertAlmostEqual(parse_time_to_us('0.5 ms'), 500.0)
+        self.assertAlmostEqual(parse_time_to_us('0.001 ms'), 1.0)
+
+    def test_parse_percentage(self):
+        """Test parsing percentage values."""
+        self.assertAlmostEqual(parse_time_to_us('50.5%'), 50.5)
+        self.assertAlmostEqual(parse_time_to_us('0.1%'), 0.1)
+        self.assertAlmostEqual(parse_time_to_us('100%'), 100.0)
+
+    def test_parse_plain_number(self):
+        """Test parsing plain numbers without unit (defaults to us)."""
+        self.assertAlmostEqual(parse_time_to_us('123.45'), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('0'), 0.0)
+
+    def test_parse_with_whitespace(self):
+        """Test parsing values with leading/trailing whitespace."""
+        self.assertAlmostEqual(parse_time_to_us('  123.45 us  '), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('\t500 ns\n'), 0.5)
+
+    def test_parse_seconds(self):
+        """Test parsing seconds values (converted to us)."""
+        self.assertAlmostEqual(parse_time_to_us('1 s'), 1000000.0)
+        self.assertAlmostEqual(parse_time_to_us('1s'), 1000000.0)
+        self.assertAlmostEqual(parse_time_to_us('0.5 s'), 500000.0)
+        self.assertAlmostEqual(parse_time_to_us('0.001 s'), 1000.0)
+
+
+class ConcreteNvbenchBase(NvbenchBase):
+    """Concrete implementation of NvbenchBase for testing."""
+
+    def __init__(self, name, parameters=''):
+        """Constructor."""
+        super().__init__(name, parameters)
+        self._bin_name = 'test_nvbench_binary'
+
+
+class TestNvbenchBase(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NvbenchBase class."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/test_nvbench_binary'])
+
+    def test_nvbench_base_init(self):
+        """Test NvbenchBase initialization."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        assert benchmark._bin_name == 'test_nvbench_binary'
+        assert benchmark.name == 'test-benchmark'
+
+    def test_nvbench_base_add_parser_arguments(self):
+        """Test NvbenchBase add_parser_arguments."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._preprocess()
+
+        # Check default values
+        assert benchmark._args.devices is None
+        self.assertAlmostEqual(benchmark._args.skip_time, -1.0)
+        self.assertAlmostEqual(benchmark._args.throttle_threshold, 75.0)
+        self.assertAlmostEqual(benchmark._args.throttle_recovery_delay, 0.05)
+        assert benchmark._args.run_once is False
+        assert benchmark._args.disable_blocking_kernel is False
+        assert benchmark._args.profile is False
+        assert benchmark._args.timeout == 15
+        assert benchmark._args.min_samples == 10
+        assert benchmark._args.stopping_criterion == 'stdrel'
+        self.assertAlmostEqual(benchmark._args.min_time, 0.5)
+        self.assertAlmostEqual(benchmark._args.max_noise, 0.5)
+        self.assertAlmostEqual(benchmark._args.max_angle, 0.048)
+        self.assertAlmostEqual(benchmark._args.min_r2, 0.36)
+
+    def test_nvbench_base_preprocess_default(self):
+        """Test NvbenchBase preprocess with default parameters."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+        assert len(benchmark._commands) == 1
+        # Check default stopping criterion args are included
+        assert '--timeout 15' in benchmark._commands[0]
+        assert '--min-samples 10' in benchmark._commands[0]
+        assert '--stopping-criterion stdrel' in benchmark._commands[0]
+        assert '--min-time 0.5' in benchmark._commands[0]
+        assert '--max-noise 0.5' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_devices(self):
+        """Test NvbenchBase preprocess with device configuration."""
+        # Test with specific device
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='--devices 0')
+        assert benchmark._preprocess()
+        assert '--devices 0' in benchmark._commands[0]
+
+        # Test with 'all' devices
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='--devices all')
+        assert benchmark._preprocess()
+        assert '--devices all' in benchmark._commands[0]
+
+        # Test with multiple devices
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='--devices 0,1,2')
+        assert benchmark._preprocess()
+        assert '--devices 0,1,2' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_benchmark_properties(self):
+        """Test NvbenchBase preprocess with benchmark properties."""
+        parameters = (
+            '--skip-time 1.0 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 0.1 '
+            '--run-once '
+            '--disable-blocking-kernel '
+            '--profile'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        assert '--skip-time 1.0' in benchmark._commands[0]
+        assert '--throttle-threshold 80.0' in benchmark._commands[0]
+        assert '--throttle-recovery-delay 0.1' in benchmark._commands[0]
+        assert '--run-once' in benchmark._commands[0]
+        assert '--disable-blocking-kernel' in benchmark._commands[0]
+        assert '--profile' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_stdrel_stopping_criterion(self):
+        """Test NvbenchBase preprocess with stdrel stopping criterion."""
+        parameters = (
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.3 '
+            '--timeout 30 '
+            '--min-samples 100'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        assert '--stopping-criterion stdrel' in benchmark._commands[0]
+        assert '--min-time 2.0' in benchmark._commands[0]
+        assert '--max-noise 0.3' in benchmark._commands[0]
+        assert '--timeout 30' in benchmark._commands[0]
+        assert '--min-samples 100' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_entropy_stopping_criterion(self):
+        """Test NvbenchBase preprocess with entropy stopping criterion."""
+        parameters = (
+            '--stopping-criterion entropy '
+            '--max-angle 0.1 '
+            '--min-r2 0.5 '
+            '--timeout 20 '
+            '--min-samples 50'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        assert '--stopping-criterion entropy' in benchmark._commands[0]
+        assert '--max-angle 0.1' in benchmark._commands[0]
+        assert '--min-r2 0.5' in benchmark._commands[0]
+        assert '--timeout 20' in benchmark._commands[0]
+        assert '--min-samples 50' in benchmark._commands[0]
+        # stdrel args should not be in entropy mode
+        assert '--min-time' not in benchmark._commands[0]
+        assert '--max-noise' not in benchmark._commands[0]
+
+    def test_nvbench_base_parse_time_value(self):
+        """Test NvbenchBase _parse_time_value method."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._preprocess()
+
+        self.assertAlmostEqual(benchmark._parse_time_value('100 us'), 100.0)
+        self.assertAlmostEqual(benchmark._parse_time_value('1000 ns'), 1.0)
+        self.assertAlmostEqual(benchmark._parse_time_value('1 ms'), 1000.0)
+
+    def test_nvbench_base_parse_percentage(self):
+        """Test NvbenchBase _parse_percentage method."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._preprocess()
+
+        self.assertAlmostEqual(benchmark._parse_percentage('50.5%'), 50.5)
+        self.assertAlmostEqual(benchmark._parse_percentage('100%'), 100.0)
+        self.assertAlmostEqual(benchmark._parse_percentage('0.1%'), 0.1)
+        self.assertAlmostEqual(benchmark._parse_percentage(25.0), 25.0)
+
+    def test_nvbench_base_handle_parsing_error(self):
+        """Test NvbenchBase _handle_parsing_error method."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._preprocess()
+
+        benchmark._handle_parsing_error('Test error message', 'raw output data')
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+    def test_nvbench_base_build_base_command_without_bin_name(self):
+        """Test NvbenchBase _build_base_command raises error without bin_name."""
+        benchmark = NvbenchBase('test-benchmark', parameters='')
+        # Manually call add_parser_arguments and parse to set up _args
+        benchmark.add_parser_arguments()
+        benchmark._args = benchmark._parser.parse_args([])
+        benchmark._args.bin_dir = '/mock/bin'
+
+        with self.assertRaises(ValueError) as context:
+            benchmark._build_base_command()
+        assert 'Subclass must set _bin_name' in str(context.exception)
+
+    def test_nvbench_base_full_command_line(self):
+        """Test NvbenchBase generates complete command line with all options."""
+        parameters = (
+            '--devices 0,1 '
+            '--skip-time 0.5 '
+            '--throttle-threshold 85.0 '
+            '--throttle-recovery-delay 0.02 '
+            '--run-once '
+            '--timeout 60 '
+            '--min-samples 200 '
+            '--stopping-criterion stdrel '
+            '--min-time 1.5 '
+            '--max-noise 0.25'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        cmd = benchmark._commands[0]
+        assert 'test_nvbench_binary' in cmd
+        assert '--devices 0,1' in cmd
+        assert '--skip-time 0.5' in cmd
+        assert '--throttle-threshold 85.0' in cmd
+        assert '--throttle-recovery-delay 0.02' in cmd
+        assert '--run-once' in cmd
+        assert '--timeout 60' in cmd
+        assert '--min-samples 200' in cmd
+        assert '--stopping-criterion stdrel' in cmd
+        assert '--min-time 1.5' in cmd
+        assert '--max-noise 0.25' in cmd
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 02908e5eb..694d0796c 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -74,12 +74,12 @@ def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
 
         # Validate parsed results
         # assert benchmark.result['samples'][0] == 120000
-        assert benchmark.result['cpu_time'][0] == 24.222
-        # assert benchmark.result['cpu_noise'][0] == 30.44
-        assert benchmark.result['gpu_time'][0] == 7.808
-        # assert benchmark.result['gpu_noise'][0] == 14.42
+        self.assertAlmostEqual(benchmark.result['cpu_time'][0], 24.222)
+        # self.assertAlmostEqual(benchmark.result['cpu_noise'][0], 30.44)
+        self.assertAlmostEqual(benchmark.result['gpu_time'][0], 7.808)
+        # self.assertAlmostEqual(benchmark.result['gpu_noise'][0], 14.42)
         # assert benchmark.result['batch_samples'][0] == 300000
-        assert benchmark.result['batch_gpu_time'][0] == 6.024
+        self.assertAlmostEqual(benchmark.result['batch_gpu_time'][0], 6.024)
 
     def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
         """Test NVBench Kernel Launch benchmark result parsing with invalid output."""
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 4606768ff..5b423182a 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -76,12 +76,12 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
 
         # Validate parsed results
         # assert benchmark.result['duration_us_25_samples'][0] == 10175
-        assert benchmark.result['duration_us_25_cpu_time'][0] == 42.123
-        # assert benchmark.result['duration_us_25_cpu_noise'][0] == 69.78
-        assert benchmark.result['duration_us_25_gpu_time'][0] == 25.321
-        # assert benchmark.result['duration_us_25_gpu_noise'][0] == 0.93
+        self.assertAlmostEqual(benchmark.result['duration_us_25_cpu_time'][0], 42.123)
+        # self.assertAlmostEqual(benchmark.result['duration_us_25_cpu_noise'][0], 69.78)
+        self.assertAlmostEqual(benchmark.result['duration_us_25_gpu_time'][0], 25.321)
+        # self.assertAlmostEqual(benchmark.result['duration_us_25_gpu_noise'][0], 0.93)
         # assert benchmark.result['duration_us_25_batch_samples'][0] == 17448
-        assert benchmark.result['duration_us_25_batch_gpu_time'][0] == 23.456
+        self.assertAlmostEqual(benchmark.result['duration_us_25_batch_gpu_time'][0], 23.456)
 
         # assert benchmark.result['duration_us_50_samples'][0] == 8187
         # assert benchmark.result['duration_us_75_samples'][0] == 6279

From c1d1e4358581484ed71beb411d8ef46c223c95f3 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 18 Feb 2026 14:43:20 -0800
Subject: [PATCH 29/37] add auto throughput benchmark

---
 .../benchmarks/nvbench_auto_throughput.py     |  26 +++
 .../benchmarks/micro_benchmarks/__init__.py   |   4 +-
 .../micro_benchmarks/nvbench/CMakeLists.txt   |   1 +
 .../nvbench/NVBENCH_BENCHMARK_GUIDE.md        |  96 ++++++++++
 .../nvbench/auto_throughput.cu                |  65 +++++++
 .../nvbench_auto_throughput.py                | 161 +++++++++++++++++
 .../micro_benchmarks/nvbench_base.py          |  11 --
 .../micro_benchmarks/nvbench_kernel_launch.py |   8 +-
 .../micro_benchmarks/nvbench_sleep_kernel.py  |   8 +-
 .../test_nvbench_auto_throughput.py           | 167 ++++++++++++++++++
 tests/data/nvbench_auto_throughput.log        |  48 +++++
 11 files changed, 575 insertions(+), 20 deletions(-)
 create mode 100644 examples/benchmarks/nvbench_auto_throughput.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
 create mode 100644 tests/data/nvbench_auto_throughput.log

diff --git a/examples/benchmarks/nvbench_auto_throughput.py b/examples/benchmarks/nvbench_auto_throughput.py
new file mode 100644
index 000000000..f99d92fe7
--- /dev/null
+++ b/examples/benchmarks/nvbench_auto_throughput.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for NVBench Auto Throughput.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_auto_throughput.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-auto-throughput',
+        platform=Platform.CUDA,
+        parameters='--stride "[1,2,4,8]" --block_size "[256,512]" --timeout 30'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 47a786f6d..cf0bc0ba8 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -41,6 +41,7 @@
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
 from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
 from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
+from superbench.benchmarks.micro_benchmarks.nvbench_auto_throughput import NvbenchAutoThroughput
 
 __all__ = [
     'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
@@ -50,5 +51,6 @@
     'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
     'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
     'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
-    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel'
+    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel',
+    'NvbenchAutoThroughput'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 8415b10b4..c00043aaf 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -21,6 +21,7 @@ if (CUDAToolkit_FOUND)
     set(NVBENCH_SOURCES
       kernel_launch.cu
       sleep_kernel.cu
+      auto_throughput.cu
       # add more *.cu as needed
     )
 
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md b/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
new file mode 100644
index 000000000..70f884541
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
@@ -0,0 +1,96 @@
+# NVBench Benchmark Development Guide
+
+Guide for GitHub Copilot to create new NVBench-based benchmarks in SuperBench.
+
+## Architecture Overview
+
+NVBench benchmarks follow a modular structure. To add a new benchmark `<name>`:
+
+```
+Files to Create/Modify:
+├── superbench/benchmarks/micro_benchmarks/nvbench/
+│   ├── <name>.cu                          # CUDA benchmark (CREATE)
+│   └── CMakeLists.txt                     # Add .cu to NVBENCH_SOURCES (MODIFY)
+├── superbench/benchmarks/micro_benchmarks/
+│   ├── nvbench_<name>.py                  # Python wrapper (CREATE)
+│   └── __init__.py                        # Add import (MODIFY)
+├── tests/benchmarks/micro_benchmarks/
+│   └── test_nvbench_<name>.py             # Test file (CREATE)
+├── tests/data/
+│   └── nvbench_<name>.log                 # Sample output for tests (CREATE)
+├── examples/benchmarks/
+│   └── nvbench_<name>.py                  # Example script (CREATE)
+└── docs/user-tutorial/benchmarks/
+    └── micro-benchmarks.md                # Add metrics documentation (MODIFY)
+```
+
+## Reference Files
+
+When creating a new benchmark, examine these existing implementations:
+
+| Component | Simple (no params) | Parameterized |
+|-----------|-------------------|---------------|
+| CUDA benchmark | `nvbench/kernel_launch.cu` | `nvbench/sleep_kernel.cu` |
+| Python wrapper | `nvbench_kernel_launch.py` | `nvbench_sleep_kernel.py` |
+| Test file | `test_nvbench_kernel_launch.py` | `test_nvbench_sleep_kernel.py` |
+| Test data | `tests/data/nvbench_kernel_launch.log` | `tests/data/nvbench_sleep_kernel.log` |
+| Example | `examples/benchmarks/nvbench_kernel_launch.py` | `examples/benchmarks/nvbench_sleep_kernel.py` |
+
+## Key Components
+
+### Base Class: `nvbench_base.py`
+- `NvbenchBase` - Inherit from this class
+- `_build_base_command()` - Builds command with common NVBench CLI args
+- `_parse_time_value(str)` - Parses "123.45 us", "678.9 ns", "0.12 ms", "1.5 s" → float µs
+- `_parse_percentage(str)` - Parses "12.34%" → float
+- `_handle_parsing_error()` - Consistent error handling
+
+### CMakeLists.txt
+Add new `.cu` file to `NVBENCH_SOURCES` list.
+
+### Python Wrapper Pattern
+1. Set `self._bin_name = 'nvbench_<name>'` (must match CMake target)
+2. Override `add_parser_arguments()` if benchmark has custom parameters
+3. Override `_preprocess()` if custom command building needed
+4. Implement `_process_raw_result()` to parse NVBench output
+
+### Registration
+- Python: `BenchmarkRegistry.register_benchmark('nvbench-<name>', Nvbench<Name>, platform=Platform.CUDA)`
+- Import in `__init__.py`
+
+### Documentation (`docs/user-tutorial/benchmarks/micro-benchmarks.md`)
+Add a section under "## Computation Benchmarks" with:
+1. `### \`nvbench-<name>\`` - Benchmark name header
+2. `#### Introduction` - Brief description of what the benchmark measures
+3. `#### Metrics` - Table with columns: Name, Unit, Description
+
+Example format (see `nvbench-sleep-kernel` or `nvbench-kernel-launch` sections):
+```markdown
+### `nvbench-<name>`
+
+#### Introduction
+Description of what the benchmark measures and any configuration options.
+
+#### Metrics
+| Name                              | Unit      | Description                    |
+|-----------------------------------|-----------|--------------------------------|
+| nvbench-<name>/${param}_cpu_time  | time (μs) | CPU-measured execution time.   |
+| nvbench-<name>/${param}_gpu_time  | time (μs) | GPU-measured execution time.   |
+```
+
+## NVBench Reference
+
+For advanced NVBench features (axes, types, throughput calculations):
+- Source: `third_party/nvbench/`
+- Examples: `third_party/nvbench/examples/`
+
+## Checklist
+
+- [ ] `<name>.cu` - CUDA benchmark with `NVBENCH_BENCH` macro
+- [ ] `CMakeLists.txt` - Add to `NVBENCH_SOURCES`
+- [ ] `nvbench_<name>.py` - Python wrapper extending `NvbenchBase`
+- [ ] `__init__.py` - Add import
+- [ ] `test_nvbench_<name>.py` - Test file (use `self.assertAlmostEqual` for floats)
+- [ ] `nvbench_<name>.log` - Test data
+- [ ] `micro-benchmarks.md` - Add Introduction and Metrics documentation
+- [ ] `nvbench_<name>.py` - Example script (follow format of other examples)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu b/superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu
new file mode 100644
index 000000000..7c7bb910c
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Based on nvbench example: auto_throughput.cu
+// This benchmark measures memory throughput and cache hit rates.
+
+#include <nvbench/nvbench.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+template <int ItemsPerThread>
+__global__ void throughput_kernel(std::size_t stride, std::size_t elements, const nvbench::int32_t *__restrict__ in,
+                                  nvbench::int32_t *__restrict__ out) {
+    const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const std::size_t step = gridDim.x * blockDim.x;
+
+    for (std::size_t i = stride * tid; i < stride * elements; i += stride * step) {
+        for (int j = 0; j < ItemsPerThread; j++) {
+            const auto read_id = (ItemsPerThread * i + j) % elements;
+            const auto write_id = tid + j * elements;
+            out[write_id] = in[read_id];
+        }
+    }
+}
+
+// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
+// and cache hit rates.
+//
+// Calling state.collect_*() enables particular metric collection if nvbench
+// was built with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
+template <int ItemsPerThread>
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>) {
+    // Allocate input data:
+    const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+    const auto threads_in_block = static_cast<int>(state.get_int64("BlockSize"));
+    const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
+    thrust::device_vector<nvbench::int32_t> input(elements);
+    thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
+
+    // Provide throughput information:
+    state.add_element_count(elements, "Elements");
+
+    // CUPTI metrics - these require nvbench built with -DNVBench_ENABLE_CUPTI=ON
+    // If CUPTI is not available, these calls are no-ops
+    state.collect_dram_throughput();
+    state.collect_l1_hit_rates();
+    state.collect_l2_hit_rates();
+    state.collect_loads_efficiency();
+    state.collect_stores_efficiency();
+
+    const auto blocks_in_grid = static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
+
+    state.exec([&](nvbench::launch &launch) {
+        throughput_kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+            stride, elements, thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()));
+    });
+}
+
+using items_per_thread = nvbench::enum_type_list<1, 2>;
+
+NVBENCH_BENCH_TYPES(throughput_bench, NVBENCH_TYPE_AXES(items_per_thread))
+    .add_int64_axis("Stride", nvbench::range(1, 4, 3))
+    .add_int64_axis("BlockSize", {128, 256, 512, 1024})
+    .set_timeout(1);
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
new file mode 100644
index 000000000..cc7e78906
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
@@ -0,0 +1,161 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Auto Throughput benchmark."""
+
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
+
+
+class NvbenchAutoThroughput(NvbenchBase):
+    """The NVBench Auto Throughput benchmark class.
+
+    This benchmark measures memory throughput and cache hit rates using CUPTI.
+    It copies a 128 MiB buffer with configurable stride and items per thread.
+    """
+
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'nvbench_auto_throughput'
+
+    def add_parser_arguments(self):
+        """Add benchmark-specific arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--stride',
+            type=str,
+            default='[1:4]',
+            help='Stride axis values. Supports: "2" (single), "[1,2,4]" (list), "[1:4]" (range), "[1:8:2]" (step).',
+        )
+
+        self._parser.add_argument(
+            '--block_size',
+            type=str,
+            default='[128,256,512,1024]',
+            help='Block size (threads per block). Supports: "256" (single), "[128,256,512,1024]" (list).',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Build base command with common nvbench arguments
+        parts = self._build_base_command()
+
+        # Add stride axis argument
+        parts.extend(['--axis', f'"Stride={self._args.stride.strip()}"'])
+
+        # Add block size axis argument
+        parts.extend(['--axis', f'"BlockSize={self._args.block_size.strip()}"'])
+
+        # Finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Parse raw results and save the summarized results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+
+        try:
+            gpu_section = r'### \[(\d+)\] NVIDIA'
+
+            # Pattern for throughput benchmark table output with CUPTI metrics
+            # Table format:
+            # | T | Stride | BlockSize | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate |
+            # | Samples | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | Samples | Batch GPU |
+            row_pat = (
+                r'\|\s*(\d+)\s*\|'               # T (ItemsPerThread)
+                r'\s*(\d+)\s*\|'                 # Stride
+                r'\s*(\d+)\s*\|'                 # BlockSize
+                r'\s*\d+\s*\|'                   # Elements (skip)
+                r'\s*([\d.]+)%\s*\|'             # HBWPeak
+                r'\s*([\d.]+)%\s*\|'             # LoadEff
+                r'\s*([\d.]+)%\s*\|'             # StoreEff
+                r'\s*([\d.]+)%\s*\|'             # L1HitRate
+                r'\s*([\d.]+)%\s*\|'             # L2HitRate
+                r'\s*\d+x\s*\|'                  # Samples CUPTI (skip)
+                r'\s*\d+x\s*\|'                  # Samples Cold (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # CPU Time
+                r'\s*[\d.]+%\s*\|'               # CPU Noise (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # GPU Time
+                r'\s*[\d.]+%\s*\|'               # GPU Noise (skip)
+                r'\s*([\d.]+)([TGMK]?)\s*\|'     # Elem/s (value and unit prefix)
+                r'\s*\d+x\s*\|'                  # Samples Batch (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # Batch GPU Time
+            )
+
+            current_gpu = None
+            parsed_any = False
+
+            for line in raw_output.splitlines():
+                line = line.strip()
+
+                g = re.match(gpu_section, line)
+                if g:
+                    current_gpu = f'gpu_{g.group(1)}'
+                    continue
+
+                if not current_gpu:
+                    continue
+
+                r = re.match(row_pat, line)
+                if r:
+                    (items_per_thread, stride, block_size,
+                     hbw_peak, load_eff, store_eff, l1_hit, l2_hit,
+                     cpu_time, gpu_time, elem_rate, elem_unit, batch_gpu) = r.groups()
+
+                    prefix = f'ipt_{items_per_thread}_stride_{stride}_blk_{block_size}'
+
+                    # Timing metrics (in microseconds)
+                    self._result.add_result(f'{prefix}_cpu_time', parse_time_to_us(cpu_time))
+                    self._result.add_result(f'{prefix}_gpu_time', parse_time_to_us(gpu_time))
+                    self._result.add_result(f'{prefix}_batch_gpu_time', parse_time_to_us(batch_gpu))
+
+                    # CUPTI metrics (percentages)
+                    self._result.add_result(f'{prefix}_hbw_peak', float(hbw_peak))
+                    self._result.add_result(f'{prefix}_load_eff', float(load_eff))
+                    self._result.add_result(f'{prefix}_store_eff', float(store_eff))
+                    self._result.add_result(f'{prefix}_l1_hit_rate', float(l1_hit))
+                    self._result.add_result(f'{prefix}_l2_hit_rate', float(l2_hit))
+
+                    # Throughput (elements/s in GB/s)
+                    elem_val = float(elem_rate)
+                    unit_multipliers = {'T': 1e3, 'G': 1.0, 'M': 1e-3, 'K': 1e-6, '': 1e-9}
+                    elem_giga = elem_val * unit_multipliers.get(elem_unit, 1e-9)
+                    self._result.add_result(f'{prefix}_throughput', elem_giga)
+
+                    parsed_any = True
+
+            if not parsed_any:
+                raise RuntimeError('No valid rows parsed')
+
+        except Exception as e:
+            self._handle_parsing_error(str(e), raw_output)
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('nvbench-auto-throughput', NvbenchAutoThroughput, platform=Platform.CUDA)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index d6d8ed466..04c116332 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -220,17 +220,6 @@ def _preprocess(self):
         self._commands = [' '.join(parts)]
         return True
 
-    def _parse_time_value(self, time_str):
-        """Parse time string to microseconds.
-
-        Args:
-            time_str (str): Time string like '123.45 us', '678.9 ns', etc.
-
-        Returns:
-            float: Time in microseconds.
-        """
-        return parse_time_to_us(time_str)
-
     def _parse_percentage(self, percent_str):
         """Parse percentage string to float.
 
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 5120f1b51..dd4156552 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -6,7 +6,7 @@
 import re
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform
-from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
 
 
 class NvbenchKernelLaunch(NvbenchBase):
@@ -59,12 +59,12 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 if r and current:
                     samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
                     # self._result.add_result('samples', int(samples.replace('x', '')))
-                    self._result.add_result('cpu_time', self._parse_time_value(cpu_time))
+                    self._result.add_result('cpu_time', parse_time_to_us(cpu_time))
                     # self._result.add_result('cpu_noise', self._parse_percentage(cpu_noise))
-                    self._result.add_result('gpu_time', self._parse_time_value(gpu_time))
+                    self._result.add_result('gpu_time', parse_time_to_us(gpu_time))
                     # self._result.add_result('gpu_noise', self._parse_percentage(gpu_noise))
                     # self._result.add_result('batch_samples', int(batch_samples.replace('x', '')))
-                    self._result.add_result('batch_gpu_time', self._parse_time_value(batch_gpu))
+                    self._result.add_result('batch_gpu_time', parse_time_to_us(batch_gpu))
                     parsed_any = True
 
             if not parsed_any:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index e7bcb4322..dca625b21 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -6,7 +6,7 @@
 import re
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
-from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
 
 
 class NvbenchSleepKernel(NvbenchBase):
@@ -99,14 +99,14 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
                     )
                     # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
-                    self._result.add_result(f'duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
+                    self._result.add_result(f'duration_us_{duration_us}_cpu_time', parse_time_to_us(cpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
-                    self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
+                    self._result.add_result(f'duration_us_{duration_us}_gpu_time', parse_time_to_us(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
                     # self._result.add_result(f'duration_us_{duration_us}_batch_samples',
                     # int(batch_samples.replace('x', '')))
                     self._result.add_result(
-                        f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)
+                        f'duration_us_{duration_us}_batch_gpu_time', parse_time_to_us(batch_gpu)
                     )
                     parsed_any = True
             if not parsed_any:
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
new file mode 100644
index 000000000..9ceb6206b
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
@@ -0,0 +1,167 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench auto throughput benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchAutoThroughputBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Auto Throughput benchmark."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_auto_throughput'])
+
+    def test_nvbench_auto_throughput_preprocess(self):
+        """Test NVBench Auto Throughput benchmark preprocess."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--devices 0 '
+            '--stride "[1,2,4,8]" '
+            '--timeout 20 '
+            '--min-samples 100'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--axis "Stride=[1,2,4,8]"' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 100' in benchmark._commands[0])
+
+    def test_nvbench_auto_throughput_stride_formats(self):
+        """Test NVBench Auto Throughput preprocess with different stride formats."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "2"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=2"' in benchmark._commands[0]
+
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "[1,2,4]"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1,2,4]"' in benchmark._commands[0]
+
+        # Test range format
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "[1:8]"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1:8]"' in benchmark._commands[0]
+
+        # Test range with step format
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "[1:8:2]"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1:8:2]"' in benchmark._commands[0]
+
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1:4]"' in benchmark._commands[0]
+        assert '--axis "BlockSize=[128,256,512,1024]"' in benchmark._commands[0]
+
+    def test_nvbench_auto_throughput_block_size_formats(self):
+        """Test NVBench Auto Throughput preprocess with different block_size formats."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--block_size "256"')
+        assert benchmark._preprocess()
+        assert '--axis "BlockSize=256"' in benchmark._commands[0]
+
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--block_size "[128,256,512]"')
+        assert benchmark._preprocess()
+        assert '--axis "BlockSize=[128,256,512]"' in benchmark._commands[0]
+
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "BlockSize=[128,256,512,1024]"' in benchmark._commands[0]
+
+    @decorator.load_data('tests/data/nvbench_auto_throughput.log')
+    def test_nvbench_auto_throughput_result_parsing(self, results):
+        """Test NVBench Auto Throughput benchmark result parsing."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate timing metrics for ItemsPerThread=1, Stride=1, BlockSize=128
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_cpu_time'][0], 120.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_gpu_time'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_batch_gpu_time'][0], 95.0)
+
+        # Validate CUPTI metrics for ItemsPerThread=1, Stride=1, BlockSize=128
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_hbw_peak'][0], 20.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_load_eff'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_store_eff'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_l1_hit_rate'][0], 0.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_l2_hit_rate'][0], 0.05)
+
+        # Validate throughput for ItemsPerThread=1, Stride=1, BlockSize=128 (100G elem/s)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_throughput'][0], 100.0)
+
+        # Validate timing metrics for ItemsPerThread=2, Stride=4, BlockSize=256
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_cpu_time'][0], 220.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_gpu_time'][0], 200.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_batch_gpu_time'][0], 195.0)
+
+        # Validate CUPTI metrics for ItemsPerThread=2, Stride=4, BlockSize=256
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_hbw_peak'][0], 80.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_load_eff'][0], 12.5)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_store_eff'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_l1_hit_rate'][0], 30.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_l2_hit_rate'][0], 10.0)
+
+        # Validate throughput for ItemsPerThread=2, Stride=4, BlockSize=256 (200G elem/s)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_throughput'][0], 200.0)
+
+    def test_nvbench_auto_throughput_invalid_output(self):
+        """Test NVBench Auto Throughput benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = 'Invalid output format'
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/data/nvbench_auto_throughput.log b/tests/data/nvbench_auto_throughput.log
new file mode 100644
index 000000000..1b87d2311
--- /dev/null
+++ b/tests/data/nvbench_auto_throughput.log
@@ -0,0 +1,48 @@
+# Devices
+
+## [0] `NVIDIA Test GPU`
+* SM Version: 100 (PTX Version: 100)
+* Number of SMs: 100
+* SM Default Clock Rate: 1000 MHz
+* Global Memory: 10000 MiB Free / 10000 MiB Total
+* Global Memory Bus Peak: 1000 GB/sec (1024-bit DDR @1000MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 10000 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/4] throughput_bench [Device=0 T=1 Stride=1 BlockSize=128]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.100000ms GPU, 0.120000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.095000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+Run:  [2/4] throughput_bench [Device=0 T=1 Stride=4 BlockSize=256]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.110000ms GPU, 0.130000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.105000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+Run:  [3/4] throughput_bench [Device=0 T=2 Stride=1 BlockSize=128]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.150000ms GPU, 0.170000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.145000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+Run:  [4/4] throughput_bench [Device=0 T=2 Stride=4 BlockSize=256]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.200000ms GPU, 0.220000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.195000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+```
+
+# Benchmark Results
+
+## throughput_bench
+
+### [0] NVIDIA Test GPU
+
+| T | Stride | BlockSize | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate | Samples | Samples |  CPU Time  | Noise |  GPU Time  | Noise |  Elem/s  | Samples | Batch GPU  |
+|---|--------|-----------|----------|---------|---------|----------|-----------|-----------|---------|---------|------------|-------|------------|-------|----------|---------|------------|
+| 1 |      1 |       128 | 33554432 |  20.00% | 100.00% |  100.00% |     0.00% |     0.05% |      3x |   1000x | 120.000 us | 1.00% | 100.000 us | 0.50% | 100.000G |   1000x |  95.000 us |
+| 1 |      4 |       256 | 33554432 |  25.00% |  25.00% |  100.00% |     0.00% |     0.30% |      3x |   1000x | 130.000 us | 1.00% | 110.000 us | 0.50% | 110.000G |   1000x | 105.000 us |
+| 2 |      1 |       128 | 33554432 |  40.00% |  50.00% |  100.00% |    30.00% |     1.50% |      3x |   1000x | 170.000 us | 1.00% | 150.000 us | 0.50% | 150.000G |   1000x | 145.000 us |
+| 2 |      4 |       256 | 33554432 |  80.00% |  12.50% |  100.00% |    30.00% |    10.00% |      3x |   1000x | 220.000 us | 1.00% | 200.000 us | 0.50% | 200.000G |   1000x | 195.000 us |

From c34591d7c90cb9548a93d2bc8c3741c2ad7fb419 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 20 Feb 2026 15:11:44 -0800
Subject: [PATCH 30/37] refined logic & fix bug

---
 .../benchmarks/micro-benchmarks.md            | 38 ++++++++++++++++++
 .../benchmarks/nvbench_auto_throughput.py     |  4 +-
 .../nvbench/NVBENCH_BENCHMARK_GUIDE.md        | 39 +++++++++++++++++--
 .../nvbench_auto_throughput.py                | 30 +++++---------
 .../micro_benchmarks/nvbench_base.py          | 22 +++--------
 .../micro_benchmarks/nvbench_kernel_launch.py | 21 ++--------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 28 ++++---------
 .../test_nvbench_auto_throughput.py           |  8 ++--
 8 files changed, 107 insertions(+), 83 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index a5bc2fa5c..cb800a66d 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -217,6 +217,44 @@ Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmar
 | nvbench-kernel-launch/gpu_time      | time (μs) | GPU-measured kernel execution time.            |
 | nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time.                     |
 
+### `nvbench-auto-throughput`
+
+#### Introduction
+
+Measure GPU memory throughput and efficiency metrics using NVBench's auto throughput benchmark. This benchmark copies a 128 MiB buffer of int32 values with configurable stride and block size parameters, measuring memory bandwidth efficiency and CUPTI-based performance counters.
+
+#### Parameters
+
+- **Stride**: Controls the memory access pattern by specifying the gap between consecutive memory accesses. A stride of 1 means contiguous (coalesced) memory access. Larger stride values (2, 4, 8, etc.) create non-contiguous access patterns, useful for stress-testing memory subsystem behavior under different access patterns.
+- **BlockSize**: The number of threads per CUDA block (e.g., 128, 256, 512, 1024). Different block sizes affect occupancy and scheduling efficiency.
+- **ItemsPerThread**: The number of elements each thread processes (1 or 2). Higher values increase work per thread.
+
+The benchmark supports multiple parameter specification formats:
+- Single value: `"2"` - Test single value
+- List format: `"[1,2,4,8]"` - Test multiple specific values
+- Range format: `"[1:4]"` - Test all values from 1 to 4
+- Range with step: `"[1:8:2]"` - Test from 1 to 8 in steps of 2
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) auto throughput benchmark with CUPTI metrics collection.
+
+#### Metrics
+
+| Name                                                                    | Unit         | Description                                                                           |
+|-------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------|
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_cpu\_time       | time (μs)    | CPU-measured execution time.             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_gpu\_time       | time (μs)    | GPU-measured execution time.             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_batch\_gpu\_time | time (μs)    | GPU batch execution time.                |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_hbw\_peak       | percent (%)  | HBM peak bandwidth utilization percentage.                                            |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_load\_eff       | percent (%)  | Global memory load efficiency percentage.                                             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_store\_eff      | percent (%)  | Global memory store efficiency percentage.                                            |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_l1\_hit\_rate   | percent (%)  | L1 cache hit rate percentage (informational only, excluded from pass/fail criteria). |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_l2\_hit\_rate   | percent (%)  | L2 cache hit rate percentage (informational only, excluded from pass/fail criteria). |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_throughput       | GB/s         | Memory throughput calculated from element rate (elements/s × 4 bytes for int32).     |
+
+Where `{T}` is ItemsPerThread (1 or 2), `{S}` is Stride value, and `{B}` is BlockSize (e.g., 128, 256, 512, 1024).
+
+> **Note:** L1 and L2 cache hit rates are collected for informational purposes only and should not be used for performance validation pass/fail criteria, as cache behavior can vary significantly based on system state and workload characteristics.
+
 ### `cpu-hpl`
 
 #### Introduction
diff --git a/examples/benchmarks/nvbench_auto_throughput.py b/examples/benchmarks/nvbench_auto_throughput.py
index f99d92fe7..63e61062f 100644
--- a/examples/benchmarks/nvbench_auto_throughput.py
+++ b/examples/benchmarks/nvbench_auto_throughput.py
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# Licensed under the MIT License.
 
 """Micro benchmark example for NVBench Auto Throughput.
 
@@ -14,7 +14,7 @@
     context = BenchmarkRegistry.create_benchmark_context(
         'nvbench-auto-throughput',
         platform=Platform.CUDA,
-        parameters='--stride "[1,2,4,8]" --block_size "[256,512]" --timeout 30'
+        parameters='--devices 0 --stride "[1,2,4,8]" --block_size "[256,512]" --timeout 30'
     )
 
     benchmark = BenchmarkRegistry.launch_benchmark(context)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md b/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
index 70f884541..85b7f8faa 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
@@ -41,9 +41,8 @@ When creating a new benchmark, examine these existing implementations:
 ### Base Class: `nvbench_base.py`
 - `NvbenchBase` - Inherit from this class
 - `_build_base_command()` - Builds command with common NVBench CLI args
-- `_parse_time_value(str)` - Parses "123.45 us", "678.9 ns", "0.12 ms", "1.5 s" → float µs
-- `_parse_percentage(str)` - Parses "12.34%" → float
-- `_handle_parsing_error()` - Consistent error handling
+- `parse_time_to_us(str)` - Parses "123.45 us", "678.9 ns", "0.12 ms", "1.5 s" → float µs
+- `_handle_parsing_error()` - Consistent error handling (see Error Handling section below)
 
 ### CMakeLists.txt
 Add new `.cu` file to `NVBENCH_SOURCES` list.
@@ -58,6 +57,40 @@ Add new `.cu` file to `NVBENCH_SOURCES` list.
 - Python: `BenchmarkRegistry.register_benchmark('nvbench-<name>', Nvbench<Name>, platform=Platform.CUDA)`
 - Import in `__init__.py`
 
+## Important Implementation Notes
+
+### Error Handling Pattern
+Always use this consistent error handling pattern in `_process_raw_result()`:
+```python
+def _process_raw_result(self, cmd_idx, raw_output):
+    self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+    try:
+        # ... parsing logic ...
+        if not parsed_any:
+            raise ValueError('No valid result rows parsed')
+    except BaseException as e:
+        self._handle_parsing_error(str(e), raw_output)
+        return False
+    return True
+```
+Key points:
+- Use `BaseException` (not `Exception`) to match codebase convention
+- Use `ValueError` for parsing failures (not `RuntimeError`)
+- Always call `_handle_parsing_error()` from base class - don't implement custom error handling
+
+### GPU ID Handling
+**Do NOT track GPU IDs in result metric names.** SuperBench executes benchmarks with `CUDA_VISIBLE_DEVICES` set per GPU, so results are automatically stored in `metric_name:gpu_id` format by the framework. Simply parse results without GPU prefixes.
+
+### Parsing Percentages
+For percentage values like "12.34%", use simple string stripping:
+```python
+float(percent_str.rstrip('%'))
+```
+Do NOT use `parse_time_to_us()` for percentages - it only handles time values.
+
+### Avoid Debug Logging
+Do not add `logger.debug()` calls in `_process_raw_result()`. The raw output is already stored via `add_raw_data()` for debugging purposes.
+
 ### Documentation (`docs/user-tutorial/benchmarks/micro-benchmarks.md`)
 Add a section under "## Computation Benchmarks" with:
 1. `### \`nvbench-<name>\`` - Benchmark name header
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
index cc7e78906..4ae7b0958 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
@@ -4,8 +4,7 @@
 """Module of the NVBench Auto Throughput benchmark."""
 
 import re
-from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
 
 
@@ -79,8 +78,6 @@ def _process_raw_result(self, cmd_idx, raw_output):
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
 
         try:
-            gpu_section = r'### \[(\d+)\] NVIDIA'
-
             # Pattern for throughput benchmark table output with CUPTI metrics
             # Table format:
             # | T | Stride | BlockSize | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate |
@@ -106,20 +103,10 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # Batch GPU Time
             )
 
-            current_gpu = None
             parsed_any = False
 
             for line in raw_output.splitlines():
                 line = line.strip()
-
-                g = re.match(gpu_section, line)
-                if g:
-                    current_gpu = f'gpu_{g.group(1)}'
-                    continue
-
-                if not current_gpu:
-                    continue
-
                 r = re.match(row_pat, line)
                 if r:
                     (items_per_thread, stride, block_size,
@@ -140,18 +127,21 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     self._result.add_result(f'{prefix}_l1_hit_rate', float(l1_hit))
                     self._result.add_result(f'{prefix}_l2_hit_rate', float(l2_hit))
 
-                    # Throughput (elements/s in GB/s)
+                    # Memory throughput in GB/s
+                    # Convert element rate to bandwidth: GB/s = (elements/s) * sizeof(int32) / 1e9
+                    # The benchmark uses int32 (4 bytes per element)
                     elem_val = float(elem_rate)
-                    unit_multipliers = {'T': 1e3, 'G': 1.0, 'M': 1e-3, 'K': 1e-6, '': 1e-9}
-                    elem_giga = elem_val * unit_multipliers.get(elem_unit, 1e-9)
-                    self._result.add_result(f'{prefix}_throughput', elem_giga)
+                    unit_multipliers = {'T': 1e12, 'G': 1e9, 'M': 1e6, 'K': 1e3, '': 1.0}
+                    elements_per_sec = elem_val * unit_multipliers.get(elem_unit, 1.0)
+                    throughput_gbs = (elements_per_sec * 4) / 1e9  # 4 bytes per int32
+                    self._result.add_result(f'{prefix}_throughput', throughput_gbs)
 
                     parsed_any = True
 
             if not parsed_any:
-                raise RuntimeError('No valid rows parsed')
+                raise ValueError('No valid result rows parsed')
 
-        except Exception as e:
+        except BaseException as e:
             self._handle_parsing_error(str(e), raw_output)
             return False
 
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 04c116332..55fbfd76e 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -11,10 +11,9 @@
 
 
 def parse_time_to_us(raw: str) -> float:
-    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms', '1.5 s' → float µs."""
+    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms', '1.5 s' → float µs.
+    """
     raw = raw.strip()
-    if raw.endswith('%'):
-        return float(raw[:-1])
     # split "value unit" or "valueunit"
     m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
     if not m:
@@ -220,19 +219,6 @@ def _preprocess(self):
         self._commands = [' '.join(parts)]
         return True
 
-    def _parse_percentage(self, percent_str):
-        """Parse percentage string to float.
-
-        Args:
-            percent_str (str): Percentage string like '12.34%'
-
-        Returns:
-            float: Percentage value as float.
-        """
-        if isinstance(percent_str, str) and percent_str.endswith('%'):
-            return float(percent_str[:-1])
-        return float(percent_str)
-
     def _handle_parsing_error(self, error_msg, raw_output):
         """Handle parsing errors consistently.
 
@@ -242,5 +228,7 @@ def _handle_parsing_error(self, error_msg, raw_output):
         """
         self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
         logger.error(
-            f'Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}'
+            'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                self._curr_run_index, self._name, raw_output, error_msg
+            )
         )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index dd4156552..234248ec8 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -4,7 +4,6 @@
 """Module of the NVBench Kernel Launch benchmark."""
 
 import re
-from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
 
@@ -34,7 +33,6 @@ def _process_raw_result(self, cmd_idx, raw_output):
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
 
         try:
-            gpu_section = r'### \[(\d+)\] NVIDIA'
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
                 r'\|\s*([0-9]+)x\s*\|\s*'    # Samples
@@ -45,33 +43,22 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 r'([0-9]+)x\s*\|\s*'    # Batch Samples
                 r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
-            current = None
-            parsed_any = False    # Track if any valid rows are parsed
+            parsed_any = False
 
             for line in raw_output.splitlines():
                 line = line.strip()
-                g = re.match(gpu_section, line)
-                if g:
-                    current = f'gpu_{g.group(1)}'
-                    continue
-
                 r = re.match(row_pat, line)
-                if r and current:
+                if r:
                     samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    # self._result.add_result('samples', int(samples.replace('x', '')))
                     self._result.add_result('cpu_time', parse_time_to_us(cpu_time))
-                    # self._result.add_result('cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result('gpu_time', parse_time_to_us(gpu_time))
-                    # self._result.add_result('gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result('batch_samples', int(batch_samples.replace('x', '')))
                     self._result.add_result('batch_gpu_time', parse_time_to_us(batch_gpu))
                     parsed_any = True
 
             if not parsed_any:
-                logger.error('No valid rows parsed from the raw output.')
-                raise RuntimeError('No valid rows parsed')
+                raise ValueError('No valid result rows parsed')
 
-        except Exception as e:
+        except BaseException as e:
             self._handle_parsing_error(str(e), raw_output)
             return False
 
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index dca625b21..3a1aa0163 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -4,8 +4,7 @@
 """Module of the NVBench Sleep Kernel benchmark."""
 
 import re
-from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
 
 
@@ -66,9 +65,6 @@ def _process_raw_result(self, cmd_idx, raw_output):
         Return:
             True if the raw output string is valid and result can be extracted.
         """
-        logger.debug(f'Processing raw result for command index {cmd_idx}.')
-        logger.debug(f'Raw output:\n{raw_output}')
-
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
         try:
             gpu_section = r'### \[(\d+)\] NVIDIA'
@@ -83,38 +79,30 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 r'([0-9]+)x\s*\|\s*'    # Batch Samples
                 r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
-            current = None
             parsed_any = False
             for line in raw_output.splitlines():
                 line = line.strip()
-                logger.debug(f'Processing line: {line}')
                 g = re.match(gpu_section, line)
                 if g:
-                    current = f'gpu_{g.group(1)}'
-                    logger.debug(f'Found GPU section: {current}')
                     continue
                 r = re.match(row_pat, line)
-                if r and current:
-                    logger.debug(f'Matched row: {r.groups()}')
+                if r:
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
                     )
-                    # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
                     self._result.add_result(f'duration_us_{duration_us}_cpu_time', parse_time_to_us(cpu_time))
-                    # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', parse_time_to_us(gpu_time))
-                    # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples',
-                    # int(batch_samples.replace('x', '')))
                     self._result.add_result(
                         f'duration_us_{duration_us}_batch_gpu_time', parse_time_to_us(batch_gpu)
                     )
                     parsed_any = True
+
             if not parsed_any:
-                raise RuntimeError('No valid rows parsed')
-        except Exception as e:
-            logger.error(f'Error processing raw result: {e}')
-            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+                raise ValueError('No valid result rows parsed')
+
+        except BaseException as e:
+            self._handle_parsing_error(str(e), raw_output)
             return False
+
         return True
 
 
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
index 9ceb6206b..2101006ea 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
@@ -128,8 +128,8 @@ def test_nvbench_auto_throughput_result_parsing(self, results):
         self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_l1_hit_rate'][0], 0.0)
         self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_l2_hit_rate'][0], 0.05)
 
-        # Validate throughput for ItemsPerThread=1, Stride=1, BlockSize=128 (100G elem/s)
-        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_throughput'][0], 100.0)
+        # Validate throughput for ItemsPerThread=1, Stride=1, BlockSize=128 (100G elem/s * 4 bytes = 400 GB/s)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_throughput'][0], 400.0)
 
         # Validate timing metrics for ItemsPerThread=2, Stride=4, BlockSize=256
         self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_cpu_time'][0], 220.0)
@@ -143,8 +143,8 @@ def test_nvbench_auto_throughput_result_parsing(self, results):
         self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_l1_hit_rate'][0], 30.0)
         self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_l2_hit_rate'][0], 10.0)
 
-        # Validate throughput for ItemsPerThread=2, Stride=4, BlockSize=256 (200G elem/s)
-        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_throughput'][0], 200.0)
+        # Validate throughput for ItemsPerThread=2, Stride=4, BlockSize=256 (200G elem/s * 4 bytes = 800 GB/s)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_throughput'][0], 800.0)
 
     def test_nvbench_auto_throughput_invalid_output(self):
         """Test NVBench Auto Throughput benchmark result parsing with invalid output."""

From 68f5c7d78c37b1648eaf4a9c46d288cd7ee96486 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 26 Feb 2026 14:04:26 -0800
Subject: [PATCH 31/37] add comment to clarify diff between
 nvbench-kernel-launch and kernel-launch

---
 .../user-tutorial/benchmarks/micro-benchmarks.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index cb800a66d..ef90838b3 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -209,6 +209,22 @@ The benchmark provides insights into:
 
 Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.
 
+#### Comparison with `kernel-launch`
+
+Both `nvbench-kernel-launch` and `kernel-launch` measure kernel launch latency, but they differ in methodology:
+
+| Aspect | `kernel-launch` | `nvbench-kernel-launch` |
+|--------|-----------------|-------------------------|
+| L2 Cache | Warm (cached) | Cold (flushed before each sample) |
+| Measurement | Warm-cache / steady-state | Cold-cache / first-access |
+| Iterations | Fixed 2M iterations | Adaptive (statistical stopping) |
+
+**Important: Do not cross-compare results between these two benchmarks.** For performance regression detection, either benchmark works well. Always compare against historical data from the same benchmark.
+
+Choose based on what scenario matters for your workload:
+- **`kernel-launch`**: Measures warm-cache performance, reflecting steady-state behavior in long-running applications where caches are typically hot
+- **`nvbench-kernel-launch`**: Measures cold-cache performance, reflecting first-access scenarios or workloads with poor cache locality
+
 #### Metrics
 
 | Name                                | Unit      | Description                                    |

From 0bde3320011b87700eeabe65303b1ec689088713 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 10 Mar 2026 10:54:15 -0700
Subject: [PATCH 32/37] resolve comments

---
 dockerfile/cuda13.0.dockerfile  | 21 ---------------------
 dockerfile/rocm5.0.x.dockerfile |  2 +-
 third_party/Makefile            |  2 +-
 3 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 6a6f88f7f..087966619 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -62,27 +62,6 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
-# Install CMake 3.30.4 for nvbench compatibility
-RUN apt-get update && \
-    apt-get remove -y cmake cmake-data && \
-    apt-get autoremove -y && \
-    cd /tmp && \
-    ARCH=$(uname -m) && \
-    case ${ARCH} in \
-        "aarch64") CMAKE_ARCH="aarch64" ;; \
-        "x86_64") CMAKE_ARCH="x86_64" ;; \
-        "arm64") CMAKE_ARCH="aarch64" ;; \
-        *) CMAKE_ARCH="x86_64" ;; \
-    esac && \
-    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
-    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
-    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile
index e3e89cf3e..9ab35244c 100644
--- a/dockerfile/rocm5.0.x.dockerfile
+++ b/dockerfile/rocm5.0.x.dockerfile
@@ -98,7 +98,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/third_party/Makefile b/third_party/Makefile
index b25fca042..c54a4a0a2 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -275,7 +275,7 @@ ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
 	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-	# Get commit 87048bd from msscl to support updated nccl and sm_100
+	# Get commit 87048bd from msccl to support updated nccl and sm_100
 	$(eval ARCHS := 75 80 86 89 90 100)
 	if [ -d msccl ]; then rm -rf msccl; fi; \
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \

From 7c456cff7151b4cfa0cfc23d17c667e550765562 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 10 Mar 2026 20:55:37 +0000
Subject: [PATCH 33/37] fix lint

---
 .../nvbench_auto_throughput.py                | 46 +++++++++----------
 .../micro_benchmarks/nvbench_sleep_kernel.py  |  4 +-
 .../test_nvbench_auto_throughput.py           |  8 +---
 .../micro_benchmarks/test_nvbench_base.py     |  3 --
 4 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
index 4ae7b0958..66c0975a5 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
@@ -14,7 +14,6 @@ class NvbenchAutoThroughput(NvbenchBase):
     This benchmark measures memory throughput and cache hit rates using CUPTI.
     It copies a 128 MiB buffer with configurable stride and items per thread.
     """
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -83,24 +82,24 @@ def _process_raw_result(self, cmd_idx, raw_output):
             # | T | Stride | BlockSize | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate |
             # | Samples | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | Samples | Batch GPU |
             row_pat = (
-                r'\|\s*(\d+)\s*\|'               # T (ItemsPerThread)
-                r'\s*(\d+)\s*\|'                 # Stride
-                r'\s*(\d+)\s*\|'                 # BlockSize
-                r'\s*\d+\s*\|'                   # Elements (skip)
-                r'\s*([\d.]+)%\s*\|'             # HBWPeak
-                r'\s*([\d.]+)%\s*\|'             # LoadEff
-                r'\s*([\d.]+)%\s*\|'             # StoreEff
-                r'\s*([\d.]+)%\s*\|'             # L1HitRate
-                r'\s*([\d.]+)%\s*\|'             # L2HitRate
-                r'\s*\d+x\s*\|'                  # Samples CUPTI (skip)
-                r'\s*\d+x\s*\|'                  # Samples Cold (skip)
-                r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # CPU Time
-                r'\s*[\d.]+%\s*\|'               # CPU Noise (skip)
-                r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # GPU Time
-                r'\s*[\d.]+%\s*\|'               # GPU Noise (skip)
-                r'\s*([\d.]+)([TGMK]?)\s*\|'     # Elem/s (value and unit prefix)
-                r'\s*\d+x\s*\|'                  # Samples Batch (skip)
-                r'\s*([\d.]+\s*[μmun]?s)\s*\|'   # Batch GPU Time
+                r'\|\s*(\d+)\s*\|'    # T (ItemsPerThread)
+                r'\s*(\d+)\s*\|'    # Stride
+                r'\s*(\d+)\s*\|'    # BlockSize
+                r'\s*\d+\s*\|'    # Elements (skip)
+                r'\s*([\d.]+)%\s*\|'    # HBWPeak
+                r'\s*([\d.]+)%\s*\|'    # LoadEff
+                r'\s*([\d.]+)%\s*\|'    # StoreEff
+                r'\s*([\d.]+)%\s*\|'    # L1HitRate
+                r'\s*([\d.]+)%\s*\|'    # L2HitRate
+                r'\s*\d+x\s*\|'    # Samples CUPTI (skip)
+                r'\s*\d+x\s*\|'    # Samples Cold (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'    # CPU Time
+                r'\s*[\d.]+%\s*\|'    # CPU Noise (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'    # GPU Time
+                r'\s*[\d.]+%\s*\|'    # GPU Noise (skip)
+                r'\s*([\d.]+)([TGMK]?)\s*\|'    # Elem/s (value and unit prefix)
+                r'\s*\d+x\s*\|'    # Samples Batch (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
 
             parsed_any = False
@@ -109,9 +108,10 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 line = line.strip()
                 r = re.match(row_pat, line)
                 if r:
-                    (items_per_thread, stride, block_size,
-                     hbw_peak, load_eff, store_eff, l1_hit, l2_hit,
-                     cpu_time, gpu_time, elem_rate, elem_unit, batch_gpu) = r.groups()
+                    (
+                        items_per_thread, stride, block_size, hbw_peak, load_eff, store_eff, l1_hit, l2_hit, cpu_time,
+                        gpu_time, elem_rate, elem_unit, batch_gpu
+                    ) = r.groups()
 
                     prefix = f'ipt_{items_per_thread}_stride_{stride}_blk_{block_size}'
 
@@ -133,7 +133,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     elem_val = float(elem_rate)
                     unit_multipliers = {'T': 1e12, 'G': 1e9, 'M': 1e6, 'K': 1e3, '': 1.0}
                     elements_per_sec = elem_val * unit_multipliers.get(elem_unit, 1.0)
-                    throughput_gbs = (elements_per_sec * 4) / 1e9  # 4 bytes per int32
+                    throughput_gbs = (elements_per_sec * 4) / 1e9    # 4 bytes per int32
                     self._result.add_result(f'{prefix}_throughput', throughput_gbs)
 
                     parsed_any = True
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 3a1aa0163..1fcffa2e3 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -91,9 +91,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     )
                     self._result.add_result(f'duration_us_{duration_us}_cpu_time', parse_time_to_us(cpu_time))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', parse_time_to_us(gpu_time))
-                    self._result.add_result(
-                        f'duration_us_{duration_us}_batch_gpu_time', parse_time_to_us(batch_gpu)
-                    )
+                    self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', parse_time_to_us(batch_gpu))
                     parsed_any = True
 
             if not parsed_any:
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
index 2101006ea..d25584a09 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
@@ -12,7 +12,6 @@
 
 class TestNvbenchAutoThroughputBenchmark(BenchmarkTestCase, unittest.TestCase):
     """Test class for NVBench Auto Throughput benchmark."""
-
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""
@@ -32,12 +31,7 @@ def test_nvbench_auto_throughput_preprocess(self):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Test preprocess with specified parameters
-        parameters = (
-            '--devices 0 '
-            '--stride "[1,2,4,8]" '
-            '--timeout 20 '
-            '--min-samples 100'
-        )
+        parameters = ('--devices 0 ' '--stride "[1,2,4,8]" ' '--timeout 20 ' '--min-samples 100')
         benchmark = benchmark_class(benchmark_name, parameters=parameters)
         assert benchmark._preprocess()
         assert benchmark.return_code == ReturnCode.SUCCESS
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
index 76d465709..1738622b3 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
@@ -13,7 +13,6 @@
 
 class TestParseTimeToUs(unittest.TestCase):
     """Test class for parse_time_to_us function."""
-
     def test_parse_microseconds(self):
         """Test parsing microseconds values."""
         self.assertAlmostEqual(parse_time_to_us('123.45 us'), 123.45)
@@ -60,7 +59,6 @@ def test_parse_seconds(self):
 
 class ConcreteNvbenchBase(NvbenchBase):
     """Concrete implementation of NvbenchBase for testing."""
-
     def __init__(self, name, parameters=''):
         """Constructor."""
         super().__init__(name, parameters)
@@ -69,7 +67,6 @@ def __init__(self, name, parameters=''):
 
 class TestNvbenchBase(BenchmarkTestCase, unittest.TestCase):
     """Test class for NvbenchBase class."""
-
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""

From 9643150b38f74862037c6a9f33a13b188b75d670 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 10 Mar 2026 14:18:43 -0700
Subject: [PATCH 34/37] fix pipeline & resolve comments

---
 superbench/benchmarks/micro_benchmarks/nvbench_base.py | 8 +++-----
 tests/benchmarks/micro_benchmarks/test_nvbench_base.py | 7 -------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 55fbfd76e..3bc8d1b51 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -11,13 +11,11 @@
 
 
 def parse_time_to_us(raw: str) -> float:
-    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms', '1.5 s' → float µs.
-    """
+    """Parse a time string like '123.45 us' or '1.5 s' to float microseconds."""
     raw = raw.strip()
-    # split "value unit" or "valueunit"
-    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
+    m = re.match(r'^([\d.]+)\s*([mun]?s)?$', raw)
     if not m:
-        return float(raw)
+        raise ValueError(f'Invalid time string: {raw!r}')
     val, unit = float(m.group(1)), (m.group(2) or 'us')
     if unit == 's':
         return val * 1e6
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
index 1738622b3..82cb5d4f1 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
@@ -4,8 +4,6 @@
 """Tests for nvbench_base module."""
 
 import unittest
-from argparse import Namespace
-
 from tests.helper.testcase import BenchmarkTestCase
 from superbench.benchmarks import ReturnCode
 from superbench.benchmarks.micro_benchmarks.nvbench_base import parse_time_to_us, NvbenchBase
@@ -33,11 +31,6 @@ def test_parse_milliseconds(self):
         self.assertAlmostEqual(parse_time_to_us('0.5 ms'), 500.0)
         self.assertAlmostEqual(parse_time_to_us('0.001 ms'), 1.0)
 
-    def test_parse_percentage(self):
-        """Test parsing percentage values."""
-        self.assertAlmostEqual(parse_time_to_us('50.5%'), 50.5)
-        self.assertAlmostEqual(parse_time_to_us('0.1%'), 0.1)
-        self.assertAlmostEqual(parse_time_to_us('100%'), 100.0)
 
     def test_parse_plain_number(self):
         """Test parsing plain numbers without unit (defaults to us)."""

From f1a3b6d074f4b3046209a77230feeb5aa33ac8bc Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 10 Mar 2026 14:30:59 -0700
Subject: [PATCH 35/37] fix lint

---
 tests/benchmarks/micro_benchmarks/test_nvbench_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
index 82cb5d4f1..c00842551 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
@@ -31,7 +31,6 @@ def test_parse_milliseconds(self):
         self.assertAlmostEqual(parse_time_to_us('0.5 ms'), 500.0)
         self.assertAlmostEqual(parse_time_to_us('0.001 ms'), 1.0)
 
-
     def test_parse_plain_number(self):
         """Test parsing plain numbers without unit (defaults to us)."""
         self.assertAlmostEqual(parse_time_to_us('123.45'), 123.45)

From fe48e35b45fdfcc2e7d891234226815a8229e3db Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 10 Mar 2026 15:42:09 -0700
Subject: [PATCH 36/37] fix test

---
 .../micro_benchmarks/test_nvbench_base.py     | 28 +++++--------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
index c00842551..2952394ef 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
@@ -56,6 +56,10 @@ def __init__(self, name, parameters=''):
         super().__init__(name, parameters)
         self._bin_name = 'test_nvbench_binary'
 
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Process raw results (no-op for base class testing)."""
+        return True
+
 
 class TestNvbenchBase(BenchmarkTestCase, unittest.TestCase):
     """Test class for NvbenchBase class."""
@@ -70,7 +74,7 @@ def test_nvbench_base_init(self):
         """Test NvbenchBase initialization."""
         benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
         assert benchmark._bin_name == 'test_nvbench_binary'
-        assert benchmark.name == 'test-benchmark'
+        assert benchmark._name == 'test-benchmark'
 
     def test_nvbench_base_add_parser_arguments(self):
         """Test NvbenchBase add_parser_arguments."""
@@ -185,25 +189,6 @@ def test_nvbench_base_preprocess_with_entropy_stopping_criterion(self):
         assert '--min-time' not in benchmark._commands[0]
         assert '--max-noise' not in benchmark._commands[0]
 
-    def test_nvbench_base_parse_time_value(self):
-        """Test NvbenchBase _parse_time_value method."""
-        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
-        benchmark._preprocess()
-
-        self.assertAlmostEqual(benchmark._parse_time_value('100 us'), 100.0)
-        self.assertAlmostEqual(benchmark._parse_time_value('1000 ns'), 1.0)
-        self.assertAlmostEqual(benchmark._parse_time_value('1 ms'), 1000.0)
-
-    def test_nvbench_base_parse_percentage(self):
-        """Test NvbenchBase _parse_percentage method."""
-        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
-        benchmark._preprocess()
-
-        self.assertAlmostEqual(benchmark._parse_percentage('50.5%'), 50.5)
-        self.assertAlmostEqual(benchmark._parse_percentage('100%'), 100.0)
-        self.assertAlmostEqual(benchmark._parse_percentage('0.1%'), 0.1)
-        self.assertAlmostEqual(benchmark._parse_percentage(25.0), 25.0)
-
     def test_nvbench_base_handle_parsing_error(self):
         """Test NvbenchBase _handle_parsing_error method."""
         benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
@@ -214,7 +199,8 @@ def test_nvbench_base_handle_parsing_error(self):
 
     def test_nvbench_base_build_base_command_without_bin_name(self):
         """Test NvbenchBase _build_base_command raises error without bin_name."""
-        benchmark = NvbenchBase('test-benchmark', parameters='')
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._bin_name = None
         # Manually call add_parser_arguments and parse to set up _args
         benchmark.add_parser_arguments()
         benchmark._args = benchmark._parser.parse_args([])

From e253b85b31647b24d632ae547a6294f5037830f4 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 22 Apr 2026 12:50:53 -0700
Subject: [PATCH 37/37] resolve comments

---
 .../benchmarks/micro_benchmarks/nvbench/kernel_launch.cu   | 3 +++
 .../benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu    | 3 +++
 .../micro_benchmarks/test_nvbench_kernel_launch.py         | 4 ----
 .../micro_benchmarks/test_nvbench_sleep_kernel.py          | 7 -------
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
index 08dc40294..395d585ba 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
@@ -1,3 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
 #include <nvbench/nvbench.cuh>
 
 __global__ void empty_kernel() {}
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
index b4789377e..8181ad5c5 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
@@ -1,3 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
 #include <cuda/std/chrono>
 #include <cuda_runtime.h>
 #include <nvbench/nvbench.cuh>
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 694d0796c..332dddec6 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -73,12 +73,8 @@ def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        # assert benchmark.result['samples'][0] == 120000
         self.assertAlmostEqual(benchmark.result['cpu_time'][0], 24.222)
-        # self.assertAlmostEqual(benchmark.result['cpu_noise'][0], 30.44)
         self.assertAlmostEqual(benchmark.result['gpu_time'][0], 7.808)
-        # self.assertAlmostEqual(benchmark.result['gpu_noise'][0], 14.42)
-        # assert benchmark.result['batch_samples'][0] == 300000
         self.assertAlmostEqual(benchmark.result['batch_gpu_time'][0], 6.024)
 
     def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 5b423182a..6df8100c6 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -75,17 +75,10 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        # assert benchmark.result['duration_us_25_samples'][0] == 10175
         self.assertAlmostEqual(benchmark.result['duration_us_25_cpu_time'][0], 42.123)
-        # self.assertAlmostEqual(benchmark.result['duration_us_25_cpu_noise'][0], 69.78)
         self.assertAlmostEqual(benchmark.result['duration_us_25_gpu_time'][0], 25.321)
-        # self.assertAlmostEqual(benchmark.result['duration_us_25_gpu_noise'][0], 0.93)
-        # assert benchmark.result['duration_us_25_batch_samples'][0] == 17448
         self.assertAlmostEqual(benchmark.result['duration_us_25_batch_gpu_time'][0], 23.456)
 
-        # assert benchmark.result['duration_us_50_samples'][0] == 8187
-        # assert benchmark.result['duration_us_75_samples'][0] == 6279
-
     def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
         """Test NVBench Sleep Kernel preprocess with different duration formats."""
         benchmark_name = 'nvbench-sleep-kernel'