diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index ef903240c..e8d1ce862 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -52,7 +52,11 @@ jobs:
       - name: Install Dependency
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
-          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
+      - name: Setup CMake
+        uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: '3.20.0'
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
diff --git a/.gitignore b/.gitignore
index 5888455a8..97b51dcd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,9 @@ cython_debug/
 *.userosscache
 *.sln.docstates
 
+# Build temporary files
+compile_commands.json
+
 # Build results
 [Dd]ebug/
 [Dd]ebugPublic/
diff --git a/.gitmodules b/.gitmodules
index e7d2af022..03afed789 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -33,3 +33,6 @@
 [submodule "third_party/nvbandwidth"]
 	path = third_party/nvbandwidth
 	url = https://github.com/NVIDIA/nvbandwidth.git
+[submodule "third_party/nvbench"]
+	path = third_party/nvbench
+	url = https://github.com/NVIDIA/nvbench.git
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 595624003..56156a00a 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -61,6 +61,27 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -161,7 +182,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 29804506c..1b0352eb1 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -62,6 +62,28 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -162,7 +184,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 25fc7f9ed..087966619 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -151,7 +151,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda
+RUN make -C third_party cuda cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index aa3aa965b..ef90838b3 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -172,6 +172,105 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `nvbench-sleep-kernel`
+
+#### Introduction
+
+Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.
+
+The benchmark supports multiple duration specification formats:
+- Single value: `"50"` - Test single duration of 50μs
+- List format: `"[25,50,75]"` - Test multiple specific durations
+- Range format: `"[25:75]"` - Test all values from 25μs to 75μs  
+- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.
+
+#### Metrics
+
+| Name                                    | Unit      | Description                                           |
+|-----------------------------------------|-----------|-------------------------------------------------------|
+| nvbench-sleep-kernel/duration_us_{X}_cpu_time     | time (μs) | CPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_gpu_time     | time (μs) | GPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |
+
+Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).
+
+### `nvbench-kernel-launch`
+
+#### Introduction
+
+Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.
+
+The benchmark provides insights into:
+- Kernel launch latency
+- CPU/GPU synchronization overhead  
+- Batch execution performance
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.
+
+#### Comparison with `kernel-launch`
+
+Both `nvbench-kernel-launch` and `kernel-launch` measure kernel launch latency, but they differ in methodology:
+
+| Aspect | `kernel-launch` | `nvbench-kernel-launch` |
+|--------|-----------------|-------------------------|
+| L2 Cache | Warm (cached) | Cold (flushed before each sample) |
+| Measurement | Warm-cache / steady-state | Cold-cache / first-access |
+| Iterations | Fixed 2M iterations | Adaptive (statistical stopping) |
+
+**Important: Do not cross-compare results between these two benchmarks.** For performance regression detection, either benchmark works well. Always compare against historical data from the same benchmark.
+
+Choose based on what scenario matters for your workload:
+- **`kernel-launch`**: Measures warm-cache performance, reflecting steady-state behavior in long-running applications where caches are typically hot
+- **`nvbench-kernel-launch`**: Measures cold-cache performance, reflecting first-access scenarios or workloads with poor cache locality
+
+#### Metrics
+
+| Name                                | Unit      | Description                                    |
+|-------------------------------------|-----------|------------------------------------------------|
+| nvbench-kernel-launch/cpu_time      | time (μs) | CPU-measured kernel execution time.            |
+| nvbench-kernel-launch/gpu_time      | time (μs) | GPU-measured kernel execution time.            |
+| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time.                     |
+
+### `nvbench-auto-throughput`
+
+#### Introduction
+
+Measure GPU memory throughput and efficiency metrics using NVBench's auto throughput benchmark. This benchmark copies a 128 MiB buffer of int32 values with configurable stride and block size parameters, measuring memory bandwidth efficiency and CUPTI-based performance counters.
+
+#### Parameters
+
+- **Stride**: Controls the memory access pattern by specifying the gap between consecutive memory accesses. A stride of 1 means contiguous (coalesced) memory access. Larger stride values (2, 4, 8, etc.) create non-contiguous access patterns, useful for stress-testing memory subsystem behavior under different access patterns.
+- **BlockSize**: The number of threads per CUDA block (e.g., 128, 256, 512, 1024). Different block sizes affect occupancy and scheduling efficiency.
+- **ItemsPerThread**: The number of elements each thread processes (1 or 2). Higher values increase work per thread.
+
+The benchmark supports multiple parameter specification formats:
+- Single value: `"2"` - Test single value
+- List format: `"[1,2,4,8]"` - Test multiple specific values
+- Range format: `"[1:4]"` - Test all values from 1 to 4
+- Range with step: `"[1:8:2]"` - Test from 1 to 8 in steps of 2
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) auto throughput benchmark with CUPTI metrics collection.
+
+#### Metrics
+
+| Name                                                                    | Unit         | Description                                                                           |
+|-------------------------------------------------------------------------|--------------|--------------------------------------------------------------------------------------|
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_cpu\_time       | time (μs)    | CPU-measured execution time.             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_gpu\_time       | time (μs)    | GPU-measured execution time.             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_batch\_gpu\_time | time (μs)    | GPU batch execution time.                |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_hbw\_peak       | percent (%)  | HBM peak bandwidth utilization percentage.                                            |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_load\_eff       | percent (%)  | Global memory load efficiency percentage.                                             |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_store\_eff      | percent (%)  | Global memory store efficiency percentage.                                            |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_l1\_hit\_rate   | percent (%)  | L1 cache hit rate percentage (informational only, excluded from pass/fail criteria). |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_l2\_hit\_rate   | percent (%)  | L2 cache hit rate percentage (informational only, excluded from pass/fail criteria). |
+| nvbench-auto-throughput/ipt\_{T}\_stride\_{S}\_blk\_{B}\_throughput       | GB/s         | Memory throughput calculated from element rate (elements/s × 4 bytes for int32).     |
+
+Where `{T}` is ItemsPerThread (1 or 2), `{S}` is Stride value, and `{B}` is BlockSize (e.g., 128, 256, 512, 1024).
+
+> **Note:** L1 and L2 cache hit rates are collected for informational purposes only and should not be used for performance validation pass/fail criteria, as cache behavior can vary significantly based on system state and workload characteristics.
+
 ### `cpu-hpl`
 
 #### Introduction
diff --git a/examples/benchmarks/nvbench_auto_throughput.py b/examples/benchmarks/nvbench_auto_throughput.py
new file mode 100644
index 000000000..63e61062f
--- /dev/null
+++ b/examples/benchmarks/nvbench_auto_throughput.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Micro benchmark example for NVBench Auto Throughput.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_auto_throughput.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-auto-throughput',
+        platform=Platform.CUDA,
+        parameters='--devices 0 --stride "[1,2,4,8]" --block_size "[256,512]" --timeout 30'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
new file mode 100644
index 000000000..5efb14078
--- /dev/null
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for NVBench Kernel Launch.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_kernel_launch.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-kernel-launch',
+        platform=Platform.CUDA,
+        parameters=(
+            '--timeout 30 '
+            '--min-samples 10 '
+            '--min-time 1.0 '
+            '--max-noise 0.1 '
+            '--stopping-criterion stdrel '
+            '--throttle-threshold 80 '
+            '--throttle-recovery-delay 1.0'
+        )
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/examples/benchmarks/nvbench_sleep_kernel.py b/examples/benchmarks/nvbench_sleep_kernel.py
new file mode 100644
index 000000000..a6adf7a9c
--- /dev/null
+++ b/examples/benchmarks/nvbench_sleep_kernel.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for NVBench Sleep Kernel.
+
+Commands to run:
+  python3 examples/benchmarks/nvbench_sleep_kernel.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 978c2d385..cf0bc0ba8 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -39,42 +39,18 @@
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
+from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
+from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
+from superbench.benchmarks.micro_benchmarks.nvbench_auto_throughput import NvbenchAutoThroughput
 
 __all__ = [
-    'BlasLtBaseBenchmark',
-    'ComputationCommunicationOverlap',
-    'CpuMemBwLatencyBenchmark',
-    'CpuHplBenchmark',
-    'CpuStreamBenchmark',
-    'CublasBenchmark',
-    'CublasLtBenchmark',
-    'CudaGemmFlopsBenchmark',
-    'CudaMemBwBenchmark',
-    'CudaNcclBwBenchmark',
-    'CudnnBenchmark',
-    'DiskBenchmark',
-    'DistInference',
-    'HipBlasLtBenchmark',
-    'GPCNetBenchmark',
-    'GemmFlopsBenchmark',
-    'GpuBurnBenchmark',
-    'GpuCopyBwBenchmark',
-    'GpuStreamBenchmark',
-    'IBBenchmark',
-    'IBLoopbackBenchmark',
-    'KernelLaunch',
-    'MemBwBenchmark',
-    'MicroBenchmark',
-    'MicroBenchmarkWithInvoke',
-    'ORTInferenceBenchmark',
-    'RocmGemmFlopsBenchmark',
-    'RocmMemBwBenchmark',
-    'ShardingMatmul',
-    'TCPConnectivityBenchmark',
-    'TensorRTInferenceBenchmark',
-    'DirectXGPUEncodingLatency',
-    'DirectXGPUCopyBw',
-    'DirectXGPUMemBw',
-    'DirectXGPUCoreFlops',
-    'NvBandwidthBenchmark',
+    'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
+    'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
+    'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
+    'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
+    'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
+    'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
+    'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
+    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel',
+    'NvbenchAutoThroughput'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
new file mode 100644
index 000000000..c00043aaf
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.18)
+project(nvbench_benchmarks LANGUAGES CUDA)
+
+# Check if we have a recent enough CMake for nvbench (which requires 3.30.4)
+if(CMAKE_VERSION VERSION_LESS "3.30.4")
+  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.30.4 (required by nvbench), skipping nvbench benchmarks")
+  return()
+endif()
+
+find_package(CUDAToolkit QUIET)
+if (CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
+  
+  # Try to find nvbench, but don't require it
+  find_package(nvbench CONFIG QUIET)
+  
+  if (nvbench_FOUND)
+    message(STATUS "Found nvbench, building nvbench benchmarks")
+    
+    # list all your CUDA benchmark source files here
+    set(NVBENCH_SOURCES
+      kernel_launch.cu
+      sleep_kernel.cu
+      auto_throughput.cu
+      # add more *.cu as needed
+    )
+
+    foreach(src ${NVBENCH_SOURCES})
+      # strip ".cu" → NAME_WE
+      get_filename_component(basename ${src} NAME_WE)
+      set(target nvbench_${basename})
+
+      add_executable(${target} ${src})
+      target_compile_features(${target} PUBLIC cuda_std_17)
+      target_link_libraries(${target}
+        PRIVATE nvbench::nvbench nvbench::main
+      )
+      install(TARGETS ${target} RUNTIME DESTINATION bin)
+    endforeach()
+  else()
+    message(STATUS "nvbench not found, skipping nvbench benchmarks.")
+    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench")
+  endif()
+else()
+  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
+endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md b/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
new file mode 100644
index 000000000..85b7f8faa
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/NVBENCH_BENCHMARK_GUIDE.md
@@ -0,0 +1,129 @@
+# NVBench Benchmark Development Guide
+
+Guide for GitHub Copilot to create new NVBench-based benchmarks in SuperBench.
+
+## Architecture Overview
+
+NVBench benchmarks follow a modular structure. To add a new benchmark `<name>`:
+
+```
+Files to Create/Modify:
+├── superbench/benchmarks/micro_benchmarks/nvbench/
+│   ├── <name>.cu                          # CUDA benchmark (CREATE)
+│   └── CMakeLists.txt                     # Add .cu to NVBENCH_SOURCES (MODIFY)
+├── superbench/benchmarks/micro_benchmarks/
+│   ├── nvbench_<name>.py                  # Python wrapper (CREATE)
+│   └── __init__.py                        # Add import (MODIFY)
+├── tests/benchmarks/micro_benchmarks/
+│   └── test_nvbench_<name>.py             # Test file (CREATE)
+├── tests/data/
+│   └── nvbench_<name>.log                 # Sample output for tests (CREATE)
+├── examples/benchmarks/
+│   └── nvbench_<name>.py                  # Example script (CREATE)
+└── docs/user-tutorial/benchmarks/
+    └── micro-benchmarks.md                # Add metrics documentation (MODIFY)
+```
+
+## Reference Files
+
+When creating a new benchmark, examine these existing implementations:
+
+| Component | Simple (no params) | Parameterized |
+|-----------|-------------------|---------------|
+| CUDA benchmark | `nvbench/kernel_launch.cu` | `nvbench/sleep_kernel.cu` |
+| Python wrapper | `nvbench_kernel_launch.py` | `nvbench_sleep_kernel.py` |
+| Test file | `test_nvbench_kernel_launch.py` | `test_nvbench_sleep_kernel.py` |
+| Test data | `tests/data/nvbench_kernel_launch.log` | `tests/data/nvbench_sleep_kernel.log` |
+| Example | `examples/benchmarks/nvbench_kernel_launch.py` | `examples/benchmarks/nvbench_sleep_kernel.py` |
+
+## Key Components
+
+### Base Class: `nvbench_base.py`
+- `NvbenchBase` - Inherit from this class
+- `_build_base_command()` - Builds command with common NVBench CLI args
+- `parse_time_to_us(str)` - Parses "123.45 us", "678.9 ns", "0.12 ms", "1.5 s" → float µs
+- `_handle_parsing_error()` - Consistent error handling (see Error Handling section below)
+
+### CMakeLists.txt
+Add new `.cu` file to `NVBENCH_SOURCES` list.
+
+### Python Wrapper Pattern
+1. Set `self._bin_name = 'nvbench_<name>'` (must match CMake target)
+2. Override `add_parser_arguments()` if benchmark has custom parameters
+3. Override `_preprocess()` if custom command building needed
+4. Implement `_process_raw_result()` to parse NVBench output
+
+### Registration
+- Python: `BenchmarkRegistry.register_benchmark('nvbench-<name>', Nvbench<Name>, platform=Platform.CUDA)`
+- Import in `__init__.py`
+
+## Important Implementation Notes
+
+### Error Handling Pattern
+Always use this consistent error handling pattern in `_process_raw_result()`:
+```python
+def _process_raw_result(self, cmd_idx, raw_output):
+    self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+    try:
+        # ... parsing logic ...
+        if not parsed_any:
+            raise ValueError('No valid result rows parsed')
+    except BaseException as e:
+        self._handle_parsing_error(str(e), raw_output)
+        return False
+    return True
+```
+Key points:
+- Use `BaseException` (not `Exception`) to match codebase convention
+- Use `ValueError` for parsing failures (not `RuntimeError`)
+- Always call `_handle_parsing_error()` from base class - don't implement custom error handling
+
+### GPU ID Handling
+**Do NOT track GPU IDs in result metric names.** SuperBench executes benchmarks with `CUDA_VISIBLE_DEVICES` set per GPU, so results are automatically stored in `metric_name:gpu_id` format by the framework. Simply parse results without GPU prefixes.
+
+### Parsing Percentages
+For percentage values like "12.34%", use simple string stripping:
+```python
+float(percent_str.rstrip('%'))
+```
+Do NOT use `parse_time_to_us()` for percentages - it only handles time values.
+
+### Avoid Debug Logging
+Do not add `logger.debug()` calls in `_process_raw_result()`. The raw output is already stored via `add_raw_data()` for debugging purposes.
+
+### Documentation (`docs/user-tutorial/benchmarks/micro-benchmarks.md`)
+Add a section under "## Computation Benchmarks" with:
+1. `### \`nvbench-<name>\`` - Benchmark name header
+2. `#### Introduction` - Brief description of what the benchmark measures
+3. `#### Metrics` - Table with columns: Name, Unit, Description
+
+Example format (see `nvbench-sleep-kernel` or `nvbench-kernel-launch` sections):
+```markdown
+### `nvbench-<name>`
+
+#### Introduction
+Description of what the benchmark measures and any configuration options.
+
+#### Metrics
+| Name                              | Unit      | Description                    |
+|-----------------------------------|-----------|--------------------------------|
+| nvbench-<name>/${param}_cpu_time  | time (μs) | CPU-measured execution time.   |
+| nvbench-<name>/${param}_gpu_time  | time (μs) | GPU-measured execution time.   |
+```
+
+## NVBench Reference
+
+For advanced NVBench features (axes, types, throughput calculations):
+- Source: `third_party/nvbench/`
+- Examples: `third_party/nvbench/examples/`
+
+## Checklist
+
+- [ ] `<name>.cu` - CUDA benchmark with `NVBENCH_BENCH` macro
+- [ ] `CMakeLists.txt` - Add to `NVBENCH_SOURCES`
+- [ ] `nvbench_<name>.py` - Python wrapper extending `NvbenchBase`
+- [ ] `__init__.py` - Add import
+- [ ] `test_nvbench_<name>.py` - Test file (use `self.assertAlmostEqual` for floats)
+- [ ] `nvbench_<name>.log` - Test data
+- [ ] `micro-benchmarks.md` - Add Introduction and Metrics documentation
+- [ ] `nvbench_<name>.py` - Example script (follow format of other examples)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu b/superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu
new file mode 100644
index 000000000..7c7bb910c
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/auto_throughput.cu
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// Based on nvbench example: auto_throughput.cu
+// This benchmark measures memory throughput and cache hit rates.
+
+#include <nvbench/nvbench.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+template <int ItemsPerThread>
+__global__ void throughput_kernel(std::size_t stride, std::size_t elements, const nvbench::int32_t *__restrict__ in,
+                                  nvbench::int32_t *__restrict__ out) {
+    const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const std::size_t step = gridDim.x * blockDim.x;
+
+    for (std::size_t i = stride * tid; i < stride * elements; i += stride * step) {
+        for (int j = 0; j < ItemsPerThread; j++) {
+            const auto read_id = (ItemsPerThread * i + j) % elements;
+            const auto write_id = tid + j * elements;
+            out[write_id] = in[read_id];
+        }
+    }
+}
+
+// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
+// and cache hit rates.
+//
+// Calling state.collect_*() enables particular metric collection if nvbench
+// was built with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
+template <int ItemsPerThread>
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>) {
+    // Allocate input data:
+    const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+    const auto threads_in_block = static_cast<int>(state.get_int64("BlockSize"));
+    const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
+    thrust::device_vector<nvbench::int32_t> input(elements);
+    thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
+
+    // Provide throughput information:
+    state.add_element_count(elements, "Elements");
+
+    // CUPTI metrics - these require nvbench built with -DNVBench_ENABLE_CUPTI=ON
+    // If CUPTI is not available, these calls are no-ops
+    state.collect_dram_throughput();
+    state.collect_l1_hit_rates();
+    state.collect_l2_hit_rates();
+    state.collect_loads_efficiency();
+    state.collect_stores_efficiency();
+
+    const auto blocks_in_grid = static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
+
+    state.exec([&](nvbench::launch &launch) {
+        throughput_kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+            stride, elements, thrust::raw_pointer_cast(input.data()), thrust::raw_pointer_cast(output.data()));
+    });
+}
+
+using items_per_thread = nvbench::enum_type_list<1, 2>;
+
+NVBENCH_BENCH_TYPES(throughput_bench, NVBENCH_TYPE_AXES(items_per_thread))
+    .add_int64_axis("Stride", nvbench::range(1, 4, 3))
+    .add_int64_axis("BlockSize", {128, 256, 512, 1024})
+    .set_timeout(1);
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
new file mode 100644
index 000000000..395d585ba
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <nvbench/nvbench.cuh>
+
+__global__ void empty_kernel() {}
+
+void kernel_launch(nvbench::state &state) {
+    state.exec([](nvbench::launch &launch) { empty_kernel<<<1, 1, 0, launch.get_stream()>>>(); });
+}
+
+NVBENCH_BENCH(kernel_launch);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
new file mode 100644
index 000000000..8181ad5c5
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cuda/std/chrono>
+#include <cuda_runtime.h>
+#include <nvbench/nvbench.cuh>
+
+__global__ void sleep_kernel(nvbench::int64_t microseconds) {
+    const auto start = cuda::std::chrono::high_resolution_clock::now();
+    const auto target_duration = cuda::std::chrono::microseconds(microseconds);
+    const auto finish = start + target_duration;
+
+    while (cuda::std::chrono::high_resolution_clock::now() < finish) {
+        // busy wait
+    }
+}
+
+void sleep_benchmark(nvbench::state &state) {
+    const auto duration_us = state.get_int64("Duration (us)");
+    state.exec(
+        [&duration_us](nvbench::launch &launch) { sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us); });
+}
+NVBENCH_BENCH(sleep_benchmark)
+    .add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
+    .set_timeout(1); // Limit to one second per measurement.
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
new file mode 100644
index 000000000..66c0975a5
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_auto_throughput.py
@@ -0,0 +1,151 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Auto Throughput benchmark."""
+
+import re
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
+
+
+class NvbenchAutoThroughput(NvbenchBase):
+    """The NVBench Auto Throughput benchmark class.
+
+    This benchmark measures memory throughput and cache hit rates using CUPTI.
+    It copies a 128 MiB buffer with configurable stride and items per thread.
+    """
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'nvbench_auto_throughput'
+
+    def add_parser_arguments(self):
+        """Add benchmark-specific arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--stride',
+            type=str,
+            default='[1:4]',
+            help='Stride axis values. Supports: "2" (single), "[1,2,4]" (list), "[1:4]" (range), "[1:8:2]" (step).',
+        )
+
+        self._parser.add_argument(
+            '--block_size',
+            type=str,
+            default='[128,256,512,1024]',
+            help='Block size (threads per block). Supports: "256" (single), "[128,256,512,1024]" (list).',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Build base command with common nvbench arguments
+        parts = self._build_base_command()
+
+        # Add stride axis argument
+        parts.extend(['--axis', f'"Stride={self._args.stride.strip()}"'])
+
+        # Add block size axis argument
+        parts.extend(['--axis', f'"BlockSize={self._args.block_size.strip()}"'])
+
+        # Finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Parse raw results and save the summarized results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+
+        try:
+            # Pattern for throughput benchmark table output with CUPTI metrics
+            # Table format:
+            # | T | Stride | BlockSize | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate |
+            # | Samples | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | Samples | Batch GPU |
+            row_pat = (
+                r'\|\s*(\d+)\s*\|'    # T (ItemsPerThread)
+                r'\s*(\d+)\s*\|'    # Stride
+                r'\s*(\d+)\s*\|'    # BlockSize
+                r'\s*\d+\s*\|'    # Elements (skip)
+                r'\s*([\d.]+)%\s*\|'    # HBWPeak
+                r'\s*([\d.]+)%\s*\|'    # LoadEff
+                r'\s*([\d.]+)%\s*\|'    # StoreEff
+                r'\s*([\d.]+)%\s*\|'    # L1HitRate
+                r'\s*([\d.]+)%\s*\|'    # L2HitRate
+                r'\s*\d+x\s*\|'    # Samples CUPTI (skip)
+                r'\s*\d+x\s*\|'    # Samples Cold (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'    # CPU Time
+                r'\s*[\d.]+%\s*\|'    # CPU Noise (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'    # GPU Time
+                r'\s*[\d.]+%\s*\|'    # GPU Noise (skip)
+                r'\s*([\d.]+)([TGMK]?)\s*\|'    # Elem/s (value and unit prefix)
+                r'\s*\d+x\s*\|'    # Samples Batch (skip)
+                r'\s*([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
+            )
+
+            parsed_any = False
+
+            for line in raw_output.splitlines():
+                line = line.strip()
+                r = re.match(row_pat, line)
+                if r:
+                    (
+                        items_per_thread, stride, block_size, hbw_peak, load_eff, store_eff, l1_hit, l2_hit, cpu_time,
+                        gpu_time, elem_rate, elem_unit, batch_gpu
+                    ) = r.groups()
+
+                    prefix = f'ipt_{items_per_thread}_stride_{stride}_blk_{block_size}'
+
+                    # Timing metrics (in microseconds)
+                    self._result.add_result(f'{prefix}_cpu_time', parse_time_to_us(cpu_time))
+                    self._result.add_result(f'{prefix}_gpu_time', parse_time_to_us(gpu_time))
+                    self._result.add_result(f'{prefix}_batch_gpu_time', parse_time_to_us(batch_gpu))
+
+                    # CUPTI metrics (percentages)
+                    self._result.add_result(f'{prefix}_hbw_peak', float(hbw_peak))
+                    self._result.add_result(f'{prefix}_load_eff', float(load_eff))
+                    self._result.add_result(f'{prefix}_store_eff', float(store_eff))
+                    self._result.add_result(f'{prefix}_l1_hit_rate', float(l1_hit))
+                    self._result.add_result(f'{prefix}_l2_hit_rate', float(l2_hit))
+
+                    # Memory throughput in GB/s
+                    # Convert element rate to bandwidth: GB/s = (elements/s) * sizeof(int32) / 1e9
+                    # The benchmark uses int32 (4 bytes per element)
+                    elem_val = float(elem_rate)
+                    unit_multipliers = {'T': 1e12, 'G': 1e9, 'M': 1e6, 'K': 1e3, '': 1.0}
+                    elements_per_sec = elem_val * unit_multipliers.get(elem_unit, 1.0)
+                    throughput_gbs = (elements_per_sec * 4) / 1e9    # 4 bytes per int32
+                    self._result.add_result(f'{prefix}_throughput', throughput_gbs)
+
+                    parsed_any = True
+
+            if not parsed_any:
+                raise ValueError('No valid result rows parsed')
+
+        except BaseException as e:
+            self._handle_parsing_error(str(e), raw_output)
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('nvbench-auto-throughput', NvbenchAutoThroughput, platform=Platform.CUDA)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
new file mode 100644
index 000000000..3bc8d1b51
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -0,0 +1,232 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Base class for NVBench benchmarks."""
+
+import os
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import ReturnCode
+from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+
+
+def parse_time_to_us(raw: str) -> float:
+    """Parse a time string like '123.45 us' or '1.5 s' to float microseconds."""
+    raw = raw.strip()
+    m = re.match(r'^([\d.]+)\s*([mun]?s)?$', raw)
+    if not m:
+        raise ValueError(f'Invalid time string: {raw!r}')
+    val, unit = float(m.group(1)), (m.group(2) or 'us')
+    if unit == 's':
+        return val * 1e6
+    elif unit == 'ns':
+        return val / 1e3
+    elif unit == 'ms':
+        return val * 1e3
+    return val
+
+
+class NvbenchBase(MicroBenchmarkWithInvoke):
+    """Base class for NVBench benchmarks with common functionality."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        # Subclasses should set this
+        self._bin_name = None
+
+    def add_parser_arguments(self):
+        """Add common NVBench arguments."""
+        super().add_parser_arguments()
+
+        # Device configuration
+        self._parser.add_argument(
+            '--devices',
+            type=str,
+            default=None,
+            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
+        )
+
+        # Benchmark Properties
+        self._parser.add_argument(
+            '--skip-time',
+            type=float,
+            default=-1.0,
+            help='Skip time in seconds.',
+        )
+        self._parser.add_argument(
+            '--throttle-threshold',
+            type=float,
+            default=75.0,
+            help='Throttle threshold percentage.',
+        )
+        self._parser.add_argument(
+            '--throttle-recovery-delay',
+            type=float,
+            default=0.05,
+            help='Throttle recovery delay in seconds.',
+        )
+        self._parser.add_argument(
+            '--run-once',
+            action='store_true',
+            help='Run once flag.',
+        )
+        self._parser.add_argument(
+            '--disable-blocking-kernel',
+            action='store_true',
+            help='Disable blocking kernel flag.',
+        )
+        self._parser.add_argument(
+            '--profile',
+            action='store_true',
+            help='Enable profiling flag.',
+        )
+
+        # Stopping Criteria
+        self._parser.add_argument(
+            '--timeout',
+            type=int,
+            default=15,
+            help='Timeout in seconds.',
+        )
+        self._parser.add_argument(
+            '--min-samples',
+            type=int,
+            default=10,
+            help='Minimum number of samples.',
+        )
+        self._parser.add_argument(
+            '--stopping-criterion',
+            type=str,
+            default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion.',
+        )
+        # stdrel-specific
+        self._parser.add_argument(
+            '--min-time',
+            type=float,
+            default=0.5,
+            help='Minimum time for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max-noise',
+            type=float,
+            default=0.5,
+            help='Maximum noise for stdrel stopping criterion.',
+        )
+        # entropy-specific
+        self._parser.add_argument(
+            '--max-angle',
+            type=float,
+            default=0.048,
+            help='Maximum angle for entropy stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min-r2',
+            type=float,
+            default=0.36,
+            help='Minimum R-squared for entropy stopping criterion.',
+        )
+
+    def _add_device_args(self, parts):
+        """Add device configuration arguments to command parts."""
+        if hasattr(self._args, 'devices') and self._args.devices is not None:
+            if self._args.devices == 'all':
+                parts.extend(['--devices', 'all'])
+            else:
+                parts.extend(['--devices', self._args.devices])
+
+    def _add_benchmark_property_args(self, parts):
+        """Add benchmark property arguments to command parts."""
+        if hasattr(self._args, 'skip_time') and self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        if hasattr(self._args, 'throttle_threshold') and self._args.throttle_threshold > 0:
+            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        if hasattr(self._args, 'throttle_recovery_delay') and self._args.throttle_recovery_delay > 0:
+            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+        if hasattr(self._args, 'run_once') and self._args.run_once:
+            parts.append('--run-once')
+        if hasattr(self._args, 'disable_blocking_kernel') and self._args.disable_blocking_kernel:
+            parts.append('--disable-blocking-kernel')
+        if hasattr(self._args, 'profile') and self._args.profile:
+            parts.append('--profile')
+
+    def _add_stopping_criteria_args(self, parts):
+        """Add stopping criteria arguments to command parts."""
+        if hasattr(self._args, 'timeout') and self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if hasattr(self._args, 'min_samples') and self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if hasattr(self._args, 'stopping_criterion') and self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                self._add_stdrel_args(parts)
+            elif self._args.stopping_criterion == 'entropy':
+                self._add_entropy_args(parts)
+
+    def _add_stdrel_args(self, parts):
+        """Add stdrel-specific stopping criterion arguments."""
+        if hasattr(self._args, 'min_time') and self._args.min_time is not None:
+            parts.extend(['--min-time', str(self._args.min_time)])
+        if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
+            parts.extend(['--max-noise', str(self._args.max_noise)])
+
+    def _add_entropy_args(self, parts):
+        """Add entropy-specific stopping criterion arguments."""
+        if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
+            parts.extend(['--max-angle', str(self._args.max_angle)])
+        if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
+            parts.extend(['--min-r2', str(self._args.min_r2)])
+
+    def _build_base_command(self):
+        """Build the base nvbench command with common arguments.
+
+        Returns:
+            list: Command parts that can be extended by subclasses.
+        """
+        if not self._bin_name:
+            raise ValueError('Subclass must set _bin_name')
+
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [command]
+
+        self._add_device_args(parts)
+        self._add_benchmark_property_args(parts)
+        self._add_stopping_criteria_args(parts)
+
+        return parts
+
+    def _preprocess(self):
+        """Default preprocess implementation. Can be overridden by subclasses.
+
+        Returns:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Build base command - subclasses can override this method to add specific arguments
+        parts = self._build_base_command()
+
+        # Finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _handle_parsing_error(self, error_msg, raw_output):
+        """Handle parsing errors consistently.
+
+        Args:
+            error_msg (str): Error message to log.
+            raw_output (str): Raw output that failed to parse.
+        """
+        self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+        logger.error(
+            'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                self._curr_run_index, self._name, raw_output, error_msg
+            )
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
new file mode 100644
index 000000000..234248ec8
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -0,0 +1,68 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Kernel Launch benchmark."""
+
+import re
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
+
+
+class NvbenchKernelLaunch(NvbenchBase):
+    """The NVBench Kernel Launch benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'nvbench_kernel_launch'
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+
+        try:
+            # Regex pattern to handle different time units and flexible spacing
+            row_pat = (
+                r'\|\s*([0-9]+)x\s*\|\s*'    # Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # CPU Time (μs, ns, ms, us, s)
+                r'([\d.]+%)\s*\|\s*'    # CPU Noise percentage
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # GPU Time
+                r'([\d.]+%)\s*\|\s*'    # GPU Noise percentage
+                r'([0-9]+)x\s*\|\s*'    # Batch Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
+            )
+            parsed_any = False
+
+            for line in raw_output.splitlines():
+                line = line.strip()
+                r = re.match(row_pat, line)
+                if r:
+                    samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    self._result.add_result('cpu_time', parse_time_to_us(cpu_time))
+                    self._result.add_result('gpu_time', parse_time_to_us(gpu_time))
+                    self._result.add_result('batch_gpu_time', parse_time_to_us(batch_gpu))
+                    parsed_any = True
+
+            if not parsed_any:
+                raise ValueError('No valid result rows parsed')
+
+        except BaseException as e:
+            self._handle_parsing_error(str(e), raw_output)
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
new file mode 100644
index 000000000..1fcffa2e3
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Sleep Kernel benchmark."""
+
+import re
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase, parse_time_to_us
+
+
+class NvbenchSleepKernel(NvbenchBase):
+    """The NVBench Sleep Kernel benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'nvbench_sleep_kernel'
+
+    def add_parser_arguments(self):
+        """Add sleep-kernel specific arguments."""
+        super().add_parser_arguments()
+
+        # Sleep-kernel specific argument
+        self._parser.add_argument(
+            '--duration_us',
+            type=str,
+            default='[0,25,50,75,100]',
+            help='Duration axis values in microseconds. Supports multiple formats: '
+            '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Build base command with common nvbench arguments
+        parts = self._build_base_command()
+
+        # Add sleep-kernel specific arguments
+        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us.strip()}"'])
+
+        # Finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+        self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        try:
+            gpu_section = r'### \[(\d+)\] NVIDIA'
+            # Regex pattern to handle different time units and flexible spacing
+            row_pat = (
+                r'\|\s*([0-9]+)\s*\|\s*'    # Duration (us)
+                r'([0-9]+)x\s*\|\s*'    # Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # CPU Time (μs, ns, ms, us, s)
+                r'([\d.]+%)\s*\|\s*'    # CPU Noise percentage
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # GPU Time
+                r'([\d.]+%)\s*\|\s*'    # GPU Noise percentage
+                r'([0-9]+)x\s*\|\s*'    # Batch Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
+            )
+            parsed_any = False
+            for line in raw_output.splitlines():
+                line = line.strip()
+                g = re.match(gpu_section, line)
+                if g:
+                    continue
+                r = re.match(row_pat, line)
+                if r:
+                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
+                    )
+                    self._result.add_result(f'duration_us_{duration_us}_cpu_time', parse_time_to_us(cpu_time))
+                    self._result.add_result(f'duration_us_{duration_us}_gpu_time', parse_time_to_us(gpu_time))
+                    self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', parse_time_to_us(batch_gpu))
+                    parsed_any = True
+
+            if not parsed_any:
+                raise ValueError('No valid result rows parsed')
+
+        except BaseException as e:
+            self._handle_parsing_error(str(e), raw_output)
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
new file mode 100644
index 000000000..d25584a09
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_auto_throughput.py
@@ -0,0 +1,161 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench auto throughput benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchAutoThroughputBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Auto Throughput benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_auto_throughput'])
+
+    def test_nvbench_auto_throughput_preprocess(self):
+        """Test NVBench Auto Throughput benchmark preprocess."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = ('--devices 0 ' '--stride "[1,2,4,8]" ' '--timeout 20 ' '--min-samples 100')
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--axis "Stride=[1,2,4,8]"' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 100' in benchmark._commands[0])
+
+    def test_nvbench_auto_throughput_stride_formats(self):
+        """Test NVBench Auto Throughput preprocess with different stride formats."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "2"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=2"' in benchmark._commands[0]
+
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "[1,2,4]"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1,2,4]"' in benchmark._commands[0]
+
+        # Test range format
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "[1:8]"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1:8]"' in benchmark._commands[0]
+
+        # Test range with step format
+        benchmark = benchmark_class(benchmark_name, parameters='--stride "[1:8:2]"')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1:8:2]"' in benchmark._commands[0]
+
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "Stride=[1:4]"' in benchmark._commands[0]
+        assert '--axis "BlockSize=[128,256,512,1024]"' in benchmark._commands[0]
+
+    def test_nvbench_auto_throughput_block_size_formats(self):
+        """Test NVBench Auto Throughput preprocess with different block_size formats."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--block_size "256"')
+        assert benchmark._preprocess()
+        assert '--axis "BlockSize=256"' in benchmark._commands[0]
+
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--block_size "[128,256,512]"')
+        assert benchmark._preprocess()
+        assert '--axis "BlockSize=[128,256,512]"' in benchmark._commands[0]
+
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "BlockSize=[128,256,512,1024]"' in benchmark._commands[0]
+
+    @decorator.load_data('tests/data/nvbench_auto_throughput.log')
+    def test_nvbench_auto_throughput_result_parsing(self, results):
+        """Test NVBench Auto Throughput benchmark result parsing."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate timing metrics for ItemsPerThread=1, Stride=1, BlockSize=128
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_cpu_time'][0], 120.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_gpu_time'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_batch_gpu_time'][0], 95.0)
+
+        # Validate CUPTI metrics for ItemsPerThread=1, Stride=1, BlockSize=128
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_hbw_peak'][0], 20.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_load_eff'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_store_eff'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_l1_hit_rate'][0], 0.0)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_l2_hit_rate'][0], 0.05)
+
+        # Validate throughput for ItemsPerThread=1, Stride=1, BlockSize=128 (100G elem/s * 4 bytes = 400 GB/s)
+        self.assertAlmostEqual(benchmark.result['ipt_1_stride_1_blk_128_throughput'][0], 400.0)
+
+        # Validate timing metrics for ItemsPerThread=2, Stride=4, BlockSize=256
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_cpu_time'][0], 220.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_gpu_time'][0], 200.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_batch_gpu_time'][0], 195.0)
+
+        # Validate CUPTI metrics for ItemsPerThread=2, Stride=4, BlockSize=256
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_hbw_peak'][0], 80.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_load_eff'][0], 12.5)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_store_eff'][0], 100.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_l1_hit_rate'][0], 30.0)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_l2_hit_rate'][0], 10.0)
+
+        # Validate throughput for ItemsPerThread=2, Stride=4, BlockSize=256 (200G elem/s * 4 bytes = 800 GB/s)
+        self.assertAlmostEqual(benchmark.result['ipt_2_stride_4_blk_256_throughput'][0], 800.0)
+
+    def test_nvbench_auto_throughput_invalid_output(self):
+        """Test NVBench Auto Throughput benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-auto-throughput'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = 'Invalid output format'
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_base.py b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
new file mode 100644
index 000000000..2952394ef
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_base.py
@@ -0,0 +1,246 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench_base module."""
+
+import unittest
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import ReturnCode
+from superbench.benchmarks.micro_benchmarks.nvbench_base import parse_time_to_us, NvbenchBase
+
+
+class TestParseTimeToUs(unittest.TestCase):
+    """Test class for parse_time_to_us function."""
+    def test_parse_microseconds(self):
+        """Test parsing microseconds values."""
+        self.assertAlmostEqual(parse_time_to_us('123.45 us'), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('123.45us'), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('0.5 us'), 0.5)
+
+    def test_parse_nanoseconds(self):
+        """Test parsing nanoseconds values (converted to us)."""
+        self.assertAlmostEqual(parse_time_to_us('1000 ns'), 1.0)
+        self.assertAlmostEqual(parse_time_to_us('1000ns'), 1.0)
+        self.assertAlmostEqual(parse_time_to_us('500 ns'), 0.5)
+        self.assertAlmostEqual(parse_time_to_us('123.456 ns'), 0.123456)
+
+    def test_parse_milliseconds(self):
+        """Test parsing milliseconds values (converted to us)."""
+        self.assertAlmostEqual(parse_time_to_us('1 ms'), 1000.0)
+        self.assertAlmostEqual(parse_time_to_us('1ms'), 1000.0)
+        self.assertAlmostEqual(parse_time_to_us('0.5 ms'), 500.0)
+        self.assertAlmostEqual(parse_time_to_us('0.001 ms'), 1.0)
+
+    def test_parse_plain_number(self):
+        """Test parsing plain numbers without unit (defaults to us)."""
+        self.assertAlmostEqual(parse_time_to_us('123.45'), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('0'), 0.0)
+
+    def test_parse_with_whitespace(self):
+        """Test parsing values with leading/trailing whitespace."""
+        self.assertAlmostEqual(parse_time_to_us('  123.45 us  '), 123.45)
+        self.assertAlmostEqual(parse_time_to_us('\t500 ns\n'), 0.5)
+
+    def test_parse_seconds(self):
+        """Test parsing seconds values (converted to us)."""
+        self.assertAlmostEqual(parse_time_to_us('1 s'), 1000000.0)
+        self.assertAlmostEqual(parse_time_to_us('1s'), 1000000.0)
+        self.assertAlmostEqual(parse_time_to_us('0.5 s'), 500000.0)
+        self.assertAlmostEqual(parse_time_to_us('0.001 s'), 1000.0)
+
+
+class ConcreteNvbenchBase(NvbenchBase):
+    """Concrete implementation of NvbenchBase for testing."""
+    def __init__(self, name, parameters=''):
+        """Constructor."""
+        super().__init__(name, parameters)
+        self._bin_name = 'test_nvbench_binary'
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Process raw results (no-op for base class testing)."""
+        return True
+
+
+class TestNvbenchBase(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NvbenchBase class."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/test_nvbench_binary'])
+
+    def test_nvbench_base_init(self):
+        """Test NvbenchBase initialization."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        assert benchmark._bin_name == 'test_nvbench_binary'
+        assert benchmark._name == 'test-benchmark'
+
+    def test_nvbench_base_add_parser_arguments(self):
+        """Test NvbenchBase add_parser_arguments."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._preprocess()
+
+        # Check default values
+        assert benchmark._args.devices is None
+        self.assertAlmostEqual(benchmark._args.skip_time, -1.0)
+        self.assertAlmostEqual(benchmark._args.throttle_threshold, 75.0)
+        self.assertAlmostEqual(benchmark._args.throttle_recovery_delay, 0.05)
+        assert benchmark._args.run_once is False
+        assert benchmark._args.disable_blocking_kernel is False
+        assert benchmark._args.profile is False
+        assert benchmark._args.timeout == 15
+        assert benchmark._args.min_samples == 10
+        assert benchmark._args.stopping_criterion == 'stdrel'
+        self.assertAlmostEqual(benchmark._args.min_time, 0.5)
+        self.assertAlmostEqual(benchmark._args.max_noise, 0.5)
+        self.assertAlmostEqual(benchmark._args.max_angle, 0.048)
+        self.assertAlmostEqual(benchmark._args.min_r2, 0.36)
+
+    def test_nvbench_base_preprocess_default(self):
+        """Test NvbenchBase preprocess with default parameters."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+        assert len(benchmark._commands) == 1
+        # Check default stopping criterion args are included
+        assert '--timeout 15' in benchmark._commands[0]
+        assert '--min-samples 10' in benchmark._commands[0]
+        assert '--stopping-criterion stdrel' in benchmark._commands[0]
+        assert '--min-time 0.5' in benchmark._commands[0]
+        assert '--max-noise 0.5' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_devices(self):
+        """Test NvbenchBase preprocess with device configuration."""
+        # Test with specific device
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='--devices 0')
+        assert benchmark._preprocess()
+        assert '--devices 0' in benchmark._commands[0]
+
+        # Test with 'all' devices
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='--devices all')
+        assert benchmark._preprocess()
+        assert '--devices all' in benchmark._commands[0]
+
+        # Test with multiple devices
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='--devices 0,1,2')
+        assert benchmark._preprocess()
+        assert '--devices 0,1,2' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_benchmark_properties(self):
+        """Test NvbenchBase preprocess with benchmark properties."""
+        parameters = (
+            '--skip-time 1.0 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 0.1 '
+            '--run-once '
+            '--disable-blocking-kernel '
+            '--profile'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        assert '--skip-time 1.0' in benchmark._commands[0]
+        assert '--throttle-threshold 80.0' in benchmark._commands[0]
+        assert '--throttle-recovery-delay 0.1' in benchmark._commands[0]
+        assert '--run-once' in benchmark._commands[0]
+        assert '--disable-blocking-kernel' in benchmark._commands[0]
+        assert '--profile' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_stdrel_stopping_criterion(self):
+        """Test NvbenchBase preprocess with stdrel stopping criterion."""
+        parameters = (
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.3 '
+            '--timeout 30 '
+            '--min-samples 100'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        assert '--stopping-criterion stdrel' in benchmark._commands[0]
+        assert '--min-time 2.0' in benchmark._commands[0]
+        assert '--max-noise 0.3' in benchmark._commands[0]
+        assert '--timeout 30' in benchmark._commands[0]
+        assert '--min-samples 100' in benchmark._commands[0]
+
+    def test_nvbench_base_preprocess_with_entropy_stopping_criterion(self):
+        """Test NvbenchBase preprocess with entropy stopping criterion."""
+        parameters = (
+            '--stopping-criterion entropy '
+            '--max-angle 0.1 '
+            '--min-r2 0.5 '
+            '--timeout 20 '
+            '--min-samples 50'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        assert '--stopping-criterion entropy' in benchmark._commands[0]
+        assert '--max-angle 0.1' in benchmark._commands[0]
+        assert '--min-r2 0.5' in benchmark._commands[0]
+        assert '--timeout 20' in benchmark._commands[0]
+        assert '--min-samples 50' in benchmark._commands[0]
+        # stdrel args should not be in entropy mode
+        assert '--min-time' not in benchmark._commands[0]
+        assert '--max-noise' not in benchmark._commands[0]
+
+    def test_nvbench_base_handle_parsing_error(self):
+        """Test NvbenchBase _handle_parsing_error method."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._preprocess()
+
+        benchmark._handle_parsing_error('Test error message', 'raw output data')
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+    def test_nvbench_base_build_base_command_without_bin_name(self):
+        """Test NvbenchBase _build_base_command raises error without bin_name."""
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters='')
+        benchmark._bin_name = None
+        # Manually call add_parser_arguments and parse to set up _args
+        benchmark.add_parser_arguments()
+        benchmark._args = benchmark._parser.parse_args([])
+        benchmark._args.bin_dir = '/mock/bin'
+
+        with self.assertRaises(ValueError) as context:
+            benchmark._build_base_command()
+        assert 'Subclass must set _bin_name' in str(context.exception)
+
+    def test_nvbench_base_full_command_line(self):
+        """Test NvbenchBase generates complete command line with all options."""
+        parameters = (
+            '--devices 0,1 '
+            '--skip-time 0.5 '
+            '--throttle-threshold 85.0 '
+            '--throttle-recovery-delay 0.02 '
+            '--run-once '
+            '--timeout 60 '
+            '--min-samples 200 '
+            '--stopping-criterion stdrel '
+            '--min-time 1.5 '
+            '--max-noise 0.25'
+        )
+        benchmark = ConcreteNvbenchBase('test-benchmark', parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        cmd = benchmark._commands[0]
+        assert 'test_nvbench_binary' in cmd
+        assert '--devices 0,1' in cmd
+        assert '--skip-time 0.5' in cmd
+        assert '--throttle-threshold 85.0' in cmd
+        assert '--throttle-recovery-delay 0.02' in cmd
+        assert '--run-once' in cmd
+        assert '--timeout 60' in cmd
+        assert '--min-samples 200' in cmd
+        assert '--stopping-criterion stdrel' in cmd
+        assert '--min-time 1.5' in cmd
+        assert '--max-noise 0.25' in cmd
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
new file mode 100644
index 000000000..332dddec6
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench kernel launch benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchKernelLaunchBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Kernel Launch benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_kernel_launch'])
+
+    def test_nvbench_kernel_launch_preprocess(self):
+        """Test NVBench Kernel Launch benchmark preprocess."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--devices 0 '
+            '--timeout 20 '
+            '--min-samples 300 '
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.5 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 1.0'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 300' in benchmark._commands[0])
+        assert ('--stopping-criterion stdrel' in benchmark._commands[0])
+        assert ('--min-time 2.0' in benchmark._commands[0])
+        assert ('--max-noise 0.5' in benchmark._commands[0])
+        assert ('--throttle-threshold 80.0' in benchmark._commands[0])
+        assert ('--throttle-recovery-delay 1.0' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbench_kernel_launch.log')
+    def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
+        """Test NVBench Kernel Launch benchmark result parsing."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        self.assertAlmostEqual(benchmark.result['cpu_time'][0], 24.222)
+        self.assertAlmostEqual(benchmark.result['gpu_time'][0], 7.808)
+        self.assertAlmostEqual(benchmark.result['batch_gpu_time'][0], 6.024)
+
+    def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
+        """Test NVBench Kernel Launch benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = 'Invalid output format'
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
new file mode 100644
index 000000000..6df8100c6
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench sleep kernel benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchSleepKernelBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Sleep Kernel benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_sleep_kernel'])
+
+    def test_nvbench_sleep_kernel_preprocess(self):
+        """Test NVBench Sleep Kernel benchmark preprocess."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--devices 0 '
+            '--duration_us "[10,25,50,75]" '
+            '--timeout 20 '
+            '--min-samples 300 '
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.5 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 1.0'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--axis "Duration (us)=[10,25,50,75]"' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 300' in benchmark._commands[0])
+        assert ('--stopping-criterion stdrel' in benchmark._commands[0])
+        assert ('--min-time 2.0' in benchmark._commands[0])
+        assert ('--max-noise 0.5' in benchmark._commands[0])
+        assert ('--throttle-threshold 80.0' in benchmark._commands[0])
+        assert ('--throttle-recovery-delay 1.0' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbench_sleep_kernel.log')
+    def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
+        """Test NVBench Sleep Kernel benchmark result parsing."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        self.assertAlmostEqual(benchmark.result['duration_us_25_cpu_time'][0], 42.123)
+        self.assertAlmostEqual(benchmark.result['duration_us_25_gpu_time'][0], 25.321)
+        self.assertAlmostEqual(benchmark.result['duration_us_25_batch_gpu_time'][0], 23.456)
+
+    def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
+        """Test NVBench Sleep Kernel preprocess with different duration formats."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "50"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=50"' in benchmark._commands[0]
+
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25,50,75]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[25,50,75]"' in benchmark._commands[0]
+
+        # Test range format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25:75]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[25:75]"' in benchmark._commands[0]
+
+        # Test range with step format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[0:50:10]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[0:50:10]"' in benchmark._commands[0]
+
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[0,25,50,75,100]"' in benchmark._commands[0]
+
+    def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
+        """Test NVBench Sleep Kernel benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = 'Invalid output format'
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/data/nvbench_auto_throughput.log b/tests/data/nvbench_auto_throughput.log
new file mode 100644
index 000000000..1b87d2311
--- /dev/null
+++ b/tests/data/nvbench_auto_throughput.log
@@ -0,0 +1,48 @@
+# Devices
+
+## [0] `NVIDIA Test GPU`
+* SM Version: 100 (PTX Version: 100)
+* Number of SMs: 100
+* SM Default Clock Rate: 1000 MHz
+* Global Memory: 10000 MiB Free / 10000 MiB Total
+* Global Memory Bus Peak: 1000 GB/sec (1024-bit DDR @1000MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 10000 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/4] throughput_bench [Device=0 T=1 Stride=1 BlockSize=128]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.100000ms GPU, 0.120000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.095000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+Run:  [2/4] throughput_bench [Device=0 T=1 Stride=4 BlockSize=256]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.110000ms GPU, 0.130000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.105000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+Run:  [3/4] throughput_bench [Device=0 T=2 Stride=1 BlockSize=128]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.150000ms GPU, 0.170000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.145000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+Run:  [4/4] throughput_bench [Device=0 T=2 Stride=4 BlockSize=256]
+Pass: CUPTI: 0.10s total wall, 3x
+Pass: Cold: 0.200000ms GPU, 0.220000ms CPU, 0.50s total GPU, 0.70s total wall, 1000x
+Pass: Batch: 0.195000ms GPU, 0.50s total GPU, 0.50s total wall, 1000x
+```
+
+# Benchmark Results
+
+## throughput_bench
+
+### [0] NVIDIA Test GPU
+
+| T | Stride | BlockSize | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate | Samples | Samples |  CPU Time  | Noise |  GPU Time  | Noise |  Elem/s  | Samples | Batch GPU  |
+|---|--------|-----------|----------|---------|---------|----------|-----------|-----------|---------|---------|------------|-------|------------|-------|----------|---------|------------|
+| 1 |      1 |       128 | 33554432 |  20.00% | 100.00% |  100.00% |     0.00% |     0.05% |      3x |   1000x | 120.000 us | 1.00% | 100.000 us | 0.50% | 100.000G |   1000x |  95.000 us |
+| 1 |      4 |       256 | 33554432 |  25.00% |  25.00% |  100.00% |     0.00% |     0.30% |      3x |   1000x | 130.000 us | 1.00% | 110.000 us | 0.50% | 110.000G |   1000x | 105.000 us |
+| 2 |      1 |       128 | 33554432 |  40.00% |  50.00% |  100.00% |    30.00% |     1.50% |      3x |   1000x | 170.000 us | 1.00% | 150.000 us | 0.50% | 150.000G |   1000x | 145.000 us |
+| 2 |      4 |       256 | 33554432 |  80.00% |  12.50% |  100.00% |    30.00% |    10.00% |      3x |   1000x | 220.000 us | 1.00% | 200.000 us | 0.50% | 200.000G |   1000x | 195.000 us |
diff --git a/tests/data/nvbench_kernel_launch.log b/tests/data/nvbench_kernel_launch.log
new file mode 100644
index 000000000..95ccbc065
--- /dev/null
+++ b/tests/data/nvbench_kernel_launch.log
@@ -0,0 +1,34 @@
+# Devices
+
+## [0] `NVIDIA GPU`
+* SM Version: 900 (PTX Version: 900)
+* Number of SMs: 100
+* SM Default Clock Rate: 800 MHz
+* Global Memory: 100000 MiB Free / 100000 MiB Total
+* Global Memory Bus Peak: 3000 GB/sec (3000-bit DDR @4000MHz)
+* Max Shared Memory: 100 KiB/SM, 20 KiB/Block
+* L2 Cache Size: 1000 KiB
+* Maximum Active Blocks: 10/SM
+* Maximum Active Threads: 500/SM, 300/Block
+* Available Registers: 500/SM, 500/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/1] launch_bench [Device=0]
+Warn: Current measurement timed out (20.00s) while over noise threshold (14.42% > 0.50%)
+Warn: Current measurement timed out (20.00s) before accumulating min_time (0.94s < 2.00s)
+Pass: Cold: 0.005878ms GPU, 0.022181ms CPU, 0.94s total GPU, 20.00s total wall, 120000x 
+Pass: Batch: 0.004024ms GPU, 2.00s total GPU, 2.00s total wall, 300000x
+```
+
+# Benchmark Results
+
+## launch_bench
+
+### [0] NVIDIA GPU
+
+| Samples | CPU Time  | Noise  | GPU Time | Noise  | Samples | Batch GPU |
+|---------|-----------|--------|----------|--------|---------|-----------|
+| 120000x | 24.222 us | 30.44% | 7.808 us | 14.42% | 300000x |  6.024 us |
\ No newline at end of file
diff --git a/tests/data/nvbench_sleep_kernel.log b/tests/data/nvbench_sleep_kernel.log
new file mode 100644
index 000000000..b2c4037da
--- /dev/null
+++ b/tests/data/nvbench_sleep_kernel.log
@@ -0,0 +1,43 @@
+# Devices
+
+## [0] `NVIDIA GPU`
+* SM Version: 900 (PTX Version: 900)
+* Number of SMs: 100
+* SM Default Clock Rate: 800 MHz
+* Global Memory: 100000 MiB Free / 100000 MiB Total
+* Global Memory Bus Peak: 3000 GB/sec (3000-bit DDR @4000MHz)
+* Max Shared Memory: 100 KiB/SM, 20 KiB/Block
+* L2 Cache Size: 1000 KiB
+* Maximum Active Blocks: 10/SM
+* Maximum Active Threads: 500/SM, 300/Block
+* Available Registers: 500/SM, 500/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/3] sleep_benchmark [Device=0 Duration (us)=25]
+Warn: Current measurement timed out (1.00s) while over noise threshold (0.93% > 0.50%)
+Warn: Current measurement timed out (1.00s) before accumulating min_time (0.31s < 0.50s)
+Pass: Cold: 0.030374ms GPU, 0.047379ms CPU, 0.31s total GPU, 1.00s total wall, 10175x 
+Pass: Batch: 0.028658ms GPU, 0.50s total GPU, 0.50s total wall, 17448x
+Run:  [2/3] sleep_benchmark [Device=0 Duration (us)=50]
+Warn: Current measurement timed out (1.00s) before accumulating min_time (0.45s < 0.50s)
+Pass: Cold: 0.055036ms GPU, 0.072054ms CPU, 0.45s total GPU, 1.00s total wall, 8187x 
+Pass: Batch: 0.053246ms GPU, 0.50s total GPU, 0.50s total wall, 9403x
+Run:  [3/3] sleep_benchmark [Device=0 Duration (us)=75]
+Pass: Cold: 0.079643ms GPU, 0.096788ms CPU, 0.50s total GPU, 0.92s total wall, 6279x 
+Pass: Batch: 0.077862ms GPU, 0.51s total GPU, 0.51s total wall, 6547x
+```
+
+# Benchmark Results
+
+## sleep_benchmark
+
+### [0] NVIDIA GPU
+
+| Duration (us) | Samples | CPU Time  | Noise  | GPU Time  | Noise | Samples | Batch GPU |
+|---------------|---------|-----------|--------|-----------|-------|---------|-----------|
+|            25 |  10175x | 42.123 us | 69.78% | 25.321 us | 0.93% |  17448x | 23.456 us |
+|            50 |   8187x | 68.456 us |  2.34% | 50.654 us | 0.45% |   9403x | 49.321 us |
+|            75 |   6279x | 90.789 us |  1.85% | 75.987 us | 0.33% |   6547x | 77.862 us |
\ No newline at end of file
diff --git a/third_party/Makefile b/third_party/Makefile
index 0449b6e99..517078b59 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -200,7 +200,7 @@ endif
 cpu_hpl: sb_micro_path
 ifneq (,$(wildcard hpl-tests/Makefile))
 	cd ./hpl-tests && \
-    wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
+	wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
 	tar xzf hpl-2.3.tar.gz && \
 	cp Make.Linux_zen3 hpl-2.3 && \
 	cp Make.Linux_zen4 hpl-2.3 && \
@@ -216,7 +216,7 @@ endif
 cpu_stream: sb_micro_path
 ifneq (,$(wildcard stream-tests/Makefile))
 	cd ./stream-tests && \
-    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+	wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
 	make all
 	cp -v ./stream-tests/stream* $(SB_MICRO_PATH)/bin/
 endif
@@ -251,10 +251,10 @@ rocm_megatron_lm:
 	fi
 	cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
 	git clone https://github.com/caaatch22/grouped_gemm.git &&\
-    	cd grouped_gemm &&\
-    	git checkout 8a9b438 &&\
-    	git submodule update --init --recursive &&\
-    	pip install .
+		cd grouped_gemm &&\
+		git checkout 8a9b438 &&\
+		git submodule update --init --recursive &&\
+		pip install .
 
 # Instal apex of ROCm due to dependency of Megatron
 apex_rocm:
@@ -272,7 +272,7 @@ apex_rocm:
 	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
 		git checkout release/1.0.0 ; \
 	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
-	    git checkout release/1.0.0 ; \
+		git checkout release/1.0.0 ; \
 	fi
 	pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
 
@@ -284,11 +284,11 @@ ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
 	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-    # Get commit 87048bd from msscl to support updated nccl and sm_100
+	# Get commit 87048bd from msccl to support updated nccl and sm_100
 	$(eval ARCHS := 75 80 86 89 90 100)
 	if [ -d msccl ]; then rm -rf msccl; fi; \
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
-    && git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
+	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := 70 75 80 86 89 90)
 else
@@ -320,3 +320,24 @@ endif
 nvbandwidth: sb_micro_path
 	cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
+
+# Build nvbench
+cuda_nvbench: sb_micro_path
+ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
+	$(eval ARCHS := "100;103")
+else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+	$(eval ARCHS := "90;100")
+else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+	$(eval ARCHS := "70;75;80;86;89;90")
+else
+	$(eval ARCHS := "70;75;80;86")
+endif
+	cd ./nvbench && mkdir -p build && cd build && \
+	cmake										 \
+	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)	   \
+	-DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)		   \
+	-DNVBench_ENABLE_CUPTI=ON					 \
+	-DCMAKE_BUILD_TYPE=Release					\
+	.. &&										 \
+	make -j $(NUM_MAKE_JOBS) &&				   \
+	make install
diff --git a/third_party/nvbench b/third_party/nvbench
new file mode 160000
index 000000000..7feda2cf3
--- /dev/null
+++ b/third_party/nvbench
@@ -0,0 +1 @@
+Subproject commit 7feda2cf3ade88b3e73a0e0414ba543a4fbfbc43