diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index de06b35d0..f2428f2c6 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -591,8 +591,8 @@ def _benchmark(self): Run the benchmark then handle post-run model log save/compare. Set SB_ENABLE_PYTORCH_PROFILER='1' to enable profiling. """ - # Check if this is a Nvidia GPU - if not (torch.cuda.is_available() and torch.version.cuda is not None): + # Check if this is a Nvidia or AMD GPU + if not (torch.cuda.is_available() and (torch.version.cuda is not None or torch.version.hip is not None)): ok = super()._benchmark() self._post_run_model_log() return ok diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index a5ac13cbb..c0b2345e5 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -7,6 +7,7 @@ import sys import json import random +import shlex import signal from pathlib import Path from pprint import pformat @@ -135,12 +136,25 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): enable_nsys = os.environ.get('SB_ENABLE_NSYS', '') == '1' trace_dir = os.environ.get('SB_NSYS_TRACE_DIR', self._sb_output_dir) + # Enable rocprofv2 profiling based on environment variable + enable_rocprof = os.environ.get('SB_ENABLE_ROCPROF', '') == '1' + rocprof_trace_dir = os.environ.get('SB_ROCPROF_TRACE_DIR', self._sb_output_dir) + mode_command = exec_command if mode.name == 'local': - trace_command = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' - f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' - ) if enable_nsys and mode.proc_rank == 0 else '' + trace_command = '' + if enable_nsys and mode.proc_rank == 0: + trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') + trace_command = ( + f'nsys profile --output {trace_output} ' + f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' + ) + elif enable_rocprof and mode.proc_rank == 0: + trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') + trace_command = ( + f'rocprofv2 --hip-trace --kernel-trace --plugin json ' + f'-d {trace_output} ' + ) # Build the command parts, only including trace if it's not empty command_parts = [] prefix = mode.prefix.format(proc_rank=mode.proc_rank, proc_num=mode.proc_num) @@ -159,23 +173,41 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' ) - nsys_prefix = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_traces ' - f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' - ) if enable_nsys else '' + trace_prefix = '' + if enable_nsys: + trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_traces') + trace_prefix = ( + f'nsys profile --output {trace_output} ' + f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' + ) + elif enable_rocprof: + trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_traces') + trace_prefix = ( + f'rocprofv2 --hip-trace --kernel-trace --plugin json ' + f'-d {trace_output} ' + ) mode_command = ( - f'{nsys_prefix}' + f'{trace_prefix}' f'torchrun' f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl' ) elif mode.name == 'mpi': - trace_command = ( - f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces ' - f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' - ) if enable_nsys else '' + trace_command = '' + if enable_nsys: + trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') + trace_command = ( + f'nsys profile --output {trace_output} ' + f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' + ) + elif enable_rocprof: + trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces') + trace_command = ( + f'rocprofv2 --hip-trace --kernel-trace --plugin json ' + f'-d {trace_output} ' + ) mode_command = ( '{trace} ' 'mpirun ' # use default OpenMPI in image