Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions superbench/benchmarks/model_benchmarks/pytorch_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,8 +591,8 @@ def _benchmark(self):
Run the benchmark then handle post-run model log save/compare.
Set SB_ENABLE_PYTORCH_PROFILER='1' to enable profiling.
"""
# Check if this is a Nvidia GPU
if not (torch.cuda.is_available() and torch.version.cuda is not None):
# Check if this is a Nvidia or AMD GPU
if not (torch.cuda.is_available() and (torch.version.cuda is not None or torch.version.hip is not None)):
Comment thread
shcho marked this conversation as resolved.
ok = super()._benchmark()
self._post_run_model_log()
return ok
Expand Down
49 changes: 37 additions & 12 deletions superbench/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,23 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
enable_nsys = os.environ.get('SB_ENABLE_NSYS', '') == '1'
trace_dir = os.environ.get('SB_NSYS_TRACE_DIR', self._sb_output_dir)

# Enable rocprofv2 profiling based on environment variable
enable_rocprof = os.environ.get('SB_ENABLE_ROCPROF', '') == '1'
rocprof_trace_dir = os.environ.get('SB_ROCPROF_TRACE_DIR', self._sb_output_dir)
Comment thread
shcho marked this conversation as resolved.

mode_command = exec_command
if mode.name == 'local':
trace_command = (
f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
) if enable_nsys and mode.proc_rank == 0 else ''
trace_command = ''
if enable_nsys and mode.proc_rank == 0:
trace_command = (
f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
)
elif enable_rocprof and mode.proc_rank == 0:
trace_command = (
f'rocprofv2 --hip-trace --kernel-trace --plugin json '
f'-d {rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
)
Comment thread
shcho marked this conversation as resolved.
Comment on lines +152 to +157
Comment on lines +145 to +157
# Build the command parts, only including trace if it's not empty
command_parts = []
prefix = mode.prefix.format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)
Expand All @@ -159,10 +170,17 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
)

nsys_prefix = (
f'nsys profile --output {trace_dir}/{benchmark_name}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
) if enable_nsys else ''
nsys_prefix = ''
if enable_nsys:
nsys_prefix = (
f'nsys profile --output {trace_dir}/{benchmark_name}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
)
elif enable_rocprof:
nsys_prefix = (
f'rocprofv2 --hip-trace --kernel-trace --plugin json '
f'-d {rocprof_trace_dir}/{benchmark_name}_traces '
Comment thread
shcho marked this conversation as resolved.
Outdated
)
Comment on lines +176 to +188

mode_command = (
f'{nsys_prefix}'
Comment thread
shcho marked this conversation as resolved.
Outdated
Expand All @@ -172,10 +190,17 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
)
elif mode.name == 'mpi':
trace_command = (
f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
) if enable_nsys else ''
trace_command = ''
if enable_nsys:
trace_command = (
f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
)
elif enable_rocprof:
trace_command = (
f'rocprofv2 --hip-trace --kernel-trace --plugin json '
f'-d {rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
)
mode_command = (
'{trace} '
'mpirun ' # use default OpenMPI in image
Expand Down
Loading