Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ jobs:
platforms: linux/amd64
runner: [self-hosted, linux/amd64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.9-arm64
dockerfile: cuda12.9
tags: superbench/main:cuda12.9-arm64
platforms: linux/arm64
runner: [self-hosted, linux/arm64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.9-amd64
dockerfile: cuda12.9
tags: superbench/main:cuda12.9-amd64
platforms: linux/amd64
runner: [self-hosted, linux/amd64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.4
dockerfile: cuda12.4
tags: superbench/main:cuda12.4
Expand Down
171 changes: 171 additions & 0 deletions dockerfile/cuda12.9.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
FROM nvcr.io/nvidia/pytorch:25.04-py3

# OS:
# - Ubuntu: 24.04
# - OpenMPI: 4.1.7+
# - Docker Client: 20.10.8
# NVIDIA:
# - CUDA: 12.9.0
# - cuDNN: 9.9.0.52
# - cuBLAS: 12.9.0.2
# - NCCL: v2.26.3
# - TransformerEngine 2.2.0
# Mellanox:
# - MOFED_VERSION; 5.4-rdmacore39.0
# - HPC-X: v2.21.0-CUDA12.x
# Intel:
# - mlc: v3.11

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
apt-get install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
ffmpeg \
git \
iproute2 \
jq \
libaio-dev \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libswresample-dev \
libncurses-dev \
libtool \
lshw \
python3-mpi4py \
net-tools \
nlohmann-json3-dev \
openssh-client \
openssh-server \
pciutils \
sudo \
util-linux \
vim \
wget \
rsync \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH

# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN TARGETARCH_HW=$(uname -m) && \
wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
RUN TARGETARCH_HW=$(uname -m) && \
cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install HPC-X
ENV HPCX_VERSION=v2.21
RUN TARGETARCH_HW=$(uname -m) && \
cd /opt && \
rm -rf hpcx && \
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW} hpcx && \
rm hpcx.tbz

# Installs specific to amd64 platform
RUN if [ "$TARGETARCH" = "amd64" ]; then \
# Install Intel MLC
cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz && \
# Install AOCC compiler
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
# Install AMD BLIS
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
else \
echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
fi

# Install NCCL 2.26.3
RUN cd /tmp && \
git clone -b v2.26.3-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j ${NUM_MAKE_JOBS} src.build \
NVCC_GENCODE="-gencode=arch=compute_100,code=sm_100 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_80,code=sm_80" && \
make install && \
rm -rf /tmp/nccl

# Install UCX with multi-threading support
ENV UCX_VERSION=1.18.0
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
tar xzf ucx-${UCX_VERSION}.tar.gz && \
cd ucx-${UCX_VERSION} && \
./contrib/configure-release-mt --prefix=/usr/local && \
make -j ${NUM_MAKE_JOBS} && \
make install

ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh

# Add config files
ADD dockerfile/etc /opt/microsoft/

WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make -C third_party cuda_with_msccl

ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \
make postinstall && \
rm -rf .git
1 change: 1 addition & 0 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs,
performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest)
or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool.
The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth benchmark.

#### Metrics

Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def run(self):
'openpyxl>=3.0.7',
'packaging>=21.0',
'pandas>=1.1.5',
'protobuf<=3.20.3',
'protobuf',
'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
'pyyaml>=5.3',
'requests>=2.27.1',
Expand Down Expand Up @@ -224,7 +224,8 @@ def run(self):
'yapf==0.31.0',
],
'torch': [
'safetensors==0.4.5',
'safetensors==0.4.5; python_version<"3.12"',
'safetensors==0.5.3; python_version>="3.12"',
'tokenizers<=0.20.3',
'torch>=1.7.0a0',
'torchvision>=0.8.0a0',
Expand Down
5 changes: 4 additions & 1 deletion third_party/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,13 @@ endif
# cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on.
# The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
# The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
# The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the change log). For up-to-date bandwidth measurements, refer instead to the NVBandwith utility.
cuda_bandwidthTest: sb_micro_path
if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
@echo "Skip cuda-samples build for CUDA>=12.9"
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make
cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/
Expand Down
6 changes: 4 additions & 2 deletions third_party/Megatron/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ pybind11
regex
six
# versions from HF transformers
black==21.4b0
black==21.4b0; python_version < '3.12'
black==25.1.0; python_version >= '3.12'
isort>=5.5.4
tqdm
sentencepiece
wandb
einops
typing_extensions==4.9.0
typing_extensions==4.9.0; python_version < '3.12'
typing_extensions==4.12.2; python_version >= '3.12'
apex
mpi4py
Loading