diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 5dde15e5e..d03d1a28e 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -38,6 +38,18 @@ jobs: platforms: linux/amd64 runner: [self-hosted, linux/amd64] build_args: "NUM_MAKE_JOBS=16" + - name: cuda12.9-arm64 + dockerfile: cuda12.9 + tags: superbench/main:cuda12.9-arm64 + platforms: linux/arm64 + runner: [self-hosted, linux/arm64] + build_args: "NUM_MAKE_JOBS=16" + - name: cuda12.9-amd64 + dockerfile: cuda12.9 + tags: superbench/main:cuda12.9-amd64 + platforms: linux/amd64 + runner: [self-hosted, linux/amd64] + build_args: "NUM_MAKE_JOBS=16" - name: cuda12.4 dockerfile: cuda12.4 tags: superbench/main:cuda12.4 diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile new file mode 100644 index 000000000..d823b1746 --- /dev/null +++ b/dockerfile/cuda12.9.dockerfile @@ -0,0 +1,172 @@ +FROM nvcr.io/nvidia/pytorch:25.05-py3 + +# OS: +# - Ubuntu: 24.04 +# - OpenMPI: 4.1.7+ +# - Docker Client: 20.10.8 +# NVIDIA: +# - CUDA: 12.9.0.43 +# - cuDNN: 9.10.1.3 +# - cuBLAS: 12.9.0.13 +# - NCCL: v2.26.5 +# - TransformerEngine: v2.3 +# - torch: 2.8.0a0+5228986c39 +# Mellanox: +# - MOFED_VERSION; 5.4-rdmacore39.0 +# - HPC-X: v2.23 +# Intel: +# - mlc: v3.11 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + ffmpeg \ + git \ + iproute2 \ + jq \ + libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libswresample-dev \ + libncurses-dev \ + libtool \ + lshw \ + python3-mpi4py \ + net-tools \ + nlohmann-json3-dev \ + openssh-client \ + openssh-server \ + pciutils \ + sudo \ + util-linux \ + vim \ + wget \ + rsync \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +ARG NUM_MAKE_JOBS= +ARG TARGETPLATFORM +ARG TARGETARCH + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN TARGETARCH_HW=$(uname -m) && \ + wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + +# Install OFED +ENV OFED_VERSION=24.10-1.1.4.0 +RUN TARGETARCH_HW=$(uname -m) && \ + cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ + rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* + +# Install HPC-X +ENV HPCX_VERSION=v2.23 +RUN TARGETARCH_HW=$(uname -m) && \ + cd /opt && \ + rm -rf hpcx && \ + wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \ + tar xf hpcx.tbz && \ + mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW} hpcx && \ + rm hpcx.tbz + +# Installs specific to amd64 platform +RUN if [ "$TARGETARCH" = "amd64" ]; then \ + # Install Intel MLC + cd /tmp && \ + wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz && \ + # Install AOCC compiler + wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ + apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ + rm -rf aocc-compiler-4.0.0_1_amd64.deb && \ + # Install AMD BLIS + wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ + tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ + mv amd-blis /opt/AMD && \ + rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \ + else \ + echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \ + fi + +# Install NCCL 2.26.3 +RUN cd /tmp && \ + git clone -b v2.26.3-1 https://github.com/NVIDIA/nccl.git && \ + cd nccl && \ + make -j ${NUM_MAKE_JOBS} src.build \ + NVCC_GENCODE="-gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_90,code=sm_90 \ + -gencode=arch=compute_80,code=sm_80" && \ + make install && \ + rm -rf /tmp/nccl + +# Install UCX with multi-threading support +ENV UCX_VERSION=1.18.0 +RUN cd /tmp && \ + wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \ + tar xzf ucx-${UCX_VERSION}.tar.gz && \ + cd ucx-${UCX_VERSION} && \ + ./contrib/configure-release-mt --prefix=/usr/local && \ + make -j ${NUM_MAKE_JOBS} && \ + make install + +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \ + echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh + +# Add config files +ADD dockerfile/etc /opt/microsoft/ + +WORKDIR ${SB_HOME} + +ADD third_party third_party +RUN make -C third_party cuda_with_msccl + +ADD . . +RUN python3 -m pip install --upgrade setuptools==78.1.0 && \ + python3 -m pip install --no-cache-dir .[nvworker] && \ + make cppbuild && \ + make postinstall && \ + rm -rf .git diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index d9d4eaf9f..aa3aa965b 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -232,6 +232,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs, performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest) or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool. +The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth benchmark. #### Metrics diff --git a/setup.py b/setup.py index 29688a5a5..c30db8db6 100644 --- a/setup.py +++ b/setup.py @@ -183,7 +183,7 @@ def run(self): 'openpyxl>=3.0.7', 'packaging>=21.0', 'pandas>=1.1.5', - 'protobuf<=3.20.3', + 'protobuf', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', 'pyyaml>=5.3', 'requests>=2.27.1', @@ -224,11 +224,14 @@ def run(self): 'yapf==0.31.0', ], 'torch': [ - 'safetensors==0.4.5', - 'tokenizers<=0.20.3', + 'safetensors==0.4.5; python_version<"3.12"', + 'safetensors>=0.5.3; python_version>="3.12"', + 'tokenizers<=0.20.3; python_version<"3.12"', + 'tokenizers<0.22; python_version>="3.12"', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'transformers>=4.28.0', + 'transformers>=4.28.0; python_version<"3.12"', + 'transformers==4.52.4; python_version>="3.12"', ], 'ort': [ 'onnx>=1.10.2', diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index f0cb52319..6bc3420ca 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -207,7 +207,10 @@ def _create_optimizer(self): elif self._optimizer_type == Optimizer.ADAM: self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) elif self._optimizer_type == Optimizer.ADAMW: - self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) + if hasattr(torch.optim, 'AdamW'): + self._optimizer = torch.optim.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) + else: + self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) else: self._optimizer = None diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py index 96e1718a0..b397f7c6b 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py @@ -222,7 +222,10 @@ def test_pytorch_base(): assert (benchmark._init_dataloader() is False) # Test _create_optimizer(). - assert (isinstance(benchmark._optimizer, transformers.AdamW)) + if hasattr(torch.optim, 'AdamW'): + assert (isinstance(benchmark._optimizer, torch.optim.AdamW)) + else: + assert (isinstance(benchmark._optimizer, transformers.AdamW)) benchmark._optimizer_type = Optimizer.ADAM assert (benchmark._create_optimizer() is True) assert (isinstance(benchmark._optimizer, torch.optim.Adam)) diff --git a/third_party/Makefile b/third_party/Makefile index 68b6174b6..a8360bb85 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -38,12 +38,12 @@ sb_micro_path: mkdir -p $(SB_MICRO_PATH)/lib # Build cutlass. -# for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support +# for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support cuda_cutlass: ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) - $(eval ARCHS := "75;80;86;89;90a;100;100a") + $(eval ARCHS := "90;100") if [ -d cutlass ]; then rm -rf cutlass; fi - git clone --single-branch --branch main https://github.com/NVIDIA/cutlass.git && cd cutlass && git checkout 389e493 + git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1) $(eval ARCHS := "70;75;80;86;89;90") else @@ -51,19 +51,39 @@ else endif ifneq (,$(wildcard cutlass/CMakeLists.txt)) - cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \ - -DCUTLASS_NVCC_ARCHS=$(ARCHS) -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build + cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin \ + -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib \ + -DCMAKE_BUILD_TYPE=Release \ + -DCUTLASS_NVCC_ARCHS=$(ARCHS) \ + -DCUTLASS_ENABLE_EXAMPLES=OFF \ + -DCUTLASS_ENABLE_TESTS=OFF \ + -S ./cutlass \ + -B ./cutlass/build \ + -DCUTLASS_LIBRARY_KERNELS="cutlass_simt_dgemm_128x128_8x2_*,\ +cutlass_simt_sgemm_128x128_8x2_*,\ +cutlass_simt_hgemm_256x128_8x2_*,\ +cutlass_tensorop_h884gemm_256x128_32x2_*,\ +cutlass_tensorop_d884gemm_128x128_16x3_*,\ +cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*,\ +cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*,\ +cutlass_tensorop_h16816gemm_256x128_32x3_*,\ +cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\ +cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*" cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install + rm -rf ./cutlass/build endif # Build cuda-samples/Samples/bandwidthTest. # cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on. # The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker. # The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing. +# The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the change log). For up-to-date bandwidth measurements, refer instead to the NVBandwith utility. cuda_bandwidthTest: sb_micro_path if [ -d cuda-samples ]; then rm -rf cuda-samples; fi git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git -ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) +ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1) + @echo "Skip cuda-samples build for CUDA>=12.9" +else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) $(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest") cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/ diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index a64133a00..a41efd476 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -4,12 +4,14 @@ pybind11 regex six # versions from HF transformers -black==21.4b0 +black==21.4b0; python_version < '3.12' +black==25.1.0; python_version >= '3.12' isort>=5.5.4 tqdm sentencepiece wandb einops -typing_extensions==4.9.0 +typing_extensions==4.9.0; python_version < '3.12' +typing_extensions==4.12.2; python_version >= '3.12' apex mpi4py