From 0a1c422bdf0b8e7c2206d84d869f02cd8a5ce088 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Tue, 27 May 2025 16:49:39 +0000 Subject: [PATCH 01/19] Add cuda12.9 docker image --- .github/workflows/build-image.yml | 12 +++ dockerfile/cuda12.9.dockerfile | 171 ++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 dockerfile/cuda12.9.dockerfile diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 5dde15e5e..d03d1a28e 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -38,6 +38,18 @@ jobs: platforms: linux/amd64 runner: [self-hosted, linux/amd64] build_args: "NUM_MAKE_JOBS=16" + - name: cuda12.9-arm64 + dockerfile: cuda12.9 + tags: superbench/main:cuda12.9-arm64 + platforms: linux/arm64 + runner: [self-hosted, linux/arm64] + build_args: "NUM_MAKE_JOBS=16" + - name: cuda12.9-amd64 + dockerfile: cuda12.9 + tags: superbench/main:cuda12.9-amd64 + platforms: linux/amd64 + runner: [self-hosted, linux/amd64] + build_args: "NUM_MAKE_JOBS=16" - name: cuda12.4 dockerfile: cuda12.4 tags: superbench/main:cuda12.4 diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile new file mode 100644 index 000000000..8904264ee --- /dev/null +++ b/dockerfile/cuda12.9.dockerfile @@ -0,0 +1,171 @@ +FROM nvcr.io/nvidia/pytorch:25.04-py3 + +# OS: +# - Ubuntu: 24.04 +# - OpenMPI: 4.1.7+ +# - Docker Client: 20.10.8 +# NVIDIA: +# - CUDA: 12.9.0 +# - cuDNN: 9.9.0.52 +# - cuBLAS: 12.9.0.2 +# - NCCL: v2.26.3 +# - TransformerEngine 2.0 +# Mellanox: +# - MOFED_VERSION; 5.4-rdmacore39.0 +# - HPC-X: v2.21.0-CUDA12.x +# Intel: +# - mlc: v3.11 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + ffmpeg \ + git \ + iproute2 \ + jq \ + libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libswresample-dev \ + libncurses-dev \ + libtool \ + lshw \ + python3-mpi4py \ + net-tools \ + nlohmann-json3-dev \ + openssh-client \ + openssh-server \ + pciutils \ + sudo \ + util-linux \ + vim \ + wget \ + rsync \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +ARG NUM_MAKE_JOBS= +ARG TARGETPLATFORM +ARG TARGETARCH + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN TARGETARCH_HW=$(uname -m) && \ + wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + +# Install OFED +ENV OFED_VERSION=24.10-1.1.4.0 +RUN TARGETARCH_HW=$(uname -m) && \ + cd /tmp && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ + rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* + +# Install HPC-X +ENV HPCX_VERSION=v2.21 +RUN TARGETARCH_HW=$(uname -m) && \ + cd /opt && \ + rm -rf hpcx && \ + wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \ + tar xf hpcx.tbz && \ + mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW} hpcx && \ + rm hpcx.tbz + +# Installs specific to amd64 platform +RUN if [ "$TARGETARCH" = "amd64" ]; then \ + # Install Intel MLC + cd /tmp && \ + wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz && \ + # Install AOCC compiler + wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ + apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ + rm -rf aocc-compiler-4.0.0_1_amd64.deb && \ + # Install AMD BLIS + wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ + tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ + mv amd-blis /opt/AMD && \ + rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \ + else \ + echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \ + fi + +# Install NCCL 2.26.3 +RUN cd /tmp && \ + git clone -b v2.26.3-1 https://github.com/NVIDIA/nccl.git && \ + cd nccl && \ + make -j ${NUM_MAKE_JOBS} src.build \ + NVCC_GENCODE="-gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_90,code=sm_90 \ + -gencode=arch=compute_80,code=sm_80" && \ + make install && \ + rm -rf /tmp/nccl + +# Install UCX with multi-threading support +ENV UCX_VERSION=1.18.0 +RUN cd /tmp && \ + wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \ + tar xzf ucx-${UCX_VERSION}.tar.gz && \ + cd ucx-${UCX_VERSION} && \ + ./contrib/configure-release-mt --prefix=/usr/local && \ + make -j ${NUM_MAKE_JOBS} && \ + make install + +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \ + echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh + +# Add config files +ADD dockerfile/etc /opt/microsoft/ + +WORKDIR ${SB_HOME} + +ADD third_party third_party +RUN make -C third_party cuda_with_msccl + +ADD . . +RUN python3 -m pip install --upgrade setuptools==70.3.0 && \ + python3 -m pip install --no-cache-dir .[nvworker] && \ + make cppbuild && \ + make postinstall && \ + rm -rf .git From 2b9d586026e04c22a4bde1ed61d36d7e28d5f931 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Wed, 28 May 2025 04:11:37 +0000 Subject: [PATCH 02/19] skip bandwidthTest for cuda>=12.9 --- docs/user-tutorial/benchmarks/micro-benchmarks.md | 1 + third_party/Makefile | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index 77f5de85b..c57ab3ea8 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -232,6 +232,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs, performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest) or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool. +The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth(#nvbandwidth) benchmark. #### Metrics diff --git a/third_party/Makefile b/third_party/Makefile index e8149afbe..fe9336c75 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -60,10 +60,13 @@ endif # cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on. # The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker. # The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing. +# The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the change log). For up-to-date bandwidth measurements, refer instead to the NVBandwith utility. cuda_bandwidthTest: sb_micro_path if [ -d cuda-samples ]; then rm -rf cuda-samples; fi git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git -ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) +ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1) + @echo "Skip cuda-samples build for CUDA>=12.9" +else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) $(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest") cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/ From 8b98e6c804e1feebdb104aa7322196caed661196 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Wed, 28 May 2025 04:16:19 +0000 Subject: [PATCH 03/19] skip bandwidthTest for cuda>=12.9 --- docs/user-tutorial/benchmarks/micro-benchmarks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index c57ab3ea8..dcc810daa 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -232,7 +232,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs, performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest) or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool. -The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth(#nvbandwidth) benchmark. +The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth benchmark. #### Metrics From 4eb58ae6ce1e679f0b46d2b150c4dcd046131a99 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Wed, 28 May 2025 04:49:27 +0000 Subject: [PATCH 04/19] format --- third_party/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/Makefile b/third_party/Makefile index fe9336c75..029d5bb44 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -65,7 +65,7 @@ cuda_bandwidthTest: sb_micro_path if [ -d cuda-samples ]; then rm -rf cuda-samples; fi git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1) - @echo "Skip cuda-samples build for CUDA>=12.9" + @echo "Skip cuda-samples build for CUDA>=12.9" else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) $(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest") cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make From a4b6e8fc2f7d641f05cd43135ec41eca7cb49f95 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 29 May 2025 00:11:53 +0000 Subject: [PATCH 05/19] fix package incompatibility issue. --- dockerfile/cuda12.9.dockerfile | 2 +- setup.py | 4 ++-- third_party/Megatron/requirements.txt | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile index 8904264ee..466ea4acd 100644 --- a/dockerfile/cuda12.9.dockerfile +++ b/dockerfile/cuda12.9.dockerfile @@ -164,7 +164,7 @@ ADD third_party third_party RUN make -C third_party cuda_with_msccl ADD . . -RUN python3 -m pip install --upgrade setuptools==70.3.0 && \ +RUN python3 -m pip install --upgrade setuptools==78.1.0 && \ python3 -m pip install --no-cache-dir .[nvworker] && \ make cppbuild && \ make postinstall && \ diff --git a/setup.py b/setup.py index 29688a5a5..79daaee64 100644 --- a/setup.py +++ b/setup.py @@ -183,7 +183,7 @@ def run(self): 'openpyxl>=3.0.7', 'packaging>=21.0', 'pandas>=1.1.5', - 'protobuf<=3.20.3', + 'protobuf', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', 'pyyaml>=5.3', 'requests>=2.27.1', @@ -224,7 +224,7 @@ def run(self): 'yapf==0.31.0', ], 'torch': [ - 'safetensors==0.4.5', + 'safetensors==0.5.3', 'tokenizers<=0.20.3', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index a64133a00..f9a14bcf5 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -4,12 +4,12 @@ pybind11 regex six # versions from HF transformers -black==21.4b0 +black isort>=5.5.4 tqdm sentencepiece wandb einops -typing_extensions==4.9.0 +typing_extensions apex mpi4py From d655533296418b1061ae84e93167db30d8b13f87 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 29 May 2025 18:31:13 +0000 Subject: [PATCH 06/19] Fix lib dependancies. --- setup.py | 3 ++- third_party/Megatron/requirements.txt | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 79daaee64..05c82862a 100644 --- a/setup.py +++ b/setup.py @@ -224,7 +224,8 @@ def run(self): 'yapf==0.31.0', ], 'torch': [ - 'safetensors==0.5.3', + 'safetensors==0.4.5; python_version<"3.12"', + 'safetensors==0.5.3; python_version>="3.12"', 'tokenizers<=0.20.3', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index f9a14bcf5..21366eafd 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -4,12 +4,13 @@ pybind11 regex six # versions from HF transformers -black +black==25.1.0 isort>=5.5.4 tqdm sentencepiece wandb einops -typing_extensions +typing_extensions==4.9.0; python_version < '3.12' +typing_extensions==4.12.2; python_version >= '3.12' apex mpi4py From 376e3ae8250fb800f23b417e78c9f5ec996e29c3 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 29 May 2025 19:48:22 +0000 Subject: [PATCH 07/19] Fix lib dependancies. --- third_party/Megatron/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index 21366eafd..a41efd476 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -4,7 +4,8 @@ pybind11 regex six # versions from HF transformers -black==25.1.0 +black==21.4b0; python_version < '3.12' +black==25.1.0; python_version >= '3.12' isort>=5.5.4 tqdm sentencepiece From 3c41dbdd1312ab4f53a4a2d70ee495752d026ad9 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 29 May 2025 20:11:49 +0000 Subject: [PATCH 08/19] update transformer_engine version in comment --- dockerfile/cuda12.9.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile index 466ea4acd..d40fc745e 100644 --- a/dockerfile/cuda12.9.dockerfile +++ b/dockerfile/cuda12.9.dockerfile @@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/pytorch:25.04-py3 # - cuDNN: 9.9.0.52 # - cuBLAS: 12.9.0.2 # - NCCL: v2.26.3 -# - TransformerEngine 2.0 +# - TransformerEngine 2.2.0 # Mellanox: # - MOFED_VERSION; 5.4-rdmacore39.0 # - HPC-X: v2.21.0-CUDA12.x From 2e0643028e8946011b15c9d474bbe8120f539ce9 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 5 Jun 2025 22:14:21 +0000 Subject: [PATCH 09/19] Update cuda12.9 image dependancies. --- dockerfile/cuda12.9.dockerfile | 4 ++-- third_party/Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile index d40fc745e..e94cffd42 100644 --- a/dockerfile/cuda12.9.dockerfile +++ b/dockerfile/cuda12.9.dockerfile @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/pytorch:25.04-py3 +FROM nvcr.io/nvidia/pytorch:25.05-py3 # OS: # - Ubuntu: 24.04 @@ -92,7 +92,7 @@ RUN TARGETARCH_HW=$(uname -m) && \ rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* # Install HPC-X -ENV HPCX_VERSION=v2.21 +ENV HPCX_VERSION=v2.23 RUN TARGETARCH_HW=$(uname -m) && \ cd /opt && \ rm -rf hpcx && \ diff --git a/third_party/Makefile b/third_party/Makefile index 029d5bb44..6575881c1 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -43,7 +43,7 @@ cuda_cutlass: ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) $(eval ARCHS := "75;80;86;89;90a;100;100a") if [ -d cutlass ]; then rm -rf cutlass; fi - git clone --single-branch --branch main https://github.com/NVIDIA/cutlass.git && cd cutlass && git checkout 389e493 + git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1) $(eval ARCHS := "70;75;80;86;89;90") else From 67b07157460c9b8a3b68f41c3ff5c5cb11cdb55d Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Thu, 5 Jun 2025 22:24:19 +0000 Subject: [PATCH 10/19] Update version information. --- dockerfile/cuda12.9.dockerfile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile index e94cffd42..d823b1746 100644 --- a/dockerfile/cuda12.9.dockerfile +++ b/dockerfile/cuda12.9.dockerfile @@ -5,14 +5,15 @@ FROM nvcr.io/nvidia/pytorch:25.05-py3 # - OpenMPI: 4.1.7+ # - Docker Client: 20.10.8 # NVIDIA: -# - CUDA: 12.9.0 -# - cuDNN: 9.9.0.52 -# - cuBLAS: 12.9.0.2 -# - NCCL: v2.26.3 -# - TransformerEngine 2.2.0 +# - CUDA: 12.9.0.43 +# - cuDNN: 9.10.1.3 +# - cuBLAS: 12.9.0.13 +# - NCCL: v2.26.5 +# - TransformerEngine: v2.3 +# - torch: 2.8.0a0+5228986c39 # Mellanox: # - MOFED_VERSION; 5.4-rdmacore39.0 -# - HPC-X: v2.21.0-CUDA12.x +# - HPC-X: v2.23 # Intel: # - mlc: v3.11 From 97f74154340451da657c48ead6e7eddf2e9cc580 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Sun, 8 Jun 2025 23:04:24 +0000 Subject: [PATCH 11/19] clean arch for cutlass building. --- third_party/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/Makefile b/third_party/Makefile index 6575881c1..15356110d 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -41,7 +41,7 @@ sb_micro_path: # for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support cuda_cutlass: ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) - $(eval ARCHS := "75;80;86;89;90a;100;100a") + $(eval ARCHS := "80;90;90a;100;100a") if [ -d cutlass ]; then rm -rf cutlass; fi git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1) From a374fc336bd615b1ac33143d8d9d9a97fd171b59 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Sun, 8 Jun 2025 23:36:58 +0000 Subject: [PATCH 12/19] revise the comment. --- third_party/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/Makefile b/third_party/Makefile index 15356110d..9f7658a29 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -38,7 +38,7 @@ sb_micro_path: mkdir -p $(SB_MICRO_PATH)/lib # Build cutlass. -# for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support +# for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support cuda_cutlass: ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) $(eval ARCHS := "80;90;90a;100;100a") From d3b4c29ac3addb406ea38eb0326fa9ee5cc5b950 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Mon, 9 Jun 2025 23:59:47 +0000 Subject: [PATCH 13/19] apply optimization from transformers --- setup.py | 3 ++- superbench/benchmarks/model_benchmarks/pytorch_base.py | 5 ++++- third_party/Makefile | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 05c82862a..bfbb05349 100644 --- a/setup.py +++ b/setup.py @@ -229,7 +229,8 @@ def run(self): 'tokenizers<=0.20.3', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'transformers>=4.28.0', + 'transformers>=4.28.0; python_version<"3.12"', + 'transformers==4.52.4; python_version>="3.12"', ], 'ort': [ 'onnx>=1.10.2', diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index f0cb52319..a72545745 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -207,7 +207,10 @@ def _create_optimizer(self): elif self._optimizer_type == Optimizer.ADAM: self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) elif self._optimizer_type == Optimizer.ADAMW: - self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) + if hasattr(torch.optim, "AdamW"): + self._optimizer = torch.optim.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) + else: + self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) else: self._optimizer = None diff --git a/third_party/Makefile b/third_party/Makefile index 9f7658a29..79f99acba 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -41,7 +41,7 @@ sb_micro_path: # for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support cuda_cutlass: ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1) - $(eval ARCHS := "80;90;90a;100;100a") + $(eval ARCHS := "90;100") if [ -d cutlass ]; then rm -rf cutlass; fi git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1) From dce2057b8f70a46665ca1bcf5cd41265250bc345 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Tue, 10 Jun 2025 04:16:24 +0000 Subject: [PATCH 14/19] format --- setup.py | 3 ++- superbench/benchmarks/model_benchmarks/pytorch_base.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index bfbb05349..c1a060494 100644 --- a/setup.py +++ b/setup.py @@ -226,7 +226,8 @@ def run(self): 'torch': [ 'safetensors==0.4.5; python_version<"3.12"', 'safetensors==0.5.3; python_version>="3.12"', - 'tokenizers<=0.20.3', + 'tokenizers<=0.20.3; python_version<"3.12"', + 'tokenizers<0.22; python_version>="3.12"', 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', 'transformers>=4.28.0; python_version<"3.12"', diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index a72545745..6bc3420ca 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -207,7 +207,7 @@ def _create_optimizer(self): elif self._optimizer_type == Optimizer.ADAM: self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) elif self._optimizer_type == Optimizer.ADAMW: - if hasattr(torch.optim, "AdamW"): + if hasattr(torch.optim, 'AdamW'): self._optimizer = torch.optim.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) else: self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08) From 895a20cd4650587a6c1e4c3a1432d76637aa9032 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Tue, 10 Jun 2025 04:53:57 +0000 Subject: [PATCH 15/19] fix version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c1a060494..c30db8db6 100644 --- a/setup.py +++ b/setup.py @@ -225,7 +225,7 @@ def run(self): ], 'torch': [ 'safetensors==0.4.5; python_version<"3.12"', - 'safetensors==0.5.3; python_version>="3.12"', + 'safetensors>=0.5.3; python_version>="3.12"', 'tokenizers<=0.20.3; python_version<"3.12"', 'tokenizers<0.22; python_version>="3.12"', 'torch>=1.7.0a0', From 9018840ff53102e60dda3a707191655de71a3e6a Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Tue, 10 Jun 2025 16:31:10 +0000 Subject: [PATCH 16/19] fix ut --- tests/benchmarks/model_benchmarks/test_pytorch_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py index 96e1718a0..f8322796c 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py @@ -222,7 +222,10 @@ def test_pytorch_base(): assert (benchmark._init_dataloader() is False) # Test _create_optimizer(). - assert (isinstance(benchmark._optimizer, transformers.AdamW)) + if hasattr(torch.optim, 'AdamW'): + assert (isinstance(benchmark._optimizer, torch.optim.AdamW)) + else + assert (isinstance(benchmark._optimizer, transformers.AdamW)) benchmark._optimizer_type = Optimizer.ADAM assert (benchmark._create_optimizer() is True) assert (isinstance(benchmark._optimizer, torch.optim.Adam)) From b21b3951a042cd4ad41a85470a4508812fe155d4 Mon Sep 17 00:00:00 2001 From: Guoshuai Zhao Date: Tue, 10 Jun 2025 16:40:02 +0000 Subject: [PATCH 17/19] fix ut --- tests/benchmarks/model_benchmarks/test_pytorch_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py index f8322796c..b397f7c6b 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py @@ -224,7 +224,7 @@ def test_pytorch_base(): # Test _create_optimizer(). if hasattr(torch.optim, 'AdamW'): assert (isinstance(benchmark._optimizer, torch.optim.AdamW)) - else + else: assert (isinstance(benchmark._optimizer, transformers.AdamW)) benchmark._optimizer_type = Optimizer.ADAM assert (benchmark._create_optimizer() is True) From e1facefd73fadd9b73ee413a70b14b5627895760 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 24 Jun 2025 22:21:37 +0000 Subject: [PATCH 18/19] Add kernel filter for building cutlass. --- third_party/Makefile | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/third_party/Makefile b/third_party/Makefile index 79f99acba..8490793ee 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -51,8 +51,24 @@ else endif ifneq (,$(wildcard cutlass/CMakeLists.txt)) - cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \ - -DCUTLASS_NVCC_ARCHS=$(ARCHS) -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build + cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin \ + -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib \ + -DCMAKE_BUILD_TYPE=Release \ + -DCUTLASS_NVCC_ARCHS=$(ARCHS) \ + -DCUTLASS_ENABLE_EXAMPLES=OFF \ + -DCUTLASS_ENABLE_TESTS=OFF \ + -S ./cutlass \ + -B ./cutlass/build \ + -DCUTLASS_LIBRARY_KERNELS="cutlass_simt_dgemm_128x128_8x2_*,\ +cutlass_simt_sgemm_128x128_8x2_*,\ +cutlass_simt_hgemm_256x128_8x2_*,\ +cutlass_tensorop_h884gemm_256x128_32x2_*,\ +cutlass_tensorop_d884gemm_128x128_16x3_*,\ +cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*,\ +cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*,\ +cutlass_tensorop_h16816gemm_256x128_32x3_*,\ +cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\ +cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*" cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install endif From fcbd6e0d40f9734f3c6e230b172c512c31fa4dbc Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 25 Jun 2025 03:43:45 +0000 Subject: [PATCH 19/19] Remove build dir. --- third_party/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/Makefile b/third_party/Makefile index 8490793ee..f3b2eeb82 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -70,6 +70,7 @@ cutlass_tensorop_h16816gemm_256x128_32x3_*,\ cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\ cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*" cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install + rm -rf ./cutlass/build endif # Build cuda-samples/Samples/bandwidthTest.