From 0a1c422bdf0b8e7c2206d84d869f02cd8a5ce088 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Tue, 27 May 2025 16:49:39 +0000
Subject: [PATCH 01/19] Add cuda12.9 docker image

---
 .github/workflows/build-image.yml |  12 +++
 dockerfile/cuda12.9.dockerfile    | 171 ++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 dockerfile/cuda12.9.dockerfile

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 5dde15e5e..d03d1a28e 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -38,6 +38,18 @@ jobs:
           platforms: linux/amd64
           runner: [self-hosted, linux/amd64]
           build_args: "NUM_MAKE_JOBS=16"
+        - name: cuda12.9-arm64
+          dockerfile: cuda12.9
+          tags: superbench/main:cuda12.9-arm64
+          platforms: linux/arm64
+          runner: [self-hosted, linux/arm64]
+          build_args: "NUM_MAKE_JOBS=16"
+        - name: cuda12.9-amd64
+          dockerfile: cuda12.9
+          tags: superbench/main:cuda12.9-amd64
+          platforms: linux/amd64
+          runner: [self-hosted, linux/amd64]
+          build_args: "NUM_MAKE_JOBS=16"
         - name: cuda12.4
           dockerfile: cuda12.4
           tags: superbench/main:cuda12.4
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
new file mode 100644
index 000000000..8904264ee
--- /dev/null
+++ b/dockerfile/cuda12.9.dockerfile
@@ -0,0 +1,171 @@
+FROM nvcr.io/nvidia/pytorch:25.04-py3
+
+# OS:
+#   - Ubuntu: 24.04
+#   - OpenMPI: 4.1.7+
+#   - Docker Client: 20.10.8
+# NVIDIA:
+#   - CUDA: 12.9.0
+#   - cuDNN: 9.9.0.52
+#   - cuBLAS: 12.9.0.2
+#   - NCCL: v2.26.3
+#   - TransformerEngine 2.0
+# Mellanox:
+#   - MOFED_VERSION; 5.4-rdmacore39.0
+#   - HPC-X: v2.21.0-CUDA12.x
+# Intel:
+#   - mlc: v3.11
+
+LABEL maintainer="SuperBench"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    bc \
+    build-essential \
+    curl \
+    dmidecode \
+    ffmpeg \
+    git \
+    iproute2 \
+    jq \
+    libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
+    libboost-program-options-dev \
+    libcap2 \
+    libcurl4-openssl-dev \
+    libnuma-dev \
+    libpci-dev \
+    libswresample-dev \
+    libncurses-dev \
+    libtool \
+    lshw \
+    python3-mpi4py \
+    net-tools \
+    nlohmann-json3-dev \
+    openssh-client \
+    openssh-server \
+    pciutils \
+    sudo \
+    util-linux \
+    vim \
+    wget \
+    rsync \
+    && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+ARG NUM_MAKE_JOBS=
+ARG TARGETPLATFORM
+ARG TARGETARCH
+
+# Install Docker
+ENV DOCKER_VERSION=20.10.8
+RUN TARGETARCH_HW=$(uname -m) && \
+    wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
+    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
+    rm docker.tgz
+
+# Update system config
+RUN mkdir -p /root/.ssh && \
+    touch /root/.ssh/authorized_keys && \
+    mkdir -p /var/run/sshd && \
+    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
+    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
+    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
+
+# Install OFED
+ENV OFED_VERSION=24.10-1.1.4.0
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /tmp && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
+    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
+
+# Install HPC-X
+ENV HPCX_VERSION=v2.21
+RUN TARGETARCH_HW=$(uname -m) && \
+    cd /opt && \
+    rm -rf hpcx && \
+    wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda12-${TARGETARCH_HW} hpcx && \
+    rm hpcx.tbz
+
+# Installs specific to amd64 platform
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+    # Install Intel MLC
+    cd /tmp && \
+    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    tar xzf mlc.tgz Linux/mlc && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    rm -rf ./Linux mlc.tgz && \
+    # Install AOCC compiler
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    # Install AMD BLIS
+    wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
+    tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
+    mv amd-blis /opt/AMD && \
+    rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
+    else \
+    echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
+    fi
+
+# Install NCCL 2.26.3
+RUN cd /tmp && \
+    git clone -b v2.26.3-1 https://github.com/NVIDIA/nccl.git && \
+    cd nccl && \
+    make -j ${NUM_MAKE_JOBS} src.build \
+    NVCC_GENCODE="-gencode=arch=compute_100,code=sm_100 \
+    -gencode=arch=compute_90,code=sm_90 \
+    -gencode=arch=compute_80,code=sm_80" && \
+    make install && \
+    rm -rf /tmp/nccl
+
+# Install UCX with multi-threading support
+ENV UCX_VERSION=1.18.0
+RUN cd /tmp && \
+    wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
+    tar xzf ucx-${UCX_VERSION}.tar.gz && \
+    cd ucx-${UCX_VERSION} && \
+    ./contrib/configure-release-mt --prefix=/usr/local && \
+    make -j ${NUM_MAKE_JOBS} && \
+    make install
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
+    echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh
+
+# Add config files
+ADD dockerfile/etc /opt/microsoft/
+
+WORKDIR ${SB_HOME}
+
+ADD third_party third_party
+RUN make -C third_party cuda_with_msccl
+
+ADD . .
+RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
+    python3 -m pip install --no-cache-dir .[nvworker] && \
+    make cppbuild && \
+    make postinstall && \
+    rm -rf .git

From 2b9d586026e04c22a4bde1ed61d36d7e28d5f931 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Wed, 28 May 2025 04:11:37 +0000
Subject: [PATCH 02/19] skip bandwidthTest for cuda>=12.9

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md | 1 +
 third_party/Makefile                              | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 77f5de85b..c57ab3ea8 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -232,6 +232,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
 Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs,
 performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest)
 or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool.
+The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth(#nvbandwidth) benchmark.
 
 #### Metrics
 
diff --git a/third_party/Makefile b/third_party/Makefile
index e8149afbe..fe9336c75 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -60,10 +60,13 @@ endif
 # cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on.
 # The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
 # The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
+# The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the change log). For up-to-date bandwidth measurements, refer instead to the NVBandwith utility.
 cuda_bandwidthTest: sb_micro_path
 	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
 	git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
-ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
+        @echo "Skip cuda-samples build for CUDA>=12.9"
+else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
 	cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make
 	cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/

From 8b98e6c804e1feebdb104aa7322196caed661196 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Wed, 28 May 2025 04:16:19 +0000
Subject: [PATCH 03/19] skip bandwidthTest for cuda>=12.9

---
 docs/user-tutorial/benchmarks/micro-benchmarks.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index c57ab3ea8..dcc810daa 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -232,7 +232,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
 Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs,
 performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest)
 or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool.
-The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth(#nvbandwidth) benchmark.
+The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release. For up-to-date bandwidth measurements, refer instead to the nvbandwidth benchmark.
 
 #### Metrics
 

From 4eb58ae6ce1e679f0b46d2b150c4dcd046131a99 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Wed, 28 May 2025 04:49:27 +0000
Subject: [PATCH 04/19] format

---
 third_party/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index fe9336c75..029d5bb44 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -65,7 +65,7 @@ cuda_bandwidthTest: sb_micro_path
 	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
 	git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
 ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
-        @echo "Skip cuda-samples build for CUDA>=12.9"
+	@echo "Skip cuda-samples build for CUDA>=12.9"
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
 	cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make

From a4b6e8fc2f7d641f05cd43135ec41eca7cb49f95 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Thu, 29 May 2025 00:11:53 +0000
Subject: [PATCH 05/19] fix package incompatibility issue.

---
 dockerfile/cuda12.9.dockerfile        | 2 +-
 setup.py                              | 4 ++--
 third_party/Megatron/requirements.txt | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 8904264ee..466ea4acd 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -164,7 +164,7 @@ ADD third_party third_party
 RUN make -C third_party cuda_with_msccl
 
 ADD . .
-RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
+RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
     python3 -m pip install --no-cache-dir .[nvworker] && \
     make cppbuild && \
     make postinstall && \
diff --git a/setup.py b/setup.py
index 29688a5a5..79daaee64 100644
--- a/setup.py
+++ b/setup.py
@@ -183,7 +183,7 @@ def run(self):
         'openpyxl>=3.0.7',
         'packaging>=21.0',
         'pandas>=1.1.5',
-        'protobuf<=3.20.3',
+        'protobuf',
         'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
         'pyyaml>=5.3',
         'requests>=2.27.1',
@@ -224,7 +224,7 @@ def run(self):
                 'yapf==0.31.0',
             ],
             'torch': [
-                'safetensors==0.4.5',
+                'safetensors==0.5.3',
                 'tokenizers<=0.20.3',
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt
index a64133a00..f9a14bcf5 100644
--- a/third_party/Megatron/requirements.txt
+++ b/third_party/Megatron/requirements.txt
@@ -4,12 +4,12 @@ pybind11
 regex
 six
 # versions from HF transformers
-black==21.4b0
+black
 isort>=5.5.4
 tqdm
 sentencepiece
 wandb
 einops
-typing_extensions==4.9.0
+typing_extensions
 apex
 mpi4py

From d655533296418b1061ae84e93167db30d8b13f87 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Thu, 29 May 2025 18:31:13 +0000
Subject: [PATCH 06/19] Fix lib dependancies.

---
 setup.py                              | 3 ++-
 third_party/Megatron/requirements.txt | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 79daaee64..05c82862a 100644
--- a/setup.py
+++ b/setup.py
@@ -224,7 +224,8 @@ def run(self):
                 'yapf==0.31.0',
             ],
             'torch': [
-                'safetensors==0.5.3',
+                'safetensors==0.4.5; python_version<"3.12"',
+                'safetensors==0.5.3; python_version>="3.12"',
                 'tokenizers<=0.20.3',
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt
index f9a14bcf5..21366eafd 100644
--- a/third_party/Megatron/requirements.txt
+++ b/third_party/Megatron/requirements.txt
@@ -4,12 +4,13 @@ pybind11
 regex
 six
 # versions from HF transformers
-black
+black==25.1.0
 isort>=5.5.4
 tqdm
 sentencepiece
 wandb
 einops
-typing_extensions
+typing_extensions==4.9.0; python_version < '3.12'
+typing_extensions==4.12.2; python_version >= '3.12'
 apex
 mpi4py

From 376e3ae8250fb800f23b417e78c9f5ec996e29c3 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Thu, 29 May 2025 19:48:22 +0000
Subject: [PATCH 07/19] Fix lib dependancies.

---
 third_party/Megatron/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt
index 21366eafd..a41efd476 100644
--- a/third_party/Megatron/requirements.txt
+++ b/third_party/Megatron/requirements.txt
@@ -4,7 +4,8 @@ pybind11
 regex
 six
 # versions from HF transformers
-black==25.1.0
+black==21.4b0; python_version < '3.12'
+black==25.1.0; python_version >= '3.12'
 isort>=5.5.4
 tqdm
 sentencepiece

From 3c41dbdd1312ab4f53a4a2d70ee495752d026ad9 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Thu, 29 May 2025 20:11:49 +0000
Subject: [PATCH 08/19] update transformer_engine version in comment

---
 dockerfile/cuda12.9.dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 466ea4acd..d40fc745e 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -9,7 +9,7 @@ FROM nvcr.io/nvidia/pytorch:25.04-py3
 #   - cuDNN: 9.9.0.52
 #   - cuBLAS: 12.9.0.2
 #   - NCCL: v2.26.3
-#   - TransformerEngine 2.0
+#   - TransformerEngine 2.2.0
 # Mellanox:
 #   - MOFED_VERSION; 5.4-rdmacore39.0
 #   - HPC-X: v2.21.0-CUDA12.x

From 2e0643028e8946011b15c9d474bbe8120f539ce9 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Thu, 5 Jun 2025 22:14:21 +0000
Subject: [PATCH 09/19] Update cuda12.9 image dependancies.

---
 dockerfile/cuda12.9.dockerfile | 4 ++--
 third_party/Makefile           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index d40fc745e..e94cffd42 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:25.04-py3
+FROM nvcr.io/nvidia/pytorch:25.05-py3
 
 # OS:
 #   - Ubuntu: 24.04
@@ -92,7 +92,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
 
 # Install HPC-X
-ENV HPCX_VERSION=v2.21
+ENV HPCX_VERSION=v2.23
 RUN TARGETARCH_HW=$(uname -m) && \
     cd /opt && \
     rm -rf hpcx && \
diff --git a/third_party/Makefile b/third_party/Makefile
index 029d5bb44..6575881c1 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -43,7 +43,7 @@ cuda_cutlass:
 ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval ARCHS := "75;80;86;89;90a;100;100a")
 	if [ -d cutlass ]; then rm -rf cutlass; fi
-	git clone --single-branch --branch main https://github.com/NVIDIA/cutlass.git && cd cutlass && git checkout 389e493
+	git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := "70;75;80;86;89;90")
 else

From 67b07157460c9b8a3b68f41c3ff5c5cb11cdb55d Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Thu, 5 Jun 2025 22:24:19 +0000
Subject: [PATCH 10/19] Update version information.

---
 dockerfile/cuda12.9.dockerfile | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index e94cffd42..d823b1746 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -5,14 +5,15 @@ FROM nvcr.io/nvidia/pytorch:25.05-py3
 #   - OpenMPI: 4.1.7+
 #   - Docker Client: 20.10.8
 # NVIDIA:
-#   - CUDA: 12.9.0
-#   - cuDNN: 9.9.0.52
-#   - cuBLAS: 12.9.0.2
-#   - NCCL: v2.26.3
-#   - TransformerEngine 2.2.0
+#   - CUDA: 12.9.0.43
+#   - cuDNN: 9.10.1.3
+#   - cuBLAS: 12.9.0.13
+#   - NCCL: v2.26.5
+#   - TransformerEngine: v2.3
+#   - torch: 2.8.0a0+5228986c39
 # Mellanox:
 #   - MOFED_VERSION; 5.4-rdmacore39.0
-#   - HPC-X: v2.21.0-CUDA12.x
+#   - HPC-X: v2.23
 # Intel:
 #   - mlc: v3.11
 

From 97f74154340451da657c48ead6e7eddf2e9cc580 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Sun, 8 Jun 2025 23:04:24 +0000
Subject: [PATCH 11/19] clean arch for cutlass building.

---
 third_party/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index 6575881c1..15356110d 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -41,7 +41,7 @@ sb_micro_path:
 # for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support
 cuda_cutlass:
 ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-	$(eval ARCHS := "75;80;86;89;90a;100;100a")
+	$(eval ARCHS := "80;90;90a;100;100a")
 	if [ -d cutlass ]; then rm -rf cutlass; fi
 	git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)

From a374fc336bd615b1ac33143d8d9d9a97fd171b59 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Sun, 8 Jun 2025 23:36:58 +0000
Subject: [PATCH 12/19] revise the comment.

---
 third_party/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index 15356110d..9f7658a29 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -38,7 +38,7 @@ sb_micro_path:
 	mkdir -p $(SB_MICRO_PATH)/lib
 
 # Build cutlass.
-# for cuda 12.8 and later Build from commit 389e493 (3.8 release commit) for blackwell support
+# for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support
 cuda_cutlass:
 ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval ARCHS := "80;90;90a;100;100a")

From d3b4c29ac3addb406ea38eb0326fa9ee5cc5b950 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Mon, 9 Jun 2025 23:59:47 +0000
Subject: [PATCH 13/19] apply optimization from transformers

---
 setup.py                                               | 3 ++-
 superbench/benchmarks/model_benchmarks/pytorch_base.py | 5 ++++-
 third_party/Makefile                                   | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 05c82862a..bfbb05349 100644
--- a/setup.py
+++ b/setup.py
@@ -229,7 +229,8 @@ def run(self):
                 'tokenizers<=0.20.3',
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
-                'transformers>=4.28.0',
+                'transformers>=4.28.0; python_version<"3.12"',
+                'transformers==4.52.4; python_version>="3.12"',
             ],
             'ort': [
                 'onnx>=1.10.2',
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py
index f0cb52319..a72545745 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -207,7 +207,10 @@ def _create_optimizer(self):
         elif self._optimizer_type == Optimizer.ADAM:
             self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)
         elif self._optimizer_type == Optimizer.ADAMW:
-            self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)
+            if hasattr(torch.optim, "AdamW"):
+                self._optimizer = torch.optim.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)
+            else:
+                self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)
         else:
             self._optimizer = None
 
diff --git a/third_party/Makefile b/third_party/Makefile
index 9f7658a29..79f99acba 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -41,7 +41,7 @@ sb_micro_path:
 # for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support
 cuda_cutlass:
 ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-	$(eval ARCHS := "80;90;90a;100;100a")
+	$(eval ARCHS := "90;100")
 	if [ -d cutlass ]; then rm -rf cutlass; fi
 	git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)

From dce2057b8f70a46665ca1bcf5cd41265250bc345 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Tue, 10 Jun 2025 04:16:24 +0000
Subject: [PATCH 14/19] format

---
 setup.py                                               | 3 ++-
 superbench/benchmarks/model_benchmarks/pytorch_base.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index bfbb05349..c1a060494 100644
--- a/setup.py
+++ b/setup.py
@@ -226,7 +226,8 @@ def run(self):
             'torch': [
                 'safetensors==0.4.5; python_version<"3.12"',
                 'safetensors==0.5.3; python_version>="3.12"',
-                'tokenizers<=0.20.3',
+                'tokenizers<=0.20.3; python_version<"3.12"',
+                'tokenizers<0.22; python_version>="3.12"',
                 'torch>=1.7.0a0',
                 'torchvision>=0.8.0a0',
                 'transformers>=4.28.0; python_version<"3.12"',
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py
index a72545745..6bc3420ca 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -207,7 +207,7 @@ def _create_optimizer(self):
         elif self._optimizer_type == Optimizer.ADAM:
             self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)
         elif self._optimizer_type == Optimizer.ADAMW:
-            if hasattr(torch.optim, "AdamW"):
+            if hasattr(torch.optim, 'AdamW'):
                 self._optimizer = torch.optim.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)
             else:
                 self._optimizer = transformers.AdamW(self._model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08)

From 895a20cd4650587a6c1e4c3a1432d76637aa9032 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Tue, 10 Jun 2025 04:53:57 +0000
Subject: [PATCH 15/19] fix version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c1a060494..c30db8db6 100644
--- a/setup.py
+++ b/setup.py
@@ -225,7 +225,7 @@ def run(self):
             ],
             'torch': [
                 'safetensors==0.4.5; python_version<"3.12"',
-                'safetensors==0.5.3; python_version>="3.12"',
+                'safetensors>=0.5.3; python_version>="3.12"',
                 'tokenizers<=0.20.3; python_version<"3.12"',
                 'tokenizers<0.22; python_version>="3.12"',
                 'torch>=1.7.0a0',

From 9018840ff53102e60dda3a707191655de71a3e6a Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Tue, 10 Jun 2025 16:31:10 +0000
Subject: [PATCH 16/19] fix ut

---
 tests/benchmarks/model_benchmarks/test_pytorch_base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
index 96e1718a0..f8322796c 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -222,7 +222,10 @@ def test_pytorch_base():
     assert (benchmark._init_dataloader() is False)
 
     # Test _create_optimizer().
-    assert (isinstance(benchmark._optimizer, transformers.AdamW))
+    if hasattr(torch.optim, 'AdamW'):
+        assert (isinstance(benchmark._optimizer, torch.optim.AdamW))
+    else
+        assert (isinstance(benchmark._optimizer, transformers.AdamW))
     benchmark._optimizer_type = Optimizer.ADAM
     assert (benchmark._create_optimizer() is True)
     assert (isinstance(benchmark._optimizer, torch.optim.Adam))

From b21b3951a042cd4ad41a85470a4508812fe155d4 Mon Sep 17 00:00:00 2001
From: Guoshuai Zhao <microsoft@microsoft.com>
Date: Tue, 10 Jun 2025 16:40:02 +0000
Subject: [PATCH 17/19] fix ut

---
 tests/benchmarks/model_benchmarks/test_pytorch_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
index f8322796c..b397f7c6b 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -224,7 +224,7 @@ def test_pytorch_base():
     # Test _create_optimizer().
     if hasattr(torch.optim, 'AdamW'):
         assert (isinstance(benchmark._optimizer, torch.optim.AdamW))
-    else
+    else:
         assert (isinstance(benchmark._optimizer, transformers.AdamW))
     benchmark._optimizer_type = Optimizer.ADAM
     assert (benchmark._create_optimizer() is True)

From e1facefd73fadd9b73ee413a70b14b5627895760 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Tue, 24 Jun 2025 22:21:37 +0000
Subject: [PATCH 18/19] Add kernel filter for building cutlass.

---
 third_party/Makefile | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index 79f99acba..8490793ee 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -51,8 +51,24 @@ else
 endif
 
 ifneq (,$(wildcard cutlass/CMakeLists.txt))
-	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \
-		-DCUTLASS_NVCC_ARCHS=$(ARCHS) -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
+	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin \
+		-DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib \
+		-DCMAKE_BUILD_TYPE=Release \
+		-DCUTLASS_NVCC_ARCHS=$(ARCHS) \
+		-DCUTLASS_ENABLE_EXAMPLES=OFF \
+		-DCUTLASS_ENABLE_TESTS=OFF \
+		-S ./cutlass \
+		-B ./cutlass/build \
+		-DCUTLASS_LIBRARY_KERNELS="cutlass_simt_dgemm_128x128_8x2_*,\
+cutlass_simt_sgemm_128x128_8x2_*,\
+cutlass_simt_hgemm_256x128_8x2_*,\
+cutlass_tensorop_h884gemm_256x128_32x2_*,\
+cutlass_tensorop_d884gemm_128x128_16x3_*,\
+cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*,\
+cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*,\
+cutlass_tensorop_h16816gemm_256x128_32x3_*,\
+cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\
+cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*"
 	cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install
 endif
 

From fcbd6e0d40f9734f3c6e230b172c512c31fa4dbc Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Wed, 25 Jun 2025 03:43:45 +0000
Subject: [PATCH 19/19] Remove build dir.

---
 third_party/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/Makefile b/third_party/Makefile
index 8490793ee..f3b2eeb82 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -70,6 +70,7 @@ cutlass_tensorop_h16816gemm_256x128_32x3_*,\
 cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\
 cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*"
 	cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install
+	rm -rf ./cutlass/build
 endif
 
 # Build cuda-samples/Samples/bandwidthTest.