diff --git a/docker/Dockerfile b/docker/Dockerfile
index 84391219..bb3b190c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -36,7 +36,14 @@ RUN if [ "${TRITONSERVER_BUILD}" = "1" ]; then \
     apt autoremove -y && apt clean && rm -rf /var/lib/apt/lists/*
 
 # -- Layer 2: pip dependencies + Megatron-LM ---
-RUN git clone -b core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
+# Bump from core_v0.12.1 -> core_v0.13.1 to pick up
+# d9608004f "Add an option to skip counting zeros in grad of ChainedOptimizer"
+# (gates ChainedOptimizer.count_zeros on log_num_zeros_in_grad). 0.12.x release
+# line still ships the unconditional count_zeros_fp32 call inside
+# ChainedOptimizer.step, which costs ~4 ms / step on HSTU bf16 even when the
+# stat is unused. Earliest tag containing the fix is core_v0.13.0; using 0.13.1
+# (latest 0.13.x patch) for stability.
+RUN git clone -b core_v0.13.1 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
     pip install --no-deps -e ./megatron-lm && \
     pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath pyvers \
     cloudpickle triton==3.6.0 nvidia-cutlass-dsl==4.3.0 --no-cache pre-commit