AI-Hypercomputer · YixuanWang-99 · Jun 16, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Validates the Qwen3-30B pre-training pipeline using a pre-converted MaxText checkpoint.
+
+# The flow of this script is as follows:
+# 1. Run inference on the pre-converted checkpoint.
+# 2. Run pre-training starting from the pre-converted checkpoint.
+# 3. Run inference on the checkpoint produced by the pre-training run.
+
+# Usage:
+# export HF_TOKEN=<your Hugging Face access token>
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_qwen3_to_mt.sh $RUN_ID
+# bash test_qwen3.sh $RUN_ID
+
+
+set -ex
+
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
+MODEL_NAME='qwen3-30b-a3b-base'
+
+USE_MULTIMODAL=false
+
+BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
+UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
+SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+DATASET_PATH=gs://maxtext-dataset
+
+if [ ${USE_MULTIMODAL} == true ]; then
+    python3 -m maxtext.inference.decode \
+    model_name=${MODEL_NAME} tokenizer_type="huggingface" \
+    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    per_device_batch_size=1 run_name=${run_id} \
+    max_prefill_predict_length=272 max_target_length=300 steps=1 async_checkpointing=false \
+    scan_layers=false use_multimodal=true \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\'
+else
+    python3 -m maxtext.inference.decode \
+    model_name=${MODEL_NAME} tokenizer_type="huggingface" \
+    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    per_device_batch_size=1 run_name=${run_id} \
+    max_prefill_predict_length=8 max_target_length=16 steps=1 async_checkpointing=false \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    scan_layers=false prompt='I love to' attention=\'dot_product\'
+fi
+
+python3 -m maxtext.trainers.pre_train.train \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/train \
+    dataset_path=${DATASET_PATH} tokenizer_type="huggingface" \
+    load_parameters_path=${SCANNED_CKPT_PATH} \
+    per_device_batch_size=0.125 run_name=${run_id} \
+    max_target_length=64 steps=5 async_checkpointing=false \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    model_name=${MODEL_NAME} scan_layers=true use_multimodal=${USE_MULTIMODAL} \
+    remat_policy=full \
+    ici_tensor_parallelism=4 ici_fsdp_parallelism=2 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd optimizer_memory_host_offload=true
+
+if [ ${USE_MULTIMODAL} == true ]; then
+    python3 -m maxtext.inference.decode \
+    model_name=${MODEL_NAME} tokenizer_type="huggingface" \
+    load_parameters_path=${BASE_OUTPUT_DIRECTORY}/train/${run_id}/checkpoints/4/items \
+    per_device_batch_size=1 run_name=${run_id} \
+    max_prefill_predict_length=272 max_target_length=300 steps=4 async_checkpointing=false \
+    scan_layers=true use_multimodal=true \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\'
+else
+    python3 -m maxtext.inference.decode \
+    model_name=${MODEL_NAME} tokenizer_type="huggingface" \
+    load_parameters_path=${BASE_OUTPUT_DIRECTORY}/train/${run_id}/checkpoints/4/items \
+    per_device_batch_size=1 run_name=${run_id} \
+    max_prefill_predict_length=8 max_target_length=16 steps=4 async_checkpointing=false \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    scan_layers=true prompt='I love to' attention=\'dot_product\'
+fi
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Validates the Qwen3 RL pipeline using a pre-converted MaxText checkpoint.
+
+# The flow of this script is as follows:
+# 1. Run inference on the pre-converted checkpoint.
+# 2. Run RL starting from the pre-converted checkpoint.
+# 3. Run inference on the checkpoint produced by the RL run.
+
+# Usage:
+# export HF_TOKEN=<your Hugging Face access token>
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_qwen3_to_mt.sh $RUN_ID
+# bash test_qwen3_rl.sh $RUN_ID
+
+set -ex
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export NEW_MODEL_DESIGN=1
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
+use_pathways=${2:-false}
+MODEL_NAME='qwen3-30b-a3b-base'
+
+BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
+UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
+SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items
+
+python3 -m maxtext.inference.vllm_decode \
+    model_name=${MODEL_NAME} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
+    hbm_utilization_vllm=0.85 \
+    use_chat_template=True scan_layers=false enable_single_controller=${use_pathways} \
+    ici_tensor_parallelism=4
+
+python3 -m maxtext.trainers.post_train.rl.train_rl \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/rl \
+    load_parameters_path=${SCANNED_CKPT_PATH} \
+    run_name=${run_id} rl.loss_algo='grpo' scan_layers=true \
+    num_batches=5 batch_size=1 num_test_batches=5 \
+    model_name=${MODEL_NAME} enable_single_controller=${use_pathways} \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    rollout_tensor_parallelism=4 \
+    vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
+    remat_policy=full \
+    ici_tensor_parallelism=4 ici_fsdp_parallelism=1 ici_expert_parallelism=2 max_target_length=64 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd
+
+python3 -m maxtext.inference.vllm_decode \
+    model_name=${MODEL_NAME} \
+    load_parameters_path=${BASE_OUTPUT_DIRECTORY}/rl/${run_id}/checkpoints/actor/5/model_params \
+    vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
+    hbm_utilization_vllm=0.85 \
+    use_chat_template=True scan_layers=true enable_single_controller=${use_pathways} \
+    ici_tensor_parallelism=4
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Validates the Qwen3 SFT pipeline using a pre-converted MaxText checkpoint.
+
+
+# The flow of this script is as follows:
+# 1. Run inference on the pre-converted checkpoint.
+# 2. Run SFT starting from the pre-converted checkpoint.
+# 3. Run inference on the checkpoint produced by the SFT run.
+
+# Usage:
+# export HF_TOKEN=<your Hugging Face access token>
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_qwen3_to_mt.sh $RUN_ID
+# bash test_qwen3_sft.sh $RUN_ID
+
+
+set -ex
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export NEW_MODEL_DESIGN=1
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
+use_pathways=${2:-false}
+MODEL_NAME='qwen3-30b-a3b-base'
+
+BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
+UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
+SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items
+
+python3 -m maxtext.inference.vllm_decode \
+    model_name=${MODEL_NAME} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
+    hbm_utilization_vllm=0.85 \
+    use_chat_template=True scan_layers=false enable_single_controller=${use_pathways} \
+    ici_tensor_parallelism=4
+
+python3 -m maxtext.trainers.post_train.sft.train_sft_native \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/sft \
+    load_parameters_path=${SCANNED_CKPT_PATH} \
+    per_device_batch_size=0.125 run_name=${run_id} \
+    steps=5 scan_layers=true \
+    model_name=${MODEL_NAME} enable_single_controller=${use_pathways} \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    remat_policy=full \
+    ici_tensor_parallelism=4 ici_fsdp_parallelism=1 ici_expert_parallelism=2 max_target_length=64 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd optimizer_memory_host_offload=true
+
+python3 -m maxtext.inference.vllm_decode \
+    model_name=${MODEL_NAME} \
+    load_parameters_path=${BASE_OUTPUT_DIRECTORY}/sft/${run_id}/checkpoints/5/model_params \
+    vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
+    hbm_utilization_vllm=0.85 \
+    use_chat_template=True scan_layers=true enable_single_controller=${use_pathways} \
+    ici_tensor_parallelism=4
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Converts a MaxText checkpoint to a Hugging Face model checkpoint.
+
+# Usage:
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_qwen3_to_hf.sh $RUN_ID $CHECKPOINT_PATH $USE_MULTIMODAL $SCAN_LAYERS
+
+
+set -ex
+
+run_id=$1
+CKPT_PATH=$2
+USE_MULTIMODAL=${3:-false}
+SCAN_LAYERS=${4:-false}
+
+MODEL_NAME='qwen3-30b-a3b-base'
+BASE_OUTPUT_DIRECTORY="gs://runner-maxtext-logs/${MODEL_NAME}"
+
+if [ "${SCAN_LAYERS,,}" = "true" ]; then
+    scan_status="scanned"
+else
+    scan_status="unscanned"
+fi
+
+python3 -m maxtext.checkpoint_conversion.to_huggingface \
+    model_name=${MODEL_NAME} \
+    tokenizer_type="huggingface" \
+    load_parameters_path=${CKPT_PATH} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/to_huggingface/${scan_status}/${run_id} \
+    use_multimodal=${USE_MULTIMODAL} \
+    scan_layers=$SCAN_LAYERS
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Converts Qwen3-30B HuggingFace checkpoint to MaxText format and validates logit correctness.
+
+# The flow of this script is as follows:
+# 1. Install PyTorch (CPU) required for checkpoint conversion.
+# 2. Convert the HuggingFace checkpoint to MaxText format in both unscanned and scanned formats.
+# 3. Run a forward pass logits check to verify the converted checkpoint matches the original HF model.
+
+# Usage:
+# export HF_TOKEN=<your Hugging Face access token>
+# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
+# bash test_qwen3_to_mt.sh $RUN_ID - to convert the checkpoint and run logit check for non-multimodal version
+# bash test_qwen3_to_mt.sh $RUN_ID true - to convert the checkpoint and run logit check for multimodal version
+
+set -ex
+
+run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
+MODEL_NAME='qwen3-30b-a3b-base'
+HF_GOLDEN_MODEL='Qwen/Qwen3-30B-A3B-Base'
+
+USE_MULTIMODAL=${2:-false}
+
+BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}/to_maxtext
+
+# Step 1: Install torch
+python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+# Step 2: Convert the checkpoint from Hugging Face
+python3 -m maxtext.checkpoint_conversion.to_maxtext \
+    model_name=${MODEL_NAME} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/unscanned/${run_id} \
+    use_multimodal=${USE_MULTIMODAL} \
+    scan_layers=false \
+    hardware=cpu skip_jax_distributed_system=True \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    --lazy_load_tensors=False \
+    --eager_load_method='safetensors'
+
+UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/unscanned/${run_id}/0/items
+echo "Unscanned checkpoint path: ${UNSCANNED_CKPT_PATH}"
+
+python3 -m maxtext.checkpoint_conversion.to_maxtext \
+    model_name=${MODEL_NAME} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY}/scanned/${run_id} \
+    use_multimodal=${USE_MULTIMODAL} \
+    scan_layers=true \
+    hardware=cpu skip_jax_distributed_system=True \
+    checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
+    --lazy_load_tensors=False \
+    --eager_load_method='safetensors'
+
+SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/scanned/${run_id}/0/items
+echo "Scanned checkpoint path: ${SCANNED_CKPT_PATH}"
+
+if [ "${USE_MULTIMODAL}" = "false" ]; then
+    python3 -m tests.utils.forward_pass_logit_checker \
+        load_parameters_path=${UNSCANNED_CKPT_PATH} \
+        model_name=${MODEL_NAME} \
+        use_multimodal=${USE_MULTIMODAL} \
+        scan_layers=false \
+        --hf_model_path=${HF_GOLDEN_MODEL} \
+        --max_kl_div=0.03 \
+        --run_hf_model=true \
+        hardware=cpu skip_jax_distributed_system=True
+fi