Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions tests/end_to_end/tpu/qwen3/30b/test_qwen3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash

# Validates the Qwen3-30B pre-training pipeline using a pre-converted MaxText checkpoint.

# The flow of this script is as follows:
# 1. Run inference on the pre-converted checkpoint.
# 2. Run pre-training starting from the pre-converted checkpoint.
# 3. Run inference on the checkpoint produced by the pre-training run.

# Usage:
# export HF_TOKEN=<your Hugging Face access token>
# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
# bash test_qwen3_to_mt.sh $RUN_ID
# bash test_qwen3.sh $RUN_ID


set -ex

run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
MODEL_NAME='qwen3-30b-a3b-base'

USE_MULTIMODAL=false

BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

DATASET_PATH=gs://maxtext-dataset

if [ ${USE_MULTIMODAL} == true ]; then
python3 -m maxtext.inference.decode \
model_name=${MODEL_NAME} tokenizer_type="huggingface" \
load_parameters_path=${UNSCANNED_CKPT_PATH} \
per_device_batch_size=1 run_name=${run_id} \
max_prefill_predict_length=272 max_target_length=300 steps=1 async_checkpointing=false \
scan_layers=false use_multimodal=true \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\'
else
python3 -m maxtext.inference.decode \
model_name=${MODEL_NAME} tokenizer_type="huggingface" \
load_parameters_path=${UNSCANNED_CKPT_PATH} \
per_device_batch_size=1 run_name=${run_id} \
max_prefill_predict_length=8 max_target_length=16 steps=1 async_checkpointing=false \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
scan_layers=false prompt='I love to' attention=\'dot_product\'
fi

python3 -m maxtext.trainers.pre_train.train \
base_output_directory=${BASE_OUTPUT_DIRECTORY}/train \
dataset_path=${DATASET_PATH} tokenizer_type="huggingface" \
load_parameters_path=${SCANNED_CKPT_PATH} \
per_device_batch_size=0.125 run_name=${run_id} \
max_target_length=64 steps=5 async_checkpointing=false \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
model_name=${MODEL_NAME} scan_layers=true use_multimodal=${USE_MULTIMODAL} \
remat_policy=full \
ici_tensor_parallelism=4 ici_fsdp_parallelism=2 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd optimizer_memory_host_offload=true

if [ ${USE_MULTIMODAL} == true ]; then
python3 -m maxtext.inference.decode \
model_name=${MODEL_NAME} tokenizer_type="huggingface" \
load_parameters_path=${BASE_OUTPUT_DIRECTORY}/train/${run_id}/checkpoints/4/items \
per_device_batch_size=1 run_name=${run_id} \
max_prefill_predict_length=272 max_target_length=300 steps=4 async_checkpointing=false \
scan_layers=true use_multimodal=true \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
prompt=\'Describe\ image\ \<start_of_image\>\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\'
else
python3 -m maxtext.inference.decode \
model_name=${MODEL_NAME} tokenizer_type="huggingface" \
load_parameters_path=${BASE_OUTPUT_DIRECTORY}/train/${run_id}/checkpoints/4/items \
per_device_batch_size=1 run_name=${run_id} \
max_prefill_predict_length=8 max_target_length=16 steps=4 async_checkpointing=false \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
scan_layers=true prompt='I love to' attention=\'dot_product\'
fi
53 changes: 53 additions & 0 deletions tests/end_to_end/tpu/qwen3/30b/test_qwen3_rl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

# Validates the Qwen3 RL pipeline using a pre-converted MaxText checkpoint.

# The flow of this script is as follows:
# 1. Run inference on the pre-converted checkpoint.
# 2. Run RL starting from the pre-converted checkpoint.
# 3. Run inference on the checkpoint produced by the RL run.

# Usage:
# export HF_TOKEN=<your Hugging Face access token>
# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
# bash test_qwen3_to_mt.sh $RUN_ID
# bash test_qwen3_rl.sh $RUN_ID

set -ex
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export NEW_MODEL_DESIGN=1
run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
use_pathways=${2:-false}
MODEL_NAME='qwen3-30b-a3b-base'

BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items

python3 -m maxtext.inference.vllm_decode \
model_name=${MODEL_NAME} \
load_parameters_path=${UNSCANNED_CKPT_PATH} \
vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
hbm_utilization_vllm=0.85 \
use_chat_template=True scan_layers=false enable_single_controller=${use_pathways} \
ici_tensor_parallelism=4

python3 -m maxtext.trainers.post_train.rl.train_rl \
base_output_directory=${BASE_OUTPUT_DIRECTORY}/rl \
load_parameters_path=${SCANNED_CKPT_PATH} \
run_name=${run_id} rl.loss_algo='grpo' scan_layers=true \
num_batches=5 batch_size=1 num_test_batches=5 \
model_name=${MODEL_NAME} enable_single_controller=${use_pathways} \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
rollout_tensor_parallelism=4 \
vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
remat_policy=full \
ici_tensor_parallelism=4 ici_fsdp_parallelism=1 ici_expert_parallelism=2 max_target_length=64 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd

python3 -m maxtext.inference.vllm_decode \
model_name=${MODEL_NAME} \
load_parameters_path=${BASE_OUTPUT_DIRECTORY}/rl/${run_id}/checkpoints/actor/5/model_params \
vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
hbm_utilization_vllm=0.85 \
use_chat_template=True scan_layers=true enable_single_controller=${use_pathways} \
ici_tensor_parallelism=4
53 changes: 53 additions & 0 deletions tests/end_to_end/tpu/qwen3/30b/test_qwen3_sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

# Validates the Qwen3 SFT pipeline using a pre-converted MaxText checkpoint.


# The flow of this script is as follows:
# 1. Run inference on the pre-converted checkpoint.
# 2. Run SFT starting from the pre-converted checkpoint.
# 3. Run inference on the checkpoint produced by the SFT run.

# Usage:
# export HF_TOKEN=<your Hugging Face access token>
# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
# bash test_qwen3_to_mt.sh $RUN_ID
# bash test_qwen3_sft.sh $RUN_ID


set -ex
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export NEW_MODEL_DESIGN=1
run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
use_pathways=${2:-false}
MODEL_NAME='qwen3-30b-a3b-base'

BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}
UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items
SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items

python3 -m maxtext.inference.vllm_decode \
model_name=${MODEL_NAME} \
load_parameters_path=${UNSCANNED_CKPT_PATH} \
vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
hbm_utilization_vllm=0.85 \
use_chat_template=True scan_layers=false enable_single_controller=${use_pathways} \
ici_tensor_parallelism=4

python3 -m maxtext.trainers.post_train.sft.train_sft_native \
base_output_directory=${BASE_OUTPUT_DIRECTORY}/sft \
load_parameters_path=${SCANNED_CKPT_PATH} \
per_device_batch_size=0.125 run_name=${run_id} \
steps=5 scan_layers=true \
model_name=${MODEL_NAME} enable_single_controller=${use_pathways} \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
remat_policy=full \
ici_tensor_parallelism=4 ici_fsdp_parallelism=1 ici_expert_parallelism=2 max_target_length=64 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd optimizer_memory_host_offload=true

python3 -m maxtext.inference.vllm_decode \
model_name=${MODEL_NAME} \
load_parameters_path=${BASE_OUTPUT_DIRECTORY}/sft/${run_id}/checkpoints/5/model_params \
vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \
hbm_utilization_vllm=0.85 \
use_chat_template=True scan_layers=true enable_single_controller=${use_pathways} \
ici_tensor_parallelism=4
32 changes: 32 additions & 0 deletions tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_hf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# Converts a MaxText checkpoint to a Hugging Face model checkpoint.

# Usage:
# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
# bash test_qwen3_to_hf.sh $RUN_ID $CHECKPOINT_PATH $USE_MULTIMODAL $SCAN_LAYERS


set -ex

run_id=$1
CKPT_PATH=$2
USE_MULTIMODAL=${3:-false}
SCAN_LAYERS=${4:-false}

MODEL_NAME='qwen3-30b-a3b-base'
BASE_OUTPUT_DIRECTORY="gs://runner-maxtext-logs/${MODEL_NAME}"

if [ "${SCAN_LAYERS,,}" = "true" ]; then
scan_status="scanned"
else
scan_status="unscanned"
fi

python3 -m maxtext.checkpoint_conversion.to_huggingface \
model_name=${MODEL_NAME} \
tokenizer_type="huggingface" \
load_parameters_path=${CKPT_PATH} \
base_output_directory=${BASE_OUTPUT_DIRECTORY}/to_huggingface/${scan_status}/${run_id} \
use_multimodal=${USE_MULTIMODAL} \
scan_layers=$SCAN_LAYERS
66 changes: 66 additions & 0 deletions tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_mt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash

# Converts Qwen3-30B HuggingFace checkpoint to MaxText format and validates logit correctness.

# The flow of this script is as follows:
# 1. Install PyTorch (CPU) required for checkpoint conversion.
# 2. Convert the HuggingFace checkpoint to MaxText format in both unscanned and scanned formats.
# 3. Run a forward pass logits check to verify the converted checkpoint matches the original HF model.

# Usage:
# export HF_TOKEN=<your Hugging Face access token>
# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S)
# bash test_qwen3_to_mt.sh $RUN_ID - to convert the checkpoint and run logit check for non-multimodal version
# bash test_qwen3_to_mt.sh $RUN_ID true - to convert the checkpoint and run logit check for multimodal version

set -ex

run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)}
MODEL_NAME='qwen3-30b-a3b-base'
HF_GOLDEN_MODEL='Qwen/Qwen3-30B-A3B-Base'

USE_MULTIMODAL=${2:-false}

BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}/to_maxtext

# Step 1: Install torch
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu

# Step 2: Convert the checkpoint from Hugging Face
python3 -m maxtext.checkpoint_conversion.to_maxtext \
model_name=${MODEL_NAME} \
base_output_directory=${BASE_OUTPUT_DIRECTORY}/unscanned/${run_id} \
use_multimodal=${USE_MULTIMODAL} \
scan_layers=false \
hardware=cpu skip_jax_distributed_system=True \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
--lazy_load_tensors=False \
--eager_load_method='safetensors'

UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/unscanned/${run_id}/0/items
echo "Unscanned checkpoint path: ${UNSCANNED_CKPT_PATH}"

python3 -m maxtext.checkpoint_conversion.to_maxtext \
model_name=${MODEL_NAME} \
base_output_directory=${BASE_OUTPUT_DIRECTORY}/scanned/${run_id} \
use_multimodal=${USE_MULTIMODAL} \
scan_layers=true \
hardware=cpu skip_jax_distributed_system=True \
checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \
--lazy_load_tensors=False \
--eager_load_method='safetensors'

SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/scanned/${run_id}/0/items
echo "Scanned checkpoint path: ${SCANNED_CKPT_PATH}"

if [ "${USE_MULTIMODAL}" = "false" ]; then
python3 -m tests.utils.forward_pass_logit_checker \
load_parameters_path=${UNSCANNED_CKPT_PATH} \
model_name=${MODEL_NAME} \
use_multimodal=${USE_MULTIMODAL} \
scan_layers=false \
--hf_model_path=${HF_GOLDEN_MODEL} \
--max_kl_div=0.03 \
--run_hf_model=true \
hardware=cpu skip_jax_distributed_system=True
fi
Loading