diff --git a/tests/end_to_end/tpu/qwen3/30b/test_qwen3.sh b/tests/end_to_end/tpu/qwen3/30b/test_qwen3.sh new file mode 100755 index 0000000000..4fea78b206 --- /dev/null +++ b/tests/end_to_end/tpu/qwen3/30b/test_qwen3.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Validates the Qwen3-30B pre-training pipeline using a pre-converted MaxText checkpoint. + +# The flow of this script is as follows: +# 1. Run inference on the pre-converted checkpoint. +# 2. Run pre-training starting from the pre-converted checkpoint. +# 3. Run inference on the checkpoint produced by the pre-training run. + +# Usage: +# export HF_TOKEN= +# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S) +# bash test_qwen3_to_mt.sh $RUN_ID +# bash test_qwen3.sh $RUN_ID + + +set -ex + +run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)} +MODEL_NAME='qwen3-30b-a3b-base' + +USE_MULTIMODAL=false + +BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME} +UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items +SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +DATASET_PATH=gs://maxtext-dataset + +if [ ${USE_MULTIMODAL} == true ]; then + python3 -m maxtext.inference.decode \ + model_name=${MODEL_NAME} tokenizer_type="huggingface" \ + load_parameters_path=${UNSCANNED_CKPT_PATH} \ + per_device_batch_size=1 run_name=${run_id} \ + max_prefill_predict_length=272 max_target_length=300 steps=1 async_checkpointing=false \ + scan_layers=false use_multimodal=true \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + prompt=\'Describe\ image\ \\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\' +else + python3 -m maxtext.inference.decode \ + model_name=${MODEL_NAME} tokenizer_type="huggingface" \ + load_parameters_path=${UNSCANNED_CKPT_PATH} \ + per_device_batch_size=1 run_name=${run_id} \ + max_prefill_predict_length=8 max_target_length=16 steps=1 async_checkpointing=false \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + scan_layers=false prompt='I love to' attention=\'dot_product\' +fi + +python3 -m maxtext.trainers.pre_train.train \ + base_output_directory=${BASE_OUTPUT_DIRECTORY}/train \ + dataset_path=${DATASET_PATH} tokenizer_type="huggingface" \ + load_parameters_path=${SCANNED_CKPT_PATH} \ + per_device_batch_size=0.125 run_name=${run_id} \ + max_target_length=64 steps=5 async_checkpointing=false \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + model_name=${MODEL_NAME} scan_layers=true use_multimodal=${USE_MULTIMODAL} \ + remat_policy=full \ + ici_tensor_parallelism=4 ici_fsdp_parallelism=2 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd optimizer_memory_host_offload=true + +if [ ${USE_MULTIMODAL} == true ]; then + python3 -m maxtext.inference.decode \ + model_name=${MODEL_NAME} tokenizer_type="huggingface" \ + load_parameters_path=${BASE_OUTPUT_DIRECTORY}/train/${run_id}/checkpoints/4/items \ + per_device_batch_size=1 run_name=${run_id} \ + max_prefill_predict_length=272 max_target_length=300 steps=4 async_checkpointing=false \ + scan_layers=true use_multimodal=true \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + prompt=\'Describe\ image\ \\' image_path=\'tests/assets/test_image.jpg\' attention=\'dot_product\' +else + python3 -m maxtext.inference.decode \ + model_name=${MODEL_NAME} tokenizer_type="huggingface" \ + load_parameters_path=${BASE_OUTPUT_DIRECTORY}/train/${run_id}/checkpoints/4/items \ + per_device_batch_size=1 run_name=${run_id} \ + max_prefill_predict_length=8 max_target_length=16 steps=4 async_checkpointing=false \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + scan_layers=true prompt='I love to' attention=\'dot_product\' +fi diff --git a/tests/end_to_end/tpu/qwen3/30b/test_qwen3_rl.sh b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_rl.sh new file mode 100755 index 0000000000..6f5571927e --- /dev/null +++ b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_rl.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Validates the Qwen3 RL pipeline using a pre-converted MaxText checkpoint. + +# The flow of this script is as follows: +# 1. Run inference on the pre-converted checkpoint. +# 2. Run RL starting from the pre-converted checkpoint. +# 3. Run inference on the checkpoint produced by the RL run. + +# Usage: +# export HF_TOKEN= +# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S) +# bash test_qwen3_to_mt.sh $RUN_ID +# bash test_qwen3_rl.sh $RUN_ID + +set -ex +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +export NEW_MODEL_DESIGN=1 +run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)} +use_pathways=${2:-false} +MODEL_NAME='qwen3-30b-a3b-base' + +BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME} +UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items +SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items + +python3 -m maxtext.inference.vllm_decode \ + model_name=${MODEL_NAME} \ + load_parameters_path=${UNSCANNED_CKPT_PATH} \ + vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \ + hbm_utilization_vllm=0.85 \ + use_chat_template=True scan_layers=false enable_single_controller=${use_pathways} \ + ici_tensor_parallelism=4 + +python3 -m maxtext.trainers.post_train.rl.train_rl \ + base_output_directory=${BASE_OUTPUT_DIRECTORY}/rl \ + load_parameters_path=${SCANNED_CKPT_PATH} \ + run_name=${run_id} rl.loss_algo='grpo' scan_layers=true \ + num_batches=5 batch_size=1 num_test_batches=5 \ + model_name=${MODEL_NAME} enable_single_controller=${use_pathways} \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + rollout_tensor_parallelism=4 \ + vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \ + remat_policy=full \ + ici_tensor_parallelism=4 ici_fsdp_parallelism=1 ici_expert_parallelism=2 max_target_length=64 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd + +python3 -m maxtext.inference.vllm_decode \ + model_name=${MODEL_NAME} \ + load_parameters_path=${BASE_OUTPUT_DIRECTORY}/rl/${run_id}/checkpoints/actor/5/model_params \ + vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \ + hbm_utilization_vllm=0.85 \ + use_chat_template=True scan_layers=true enable_single_controller=${use_pathways} \ + ici_tensor_parallelism=4 diff --git a/tests/end_to_end/tpu/qwen3/30b/test_qwen3_sft.sh b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_sft.sh new file mode 100755 index 0000000000..a8b5cbc3c2 --- /dev/null +++ b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_sft.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Validates the Qwen3 SFT pipeline using a pre-converted MaxText checkpoint. + + +# The flow of this script is as follows: +# 1. Run inference on the pre-converted checkpoint. +# 2. Run SFT starting from the pre-converted checkpoint. +# 3. Run inference on the checkpoint produced by the SFT run. + +# Usage: +# export HF_TOKEN= +# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S) +# bash test_qwen3_to_mt.sh $RUN_ID +# bash test_qwen3_sft.sh $RUN_ID + + +set -ex +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +export NEW_MODEL_DESIGN=1 +run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)} +use_pathways=${2:-false} +MODEL_NAME='qwen3-30b-a3b-base' + +BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME} +UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/unscanned/${run_id}/0/items +SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/to_maxtext/scanned/${run_id}/0/items + +python3 -m maxtext.inference.vllm_decode \ + model_name=${MODEL_NAME} \ + load_parameters_path=${UNSCANNED_CKPT_PATH} \ + vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \ + hbm_utilization_vllm=0.85 \ + use_chat_template=True scan_layers=false enable_single_controller=${use_pathways} \ + ici_tensor_parallelism=4 + +python3 -m maxtext.trainers.post_train.sft.train_sft_native \ + base_output_directory=${BASE_OUTPUT_DIRECTORY}/sft \ + load_parameters_path=${SCANNED_CKPT_PATH} \ + per_device_batch_size=0.125 run_name=${run_id} \ + steps=5 scan_layers=true \ + model_name=${MODEL_NAME} enable_single_controller=${use_pathways} \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + remat_policy=full \ + ici_tensor_parallelism=4 ici_fsdp_parallelism=1 ici_expert_parallelism=2 max_target_length=64 weight_dtype=bfloat16 dtype=bfloat16 opt_type=sgd optimizer_memory_host_offload=true + +python3 -m maxtext.inference.vllm_decode \ + model_name=${MODEL_NAME} \ + load_parameters_path=${BASE_OUTPUT_DIRECTORY}/sft/${run_id}/checkpoints/5/model_params \ + vllm_hf_overrides='{architectures: ["MaxTextForCausalLM"]}' \ + hbm_utilization_vllm=0.85 \ + use_chat_template=True scan_layers=true enable_single_controller=${use_pathways} \ + ici_tensor_parallelism=4 diff --git a/tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_hf.sh b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_hf.sh new file mode 100755 index 0000000000..f3d446006b --- /dev/null +++ b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_hf.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Converts a MaxText checkpoint to a Hugging Face model checkpoint. + +# Usage: +# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S) +# bash test_qwen3_to_hf.sh $RUN_ID $CHECKPOINT_PATH $USE_MULTIMODAL $SCAN_LAYERS + + +set -ex + +run_id=$1 +CKPT_PATH=$2 +USE_MULTIMODAL=${3:-false} +SCAN_LAYERS=${4:-false} + +MODEL_NAME='qwen3-30b-a3b-base' +BASE_OUTPUT_DIRECTORY="gs://runner-maxtext-logs/${MODEL_NAME}" + +if [ "${SCAN_LAYERS,,}" = "true" ]; then + scan_status="scanned" +else + scan_status="unscanned" +fi + +python3 -m maxtext.checkpoint_conversion.to_huggingface \ + model_name=${MODEL_NAME} \ + tokenizer_type="huggingface" \ + load_parameters_path=${CKPT_PATH} \ + base_output_directory=${BASE_OUTPUT_DIRECTORY}/to_huggingface/${scan_status}/${run_id} \ + use_multimodal=${USE_MULTIMODAL} \ + scan_layers=$SCAN_LAYERS diff --git a/tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_mt.sh b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_mt.sh new file mode 100755 index 0000000000..105cca3af1 --- /dev/null +++ b/tests/end_to_end/tpu/qwen3/30b/test_qwen3_to_mt.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Converts Qwen3-30B HuggingFace checkpoint to MaxText format and validates logit correctness. + +# The flow of this script is as follows: +# 1. Install PyTorch (CPU) required for checkpoint conversion. +# 2. Convert the HuggingFace checkpoint to MaxText format in both unscanned and scanned formats. +# 3. Run a forward pass logits check to verify the converted checkpoint matches the original HF model. + +# Usage: +# export HF_TOKEN= +# export RUN_ID=$(date +%Y-%m-%d-%H-%M-%S) +# bash test_qwen3_to_mt.sh $RUN_ID - to convert the checkpoint and run logit check for non-multimodal version +# bash test_qwen3_to_mt.sh $RUN_ID true - to convert the checkpoint and run logit check for multimodal version + +set -ex + +run_id=${1:-$(date +%Y-%m-%d-%H-%M-%S)} +MODEL_NAME='qwen3-30b-a3b-base' +HF_GOLDEN_MODEL='Qwen/Qwen3-30B-A3B-Base' + +USE_MULTIMODAL=${2:-false} + +BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs/${MODEL_NAME}/to_maxtext + +# Step 1: Install torch +python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu + +# Step 2: Convert the checkpoint from Hugging Face +python3 -m maxtext.checkpoint_conversion.to_maxtext \ + model_name=${MODEL_NAME} \ + base_output_directory=${BASE_OUTPUT_DIRECTORY}/unscanned/${run_id} \ + use_multimodal=${USE_MULTIMODAL} \ + scan_layers=false \ + hardware=cpu skip_jax_distributed_system=True \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + --lazy_load_tensors=False \ + --eager_load_method='safetensors' + +UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/unscanned/${run_id}/0/items +echo "Unscanned checkpoint path: ${UNSCANNED_CKPT_PATH}" + +python3 -m maxtext.checkpoint_conversion.to_maxtext \ + model_name=${MODEL_NAME} \ + base_output_directory=${BASE_OUTPUT_DIRECTORY}/scanned/${run_id} \ + use_multimodal=${USE_MULTIMODAL} \ + scan_layers=true \ + hardware=cpu skip_jax_distributed_system=True \ + checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False \ + --lazy_load_tensors=False \ + --eager_load_method='safetensors' + +SCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/scanned/${run_id}/0/items +echo "Scanned checkpoint path: ${SCANNED_CKPT_PATH}" + +if [ "${USE_MULTIMODAL}" = "false" ]; then + python3 -m tests.utils.forward_pass_logit_checker \ + load_parameters_path=${UNSCANNED_CKPT_PATH} \ + model_name=${MODEL_NAME} \ + use_multimodal=${USE_MULTIMODAL} \ + scan_layers=false \ + --hf_model_path=${HF_GOLDEN_MODEL} \ + --max_kl_div=0.03 \ + --run_hf_model=true \ + hardware=cpu skip_jax_distributed_system=True +fi