diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 79eb80f45a..2914a9f344 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -37,6 +37,10 @@ "r100": 1, } +REQUIRED_DOMAIN_BY_MODEL_FAMILY = { + "qwen_vl": "qwen3vl", +} + def list_of_strings(arg): """Split a comma-separated string into a list of substrings.""" @@ -90,6 +94,15 @@ def bool_arg(arg): raise ValueError(f"Invalid value for boolean argument: {arg}") +def resolve_domain(domain: str | None, model_family_name: str) -> str: + """Resolve the effective domain for a model family.""" + resolved_domain = domain or "llm" + required_domain = REQUIRED_DOMAIN_BY_MODEL_FAMILY.get(model_family_name) + if required_domain is not None: + return required_domain + return resolved_domain + + def is_cuda_graph_impl_valid(arg): """Validate and normalize the CUDA graph implementation argument.""" if arg in VALID_CUDA_GRAPH_IMPLS: diff --git a/scripts/performance/configs/qwen_vl/qwen3_vl_pretrain.py b/scripts/performance/configs/qwen_vl/qwen3_vl_pretrain.py index 567b8dce42..5a8d03f94d 100644 --- a/scripts/performance/configs/qwen_vl/qwen3_vl_pretrain.py +++ b/scripts/performance/configs/qwen_vl/qwen3_vl_pretrain.py @@ -56,6 +56,8 @@ def set_qwen3_vl_common_configs(cfg: ConfigContainer) -> None: """Set common performance configurations for all Qwen3-VL configs.""" cfg.model.bias_activation_fusion = True + # Qwen3-VL uses a custom mRoPE path that does not support fused RoPE kernels. + cfg.model.apply_rope_fusion = False cfg.model.recompute_granularity = None cfg.model.recompute_method = None cfg.model.recompute_num_layers = None diff --git a/scripts/performance/run_script.py b/scripts/performance/run_script.py index 743d87e4e8..fc16f002f5 100644 --- a/scripts/performance/run_script.py +++ b/scripts/performance/run_script.py @@ -18,7 +18,7 @@ import sys import torch -from argument_parser import parse_cli_args +from argument_parser import parse_cli_args, resolve_domain from utils.overrides import set_cli_overrides, set_post_overrides, set_user_overrides from utils.utils import get_perf_optimized_recipe @@ -67,6 +67,15 @@ def main(): # `argparse.parse_known_args()` returns the unknown args as a `list[str]`. parser = parse_cli_args() args, cli_overrides = parser.parse_known_args() + resolved_domain = resolve_domain(args.domain, args.model_family_name) + if resolved_domain != args.domain: + logger.info( + "Using domain '%s' for model family '%s' instead of requested/default '%s'.", + resolved_domain, + args.model_family_name, + args.domain, + ) + args.domain = resolved_domain if args.dump_env: _dump_env_rank0() diff --git a/src/megatron/bridge/models/qwen_vl/qwen3_vl_provider.py b/src/megatron/bridge/models/qwen_vl/qwen3_vl_provider.py index a8e7ed90ee..5a036f3a1f 100644 --- a/src/megatron/bridge/models/qwen_vl/qwen3_vl_provider.py +++ b/src/megatron/bridge/models/qwen_vl/qwen3_vl_provider.py @@ -30,6 +30,7 @@ from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.models.qwen_vl.modelling_qwen3_vl.model import Qwen3VLModel +from megatron.bridge.utils import fusions @dataclass @@ -107,6 +108,9 @@ class Qwen3VLModelProvider(GPTModelProvider): def provide(self, pre_process=None, post_process=None, vp_stage=None) -> Qwen3VLModel: """Provide a Qwen3 VL model instance with vision and language components.""" + if not fusions.validate_rope_fusion_compatibility(self): + self.apply_rope_fusion = False + language_transformer_config = self hf_vision_config = self.vision_config @@ -257,6 +261,9 @@ def finalize(self) -> None: def provide(self, pre_process=None, post_process=None, vp_stage=None) -> Qwen3VLModel: """Provide a Qwen3 VL MoE model instance with vision and language components.""" + if not fusions.validate_rope_fusion_compatibility(self): + self.apply_rope_fusion = False + language_transformer_config = self hf_vision_config = self.vision_config diff --git a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py index 40ae6dd322..02601b965e 100644 --- a/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py +++ b/src/megatron/bridge/recipes/qwen_vl/qwen3_vl.py @@ -203,6 +203,12 @@ def _qwen3_vl_common( return cfg +def _enable_235b_pipeline_split_accounting(model_cfg) -> None: + """Account for embedding and loss stages in 235B pipeline splits.""" + model_cfg.account_for_embedding_in_pipeline_split = True + model_cfg.account_for_loss_in_pipeline_split = True + + # ============================================================================= # Qwen3-VL Pretrain Configurations (mock dataset) # ============================================================================= @@ -251,7 +257,7 @@ def qwen3_vl_235b_a22b_pretrain_mock_config(**user_kwargs: Unpack[Qwen3VLCommonK See `_qwen3_vl_common` for the full list of parameters. """ recommended_kwargs: Qwen3VLCommonKwargs = { - "hf_path": "Qwen/Qwen3-VL-235B-A22B", + "hf_path": "Qwen/Qwen3-VL-235B-A22B-Instruct", "tensor_model_parallel_size": 4, "pipeline_model_parallel_size": 16, "expert_model_parallel_size": 8, @@ -262,7 +268,9 @@ def qwen3_vl_235b_a22b_pretrain_mock_config(**user_kwargs: Unpack[Qwen3VLCommonK "freeze_vision_projection": False, } combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} - return _qwen3_vl_common(**combined_kwargs) + cfg = _qwen3_vl_common(**combined_kwargs) + _enable_235b_pipeline_split_accounting(cfg.model) + return cfg def _make_energon_dataset( @@ -573,7 +581,7 @@ def qwen3_vl_235b_a22b_sft_config() -> ConfigContainer: cfg = _sft_common_vlm() # Model configuration - hf_path = "Qwen/Qwen3-VL-235B-A22B" + hf_path = "Qwen/Qwen3-VL-235B-A22B-Instruct" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -585,6 +593,7 @@ def qwen3_vl_235b_a22b_sft_config() -> ConfigContainer: cfg.model.expert_model_parallel_size = 32 cfg.model.context_parallel_size = 1 cfg.model.sequence_parallel = False + _enable_235b_pipeline_split_accounting(cfg.model) # VLM-specific settings cfg.model.freeze_language_model = False @@ -1007,7 +1016,7 @@ def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCo cfg.peft = peft_scheme # Model configuration - hf_path = "Qwen/Qwen3-VL-235B-A22B" + hf_path = "Qwen/Qwen3-VL-235B-A22B-Instruct" cfg.model = AutoBridge.from_hf_pretrained(hf_path).to_megatron_provider(load_weights=False) cfg.model.seq_length = 4096 @@ -1019,6 +1028,7 @@ def qwen3_vl_235b_a22b_peft_config(peft_scheme: str | PEFT = "lora") -> ConfigCo cfg.model.expert_model_parallel_size = 16 cfg.model.context_parallel_size = 1 cfg.model.sequence_parallel = False + _enable_235b_pipeline_split_accounting(cfg.model) # VLM-specific settings cfg.model.freeze_language_model = False diff --git a/tests/unit_tests/models/qwen_vl/test_qwen3_vl_provider.py b/tests/unit_tests/models/qwen_vl/test_qwen3_vl_provider.py new file mode 100644 index 0000000000..c564a94bb8 --- /dev/null +++ b/tests/unit_tests/models/qwen_vl/test_qwen3_vl_provider.py @@ -0,0 +1,61 @@ +from types import SimpleNamespace + +import pytest + +import megatron.bridge.models.qwen_vl.qwen3_vl_provider as qwen3_vl_provider_module +from megatron.bridge.models.qwen_vl.qwen3_vl_provider import ( + Qwen3VLModelProvider, + Qwen3VLMoEModelProvider, +) + + +class _DummyQwen3VLModel: + """Minimal stand-in for Qwen3VLModel used to inspect provider inputs.""" + + def __init__( + self, + *, + language_transformer_config, + language_transformer_layer_spec, + vision_transformer_config, + pre_process, + post_process, + pg_collection, + ): + self.language_transformer_config = language_transformer_config + self.language_transformer_layer_spec = language_transformer_layer_spec + self.vision_transformer_config = vision_transformer_config + self.pre_process = pre_process + self.post_process = post_process + self.pg_collection = pg_collection + self.freeze_calls = [] + + def freeze(self, **kwargs): + """Record freeze calls without touching parameters.""" + self.freeze_calls.append(kwargs) + + +@pytest.mark.parametrize("provider_cls", [Qwen3VLModelProvider, Qwen3VLMoEModelProvider]) +def test_qwen3_vl_provide_disables_incompatible_rope_fusion( + provider_cls, + monkeypatch: pytest.MonkeyPatch, +): + """Qwen3-VL providers should clear fused RoPE for mRoPE models before model build.""" + monkeypatch.setattr( + qwen3_vl_provider_module, + "get_gpt_layer_with_transformer_engine_spec", + lambda **kwargs: SimpleNamespace(**kwargs), + ) + monkeypatch.setattr(qwen3_vl_provider_module, "Qwen3VLModel", _DummyQwen3VLModel) + + provider = provider_cls( + num_layers=4, + hidden_size=256, + num_attention_heads=8, + apply_rope_fusion=True, + ) + + model = provider.provide() + + assert provider.apply_rope_fusion is False + assert model.language_transformer_config.apply_rope_fusion is False diff --git a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py index 8b271447a4..7bf3b1461b 100644 --- a/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py +++ b/tests/unit_tests/recipes/qwen_vl/test_qwen3_vl_recipes.py @@ -95,6 +95,18 @@ def to_megatron_provider(self, load_weights: bool = False): return _FakeModelCfg() +class _TrackingAutoBridge(_FakeAutoBridge): + """Fake AutoBridge that records the requested HF repo.""" + + last_hf_path: str | None = None + + @staticmethod + def from_hf_pretrained(hf_path: str): + """Record the HF repo used by the recipe under test.""" + _TrackingAutoBridge.last_hf_path = hf_path + return _TrackingAutoBridge() + + def _assert_basic_config(cfg): """Assert that a config has all required components.""" from megatron.bridge.training.config import ConfigContainer @@ -294,6 +306,8 @@ def test_qwen3_vl_235b_sft_defaults(monkeypatch: pytest.MonkeyPatch): # Check expert_model_parallel_size for MoE model assert cfg.model.expert_model_parallel_size == 32 + assert cfg.model.account_for_embedding_in_pipeline_split is True + assert cfg.model.account_for_loss_in_pipeline_split is True def test_qwen3_vl_235b_peft_defaults(monkeypatch: pytest.MonkeyPatch): @@ -312,6 +326,37 @@ def test_qwen3_vl_235b_peft_defaults(monkeypatch: pytest.MonkeyPatch): # Check PEFT config assert cfg.peft is not None + assert cfg.model.account_for_embedding_in_pipeline_split is True + assert cfg.model.account_for_loss_in_pipeline_split is True + + +@pytest.mark.parametrize( + "recipe_func", + [ + _qwen3_vl_module.qwen3_vl_235b_a22b_pretrain_mock_config, + _qwen3_vl_module.qwen3_vl_235b_a22b_sft_config, + _qwen3_vl_module.qwen3_vl_235b_a22b_peft_config, + ], +) +def test_qwen3_vl_235b_uses_instruct_repo(recipe_func: Callable, monkeypatch: pytest.MonkeyPatch): + """Test that 235B-A22B recipes point at the published Instruct HF repo.""" + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _TrackingAutoBridge) + _TrackingAutoBridge.last_hf_path = None + + cfg = recipe_func() + + assert _TrackingAutoBridge.last_hf_path == "Qwen/Qwen3-VL-235B-A22B-Instruct" + assert cfg.dataset.hf_processor_path == "Qwen/Qwen3-VL-235B-A22B-Instruct" + + +def test_qwen3_vl_235b_a22b_pretrain_mock_uses_pipeline_split_accounting(monkeypatch: pytest.MonkeyPatch): + """Test that 235B-A22B pretrain enables embedding/loss pipeline accounting.""" + monkeypatch.setattr(_qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + cfg = _qwen3_vl_module.qwen3_vl_235b_a22b_pretrain_mock_config() + + assert cfg.model.account_for_embedding_in_pipeline_split is True + assert cfg.model.account_for_loss_in_pipeline_split is True def test_qwen3_vl_sft_has_hf_dataset_provider(monkeypatch: pytest.MonkeyPatch): diff --git a/tests/unit_tests/scripts/test_performance_offline_mode.py b/tests/unit_tests/scripts/test_performance_offline_mode.py index 79a781ca60..9d5fa5d80c 100644 --- a/tests/unit_tests/scripts/test_performance_offline_mode.py +++ b/tests/unit_tests/scripts/test_performance_offline_mode.py @@ -115,6 +115,21 @@ def test_argparse_rejects_hf_token_with_offline(import_performance_module): ) +def test_resolve_domain_forces_qwen3vl_for_qwen_vl(import_performance_module): + """Qwen3-VL perf launches should always use the dedicated qwen3vl domain.""" + argument_parser = import_performance_module("scripts.performance.argument_parser") + + assert argument_parser.resolve_domain("llm", "qwen_vl") == "qwen3vl" + assert argument_parser.resolve_domain("vlm", "qwen_vl") == "qwen3vl" + + +def test_resolve_domain_keeps_llm_for_llama(import_performance_module): + """Non-VLM model families should keep their requested/default domain.""" + argument_parser = import_performance_module("scripts.performance.argument_parser") + + assert argument_parser.resolve_domain("llm", "llama") == "llm" + + def test_slurm_executor_sets_offline_env_and_container_writable(import_performance_module): """Offline mode should set HF_HUB_OFFLINE and preserve the offline Transformers default.""" executors = import_performance_module("scripts.performance.utils.executors") diff --git a/tests/unit_tests/scripts/test_qwen3_vl_performance_config.py b/tests/unit_tests/scripts/test_qwen3_vl_performance_config.py new file mode 100644 index 0000000000..00bd45e8a1 --- /dev/null +++ b/tests/unit_tests/scripts/test_qwen3_vl_performance_config.py @@ -0,0 +1,61 @@ +import importlib +import sys +from pathlib import Path + + +SCRIPTS_PERF_PATH = Path(__file__).parents[3] / "scripts" / "performance" +if str(SCRIPTS_PERF_PATH) not in sys.path: + sys.path.insert(0, str(SCRIPTS_PERF_PATH)) + + +class _FakeModelCfg: + """Minimal fake model provider for Qwen3-VL perf config tests.""" + + def __init__(self): + self.tensor_model_parallel_size = 1 + self.pipeline_model_parallel_size = 1 + self.pipeline_dtype = None + self.virtual_pipeline_model_parallel_size = None + self.context_parallel_size = 1 + self.expert_model_parallel_size = 1 + self.expert_tensor_parallel_size = 1 + self.sequence_parallel = False + self.seq_length = 64 + self.freeze_language_model = False + self.freeze_vision_model = False + self.freeze_vision_projection = False + self.moe_token_dispatcher_type = None + self.moe_flex_dispatcher_backend = None + self.moe_hybridep_num_sms = None + self.moe_router_fusion = False + self.moe_permute_fusion = False + self.moe_grouped_gemm = False + self.moe_router_padding_for_fp8 = False + self.moe_shared_expert_overlap = False + self.moe_router_force_load_balancing = False + self.apply_rope_fusion = False + + def finalize(self): + return None + + +class _FakeAutoBridge: + """Fake AutoBridge used to avoid HF/network access in perf config tests.""" + + @staticmethod + def from_hf_pretrained(_hf_path: str): + return _FakeAutoBridge() + + def to_megatron_provider(self, load_weights: bool = False): + return _FakeModelCfg() + + +def test_qwen3_vl_235b_perf_config_disables_rope_fusion(monkeypatch): + """Qwen3-VL perf configs should not re-enable unsupported fused RoPE.""" + qwen3_vl_module = importlib.import_module("megatron.bridge.recipes.qwen_vl.qwen3_vl") + monkeypatch.setattr(qwen3_vl_module, "AutoBridge", _FakeAutoBridge) + + qwen3_vl_perf_module = importlib.import_module("configs.qwen_vl.qwen3_vl_pretrain") + cfg = qwen3_vl_perf_module.qwen3_vl_235b_a22b_pretrain_config_h100(precision="bf16", mock=True) + + assert cfg.model.apply_rope_fusion is False