diff --git a/examples/eagle/convert_checkpoint.py b/examples/eagle/convert_checkpoint.py index 217144e1ae5..130faee4453 100644 --- a/examples/eagle/convert_checkpoint.py +++ b/examples/eagle/convert_checkpoint.py @@ -9,6 +9,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.eagle.config import EagleConfig from tensorrt_llm.models.eagle.model import EagleForCausalLM @@ -293,7 +294,7 @@ def copy(tensors): args.rms_norm_eps = hf_config.rms_norm_eps args.vocab_size = hf_config.vocab_size args.rotary_scaling = hf_config.rope_scaling - args.rotary_base = hf_config.rope_theta + args.rotary_base = get_hf_rope_theta(hf_config, 10000.0) args.n_positions = hf_config.max_position_embeddings args.dtype = str( hf_config.torch_dtype)[6:] if args.dtype == 'auto' else args.dtype diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py index 48dcc6fd400..09eb55b4610 100644 --- a/examples/medusa/convert_checkpoint.py +++ b/examples/medusa/convert_checkpoint.py @@ -13,7 +13,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation -from tensorrt_llm._utils import numpy_to_torch +from tensorrt_llm._utils import get_hf_rope_theta, numpy_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import (LLaMAForCausalLM, PretrainedConfig, @@ -209,7 +209,7 @@ def main(): args.rms_norm_eps = hf_config.rms_norm_eps args.vocab_size = hf_config.vocab_size args.n_positions = hf_config.max_position_embeddings - args.rotary_base = hf_config.rope_theta + args.rotary_base = get_hf_rope_theta(hf_config, 10000.0) args.rotary_scaling = hf_config.rope_scaling elif args.meta_ckpt_dir is not None: diff --git a/examples/models/contrib/dbrx/convert_checkpoint.py b/examples/models/contrib/dbrx/convert_checkpoint.py index ad487a50c76..1ca287f2588 100644 --- a/examples/models/contrib/dbrx/convert_checkpoint.py +++ b/examples/models/contrib/dbrx/convert_checkpoint.py @@ -18,7 +18,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation -from tensorrt_llm._utils import release_gc +from tensorrt_llm._utils import get_hf_rope_theta, release_gc from tensorrt_llm.layers import MoeConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.convert_utils import (generate_int8, @@ -557,7 +557,7 @@ def execute(workers, func, hf_model): args.moe_top_k = 1 args.clip_qkv = hf_config.attn_config.clip_qkv args.hidden_act = 'swiglu' - args.rotary_base = hf_config.attn_config.rope_theta + args.rotary_base = get_hf_rope_theta(hf_config.attn_config, 10000.0) args.moe_config = MoeConfig( num_experts=args.moe_num_experts, top_k=args.moe_top_k, diff --git a/examples/models/core/internlm2/convert_checkpoint.py b/examples/models/core/internlm2/convert_checkpoint.py index 151a1afe85c..44c80d6d51f 100644 --- a/examples/models/core/internlm2/convert_checkpoint.py +++ b/examples/models/core/internlm2/convert_checkpoint.py @@ -14,7 +14,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation -from tensorrt_llm._utils import release_gc +from tensorrt_llm._utils import get_hf_rope_theta, release_gc from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.llama import convert @@ -480,7 +480,7 @@ def convert_from_hf(hf_model, 'norm_epsilon': hf_config.rms_norm_eps, 'vocab_size': hf_config.vocab_size, 'position_embedding_type': 'rope_gpt_neox', - 'rotary_base': hf_config.rope_theta, + 'rotary_base': get_hf_rope_theta(hf_config, 10000.0), 'max_position_embeddings': hf_config.max_position_embeddings, 'hidden_act': hf_config.hidden_act, 'use_parallel_embedding': args.use_parallel_embedding, diff --git a/requirements.txt b/requirements.txt index b76e28208bd..bfe364fde97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0 # torch 2.10.0+cu130 depends on nvidia-nccl-cu13==2.28.9 nvidia-nccl-cu13>=2.28.9,<=2.29.2 nvidia-cuda-nvrtc -transformers==4.57.3 +transformers==5.3.0 prometheus_client prometheus_fastapi_instrumentator pydantic>=2.9.1 diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index 600f655bc51..f86b550482f 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -14,7 +14,7 @@ from ..speculative.interface import SpecMetadata from ..speculative.spec_tree_manager import SpecTreeManager -from tensorrt_llm._utils import maybe_pin_memory +from tensorrt_llm._utils import get_hf_rope_theta, maybe_pin_memory from tensorrt_llm.functional import (PositionEmbeddingType, RopeEmbeddingUtils, RotaryScalingType) from tensorrt_llm.mapping import Mapping @@ -498,7 +498,7 @@ def from_config(config) -> "RopeParams": head_dim = hidden_size // num_attention_heads rope_scaling = getattr(config, 'rope_scaling', None) rope_params.max_positions = config.max_position_embeddings - rope_params.theta = getattr(config, 'rope_theta', 10000.0) + rope_params.theta = get_hf_rope_theta(config, 10000.0) rope_percentage = (getattr(config, 'rotary_pct', None) or getattr(config, 'partial_rotary_factor', None) or 1.0) diff --git a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py index fd21604d1b6..d6272be573d 100644 --- a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py +++ b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py @@ -1,6 +1,7 @@ """Patch for transformers SDPA mask to be export-compatible.""" import importlib.metadata +from functools import partial from packaging import version @@ -29,7 +30,14 @@ def _apply_patch(self): try: # imports only after version check from transformers import masking_utils - from transformers.integrations.executorch import sdpa_mask_without_vmap + + # Up to ~4.53+, HF exposed this helper next to ExecuTorch export utilities. + # Transformers 5.x removed it; sdpa_mask now supports use_vmap=False (the default), + # which is export-compatible without vmap. + try: + from transformers.integrations.executorch import sdpa_mask_without_vmap + except ImportError: + sdpa_mask_without_vmap = partial(masking_utils.sdpa_mask, use_vmap=False) # recall original implementation self.original_values["masking_utils.sdpa_mask"] = masking_utils.sdpa_mask diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py index e227bc7ebec..71c11814379 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py @@ -2854,8 +2854,8 @@ def init_input_processor(self, base): # Registration # ============================================================================= -AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig) -AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig) +AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig, exist_ok=True) +AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig, exist_ok=True) AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM) Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration) diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py index 47d7eacd47a..779622a2a0b 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py @@ -205,20 +205,25 @@ class BambaModelPatch(BaseExportPatch): def _apply_patch(self): self.original_values["BambaMixer.torch_forward"] = BambaMixer.torch_forward self.original_values["BambaModel._update_mamba_mask"] = BambaModel._update_mamba_mask - self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask + # Older transformers expose both; newer releases dropped `_update_causal_mask` on `BambaModel` + # (mask handling consolidated under `_update_mamba_mask`). + if hasattr(BambaModel, "_update_causal_mask"): + self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask # NOTE: there is `HybridMambaAttentionDynamicCache.__bool__` to save. # self.original_values["BambaPreTrainedModel._init_weights"] = BambaPreTrainedModel._init_weights BambaMixer.torch_forward = _bamba_mixer_torch_forward BambaModel._update_mamba_mask = _bamba_model_update_mamba_mask - BambaModel._update_causal_mask = _bamba_model_update_causal_mask + if hasattr(BambaModel, "_update_causal_mask"): + BambaModel._update_causal_mask = _bamba_model_update_causal_mask HybridMambaAttentionDynamicCache.__bool__ = _cache_bool # BambaPreTrainedModel._init_weights = _bamba_pretrained_model_init_weights def _revert_patch(self): BambaMixer.torch_forward = self.original_values["BambaMixer.torch_forward"] BambaModel._update_mamba_mask = self.original_values["BambaModel._update_mamba_mask"] - BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"] + if "BambaModel._update_causal_mask" in self.original_values: + BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"] del HybridMambaAttentionDynamicCache.__bool__ # BambaPreTrainedModel._init_weights = self.original_values[ # "BambaPreTrainedModel._init_weights" diff --git a/tensorrt_llm/_torch/models/hf_parameter_utils.py b/tensorrt_llm/_torch/models/hf_parameter_utils.py new file mode 100644 index 00000000000..8a163c4f917 --- /dev/null +++ b/tensorrt_llm/_torch/models/hf_parameter_utils.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Compatibility for Hugging Face ``get_parameter_device`` / ``get_parameter_dtype``. + +Transformers v5 no longer exports these from ``transformers.modeling_utils``; they +match ``ModuleUtilsMixin`` behavior for plain ``nn.Module`` stacks. +""" + +from __future__ import annotations + +import torch +import torch.nn as nn + +try: + from transformers.modeling_utils import get_parameter_device, get_parameter_dtype +except ImportError: + + def get_parameter_device(module: nn.Module) -> torch.device: + return next(module.parameters()).device + + def get_parameter_dtype(module: nn.Module) -> torch.dtype: + return next(param.dtype for param in module.parameters() if param.is_floating_point()) diff --git a/tensorrt_llm/_torch/models/modeling_clip.py b/tensorrt_llm/_torch/models/modeling_clip.py index 1e203eda8b7..9e73dcc2dd3 100644 --- a/tensorrt_llm/_torch/models/modeling_clip.py +++ b/tensorrt_llm/_torch/models/modeling_clip.py @@ -4,8 +4,6 @@ import torch.nn as nn from transformers.activations import ACT2FN from transformers.modeling_outputs import BaseModelOutput -from transformers.modeling_utils import (get_parameter_device, - get_parameter_dtype) from transformers.models.clip.configuration_clip import CLIPVisionConfig from transformers.models.clip.modeling_clip import CLIPVisionEmbeddings @@ -17,6 +15,7 @@ from ..model_config import ModelConfig from ..modules.attention import Attention from ..modules.mlp import MLP +from .hf_parameter_utils import get_parameter_device, get_parameter_dtype from .modeling_utils import _load_weights_impl, register_auto_model diff --git a/tensorrt_llm/_torch/models/modeling_exaone4.py b/tensorrt_llm/_torch/models/modeling_exaone4.py index 07951fc28a4..2b49b9bb475 100644 --- a/tensorrt_llm/_torch/models/modeling_exaone4.py +++ b/tensorrt_llm/_torch/models/modeling_exaone4.py @@ -29,7 +29,7 @@ class Exaone4Config(PretrainedConfig): model_type = "exaone4" - AutoConfig.register(Exaone4Config.model_type, Exaone4Config) + AutoConfig.register(Exaone4Config.model_type, Exaone4Config, exist_ok=True) def check_is_sliding(config: Exaone4Config, layer_idx: int) -> bool: diff --git a/tensorrt_llm/_torch/models/modeling_exaone_moe.py b/tensorrt_llm/_torch/models/modeling_exaone_moe.py index fe420178558..621065ccb47 100644 --- a/tensorrt_llm/_torch/models/modeling_exaone_moe.py +++ b/tensorrt_llm/_torch/models/modeling_exaone_moe.py @@ -53,7 +53,7 @@ class ExaoneMoEConfig(PretrainedConfig): "Register ExaoneMoEConfig to mimic the ExaoneMoE model.", key="EXAONE_MOE_REGISTER_WARNING" ) -AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig) +AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig, exist_ok=True) # End of the config register. # fmt: on diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py index 4d46611a7fd..7edeb3e73be 100644 --- a/tensorrt_llm/_torch/models/modeling_gpt_oss.py +++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py @@ -7,7 +7,7 @@ from tqdm import tqdm from transformers import GptOssConfig -from tensorrt_llm._utils import get_sm_version +from tensorrt_llm._utils import get_hf_rope_theta, get_sm_version from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType from ..attention_backend import AttentionMetadata @@ -55,7 +55,7 @@ def __init__( type=PositionEmbeddingType.yarn, rope=RopeParams( dim=pretrained_config.head_dim, - theta=pretrained_config.rope_theta, + theta=get_hf_rope_theta(pretrained_config, 10000.0), scale_type=RotaryScalingType.yarn, scale=pretrained_config.rope_scaling['factor'], max_positions=pretrained_config.max_position_embeddings, diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index b13c2e3de91..8dff496563a 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -7,8 +7,9 @@ from torch import nn from transformers import (AutoProcessor, AutoTokenizer, Llama4Config, Llama4VisionModel, LlamaConfig, PretrainedConfig) -from transformers.modeling_utils import load_sharded_checkpoint from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector +from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, WEIGHTS_NAME) from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams, MoEAllReduce) @@ -1118,6 +1119,56 @@ def post_load_weights(self): layer.next_attn = self.model.layers[idx + 1].self_attn +def _load_checkpoint_into_module(module: nn.Module, + folder: str, + strict: bool = True) -> None: + """Load a sharded HuggingFace checkpoint into a module. + + This replaces the removed ``transformers.modeling_utils.load_sharded_checkpoint`` + function. It supports both safetensors and PyTorch checkpoint formats. + """ + folder = str(folder) + + # Determine checkpoint format and collect shard files + index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME) + if os.path.isfile(index_file): + import json + with open(index_file) as f: + shard_files = sorted(set(json.load(f)["weight_map"].values())) + shard_paths = [os.path.join(folder, s) for s in shard_files] + use_safetensors = True + elif os.path.isfile(os.path.join(folder, WEIGHTS_INDEX_NAME)): + import json + with open(os.path.join(folder, WEIGHTS_INDEX_NAME)) as f: + shard_files = sorted(set(json.load(f)["weight_map"].values())) + shard_paths = [os.path.join(folder, s) for s in shard_files] + use_safetensors = False + elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)): + shard_paths = [os.path.join(folder, SAFE_WEIGHTS_NAME)] + use_safetensors = True + elif os.path.isfile(os.path.join(folder, WEIGHTS_NAME)): + shard_paths = [os.path.join(folder, WEIGHTS_NAME)] + use_safetensors = False + else: + raise FileNotFoundError( + f"No checkpoint found in {folder}. Expected " + f"{SAFE_WEIGHTS_INDEX_NAME}, {WEIGHTS_INDEX_NAME}, " + f"{SAFE_WEIGHTS_NAME}, or {WEIGHTS_NAME}.") + + # Load state dict from all shards and merge + full_state_dict: Dict[str, torch.Tensor] = {} + if use_safetensors: + from safetensors.torch import load_file + for path in shard_paths: + full_state_dict.update(load_file(path)) + else: + for path in shard_paths: + full_state_dict.update( + torch.load(path, map_location="cpu", weights_only=True)) + + module.load_state_dict(full_state_dict, strict=strict) + + class Llama4VisionEncoder(nn.Module): def __init__(self, model_config: ModelConfig[Llama4Config], *args, @@ -1148,9 +1199,9 @@ def load_weights(self, weights: Dict): # Otherwise, load the weights from the checkpoint. else: - load_sharded_checkpoint(module_dict, - self.pretrained_config._name_or_path, - strict=False) + _load_checkpoint_into_module(module_dict, + self.pretrained_config._name_or_path, + strict=False) self.vision_model = module_dict["vision_model"].to(self.device) self.mm_projector = module_dict["multi_modal_projector"].to(self.device) diff --git a/tensorrt_llm/_torch/models/modeling_mllama.py b/tensorrt_llm/_torch/models/modeling_mllama.py index 16ec672539a..21a5fc447f4 100644 --- a/tensorrt_llm/_torch/models/modeling_mllama.py +++ b/tensorrt_llm/_torch/models/modeling_mllama.py @@ -274,8 +274,10 @@ def __init__( self.hidden_size = pretrained_config.text_config.hidden_size self.max_num_tiles = pretrained_config.vision_config.max_num_tiles self.vision_output_dim = pretrained_config.vision_config.vision_output_dim - self.pad_token_id = (pretrained_config.pad_token_id if - pretrained_config.pad_token_id is not None else -1) + self.pad_token_id = getattr(pretrained_config, 'pad_token_id', None) + if self.pad_token_id is None: + self.pad_token_id = getattr(pretrained_config.text_config, + 'pad_token_id', -1) or -1 self.image_size = pretrained_config.vision_config.image_size # hack config diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py index 623195da94a..446a7383a0a 100644 --- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py +++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py @@ -1073,4 +1073,4 @@ def forward( return hidden_states -AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig) +AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig, exist_ok=True) diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py index 268ef6ce5f5..08c7303bdbb 100644 --- a/tensorrt_llm/_torch/models/modeling_phi4mm.py +++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py @@ -114,6 +114,12 @@ def _load_phi4mm_classes(local_path): spec = importlib.util.spec_from_file_location( f"{package_name}.hf_modeling_phi4mm", modeling_phi4mm_path) hf_modeling_phi4mm = importlib.util.module_from_spec(spec) + # Inject compatibility shims for classes removed in transformers 5.x. + # The model's custom modeling_phi4mm.py may import SlidingWindowCache + # which was removed in transformers 5.3.0 (merged into StaticCache). + _cache_utils = importlib.import_module("transformers.cache_utils") + if not hasattr(_cache_utils, "SlidingWindowCache"): + _cache_utils.SlidingWindowCache = _cache_utils.StaticCache spec.loader.exec_module(hf_modeling_phi4mm) Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding Phi4MMImageEmbedding = hf_modeling_phi4mm.Phi4MMImageEmbedding diff --git a/tensorrt_llm/_torch/models/modeling_siglip.py b/tensorrt_llm/_torch/models/modeling_siglip.py index e4ed6d462b8..071ee7f03d4 100644 --- a/tensorrt_llm/_torch/models/modeling_siglip.py +++ b/tensorrt_llm/_torch/models/modeling_siglip.py @@ -2,8 +2,6 @@ import torch import torch.nn as nn -from transformers.modeling_utils import (get_parameter_device, - get_parameter_dtype) from transformers.models.siglip.configuration_siglip import SiglipVisionConfig from transformers.models.siglip.modeling_siglip import (SiglipVisionConfig, SiglipVisionEmbeddings) @@ -13,6 +11,7 @@ from ..attention_backend.interface import AttentionMetadata from ..attention_backend.utils import get_attention_backend from ..model_config import ModelConfig +from .hf_parameter_utils import get_parameter_device, get_parameter_dtype from .modeling_clip import CLIPEncoder from .modeling_utils import _load_weights_impl, register_auto_model diff --git a/tensorrt_llm/_torch/models/modeling_vila.py b/tensorrt_llm/_torch/models/modeling_vila.py index 8b634229237..1e3bcab02b5 100644 --- a/tensorrt_llm/_torch/models/modeling_vila.py +++ b/tensorrt_llm/_torch/models/modeling_vila.py @@ -1252,5 +1252,5 @@ def post_config(self): self.model_config.pretrained_config = self.llm.config -AutoConfig.register(VilaConfig.model_type, VilaConfig) +AutoConfig.register(VilaConfig.model_type, VilaConfig, exist_ok=True) AutoModel.register(VilaConfig, VilaModel) diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py index 49e56c4d23d..67d805416cf 100644 --- a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py +++ b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py @@ -6,8 +6,8 @@ import torch.nn.functional as F from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps from tqdm import tqdm -from transformers.modeling_utils import get_parameter_device +from tensorrt_llm._torch.models.hf_parameter_utils import get_parameter_device from tensorrt_llm._torch.modules.layer_norm import LayerNorm from tensorrt_llm._torch.modules.linear import Linear from tensorrt_llm._torch.modules.mlp import MLP diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 47a6a88499e..c53e7a08504 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -70,6 +70,23 @@ np_float8 = np.dtype('V1', metadata={"dtype": "float8"}) +def get_hf_rope_theta(config: Any, default: float = 10000.0) -> float: + """Return RoPE ``theta`` from a Hugging Face ``PreTrainedConfig``-like object. + + Transformers v5+ nests ``rope_theta`` under ``rope_parameters`` for several + models (e.g. LLaMA); older releases expose ``config.rope_theta`` directly. + """ + theta = getattr(config, "rope_theta", None) + if theta is not None: + return float(theta) + rope_params = getattr(config, "rope_parameters", None) + if isinstance(rope_params, dict): + theta = rope_params.get("rope_theta") + if theta is not None: + return float(theta) + return default + + def torch_to_numpy(x: torch.Tensor): assert isinstance(x, torch.Tensor), \ f'x must be a torch.Tensor object, but got {type(x)}.' diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 5dd99755dc6..df80459359a 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -680,8 +680,16 @@ class RotaryScalingType(IntEnum): @staticmethod def from_string(s): + if isinstance(s, RotaryScalingType): + return s + if s is None: + return RotaryScalingType.none + key = str(s).lower() + # Hugging Face Transformers v5+ uses type "default" for unscaled / standard RoPE. + if key == "default": + return RotaryScalingType.none try: - return RotaryScalingType[s] + return RotaryScalingType[key] except KeyError: raise ValueError(f'Unsupported rotary scaling type: {s}') @@ -722,6 +730,9 @@ def __str__(self): @staticmethod def from_string(s): + # Transformers 5.x uses "default" for standard RoPE (no scaling). + if s == "default": + return PositionEmbeddingType.rope_gpt_neox try: return PositionEmbeddingType[s] except KeyError: diff --git a/tensorrt_llm/models/commandr/config.py b/tensorrt_llm/models/commandr/config.py index a2edca61fb7..511640c2249 100644 --- a/tensorrt_llm/models/commandr/config.py +++ b/tensorrt_llm/models/commandr/config.py @@ -16,6 +16,7 @@ import transformers +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..modeling_utils import PretrainedConfig, QuantConfig @@ -79,7 +80,7 @@ def from_hugging_face( hidden_act=hf_config.hidden_act, norm_epsilon=hf_config.layer_norm_eps, output_multiplier_scale=hf_config.logit_scale, - rotary_base=hf_config.rope_theta, + rotary_base=get_hf_rope_theta(hf_config, 10000.0), attn_bias=hf_config.attention_bias, qk_layernorm=hf_config.use_qk_norm, mapping=mapping, diff --git a/tensorrt_llm/models/deepseek_v1/config.py b/tensorrt_llm/models/deepseek_v1/config.py index b47fa91a43d..e7bff0d9aab 100755 --- a/tensorrt_llm/models/deepseek_v1/config.py +++ b/tensorrt_llm/models/deepseek_v1/config.py @@ -15,6 +15,7 @@ from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -70,7 +71,7 @@ def from_hugging_face( num_key_value_heads = getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads) rotary_scaling = getattr(hf_config, "rope_scaling", None) - rotary_base = getattr(hf_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_config, 10000.0) dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None)) moe_config = MoeConfig( num_experts=getattr(hf_config, 'n_routed_experts', 0), diff --git a/tensorrt_llm/models/deepseek_v2/config.py b/tensorrt_llm/models/deepseek_v2/config.py index edaf21f128c..c110df0d53f 100644 --- a/tensorrt_llm/models/deepseek_v2/config.py +++ b/tensorrt_llm/models/deepseek_v2/config.py @@ -17,6 +17,7 @@ from transformers import AutoConfig +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..modeling_utils import PretrainedConfig, QuantConfig @@ -129,7 +130,7 @@ def from_hugging_face( max_position_embeddings=hf_config.max_position_embeddings, hidden_act='swiglu', norm_epsilon=hf_config.rms_norm_eps, - rotary_base=hf_config.rope_theta, + rotary_base=get_hf_rope_theta(hf_config, 10000.0), rotary_scaling=rotary_scaling, moe_inter_size=hf_config.moe_intermediate_size, moe=moe_config, diff --git a/tensorrt_llm/models/deepseek_v2/convert.py b/tensorrt_llm/models/deepseek_v2/convert.py index 697040d3b75..5a23130fc52 100755 --- a/tensorrt_llm/models/deepseek_v2/convert.py +++ b/tensorrt_llm/models/deepseek_v2/convert.py @@ -20,7 +20,8 @@ from tensorrt_llm.layers import MoeConfig -from ..._utils import pad_vocab_size, release_gc, str_dtype_to_torch +from ..._utils import (get_hf_rope_theta, pad_vocab_size, release_gc, + str_dtype_to_torch) from ...logger import logger from ...mapping import Mapping from ..convert_utils import get_tllm_linear_weight @@ -52,7 +53,7 @@ def create_trt_config_from_hf(model_dir, vocab_size = hf_config.vocab_size n_positions = hf_config.max_position_embeddings hidden_act = 'swiglu' # TRT-LLM request make gated activation explicit for MOE implementation - rotary_base = hf_config.rope_theta + rotary_base = get_hf_rope_theta(hf_config, 10000.0) rms_norm_eps = hf_config.rms_norm_eps rotary_scaling_beta_fast = hf_config.rope_scaling['beta_fast'] rotary_scaling_beta_slow = hf_config.rope_scaling['beta_slow'] diff --git a/tensorrt_llm/models/eagle/config.py b/tensorrt_llm/models/eagle/config.py index f81e43bb03f..e7a559f3469 100644 --- a/tensorrt_llm/models/eagle/config.py +++ b/tensorrt_llm/models/eagle/config.py @@ -18,6 +18,7 @@ from transformers import LlamaConfig +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..llama.config import LLaMAConfig @@ -84,7 +85,7 @@ def from_hugging_face( rms_norm_eps = hf_config.rms_norm_eps vocab_size = hf_config.vocab_size rotary_scaling = hf_config.rope_scaling - rotary_base = hf_config.rope_theta + rotary_base = get_hf_rope_theta(hf_config, 10000.0) n_positions = hf_config.max_position_embeddings hidden_act = hf_config.hidden_act dtype = str(hf_config.torch_dtype)[6:] if dtype == 'auto' else dtype diff --git a/tensorrt_llm/models/falcon/config.py b/tensorrt_llm/models/falcon/config.py index c96bd517cc4..1ff2ff0391c 100644 --- a/tensorrt_llm/models/falcon/config.py +++ b/tensorrt_llm/models/falcon/config.py @@ -14,6 +14,7 @@ # limitations under the License. from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..modeling_utils import PretrainedConfig, QuantConfig @@ -109,7 +110,7 @@ def from_hugging_face( max_position_embeddings=getattr(hf_config, 'max_position_embeddings', 2048), - rotary_base=getattr(hf_config, 'rope_theta', 10000.0), + rotary_base=get_hf_rope_theta(hf_config, 10000.0), intermediate_size=getattr(hf_config, 'ffn_hidden_size', None), mapping=mapping, diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py index 8e176c4ed7e..3b0d8d6218c 100644 --- a/tensorrt_llm/models/gemma/config.py +++ b/tensorrt_llm/models/gemma/config.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Optional, Union +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping @@ -186,7 +187,7 @@ def from_hugging_face( norm_epsilon=hf_config.rms_norm_eps, num_key_value_heads=getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads), - rotary_base=getattr(hf_config, "rope_theta", 10000.0), + rotary_base=get_hf_rope_theta(hf_config, 10000.0), rotary_scaling=getattr(hf_config, "rotary_scaling", None), quantization=quant_config, mapping=mapping, diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py index e89dddd5efe..ba09d1f8694 100644 --- a/tensorrt_llm/models/gpt/config.py +++ b/tensorrt_llm/models/gpt/config.py @@ -17,6 +17,7 @@ import torch +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...logger import logger from ...mapping import Mapping @@ -134,7 +135,7 @@ def from_hugging_face( hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else gpt_variant != 'nemotron' hf_config.position_embedding_type = 'rope_gpt_neox' - hf_config.rotary_base = hf_config.rope_theta + hf_config.rotary_base = get_hf_rope_theta(hf_config, 10000.0) hf_config.rotary_pct = getattr( hf_config, 'partial_rotary_factor', getattr(hf_config, 'rope_percent', 1.0)) diff --git a/tensorrt_llm/models/gpt/convert.py b/tensorrt_llm/models/gpt/convert.py index 1e2bc4b999d..aa7d9a89e0d 100644 --- a/tensorrt_llm/models/gpt/convert.py +++ b/tensorrt_llm/models/gpt/convert.py @@ -29,8 +29,14 @@ import torch.nn as nn import yaml from tqdm import tqdm -from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, - AutoTokenizer) + +try: + from transformers import AutoModelForVision2Seq +except ImportError: + # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq + +from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.models.gpt2.modeling_gpt2 import GPT2Block from transformers.pytorch_utils import Conv1D diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py index 7e0369a4ba0..54038e32c4f 100644 --- a/tensorrt_llm/models/llama/config.py +++ b/tensorrt_llm/models/llama/config.py @@ -18,6 +18,7 @@ from pathlib import Path from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -112,7 +113,8 @@ def from_hugging_face( from llava.model import LlavaLlamaConfig # noqa from llava.model import LlavaLlamaModel transformers.AutoConfig.register("llava_llama", - LlavaLlamaConfig) + LlavaLlamaConfig, + exist_ok=True) transformers.AutoModelForCausalLM.register( LlavaLlamaConfig, LlavaLlamaModel) @@ -160,7 +162,7 @@ def from_hugging_face( attn_bias = getattr(hf_config, 'bias', False) or getattr( hf_config, 'attention_bias', False) rotary_scaling = getattr(hf_config, "rope_scaling", None) - rotary_base = getattr(hf_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_config, 10000.0) residual_mlp = getattr(hf_config, "parallel_attn_mlp_res", False) disable_weight_only_quant_plugin = kwargs.pop( 'disable_weight_only_quant_plugin', False) diff --git a/tensorrt_llm/models/mllama/config.py b/tensorrt_llm/models/mllama/config.py index 5fb24f6fac7..cbd7f1b8f38 100644 --- a/tensorrt_llm/models/mllama/config.py +++ b/tensorrt_llm/models/mllama/config.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import List, Optional, Union +from ..._utils import get_hf_rope_theta from ...functional import LayerNormPositionType, LayerNormType, MLPType from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -166,7 +167,7 @@ def from_hugging_face( attn_bias = getattr(hf_text_config, 'bias', False) or getattr( hf_text_config, 'attention_bias', False) rotary_scaling = getattr(hf_text_config, "rope_scaling", None) - rotary_base = getattr(hf_text_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_text_config, 10000.0) residual_mlp = getattr(hf_text_config, "parallel_attn_mlp_res", False) disable_weight_only_quant_plugin = kwargs.pop( 'disable_weight_only_quant_plugin', False) diff --git a/tensorrt_llm/models/nemotron_nas/config.py b/tensorrt_llm/models/nemotron_nas/config.py index 139b052c7bc..11d02df84b0 100644 --- a/tensorrt_llm/models/nemotron_nas/config.py +++ b/tensorrt_llm/models/nemotron_nas/config.py @@ -15,6 +15,7 @@ from dataclasses import asdict from typing import Any, Dict, List, Optional, Union +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.convert_utils import infer_dtype @@ -198,7 +199,7 @@ def from_hugging_face( num_key_value_heads=hf_config.num_key_value_heads, norm_epsilon=hf_config.rms_norm_eps, rotary_scaling=hf_config.rope_scaling, - rotary_base=hf_config.rope_theta, + rotary_base=get_hf_rope_theta(hf_config, 10000.0), vocab_size=hf_config.vocab_size, max_position_embeddings=hf_config.max_position_embeddings, mapping=mapping, diff --git a/tensorrt_llm/models/phi/config.py b/tensorrt_llm/models/phi/config.py index 3d38db0fa7b..583de15fadf 100644 --- a/tensorrt_llm/models/phi/config.py +++ b/tensorrt_llm/models/phi/config.py @@ -15,6 +15,7 @@ from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..modeling_utils import PretrainedConfig, QuantConfig @@ -64,7 +65,7 @@ def from_hugging_face( num_key_value_heads = getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads) rotary_scaling = getattr(hf_config, "rope_scaling", None) - rotary_base = getattr(hf_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_config, 10000.0) dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None)) return cls(architecture=hf_config.architectures[0], diff --git a/tensorrt_llm/models/phi/convert.py b/tensorrt_llm/models/phi/convert.py index 0d1ec78bfd7..4bf3406c726 100644 --- a/tensorrt_llm/models/phi/convert.py +++ b/tensorrt_llm/models/phi/convert.py @@ -1,6 +1,6 @@ import torch -from ..._utils import pad_vocab_size, str_dtype_to_torch +from ..._utils import get_hf_rope_theta, pad_vocab_size, str_dtype_to_torch def split(v, tp_size, idx, dim=0): @@ -129,7 +129,7 @@ def convert_hf_config(hf_config, dtype, args): 'num_hidden_layers': hf_config.num_hidden_layers, 'num_attention_heads': hf_config.num_key_value_heads, 'rotary_pct': hf_config.partial_rotary_factor, - 'rope_theta': hf_config.rope_theta, + 'rope_theta': get_hf_rope_theta(hf_config, 10000.0), 'hidden_size': hf_config.hidden_size, 'intermediate_size': hf_config.intermediate_size, 'vocab_size': hf_config.vocab_size, diff --git a/tensorrt_llm/models/phi3/config.py b/tensorrt_llm/models/phi3/config.py index c824e921720..42d3954092e 100644 --- a/tensorrt_llm/models/phi3/config.py +++ b/tensorrt_llm/models/phi3/config.py @@ -15,6 +15,7 @@ from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -96,7 +97,7 @@ def from_hugging_face( hf_config, "dense_attention_every_n_layers", None) kwargs['norm_epsilon'] = hf_config.layer_norm_epsilon else: - kwargs['rotary_base'] = hf_config.rope_theta + kwargs['rotary_base'] = get_hf_rope_theta(hf_config, 10000.0) kwargs['norm_epsilon'] = hf_config.rms_norm_eps moe_variant = hf_config.architectures[0] == "PhiMoEForCausalLM" if moe_variant: diff --git a/tensorrt_llm/models/qwen/config.py b/tensorrt_llm/models/qwen/config.py index e2c22909538..0f1bd34606b 100644 --- a/tensorrt_llm/models/qwen/config.py +++ b/tensorrt_llm/models/qwen/config.py @@ -14,6 +14,7 @@ # limitations under the License. from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -138,7 +139,7 @@ def from_hugging_face(cls, rotary_base = getattr(hf_config, "rotary_emb_base", 10000.0) else: rms_norm_eps = hf_config.rms_norm_eps - rotary_base = getattr(hf_config, "rope_theta", 100000.0) + rotary_base = get_hf_rope_theta(hf_config, 100000.0) num_labels = 1 if hf_config.architectures[0] == "Qwen2ForSequenceClassification": diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py index 302eb74533f..8c1aa57efa6 100755 --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -34,7 +34,7 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer) -from .._utils import release_gc, str_dtype_to_torch +from .._utils import get_hf_rope_theta, release_gc, str_dtype_to_torch from ..logger import logger from ..mapping import Mapping from .image_processing import MllamaImageProcessor @@ -888,7 +888,8 @@ def quantize_and_export(*, if qwen_config.model_type == "qwen2": tensorrt_llm_config[ "norm_epsilon"] = qwen_config.rms_norm_eps - tensorrt_llm_config["rotary_base"] = qwen_config.rope_theta + tensorrt_llm_config["rotary_base"] = get_hf_rope_theta( + qwen_config, 100000.0) tensorrt_llm_config[ "intermediate_size"] = qwen_config.intermediate_size with open(f"{export_path}/config.json", "w") as f: diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py index 5a88cc9dd80..47d9bd66ad9 100644 --- a/tensorrt_llm/runtime/multimodal_model_runner.py +++ b/tensorrt_llm/runtime/multimodal_model_runner.py @@ -28,8 +28,8 @@ from .. import profiler from .._deprecation import emit_engine_arch_deprecation -from .._utils import (maybe_pin_memory, mpi_rank, prefer_pinned, - str_dtype_to_torch, str_dtype_to_trt, +from .._utils import (get_hf_rope_theta, maybe_pin_memory, mpi_rank, + prefer_pinned, str_dtype_to_torch, str_dtype_to_trt, supports_inflight_batching, torch_dtype_to_trt, trt_dtype_to_torch) from ..functional import RopeEmbeddingUtils, RotaryScalingType @@ -415,7 +415,7 @@ def __init__(self, args): self.max_position_embeddings = hf_config.max_position_embeddings self.hidden_size = hf_config.hidden_size self.num_attention_heads = hf_config.num_attention_heads - self.rope_theta = hf_config.rope_theta + self.rope_theta = get_hf_rope_theta(hf_config, 10000.0) if self.model_type == 'llava_onevision': self.num_frames = self.args.video_num_frames if self.num_frames is None: diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py index bf948eb2506..c2006f5f2c0 100644 --- a/tensorrt_llm/tools/multimodal_builder.py +++ b/tensorrt_llm/tools/multimodal_builder.py @@ -14,10 +14,15 @@ from tensorrt_llm._utils import torch_dtype_to_str, to_json_file from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger +try: + from transformers import AutoModelForVision2Seq +except ImportError: + # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq + from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, - AutoModelForVision2Seq, AutoProcessor, - Blip2ForConditionalGeneration, Blip2Processor, - FuyuForCausalLM, FuyuProcessor, + AutoProcessor, Blip2ForConditionalGeneration, + Blip2Processor, FuyuForCausalLM, FuyuProcessor, LlavaForConditionalGeneration, NougatProcessor, Pix2StructForConditionalGeneration, VisionEncoderDecoderModel, CLIPVisionModel) diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py index 743f3998f26..f7dde64c669 100644 --- a/tests/unittest/_torch/helpers.py +++ b/tests/unittest/_torch/helpers.py @@ -1,4 +1,4 @@ -from typing import Dict, Tuple +from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F @@ -256,3 +256,34 @@ def create_mock_cuda_graph_runner(batch_size: int, use_mrope: bool = False): dist=None, kv_cache_manager_key=ResourceManagerType.KV_CACHE_MANAGER) return CUDAGraphRunner(config) + + +def make_hf_hybrid_cache_for_tests( + config, + *, + max_cache_len: int, + max_batch_size: Optional[int] = None, + device=None, + dtype=None, +): + """Build Hugging Face ``past_key_values`` for hybrid / sliding-window models in tests. + + Transformers v4 exposes ``HybridCache``; v5 removes it in favor of ``StaticCache`` + for fixed-length pre-allocated KV (see HF cache refactor). + """ + try: + from transformers.cache_utils import HybridCache + except ImportError: + from transformers.cache_utils import StaticCache + + return StaticCache(config=config, max_cache_len=max_cache_len) + + kwargs = { + "config": config, + "max_cache_len": max_cache_len, + "device": device, + "dtype": dtype, + } + if max_batch_size is not None: + kwargs["max_batch_size"] = max_batch_size + return HybridCache(**kwargs) diff --git a/tests/unittest/_torch/modeling/test_modeling_cohere2.py b/tests/unittest/_torch/modeling/test_modeling_cohere2.py index 20c2e88fe69..1a839cacd6c 100644 --- a/tests/unittest/_torch/modeling/test_modeling_cohere2.py +++ b/tests/unittest/_torch/modeling/test_modeling_cohere2.py @@ -1,9 +1,9 @@ from copy import deepcopy import torch +from _torch.helpers import make_hf_hybrid_cache_for_tests from transformers import Cohere2Config from transformers import Cohere2ForCausalLM as HFCohere2ForCausalLM -from transformers.cache_utils import HybridCache import tensorrt_llm from tensorrt_llm._torch.attention_backend.utils import get_attention_backend @@ -161,8 +161,8 @@ def test_cohere2_allclose_to_hf(self) -> None: # Initialize the hugging face model hf_cohere2 = HFCohere2ForCausalLM(cohere2_config).to(dtype).to(device).eval() - hf_cache = HybridCache( - config=cohere2_config, + hf_cache = make_hf_hybrid_cache_for_tests( + cohere2_config, max_batch_size=batch_size, max_cache_len=10, device=device, diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py index 931828be848..7ea88c93c57 100644 --- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py +++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py @@ -25,8 +25,8 @@ class Exaone4Config(PretrainedConfig): # TODO: Remove this once we have a proper config for Exaone4 SKIP_EXAONE4_HF_ACCURACY_TEST = True -from _torch.helpers import create_mock_cuda_graph_runner -from transformers.cache_utils import HybridCache +from _torch.helpers import (create_mock_cuda_graph_runner, + make_hf_hybrid_cache_for_tests) from utils.util import getSMVersion import tensorrt_llm @@ -248,11 +248,13 @@ def test_exaone4_allclose_to_hf(self, scenario: Scenario) -> None: num_kv_heads = exaone4.config.num_key_value_heads max_seq_len = num_blocks * tokens_per_block batch_size = 1 - hf_cache = HybridCache(config=exaone4_config, - max_batch_size=batch_size, - max_cache_len=max_seq_len, - device=device, - dtype=dtype) + hf_cache = make_hf_hybrid_cache_for_tests( + exaone4_config, + max_batch_size=batch_size, + max_cache_len=max_seq_len, + device=device, + dtype=dtype, + ) if dtype == torch.half: kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF elif dtype == torch.bfloat16: diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py index 6b532b9b1c6..f252f4cbd2d 100644 --- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py +++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py @@ -3,11 +3,11 @@ from dataclasses import dataclass import torch +from _torch.helpers import make_hf_hybrid_cache_for_tests from parameterized import parameterized from transformers import Gemma3Config from transformers import Gemma3ForCausalLM as HFGemma3ForCausalLM from transformers import Gemma3TextConfig -from transformers.cache_utils import HybridCache import tensorrt_llm from tensorrt_llm._torch.attention_backend import (AttentionMetadata, @@ -285,11 +285,13 @@ def test_gemma3_allclose_to_hf(self, scenario: Scenario) -> None: hf_gemma3 = HFGemma3ForCausalLM(gemma3_config).to(dtype).to( device).eval() - hf_cache = HybridCache(config=gemma3_config, - max_batch_size=batch_size, - max_cache_len=10, - device=device, - dtype=dtype) + hf_cache = make_hf_hybrid_cache_for_tests( + gemma3_config, + max_batch_size=batch_size, + max_cache_len=10, + device=device, + dtype=dtype, + ) model_config = ModelConfig(pretrained_config=gemma3_config, attn_backend=backend) diff --git a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py new file mode 100644 index 00000000000..6866feb8ba6 --- /dev/null +++ b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DynamicCache legacy tuple format for tests (removed from Transformers v5+).""" + +from __future__ import annotations + +from typing import List, Optional, Sequence, Tuple, Union + +import torch +from transformers.cache_utils import DynamicCache + +LegacyLayerKV = Tuple[torch.Tensor, torch.Tensor] +LegacyCache = Tuple[LegacyLayerKV, ...] + + +def dynamic_cache_from_legacy( + past_key_values: Optional[Union[LegacyCache, Sequence[LegacyLayerKV]]], +) -> DynamicCache: + """Match pre-v5 ``DynamicCache.from_legacy_cache`` (see transformers v4.48 ``cache_utils``).""" + if past_key_values is None: + return DynamicCache() + if hasattr(DynamicCache, "from_legacy_cache"): + return DynamicCache.from_legacy_cache(past_key_values) + cache = DynamicCache() + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + + +def dynamic_cache_to_legacy(cache: DynamicCache) -> LegacyCache: + """Match pre-v5 ``DynamicCache.to_legacy_cache``.""" + if hasattr(cache, "to_legacy_cache"): + return cache.to_legacy_cache() + layers: List[LegacyLayerKV] = [] + for layer in cache.layers: + if not getattr(layer, "is_initialized", False): + continue + keys = layer.keys + values = layer.values + if keys is None or values is None: + continue + layers.append((keys, values)) + return tuple(layers) diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py index 349bf6b752d..328ce022118 100644 --- a/tests/unittest/trt/attention/test_gpt_attention.py +++ b/tests/unittest/trt/attention/test_gpt_attention.py @@ -48,6 +48,8 @@ MemoryPoolsAllocator from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ PoolsKVCacheManager +from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( + dynamic_cache_from_legacy, dynamic_cache_to_legacy) class TestFunctional(unittest.TestCase): @@ -630,13 +632,19 @@ def _construct_execution( rope_scale_type = RotaryScalingType.none rope_scale = 1.0 if attention_type == "llama_attention": - rope_base = configuration.rope_theta - if configuration.rope_scaling is not None: + rope_params = getattr(configuration, 'rope_parameters', + None) or {} + rope_base = rope_params.get( + 'rope_theta', + getattr(configuration, 'rope_theta', 10000.0)) + rope_type = rope_params.get( + 'rope_type', rope_params.get('type', 'default')) + if rope_type not in ('default', None): rope_scale_type = { "linear": RotaryScalingType.linear, "dynamic": RotaryScalingType.dynamic - }[configuration.rope_scaling["type"]] - rope_scale = configuration.rope_scaling["factor"] + }[rope_type] + rope_scale = rope_params.get("factor", 1.0) rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin( configuration.max_position_embeddings, rotary_embedding_dim, rope_base, rope_scale) @@ -891,8 +899,17 @@ def _construct_execution( if attention_type == 'llama_attention': configuration.num_key_value_heads = num_kv_heads - configuration.rope_theta = rope_base - configuration.rope_scaling = rope_scaling + # In transformers 5.x, rope_theta/rope_scaling are unified into + # rope_parameters. Build the dict so LlamaRotaryEmbedding works. + if rope_scaling is not None: + rope_params = {**rope_scaling, "rope_theta": rope_base} + else: + rope_params = { + "rope_type": "default", + "rope_theta": rope_base, + } + configuration.rope_parameters = rope_params + configuration.rope_scaling = rope_params if rope_scaling is not None: # scaling is typically used for supporting longer seq lens than max_position_embeddings # so we set the max_position_embeddings to be smaller than total seq len @@ -1236,13 +1253,12 @@ def verify_kv_cache(torch_present): attention_packed_mask = None if attention_type == 'gpt2_attention': # gpt2 uses DynamicCache - torch_present = DynamicCache.from_legacy_cache( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, past_key_value=torch_present, use_cache=True, attention_mask=attention_mask)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'llama_attention': position_embeddings = rotary_emb(input_tensor, position_ids) attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask( @@ -1257,7 +1273,7 @@ def verify_kv_cache(torch_present): position_embeddings=position_embeddings, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gptj_attention': torch_present = DynamicCache() torch_output = attention(input_tensor, @@ -1265,7 +1281,7 @@ def verify_kv_cache(torch_present): position_ids=position_ids, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gpt_bigcode_attention': attention_mask = _prepare_4d_attention_mask( ctx_attention_mask, @@ -1280,7 +1296,7 @@ def verify_kv_cache(torch_present): layer_past=torch_present, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) else: raise RuntimeError("attention_type not properly set") @@ -1377,13 +1393,12 @@ def verify_kv_cache(torch_present): # torch execution if attention_type == 'gpt2_attention': # gpt2 uses DynamicCache - torch_present = DynamicCache.from_legacy_cache( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, past_key_value=torch_present, use_cache=True, attention_mask=attention_mask)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'llama_attention': position_embeddings = rotary_emb(input_tensor, position_ids) attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask( @@ -1392,37 +1407,34 @@ def verify_kv_cache(torch_present): device='cuda', past_key_values_length=in_len + step - 1) # llama uses DynamicCache - torch_present = DynamicCache.from_legacy_cache( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention( input_tensor, past_key_value=torch_present, position_embeddings=position_embeddings, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gptj_attention': - torch_present = DynamicCache.from_legacy_cache( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, layer_past=torch_present, position_ids=position_ids, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gpt_bigcode_attention': # target shape = (b, h, 1, s_key) key_seqlen = in_len + step # ctx_attention_mask.shape[1] attention_mask = (attention_mask >= 0).expand(batch_size, num_heads, 1, key_seqlen) - torch_present = DynamicCache.from_legacy_cache( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, layer_past=torch_present, use_cache=True, attention_mask=attention_mask)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) def tile_beam_width(tensor: torch.Tensor, num_beams: int): if num_beams == 1: diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py index cda9025a8b9..cacad8c35aa 100644 --- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py +++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py @@ -26,7 +26,6 @@ from parameterized import parameterized from transformers import GPT2Config, GPTBigCodeConfig, GPTJConfig, LlamaConfig -from transformers.cache_utils import DynamicCache from transformers.modeling_attn_mask_utils import (AttentionMaskConverter, _prepare_4d_attention_mask) from transformers.models.gpt2.modeling_gpt2 import GPT2Attention @@ -51,6 +50,8 @@ MemoryPoolsAllocator from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ PoolsKVCacheManager +from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( + dynamic_cache_from_legacy, dynamic_cache_to_legacy) class TestFunctional(unittest.TestCase): @@ -381,13 +382,19 @@ def _construct_execution(session, rope_scale_type = RotaryScalingType.none rope_scale = 1.0 if attention_type == "llama_attention": - rope_base = configuration.rope_theta - if configuration.rope_scaling is not None: + rope_params = getattr(configuration, 'rope_parameters', + None) or {} + rope_base = rope_params.get( + 'rope_theta', + getattr(configuration, 'rope_theta', 10000.0)) + rope_type = rope_params.get( + 'rope_type', rope_params.get('type', 'default')) + if rope_type not in ('default', None): rope_scale_type = { "linear": RotaryScalingType.linear, "dynamic": RotaryScalingType.dynamic - }[configuration.rope_scaling["type"]] - rope_scale = configuration.rope_scaling["factor"] + }[rope_type] + rope_scale = rope_params.get("factor", 1.0) rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin( configuration.max_position_embeddings, rotary_embedding_dim, rope_base, rope_scale) @@ -579,8 +586,17 @@ def _construct_execution(session, attn_implementation='eager') if attention_type == 'llama_attention': configuration.num_key_value_heads = num_kv_heads - configuration.rope_theta = rope_base - configuration.rope_scaling = rope_scaling + # In transformers 5.x, rope_theta/rope_scaling are unified into + # rope_parameters. Build the dict so LlamaRotaryEmbedding works. + if rope_scaling is not None: + rope_params = {**rope_scaling, "rope_theta": rope_base} + else: + rope_params = { + "rope_type": "default", + "rope_theta": rope_base, + } + configuration.rope_parameters = rope_params + configuration.rope_scaling = rope_params if rope_scaling is not None: # scaling is typically used for supporting longer seq lens than max_position_embeddings # so we set the max_position_embeddings to be smaller than total seq len @@ -760,8 +776,7 @@ def torch_exec(step: int, tgt_len=(in_len if step == 0 else 1)) if attention_type == 'gpt2_attention': torch_output = attention(input, - past_key_value=layer_past, - use_cache=True, + past_key_values=layer_past, attention_mask=attention_mask)[0] torch_present = layer_past elif attention_type == 'llama_attention': @@ -774,10 +789,9 @@ def torch_exec(step: int, 1)) torch_output = attention( input, - past_key_value=layer_past, + past_key_values=layer_past, position_embeddings=position_embeddings, - attention_mask=attention_mask, - use_cache=True)[0] + attention_mask=attention_mask)[0] torch_present = layer_past elif attention_type == 'gptj_attention': torch_output, torch_present = attention( @@ -1010,7 +1024,7 @@ def torch_exec(step: int, (local_beam_width, input_length, hidden_size)) # llama/gpt2 uses DynamicCache - past_key_values = DynamicCache.from_legacy_cache( + past_key_values = dynamic_cache_from_legacy( torch_cache_list[req_idx]) torch_out, past_key_values = torch_exec( @@ -1018,7 +1032,8 @@ def torch_exec(step: int, past_key_values) # llama/gpt2 uses DynamicCache - torch_cache_list[req_idx] = past_key_values.to_legacy_cache() + torch_cache_list[req_idx] = dynamic_cache_to_legacy( + past_key_values) past_key_values = torch_cache_list[req_idx][0] if use_fp8_kv_cache or use_int8_kv_cache: diff --git a/tests/unittest/trt/model/test_phi.py b/tests/unittest/trt/model/test_phi.py index 9db18f4e46e..b3cf8d28f2f 100644 --- a/tests/unittest/trt/model/test_phi.py +++ b/tests/unittest/trt/model/test_phi.py @@ -24,6 +24,7 @@ import tensorrt_llm from tensorrt_llm import Builder +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.models.phi.convert import load_weights_from_hf_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType @@ -61,7 +62,7 @@ def initialize_network(self, network: tensorrt_llm.Network, hf_model, 'num_attention_heads': hf_config.num_key_value_heads, 'rotary_pct': hf_config.partial_rotary_factor, 'position_embedding_type': 'rope_gpt_neox', - 'rope_theta': hf_config.rope_theta, + 'rope_theta': get_hf_rope_theta(hf_config, 10000.0), 'hidden_size': hf_config.hidden_size, 'intermediate_size': hf_config.intermediate_size, 'vocab_size': hf_config.vocab_size, diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py index 4caff0bbffc..85600193bfe 100644 --- a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py +++ b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py @@ -2,6 +2,8 @@ import torch +from tensorrt_llm._utils import get_hf_rope_theta + class LlavaOnevisionUtils: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -129,7 +131,7 @@ def __init__(self, config): self.max_position_embeddings = config.max_position_embeddings self.hidden_size = config.hidden_size self.num_attention_heads = config.num_attention_heads - self.rope_theta = config.rope_theta + self.rope_theta = get_hf_rope_theta(config, 10000.0) def get_rope_index( self,