From 34d8e335e8e7914af32d54703cf0e2a35bf645d7 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 00:33:56 -0700 Subject: [PATCH 01/22] Upgrade transformers 5.3.0 dependency Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b76e28208bd..bfe364fde97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0 # torch 2.10.0+cu130 depends on nvidia-nccl-cu13==2.28.9 nvidia-nccl-cu13>=2.28.9,<=2.29.2 nvidia-cuda-nvrtc -transformers==4.57.3 +transformers==5.3.0 prometheus_client prometheus_fastapi_instrumentator pydantic>=2.9.1 From a16ddeb828b9e1348d948d076f9240543b8ed852 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 01:01:24 -0700 Subject: [PATCH 02/22] fix(auto_deploy): fallback SDPA mask patch when executorch helper removed Transformers 5.x removed sdpa_mask_without_vmap from integrations.executorch. Use functools.partial(sdpa_mask, use_vmap=False) when the legacy import fails. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- .../export/library/transformers_sdpa_mask.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py index fd21604d1b6..d6272be573d 100644 --- a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py +++ b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py @@ -1,6 +1,7 @@ """Patch for transformers SDPA mask to be export-compatible.""" import importlib.metadata +from functools import partial from packaging import version @@ -29,7 +30,14 @@ def _apply_patch(self): try: # imports only after version check from transformers import masking_utils - from transformers.integrations.executorch import sdpa_mask_without_vmap + + # Up to ~4.53+, HF exposed this helper next to ExecuTorch export utilities. + # Transformers 5.x removed it; sdpa_mask now supports use_vmap=False (the default), + # which is export-compatible without vmap. + try: + from transformers.integrations.executorch import sdpa_mask_without_vmap + except ImportError: + sdpa_mask_without_vmap = partial(masking_utils.sdpa_mask, use_vmap=False) # recall original implementation self.original_values["masking_utils.sdpa_mask"] = masking_utils.sdpa_mask From f274bb6c9a7a96ba75ae9924b85e5c7d226347f0 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 03:24:04 -0700 Subject: [PATCH 03/22] fix: import AutoModelForImageTextToText for Transformers v5 AutoModelForVision2Seq was removed from the public API in Transformers v5. Fall back to AutoModelForImageTextToText when the legacy name is unavailable. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- tensorrt_llm/models/gpt/convert.py | 9 +++++++-- tensorrt_llm/tools/multimodal_builder.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/models/gpt/convert.py b/tensorrt_llm/models/gpt/convert.py index 1e2bc4b999d..315fe6a17d4 100644 --- a/tensorrt_llm/models/gpt/convert.py +++ b/tensorrt_llm/models/gpt/convert.py @@ -29,8 +29,13 @@ import torch.nn as nn import yaml from tqdm import tqdm -from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, - AutoTokenizer) +try: + from transformers import AutoModelForVision2Seq +except ImportError: + # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq + +from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.models.gpt2.modeling_gpt2 import GPT2Block from transformers.pytorch_utils import Conv1D diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py index bf948eb2506..c2006f5f2c0 100644 --- a/tensorrt_llm/tools/multimodal_builder.py +++ b/tensorrt_llm/tools/multimodal_builder.py @@ -14,10 +14,15 @@ from tensorrt_llm._utils import torch_dtype_to_str, to_json_file from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger +try: + from transformers import AutoModelForVision2Seq +except ImportError: + # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText + from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq + from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, - AutoModelForVision2Seq, AutoProcessor, - Blip2ForConditionalGeneration, Blip2Processor, - FuyuForCausalLM, FuyuProcessor, + AutoProcessor, Blip2ForConditionalGeneration, + Blip2Processor, FuyuForCausalLM, FuyuProcessor, LlavaForConditionalGeneration, NougatProcessor, Pix2StructForConditionalGeneration, VisionEncoderDecoderModel, CLIPVisionModel) From 9989aae9f31d8b4391899c225c30e67d86b4ca75 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:25:37 -0700 Subject: [PATCH 04/22] fix: shim get_parameter_device/dtype for Transformers v5 Transformers v5 removed these helpers from modeling_utils. Add hf_parameter_utils with ImportError fallback matching ModuleUtilsMixin behavior; update CLIP, SigLIP, and Wan call sites. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- .../_torch/models/hf_parameter_utils.py | 36 +++++++++++++++++++ tensorrt_llm/_torch/models/modeling_clip.py | 3 +- tensorrt_llm/_torch/models/modeling_siglip.py | 3 +- .../visual_gen/models/wan/transformer_wan.py | 2 +- tensorrt_llm/models/gpt/convert.py | 1 + 5 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 tensorrt_llm/_torch/models/hf_parameter_utils.py diff --git a/tensorrt_llm/_torch/models/hf_parameter_utils.py b/tensorrt_llm/_torch/models/hf_parameter_utils.py new file mode 100644 index 00000000000..b5629827625 --- /dev/null +++ b/tensorrt_llm/_torch/models/hf_parameter_utils.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Compatibility for Hugging Face ``get_parameter_device`` / ``get_parameter_dtype``. + +Transformers v5 no longer exports these from ``transformers.modeling_utils``; they +match ``ModuleUtilsMixin`` behavior for plain ``nn.Module`` stacks. +""" + +from __future__ import annotations + +import torch +import torch.nn as nn + +try: + from transformers.modeling_utils import (get_parameter_device, + get_parameter_dtype) +except ImportError: + + def get_parameter_device(module: nn.Module) -> torch.device: + return next(module.parameters()).device + + def get_parameter_dtype(module: nn.Module) -> torch.dtype: + return next(param.dtype for param in module.parameters() if param.is_floating_point()) diff --git a/tensorrt_llm/_torch/models/modeling_clip.py b/tensorrt_llm/_torch/models/modeling_clip.py index 1e203eda8b7..9e73dcc2dd3 100644 --- a/tensorrt_llm/_torch/models/modeling_clip.py +++ b/tensorrt_llm/_torch/models/modeling_clip.py @@ -4,8 +4,6 @@ import torch.nn as nn from transformers.activations import ACT2FN from transformers.modeling_outputs import BaseModelOutput -from transformers.modeling_utils import (get_parameter_device, - get_parameter_dtype) from transformers.models.clip.configuration_clip import CLIPVisionConfig from transformers.models.clip.modeling_clip import CLIPVisionEmbeddings @@ -17,6 +15,7 @@ from ..model_config import ModelConfig from ..modules.attention import Attention from ..modules.mlp import MLP +from .hf_parameter_utils import get_parameter_device, get_parameter_dtype from .modeling_utils import _load_weights_impl, register_auto_model diff --git a/tensorrt_llm/_torch/models/modeling_siglip.py b/tensorrt_llm/_torch/models/modeling_siglip.py index e4ed6d462b8..071ee7f03d4 100644 --- a/tensorrt_llm/_torch/models/modeling_siglip.py +++ b/tensorrt_llm/_torch/models/modeling_siglip.py @@ -2,8 +2,6 @@ import torch import torch.nn as nn -from transformers.modeling_utils import (get_parameter_device, - get_parameter_dtype) from transformers.models.siglip.configuration_siglip import SiglipVisionConfig from transformers.models.siglip.modeling_siglip import (SiglipVisionConfig, SiglipVisionEmbeddings) @@ -13,6 +11,7 @@ from ..attention_backend.interface import AttentionMetadata from ..attention_backend.utils import get_attention_backend from ..model_config import ModelConfig +from .hf_parameter_utils import get_parameter_device, get_parameter_dtype from .modeling_clip import CLIPEncoder from .modeling_utils import _load_weights_impl, register_auto_model diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py index 49e56c4d23d..67d805416cf 100644 --- a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py +++ b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py @@ -6,8 +6,8 @@ import torch.nn.functional as F from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps from tqdm import tqdm -from transformers.modeling_utils import get_parameter_device +from tensorrt_llm._torch.models.hf_parameter_utils import get_parameter_device from tensorrt_llm._torch.modules.layer_norm import LayerNorm from tensorrt_llm._torch.modules.linear import Linear from tensorrt_llm._torch.modules.mlp import MLP diff --git a/tensorrt_llm/models/gpt/convert.py b/tensorrt_llm/models/gpt/convert.py index 315fe6a17d4..aa7d9a89e0d 100644 --- a/tensorrt_llm/models/gpt/convert.py +++ b/tensorrt_llm/models/gpt/convert.py @@ -29,6 +29,7 @@ import torch.nn as nn import yaml from tqdm import tqdm + try: from transformers import AutoModelForVision2Seq except ImportError: From 7de8d791d6e4ee98986e64c7a45761743148b57b Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:12:21 -0700 Subject: [PATCH 05/22] fix: pass exist_ok to AutoConfig.register for Transformers 5.3 Transformers 5.x is stricter about duplicate config registration. Allow TRT-LLM re-registration when configs are already registered. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- .../_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py | 4 ++-- tensorrt_llm/_torch/models/modeling_exaone4.py | 2 +- tensorrt_llm/_torch/models/modeling_exaone_moe.py | 2 +- tensorrt_llm/_torch/models/modeling_nemotron_h.py | 2 +- tensorrt_llm/_torch/models/modeling_vila.py | 2 +- tensorrt_llm/models/llama/config.py | 3 ++- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py index e227bc7ebec..71c11814379 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py +++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py @@ -2854,8 +2854,8 @@ def init_input_processor(self, base): # Registration # ============================================================================= -AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig) -AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig) +AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig, exist_ok=True) +AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig, exist_ok=True) AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM) Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration) diff --git a/tensorrt_llm/_torch/models/modeling_exaone4.py b/tensorrt_llm/_torch/models/modeling_exaone4.py index 07951fc28a4..2b49b9bb475 100644 --- a/tensorrt_llm/_torch/models/modeling_exaone4.py +++ b/tensorrt_llm/_torch/models/modeling_exaone4.py @@ -29,7 +29,7 @@ class Exaone4Config(PretrainedConfig): model_type = "exaone4" - AutoConfig.register(Exaone4Config.model_type, Exaone4Config) + AutoConfig.register(Exaone4Config.model_type, Exaone4Config, exist_ok=True) def check_is_sliding(config: Exaone4Config, layer_idx: int) -> bool: diff --git a/tensorrt_llm/_torch/models/modeling_exaone_moe.py b/tensorrt_llm/_torch/models/modeling_exaone_moe.py index fe420178558..621065ccb47 100644 --- a/tensorrt_llm/_torch/models/modeling_exaone_moe.py +++ b/tensorrt_llm/_torch/models/modeling_exaone_moe.py @@ -53,7 +53,7 @@ class ExaoneMoEConfig(PretrainedConfig): "Register ExaoneMoEConfig to mimic the ExaoneMoE model.", key="EXAONE_MOE_REGISTER_WARNING" ) -AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig) +AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig, exist_ok=True) # End of the config register. # fmt: on diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py index 623195da94a..446a7383a0a 100644 --- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py +++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py @@ -1073,4 +1073,4 @@ def forward( return hidden_states -AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig) +AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig, exist_ok=True) diff --git a/tensorrt_llm/_torch/models/modeling_vila.py b/tensorrt_llm/_torch/models/modeling_vila.py index 8b634229237..1e3bcab02b5 100644 --- a/tensorrt_llm/_torch/models/modeling_vila.py +++ b/tensorrt_llm/_torch/models/modeling_vila.py @@ -1252,5 +1252,5 @@ def post_config(self): self.model_config.pretrained_config = self.llm.config -AutoConfig.register(VilaConfig.model_type, VilaConfig) +AutoConfig.register(VilaConfig.model_type, VilaConfig, exist_ok=True) AutoModel.register(VilaConfig, VilaModel) diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py index 7e0369a4ba0..6db265dbd73 100644 --- a/tensorrt_llm/models/llama/config.py +++ b/tensorrt_llm/models/llama/config.py @@ -112,7 +112,8 @@ def from_hugging_face( from llava.model import LlavaLlamaConfig # noqa from llava.model import LlavaLlamaModel transformers.AutoConfig.register("llava_llama", - LlavaLlamaConfig) + LlavaLlamaConfig, + exist_ok=True) transformers.AutoModelForCausalLM.register( LlavaLlamaConfig, LlavaLlamaModel) From cba7d8dc146467b43c0b68bc819d9d17e69a247c Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:12:33 -0700 Subject: [PATCH 06/22] fix: replace removed load_sharded_checkpoint for Transformers 5.3 Implement local sharded checkpoint loading for Llama4 vision encoder; load_sharded_checkpoint was removed from transformers.modeling_utils. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- tensorrt_llm/_torch/models/modeling_llama.py | 62 ++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index b13c2e3de91..e898a344e8a 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -7,7 +7,8 @@ from torch import nn from transformers import (AutoProcessor, AutoTokenizer, Llama4Config, Llama4VisionModel, LlamaConfig, PretrainedConfig) -from transformers.modeling_utils import load_sharded_checkpoint +from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, WEIGHTS_NAME) from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, @@ -1118,6 +1119,58 @@ def post_load_weights(self): layer.next_attn = self.model.layers[idx + 1].self_attn +def _load_checkpoint_into_module(module: nn.Module, + folder: str, + strict: bool = True) -> None: + """Load a sharded HuggingFace checkpoint into a module. + + This replaces the removed ``transformers.modeling_utils.load_sharded_checkpoint`` + function. It supports both safetensors and PyTorch checkpoint formats. + """ + folder = str(folder) + + # Determine checkpoint format and collect shard files + index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME) + if os.path.isfile(index_file): + import json + with open(index_file) as f: + shard_files = sorted( + set(json.load(f)["weight_map"].values())) + shard_paths = [os.path.join(folder, s) for s in shard_files] + use_safetensors = True + elif os.path.isfile(os.path.join(folder, WEIGHTS_INDEX_NAME)): + import json + with open(os.path.join(folder, WEIGHTS_INDEX_NAME)) as f: + shard_files = sorted( + set(json.load(f)["weight_map"].values())) + shard_paths = [os.path.join(folder, s) for s in shard_files] + use_safetensors = False + elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)): + shard_paths = [os.path.join(folder, SAFE_WEIGHTS_NAME)] + use_safetensors = True + elif os.path.isfile(os.path.join(folder, WEIGHTS_NAME)): + shard_paths = [os.path.join(folder, WEIGHTS_NAME)] + use_safetensors = False + else: + raise FileNotFoundError( + f"No checkpoint found in {folder}. Expected " + f"{SAFE_WEIGHTS_INDEX_NAME}, {WEIGHTS_INDEX_NAME}, " + f"{SAFE_WEIGHTS_NAME}, or {WEIGHTS_NAME}.") + + # Load state dict from all shards and merge + full_state_dict: Dict[str, torch.Tensor] = {} + if use_safetensors: + from safetensors.torch import load_file + for path in shard_paths: + full_state_dict.update(load_file(path)) + else: + for path in shard_paths: + full_state_dict.update( + torch.load(path, map_location="cpu", weights_only=True)) + + module.load_state_dict(full_state_dict, strict=strict) + + class Llama4VisionEncoder(nn.Module): def __init__(self, model_config: ModelConfig[Llama4Config], *args, @@ -1148,9 +1201,10 @@ def load_weights(self, weights: Dict): # Otherwise, load the weights from the checkpoint. else: - load_sharded_checkpoint(module_dict, - self.pretrained_config._name_or_path, - strict=False) + _load_checkpoint_into_module( + module_dict, + self.pretrained_config._name_or_path, + strict=False) self.vision_model = module_dict["vision_model"].to(self.device) self.mm_projector = module_dict["multi_modal_projector"].to(self.device) From 1ca795eb62bcdb04d58768699a1079d19b000e80 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:26:18 -0700 Subject: [PATCH 07/22] format code Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/_torch/models/hf_parameter_utils.py | 3 +-- tensorrt_llm/_torch/models/modeling_llama.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tensorrt_llm/_torch/models/hf_parameter_utils.py b/tensorrt_llm/_torch/models/hf_parameter_utils.py index b5629827625..8a163c4f917 100644 --- a/tensorrt_llm/_torch/models/hf_parameter_utils.py +++ b/tensorrt_llm/_torch/models/hf_parameter_utils.py @@ -25,8 +25,7 @@ import torch.nn as nn try: - from transformers.modeling_utils import (get_parameter_device, - get_parameter_dtype) + from transformers.modeling_utils import get_parameter_device, get_parameter_dtype except ImportError: def get_parameter_device(module: nn.Module) -> torch.device: diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index e898a344e8a..8dff496563a 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -7,9 +7,9 @@ from torch import nn from transformers import (AutoProcessor, AutoTokenizer, Llama4Config, Llama4VisionModel, LlamaConfig, PretrainedConfig) +from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME) -from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp, AllReduceParams, MoEAllReduce) @@ -1134,15 +1134,13 @@ def _load_checkpoint_into_module(module: nn.Module, if os.path.isfile(index_file): import json with open(index_file) as f: - shard_files = sorted( - set(json.load(f)["weight_map"].values())) + shard_files = sorted(set(json.load(f)["weight_map"].values())) shard_paths = [os.path.join(folder, s) for s in shard_files] use_safetensors = True elif os.path.isfile(os.path.join(folder, WEIGHTS_INDEX_NAME)): import json with open(os.path.join(folder, WEIGHTS_INDEX_NAME)) as f: - shard_files = sorted( - set(json.load(f)["weight_map"].values())) + shard_files = sorted(set(json.load(f)["weight_map"].values())) shard_paths = [os.path.join(folder, s) for s in shard_files] use_safetensors = False elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)): @@ -1201,10 +1199,9 @@ def load_weights(self, weights: Dict): # Otherwise, load the weights from the checkpoint. else: - _load_checkpoint_into_module( - module_dict, - self.pretrained_config._name_or_path, - strict=False) + _load_checkpoint_into_module(module_dict, + self.pretrained_config._name_or_path, + strict=False) self.vision_model = module_dict["vision_model"].to(self.device) self.mm_projector = module_dict["multi_modal_projector"].to(self.device) From 39efba415c1eaa701cc3dd56fd82288d40b59cad Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 23:39:22 -0700 Subject: [PATCH 08/22] test: replace HybridCache with StaticCache helper for Transformers v5 HybridCache was removed from transformers.cache_utils in v5. Add make_hf_hybrid_cache_for_tests using StaticCache on ImportError; update Cohere2, Exaone4, and Gemma3 modeling tests. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- tests/unittest/_torch/helpers.py | 33 ++++++++++++++++++- .../_torch/modeling/test_modeling_cohere2.py | 6 ++-- .../_torch/modeling/test_modeling_exaone4.py | 16 +++++---- .../_torch/modeling/test_modeling_gemma3.py | 14 ++++---- 4 files changed, 52 insertions(+), 17 deletions(-) diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py index 743f3998f26..f7dde64c669 100644 --- a/tests/unittest/_torch/helpers.py +++ b/tests/unittest/_torch/helpers.py @@ -1,4 +1,4 @@ -from typing import Dict, Tuple +from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F @@ -256,3 +256,34 @@ def create_mock_cuda_graph_runner(batch_size: int, use_mrope: bool = False): dist=None, kv_cache_manager_key=ResourceManagerType.KV_CACHE_MANAGER) return CUDAGraphRunner(config) + + +def make_hf_hybrid_cache_for_tests( + config, + *, + max_cache_len: int, + max_batch_size: Optional[int] = None, + device=None, + dtype=None, +): + """Build Hugging Face ``past_key_values`` for hybrid / sliding-window models in tests. + + Transformers v4 exposes ``HybridCache``; v5 removes it in favor of ``StaticCache`` + for fixed-length pre-allocated KV (see HF cache refactor). + """ + try: + from transformers.cache_utils import HybridCache + except ImportError: + from transformers.cache_utils import StaticCache + + return StaticCache(config=config, max_cache_len=max_cache_len) + + kwargs = { + "config": config, + "max_cache_len": max_cache_len, + "device": device, + "dtype": dtype, + } + if max_batch_size is not None: + kwargs["max_batch_size"] = max_batch_size + return HybridCache(**kwargs) diff --git a/tests/unittest/_torch/modeling/test_modeling_cohere2.py b/tests/unittest/_torch/modeling/test_modeling_cohere2.py index 20c2e88fe69..783c0939251 100644 --- a/tests/unittest/_torch/modeling/test_modeling_cohere2.py +++ b/tests/unittest/_torch/modeling/test_modeling_cohere2.py @@ -3,9 +3,9 @@ import torch from transformers import Cohere2Config from transformers import Cohere2ForCausalLM as HFCohere2ForCausalLM -from transformers.cache_utils import HybridCache import tensorrt_llm +from _torch.helpers import make_hf_hybrid_cache_for_tests from tensorrt_llm._torch.attention_backend.utils import get_attention_backend from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig @@ -161,8 +161,8 @@ def test_cohere2_allclose_to_hf(self) -> None: # Initialize the hugging face model hf_cohere2 = HFCohere2ForCausalLM(cohere2_config).to(dtype).to(device).eval() - hf_cache = HybridCache( - config=cohere2_config, + hf_cache = make_hf_hybrid_cache_for_tests( + cohere2_config, max_batch_size=batch_size, max_cache_len=10, device=device, diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py index 931828be848..a9f1517a30f 100644 --- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py +++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py @@ -25,8 +25,8 @@ class Exaone4Config(PretrainedConfig): # TODO: Remove this once we have a proper config for Exaone4 SKIP_EXAONE4_HF_ACCURACY_TEST = True -from _torch.helpers import create_mock_cuda_graph_runner -from transformers.cache_utils import HybridCache +from _torch.helpers import (create_mock_cuda_graph_runner, + make_hf_hybrid_cache_for_tests) from utils.util import getSMVersion import tensorrt_llm @@ -248,11 +248,13 @@ def test_exaone4_allclose_to_hf(self, scenario: Scenario) -> None: num_kv_heads = exaone4.config.num_key_value_heads max_seq_len = num_blocks * tokens_per_block batch_size = 1 - hf_cache = HybridCache(config=exaone4_config, - max_batch_size=batch_size, - max_cache_len=max_seq_len, - device=device, - dtype=dtype) + hf_cache = make_hf_hybrid_cache_for_tests( + exaone4_config, + max_batch_size=batch_size, + max_cache_len=max_seq_len, + device=device, + dtype=dtype, + ) if dtype == torch.half: kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF elif dtype == torch.bfloat16: diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py index 6b532b9b1c6..7d164b7a6d2 100644 --- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py +++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py @@ -7,9 +7,9 @@ from transformers import Gemma3Config from transformers import Gemma3ForCausalLM as HFGemma3ForCausalLM from transformers import Gemma3TextConfig -from transformers.cache_utils import HybridCache import tensorrt_llm +from _torch.helpers import make_hf_hybrid_cache_for_tests from tensorrt_llm._torch.attention_backend import (AttentionMetadata, FlashInferAttentionMetadata) from tensorrt_llm._torch.attention_backend.utils import get_attention_backend @@ -285,11 +285,13 @@ def test_gemma3_allclose_to_hf(self, scenario: Scenario) -> None: hf_gemma3 = HFGemma3ForCausalLM(gemma3_config).to(dtype).to( device).eval() - hf_cache = HybridCache(config=gemma3_config, - max_batch_size=batch_size, - max_cache_len=10, - device=device, - dtype=dtype) + hf_cache = make_hf_hybrid_cache_for_tests( + gemma3_config, + max_batch_size=batch_size, + max_cache_len=10, + device=device, + dtype=dtype, + ) model_config = ModelConfig(pretrained_config=gemma3_config, attn_backend=backend) From a0e4198d28a88f32278ee60c97af1549b6fdc5d3 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Wed, 8 Apr 2026 23:50:36 -0700 Subject: [PATCH 09/22] fix: map Transformers v5 rope_scaling type default to RoPE none Transformers 5.x may set rope_scaling["type"] to "default" for standard RoPE. Teach RotaryScalingType.from_string to treat it as none, accept None, and match enum names case-insensitively. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- tensorrt_llm/functional.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 5dd99755dc6..694aab45f2d 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -680,8 +680,16 @@ class RotaryScalingType(IntEnum): @staticmethod def from_string(s): + if isinstance(s, RotaryScalingType): + return s + if s is None: + return RotaryScalingType.none + key = str(s).lower() + # Hugging Face Transformers v5+ uses type "default" for unscaled / standard RoPE. + if key == "default": + return RotaryScalingType.none try: - return RotaryScalingType[s] + return RotaryScalingType[key] except KeyError: raise ValueError(f'Unsupported rotary scaling type: {s}') From 6d5701f132da531bd58bfbe4473964b8874dc707 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:04:31 -0700 Subject: [PATCH 10/22] fix: read HF rope_theta from rope_parameters for Transformers v5 Add tensorrt_llm._utils.get_hf_rope_theta() and use it across TRT-LLM configs, converters, RopeParams, and multimodal paths so Llama-style HF configs without top-level rope_theta still resolve the correct base. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- examples/eagle/convert_checkpoint.py | 3 ++- examples/medusa/convert_checkpoint.py | 4 ++-- .../models/contrib/dbrx/convert_checkpoint.py | 4 ++-- .../models/core/internlm2/convert_checkpoint.py | 4 ++-- .../_torch/attention_backend/interface.py | 4 ++-- tensorrt_llm/_torch/models/modeling_gpt_oss.py | 4 ++-- tensorrt_llm/_utils.py | 17 +++++++++++++++++ tensorrt_llm/models/commandr/config.py | 3 ++- tensorrt_llm/models/deepseek_v1/config.py | 3 ++- tensorrt_llm/models/deepseek_v2/config.py | 3 ++- tensorrt_llm/models/deepseek_v2/convert.py | 5 +++-- tensorrt_llm/models/eagle/config.py | 3 ++- tensorrt_llm/models/falcon/config.py | 3 ++- tensorrt_llm/models/gemma/config.py | 3 ++- tensorrt_llm/models/gpt/config.py | 3 ++- tensorrt_llm/models/llama/config.py | 3 ++- tensorrt_llm/models/mllama/config.py | 3 ++- tensorrt_llm/models/nemotron_nas/config.py | 3 ++- tensorrt_llm/models/phi/config.py | 3 ++- tensorrt_llm/models/phi/convert.py | 4 ++-- tensorrt_llm/models/phi3/config.py | 3 ++- tensorrt_llm/models/qwen/config.py | 3 ++- .../quantization/quantize_by_modelopt.py | 5 +++-- tensorrt_llm/runtime/multimodal_model_runner.py | 6 +++--- tests/unittest/trt/model/test_phi.py | 3 ++- .../multimodal_encoders/1/multimodal_utils.py | 4 +++- 26 files changed, 71 insertions(+), 35 deletions(-) diff --git a/examples/eagle/convert_checkpoint.py b/examples/eagle/convert_checkpoint.py index 217144e1ae5..130faee4453 100644 --- a/examples/eagle/convert_checkpoint.py +++ b/examples/eagle/convert_checkpoint.py @@ -9,6 +9,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.eagle.config import EagleConfig from tensorrt_llm.models.eagle.model import EagleForCausalLM @@ -293,7 +294,7 @@ def copy(tensors): args.rms_norm_eps = hf_config.rms_norm_eps args.vocab_size = hf_config.vocab_size args.rotary_scaling = hf_config.rope_scaling - args.rotary_base = hf_config.rope_theta + args.rotary_base = get_hf_rope_theta(hf_config, 10000.0) args.n_positions = hf_config.max_position_embeddings args.dtype = str( hf_config.torch_dtype)[6:] if args.dtype == 'auto' else args.dtype diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py index 48dcc6fd400..09eb55b4610 100644 --- a/examples/medusa/convert_checkpoint.py +++ b/examples/medusa/convert_checkpoint.py @@ -13,7 +13,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation -from tensorrt_llm._utils import numpy_to_torch +from tensorrt_llm._utils import get_hf_rope_theta, numpy_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import (LLaMAForCausalLM, PretrainedConfig, @@ -209,7 +209,7 @@ def main(): args.rms_norm_eps = hf_config.rms_norm_eps args.vocab_size = hf_config.vocab_size args.n_positions = hf_config.max_position_embeddings - args.rotary_base = hf_config.rope_theta + args.rotary_base = get_hf_rope_theta(hf_config, 10000.0) args.rotary_scaling = hf_config.rope_scaling elif args.meta_ckpt_dir is not None: diff --git a/examples/models/contrib/dbrx/convert_checkpoint.py b/examples/models/contrib/dbrx/convert_checkpoint.py index ad487a50c76..1ca287f2588 100644 --- a/examples/models/contrib/dbrx/convert_checkpoint.py +++ b/examples/models/contrib/dbrx/convert_checkpoint.py @@ -18,7 +18,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation -from tensorrt_llm._utils import release_gc +from tensorrt_llm._utils import get_hf_rope_theta, release_gc from tensorrt_llm.layers import MoeConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.convert_utils import (generate_int8, @@ -557,7 +557,7 @@ def execute(workers, func, hf_model): args.moe_top_k = 1 args.clip_qkv = hf_config.attn_config.clip_qkv args.hidden_act = 'swiglu' - args.rotary_base = hf_config.attn_config.rope_theta + args.rotary_base = get_hf_rope_theta(hf_config.attn_config, 10000.0) args.moe_config = MoeConfig( num_experts=args.moe_num_experts, top_k=args.moe_top_k, diff --git a/examples/models/core/internlm2/convert_checkpoint.py b/examples/models/core/internlm2/convert_checkpoint.py index 151a1afe85c..44c80d6d51f 100644 --- a/examples/models/core/internlm2/convert_checkpoint.py +++ b/examples/models/core/internlm2/convert_checkpoint.py @@ -14,7 +14,7 @@ import tensorrt_llm from tensorrt_llm._deprecation import emit_engine_arch_deprecation -from tensorrt_llm._utils import release_gc +from tensorrt_llm._utils import get_hf_rope_theta, release_gc from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.llama import convert @@ -480,7 +480,7 @@ def convert_from_hf(hf_model, 'norm_epsilon': hf_config.rms_norm_eps, 'vocab_size': hf_config.vocab_size, 'position_embedding_type': 'rope_gpt_neox', - 'rotary_base': hf_config.rope_theta, + 'rotary_base': get_hf_rope_theta(hf_config, 10000.0), 'max_position_embeddings': hf_config.max_position_embeddings, 'hidden_act': hf_config.hidden_act, 'use_parallel_embedding': args.use_parallel_embedding, diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index 600f655bc51..f86b550482f 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -14,7 +14,7 @@ from ..speculative.interface import SpecMetadata from ..speculative.spec_tree_manager import SpecTreeManager -from tensorrt_llm._utils import maybe_pin_memory +from tensorrt_llm._utils import get_hf_rope_theta, maybe_pin_memory from tensorrt_llm.functional import (PositionEmbeddingType, RopeEmbeddingUtils, RotaryScalingType) from tensorrt_llm.mapping import Mapping @@ -498,7 +498,7 @@ def from_config(config) -> "RopeParams": head_dim = hidden_size // num_attention_heads rope_scaling = getattr(config, 'rope_scaling', None) rope_params.max_positions = config.max_position_embeddings - rope_params.theta = getattr(config, 'rope_theta', 10000.0) + rope_params.theta = get_hf_rope_theta(config, 10000.0) rope_percentage = (getattr(config, 'rotary_pct', None) or getattr(config, 'partial_rotary_factor', None) or 1.0) diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py index 4d46611a7fd..7edeb3e73be 100644 --- a/tensorrt_llm/_torch/models/modeling_gpt_oss.py +++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py @@ -7,7 +7,7 @@ from tqdm import tqdm from transformers import GptOssConfig -from tensorrt_llm._utils import get_sm_version +from tensorrt_llm._utils import get_hf_rope_theta, get_sm_version from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType from ..attention_backend import AttentionMetadata @@ -55,7 +55,7 @@ def __init__( type=PositionEmbeddingType.yarn, rope=RopeParams( dim=pretrained_config.head_dim, - theta=pretrained_config.rope_theta, + theta=get_hf_rope_theta(pretrained_config, 10000.0), scale_type=RotaryScalingType.yarn, scale=pretrained_config.rope_scaling['factor'], max_positions=pretrained_config.max_position_embeddings, diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 47a6a88499e..c53e7a08504 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -70,6 +70,23 @@ np_float8 = np.dtype('V1', metadata={"dtype": "float8"}) +def get_hf_rope_theta(config: Any, default: float = 10000.0) -> float: + """Return RoPE ``theta`` from a Hugging Face ``PreTrainedConfig``-like object. + + Transformers v5+ nests ``rope_theta`` under ``rope_parameters`` for several + models (e.g. LLaMA); older releases expose ``config.rope_theta`` directly. + """ + theta = getattr(config, "rope_theta", None) + if theta is not None: + return float(theta) + rope_params = getattr(config, "rope_parameters", None) + if isinstance(rope_params, dict): + theta = rope_params.get("rope_theta") + if theta is not None: + return float(theta) + return default + + def torch_to_numpy(x: torch.Tensor): assert isinstance(x, torch.Tensor), \ f'x must be a torch.Tensor object, but got {type(x)}.' diff --git a/tensorrt_llm/models/commandr/config.py b/tensorrt_llm/models/commandr/config.py index a2edca61fb7..511640c2249 100644 --- a/tensorrt_llm/models/commandr/config.py +++ b/tensorrt_llm/models/commandr/config.py @@ -16,6 +16,7 @@ import transformers +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..modeling_utils import PretrainedConfig, QuantConfig @@ -79,7 +80,7 @@ def from_hugging_face( hidden_act=hf_config.hidden_act, norm_epsilon=hf_config.layer_norm_eps, output_multiplier_scale=hf_config.logit_scale, - rotary_base=hf_config.rope_theta, + rotary_base=get_hf_rope_theta(hf_config, 10000.0), attn_bias=hf_config.attention_bias, qk_layernorm=hf_config.use_qk_norm, mapping=mapping, diff --git a/tensorrt_llm/models/deepseek_v1/config.py b/tensorrt_llm/models/deepseek_v1/config.py index b47fa91a43d..e7bff0d9aab 100755 --- a/tensorrt_llm/models/deepseek_v1/config.py +++ b/tensorrt_llm/models/deepseek_v1/config.py @@ -15,6 +15,7 @@ from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -70,7 +71,7 @@ def from_hugging_face( num_key_value_heads = getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads) rotary_scaling = getattr(hf_config, "rope_scaling", None) - rotary_base = getattr(hf_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_config, 10000.0) dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None)) moe_config = MoeConfig( num_experts=getattr(hf_config, 'n_routed_experts', 0), diff --git a/tensorrt_llm/models/deepseek_v2/config.py b/tensorrt_llm/models/deepseek_v2/config.py index edaf21f128c..c110df0d53f 100644 --- a/tensorrt_llm/models/deepseek_v2/config.py +++ b/tensorrt_llm/models/deepseek_v2/config.py @@ -17,6 +17,7 @@ from transformers import AutoConfig +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..modeling_utils import PretrainedConfig, QuantConfig @@ -129,7 +130,7 @@ def from_hugging_face( max_position_embeddings=hf_config.max_position_embeddings, hidden_act='swiglu', norm_epsilon=hf_config.rms_norm_eps, - rotary_base=hf_config.rope_theta, + rotary_base=get_hf_rope_theta(hf_config, 10000.0), rotary_scaling=rotary_scaling, moe_inter_size=hf_config.moe_intermediate_size, moe=moe_config, diff --git a/tensorrt_llm/models/deepseek_v2/convert.py b/tensorrt_llm/models/deepseek_v2/convert.py index 697040d3b75..5a23130fc52 100755 --- a/tensorrt_llm/models/deepseek_v2/convert.py +++ b/tensorrt_llm/models/deepseek_v2/convert.py @@ -20,7 +20,8 @@ from tensorrt_llm.layers import MoeConfig -from ..._utils import pad_vocab_size, release_gc, str_dtype_to_torch +from ..._utils import (get_hf_rope_theta, pad_vocab_size, release_gc, + str_dtype_to_torch) from ...logger import logger from ...mapping import Mapping from ..convert_utils import get_tllm_linear_weight @@ -52,7 +53,7 @@ def create_trt_config_from_hf(model_dir, vocab_size = hf_config.vocab_size n_positions = hf_config.max_position_embeddings hidden_act = 'swiglu' # TRT-LLM request make gated activation explicit for MOE implementation - rotary_base = hf_config.rope_theta + rotary_base = get_hf_rope_theta(hf_config, 10000.0) rms_norm_eps = hf_config.rms_norm_eps rotary_scaling_beta_fast = hf_config.rope_scaling['beta_fast'] rotary_scaling_beta_slow = hf_config.rope_scaling['beta_slow'] diff --git a/tensorrt_llm/models/eagle/config.py b/tensorrt_llm/models/eagle/config.py index f81e43bb03f..e7a559f3469 100644 --- a/tensorrt_llm/models/eagle/config.py +++ b/tensorrt_llm/models/eagle/config.py @@ -18,6 +18,7 @@ from transformers import LlamaConfig +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..llama.config import LLaMAConfig @@ -84,7 +85,7 @@ def from_hugging_face( rms_norm_eps = hf_config.rms_norm_eps vocab_size = hf_config.vocab_size rotary_scaling = hf_config.rope_scaling - rotary_base = hf_config.rope_theta + rotary_base = get_hf_rope_theta(hf_config, 10000.0) n_positions = hf_config.max_position_embeddings hidden_act = hf_config.hidden_act dtype = str(hf_config.torch_dtype)[6:] if dtype == 'auto' else dtype diff --git a/tensorrt_llm/models/falcon/config.py b/tensorrt_llm/models/falcon/config.py index c96bd517cc4..1ff2ff0391c 100644 --- a/tensorrt_llm/models/falcon/config.py +++ b/tensorrt_llm/models/falcon/config.py @@ -14,6 +14,7 @@ # limitations under the License. from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..modeling_utils import PretrainedConfig, QuantConfig @@ -109,7 +110,7 @@ def from_hugging_face( max_position_embeddings=getattr(hf_config, 'max_position_embeddings', 2048), - rotary_base=getattr(hf_config, 'rope_theta', 10000.0), + rotary_base=get_hf_rope_theta(hf_config, 10000.0), intermediate_size=getattr(hf_config, 'ffn_hidden_size', None), mapping=mapping, diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py index 8e176c4ed7e..3b0d8d6218c 100644 --- a/tensorrt_llm/models/gemma/config.py +++ b/tensorrt_llm/models/gemma/config.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Optional, Union +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping @@ -186,7 +187,7 @@ def from_hugging_face( norm_epsilon=hf_config.rms_norm_eps, num_key_value_heads=getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads), - rotary_base=getattr(hf_config, "rope_theta", 10000.0), + rotary_base=get_hf_rope_theta(hf_config, 10000.0), rotary_scaling=getattr(hf_config, "rotary_scaling", None), quantization=quant_config, mapping=mapping, diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py index e89dddd5efe..ba09d1f8694 100644 --- a/tensorrt_llm/models/gpt/config.py +++ b/tensorrt_llm/models/gpt/config.py @@ -17,6 +17,7 @@ import torch +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...logger import logger from ...mapping import Mapping @@ -134,7 +135,7 @@ def from_hugging_face( hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else gpt_variant != 'nemotron' hf_config.position_embedding_type = 'rope_gpt_neox' - hf_config.rotary_base = hf_config.rope_theta + hf_config.rotary_base = get_hf_rope_theta(hf_config, 10000.0) hf_config.rotary_pct = getattr( hf_config, 'partial_rotary_factor', getattr(hf_config, 'rope_percent', 1.0)) diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py index 6db265dbd73..54038e32c4f 100644 --- a/tensorrt_llm/models/llama/config.py +++ b/tensorrt_llm/models/llama/config.py @@ -18,6 +18,7 @@ from pathlib import Path from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -161,7 +162,7 @@ def from_hugging_face( attn_bias = getattr(hf_config, 'bias', False) or getattr( hf_config, 'attention_bias', False) rotary_scaling = getattr(hf_config, "rope_scaling", None) - rotary_base = getattr(hf_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_config, 10000.0) residual_mlp = getattr(hf_config, "parallel_attn_mlp_res", False) disable_weight_only_quant_plugin = kwargs.pop( 'disable_weight_only_quant_plugin', False) diff --git a/tensorrt_llm/models/mllama/config.py b/tensorrt_llm/models/mllama/config.py index 5fb24f6fac7..cbd7f1b8f38 100644 --- a/tensorrt_llm/models/mllama/config.py +++ b/tensorrt_llm/models/mllama/config.py @@ -16,6 +16,7 @@ from pathlib import Path from typing import List, Optional, Union +from ..._utils import get_hf_rope_theta from ...functional import LayerNormPositionType, LayerNormType, MLPType from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -166,7 +167,7 @@ def from_hugging_face( attn_bias = getattr(hf_text_config, 'bias', False) or getattr( hf_text_config, 'attention_bias', False) rotary_scaling = getattr(hf_text_config, "rope_scaling", None) - rotary_base = getattr(hf_text_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_text_config, 10000.0) residual_mlp = getattr(hf_text_config, "parallel_attn_mlp_res", False) disable_weight_only_quant_plugin = kwargs.pop( 'disable_weight_only_quant_plugin', False) diff --git a/tensorrt_llm/models/nemotron_nas/config.py b/tensorrt_llm/models/nemotron_nas/config.py index 139b052c7bc..11d02df84b0 100644 --- a/tensorrt_llm/models/nemotron_nas/config.py +++ b/tensorrt_llm/models/nemotron_nas/config.py @@ -15,6 +15,7 @@ from dataclasses import asdict from typing import Any, Dict, List, Optional, Union +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.convert_utils import infer_dtype @@ -198,7 +199,7 @@ def from_hugging_face( num_key_value_heads=hf_config.num_key_value_heads, norm_epsilon=hf_config.rms_norm_eps, rotary_scaling=hf_config.rope_scaling, - rotary_base=hf_config.rope_theta, + rotary_base=get_hf_rope_theta(hf_config, 10000.0), vocab_size=hf_config.vocab_size, max_position_embeddings=hf_config.max_position_embeddings, mapping=mapping, diff --git a/tensorrt_llm/models/phi/config.py b/tensorrt_llm/models/phi/config.py index 3d38db0fa7b..583de15fadf 100644 --- a/tensorrt_llm/models/phi/config.py +++ b/tensorrt_llm/models/phi/config.py @@ -15,6 +15,7 @@ from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...mapping import Mapping from ..convert_utils import infer_dtype from ..modeling_utils import PretrainedConfig, QuantConfig @@ -64,7 +65,7 @@ def from_hugging_face( num_key_value_heads = getattr(hf_config, "num_key_value_heads", hf_config.num_attention_heads) rotary_scaling = getattr(hf_config, "rope_scaling", None) - rotary_base = getattr(hf_config, "rope_theta", 10000.0) + rotary_base = get_hf_rope_theta(hf_config, 10000.0) dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None)) return cls(architecture=hf_config.architectures[0], diff --git a/tensorrt_llm/models/phi/convert.py b/tensorrt_llm/models/phi/convert.py index 0d1ec78bfd7..4bf3406c726 100644 --- a/tensorrt_llm/models/phi/convert.py +++ b/tensorrt_llm/models/phi/convert.py @@ -1,6 +1,6 @@ import torch -from ..._utils import pad_vocab_size, str_dtype_to_torch +from ..._utils import get_hf_rope_theta, pad_vocab_size, str_dtype_to_torch def split(v, tp_size, idx, dim=0): @@ -129,7 +129,7 @@ def convert_hf_config(hf_config, dtype, args): 'num_hidden_layers': hf_config.num_hidden_layers, 'num_attention_heads': hf_config.num_key_value_heads, 'rotary_pct': hf_config.partial_rotary_factor, - 'rope_theta': hf_config.rope_theta, + 'rope_theta': get_hf_rope_theta(hf_config, 10000.0), 'hidden_size': hf_config.hidden_size, 'intermediate_size': hf_config.intermediate_size, 'vocab_size': hf_config.vocab_size, diff --git a/tensorrt_llm/models/phi3/config.py b/tensorrt_llm/models/phi3/config.py index c824e921720..42d3954092e 100644 --- a/tensorrt_llm/models/phi3/config.py +++ b/tensorrt_llm/models/phi3/config.py @@ -15,6 +15,7 @@ from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -96,7 +97,7 @@ def from_hugging_face( hf_config, "dense_attention_every_n_layers", None) kwargs['norm_epsilon'] = hf_config.layer_norm_epsilon else: - kwargs['rotary_base'] = hf_config.rope_theta + kwargs['rotary_base'] = get_hf_rope_theta(hf_config, 10000.0) kwargs['norm_epsilon'] = hf_config.rms_norm_eps moe_variant = hf_config.architectures[0] == "PhiMoEForCausalLM" if moe_variant: diff --git a/tensorrt_llm/models/qwen/config.py b/tensorrt_llm/models/qwen/config.py index e2c22909538..0f1bd34606b 100644 --- a/tensorrt_llm/models/qwen/config.py +++ b/tensorrt_llm/models/qwen/config.py @@ -14,6 +14,7 @@ # limitations under the License. from typing import Optional, Union +from ..._utils import get_hf_rope_theta from ...layers import MoeConfig from ...mapping import Mapping from ..convert_utils import infer_dtype @@ -138,7 +139,7 @@ def from_hugging_face(cls, rotary_base = getattr(hf_config, "rotary_emb_base", 10000.0) else: rms_norm_eps = hf_config.rms_norm_eps - rotary_base = getattr(hf_config, "rope_theta", 100000.0) + rotary_base = get_hf_rope_theta(hf_config, 100000.0) num_labels = 1 if hf_config.architectures[0] == "Qwen2ForSequenceClassification": diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py index 302eb74533f..8c1aa57efa6 100755 --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -34,7 +34,7 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer) -from .._utils import release_gc, str_dtype_to_torch +from .._utils import get_hf_rope_theta, release_gc, str_dtype_to_torch from ..logger import logger from ..mapping import Mapping from .image_processing import MllamaImageProcessor @@ -888,7 +888,8 @@ def quantize_and_export(*, if qwen_config.model_type == "qwen2": tensorrt_llm_config[ "norm_epsilon"] = qwen_config.rms_norm_eps - tensorrt_llm_config["rotary_base"] = qwen_config.rope_theta + tensorrt_llm_config["rotary_base"] = get_hf_rope_theta( + qwen_config, 100000.0) tensorrt_llm_config[ "intermediate_size"] = qwen_config.intermediate_size with open(f"{export_path}/config.json", "w") as f: diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py index 5a88cc9dd80..47d9bd66ad9 100644 --- a/tensorrt_llm/runtime/multimodal_model_runner.py +++ b/tensorrt_llm/runtime/multimodal_model_runner.py @@ -28,8 +28,8 @@ from .. import profiler from .._deprecation import emit_engine_arch_deprecation -from .._utils import (maybe_pin_memory, mpi_rank, prefer_pinned, - str_dtype_to_torch, str_dtype_to_trt, +from .._utils import (get_hf_rope_theta, maybe_pin_memory, mpi_rank, + prefer_pinned, str_dtype_to_torch, str_dtype_to_trt, supports_inflight_batching, torch_dtype_to_trt, trt_dtype_to_torch) from ..functional import RopeEmbeddingUtils, RotaryScalingType @@ -415,7 +415,7 @@ def __init__(self, args): self.max_position_embeddings = hf_config.max_position_embeddings self.hidden_size = hf_config.hidden_size self.num_attention_heads = hf_config.num_attention_heads - self.rope_theta = hf_config.rope_theta + self.rope_theta = get_hf_rope_theta(hf_config, 10000.0) if self.model_type == 'llava_onevision': self.num_frames = self.args.video_num_frames if self.num_frames is None: diff --git a/tests/unittest/trt/model/test_phi.py b/tests/unittest/trt/model/test_phi.py index 9db18f4e46e..b3cf8d28f2f 100644 --- a/tests/unittest/trt/model/test_phi.py +++ b/tests/unittest/trt/model/test_phi.py @@ -24,6 +24,7 @@ import tensorrt_llm from tensorrt_llm import Builder +from tensorrt_llm._utils import get_hf_rope_theta from tensorrt_llm.models.phi.convert import load_weights_from_hf_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType @@ -61,7 +62,7 @@ def initialize_network(self, network: tensorrt_llm.Network, hf_model, 'num_attention_heads': hf_config.num_key_value_heads, 'rotary_pct': hf_config.partial_rotary_factor, 'position_embedding_type': 'rope_gpt_neox', - 'rope_theta': hf_config.rope_theta, + 'rope_theta': get_hf_rope_theta(hf_config, 10000.0), 'hidden_size': hf_config.hidden_size, 'intermediate_size': hf_config.intermediate_size, 'vocab_size': hf_config.vocab_size, diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py index 4caff0bbffc..85600193bfe 100644 --- a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py +++ b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py @@ -2,6 +2,8 @@ import torch +from tensorrt_llm._utils import get_hf_rope_theta + class LlavaOnevisionUtils: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -129,7 +131,7 @@ def __init__(self, config): self.max_position_embeddings = config.max_position_embeddings self.hidden_size = config.hidden_size self.num_attention_heads = config.num_attention_heads - self.rope_theta = config.rope_theta + self.rope_theta = get_hf_rope_theta(config, 10000.0) def get_rope_index( self, From 08074a935a3abaeede0bd214acdc021e5933d208 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:15:03 -0700 Subject: [PATCH 11/22] test: compat helpers for DynamicCache legacy API (Transformers v5) Transformers v5 removed DynamicCache.from_legacy_cache and to_legacy_cache. Add hf_dynamic_cache_compat helpers and switch TRT attention unit tests to use them so behavior matches v4 when the methods are absent. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- .../trt/attention/hf_dynamic_cache_compat.py | 56 +++++++++++++++++++ .../trt/attention/test_gpt_attention.py | 29 +++++----- .../trt/attention/test_gpt_attention_IFB.py | 8 ++- 3 files changed, 78 insertions(+), 15 deletions(-) create mode 100644 tests/unittest/trt/attention/hf_dynamic_cache_compat.py diff --git a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py new file mode 100644 index 00000000000..d2a6005e8c3 --- /dev/null +++ b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DynamicCache legacy tuple format for tests (removed from Transformers v5+).""" + +from __future__ import annotations + +from typing import List, Optional, Sequence, Tuple, Union + +import torch +from transformers.cache_utils import DynamicCache + +LegacyLayerKV = Tuple[torch.Tensor, torch.Tensor] +LegacyCache = Tuple[LegacyLayerKV, ...] + + +def dynamic_cache_from_legacy( + past_key_values: Optional[Union[LegacyCache, + Sequence[LegacyLayerKV]]]) -> DynamicCache: + """Match pre-v5 ``DynamicCache.from_legacy_cache`` (see transformers v4.48 ``cache_utils``).""" + if past_key_values is None: + return DynamicCache() + if hasattr(DynamicCache, "from_legacy_cache"): + return DynamicCache.from_legacy_cache(past_key_values) + cache = DynamicCache() + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + + +def dynamic_cache_to_legacy(cache: DynamicCache) -> LegacyCache: + """Match pre-v5 ``DynamicCache.to_legacy_cache``.""" + if hasattr(cache, "to_legacy_cache"): + return cache.to_legacy_cache() + layers: List[LegacyLayerKV] = [] + for layer in cache.layers: + if not getattr(layer, "is_initialized", False): + continue + keys = layer.keys + values = layer.values + if keys is None or values is None: + continue + layers.append((keys, values)) + return tuple(layers) diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py index 349bf6b752d..d44411b060c 100644 --- a/tests/unittest/trt/attention/test_gpt_attention.py +++ b/tests/unittest/trt/attention/test_gpt_attention.py @@ -31,6 +31,9 @@ from transformers.models.gptj.modeling_gptj import GPTJAttention from transformers.models.llama.modeling_llama import (LlamaAttention, LlamaRotaryEmbedding) + +from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( + dynamic_cache_from_legacy, dynamic_cache_to_legacy) from utils.util import (getSMVersion, skip_bf16_fp32_accum, skip_blackwell_for_fmha_tests, skip_fp8_pre_ada, unittest_name_func) @@ -1236,13 +1239,13 @@ def verify_kv_cache(torch_present): attention_packed_mask = None if attention_type == 'gpt2_attention': # gpt2 uses DynamicCache - torch_present = DynamicCache.from_legacy_cache( + torch_present = dynamic_cache_from_legacy( torch_present) torch_output = attention(input_tensor, past_key_value=torch_present, use_cache=True, attention_mask=attention_mask)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'llama_attention': position_embeddings = rotary_emb(input_tensor, position_ids) attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask( @@ -1257,7 +1260,7 @@ def verify_kv_cache(torch_present): position_embeddings=position_embeddings, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gptj_attention': torch_present = DynamicCache() torch_output = attention(input_tensor, @@ -1265,7 +1268,7 @@ def verify_kv_cache(torch_present): position_ids=position_ids, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gpt_bigcode_attention': attention_mask = _prepare_4d_attention_mask( ctx_attention_mask, @@ -1280,7 +1283,7 @@ def verify_kv_cache(torch_present): layer_past=torch_present, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) else: raise RuntimeError("attention_type not properly set") @@ -1377,13 +1380,13 @@ def verify_kv_cache(torch_present): # torch execution if attention_type == 'gpt2_attention': # gpt2 uses DynamicCache - torch_present = DynamicCache.from_legacy_cache( + torch_present = dynamic_cache_from_legacy( torch_present) torch_output = attention(input_tensor, past_key_value=torch_present, use_cache=True, attention_mask=attention_mask)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'llama_attention': position_embeddings = rotary_emb(input_tensor, position_ids) attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask( @@ -1392,7 +1395,7 @@ def verify_kv_cache(torch_present): device='cuda', past_key_values_length=in_len + step - 1) # llama uses DynamicCache - torch_present = DynamicCache.from_legacy_cache( + torch_present = dynamic_cache_from_legacy( torch_present) torch_output = attention( input_tensor, @@ -1400,29 +1403,29 @@ def verify_kv_cache(torch_present): position_embeddings=position_embeddings, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gptj_attention': - torch_present = DynamicCache.from_legacy_cache( + torch_present = dynamic_cache_from_legacy( torch_present) torch_output = attention(input_tensor, layer_past=torch_present, position_ids=position_ids, attention_mask=attention_mask, use_cache=True)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gpt_bigcode_attention': # target shape = (b, h, 1, s_key) key_seqlen = in_len + step # ctx_attention_mask.shape[1] attention_mask = (attention_mask >= 0).expand(batch_size, num_heads, 1, key_seqlen) - torch_present = DynamicCache.from_legacy_cache( + torch_present = dynamic_cache_from_legacy( torch_present) torch_output = attention(input_tensor, layer_past=torch_present, use_cache=True, attention_mask=attention_mask)[0] - torch_present = torch_present.to_legacy_cache() + torch_present = dynamic_cache_to_legacy(torch_present) def tile_beam_width(tensor: torch.Tensor, num_beams: int): if num_beams == 1: diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py index cda9025a8b9..12a551ee9ea 100644 --- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py +++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py @@ -35,6 +35,9 @@ from transformers.models.gptj.modeling_gptj import GPTJAttention from transformers.models.llama.modeling_llama import (LlamaAttention, LlamaRotaryEmbedding) + +from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( + dynamic_cache_from_legacy, dynamic_cache_to_legacy) from utils.util import (skip_bf16_fp32_accum, skip_fp8_pre_ada, unittest_name_func) @@ -1010,7 +1013,7 @@ def torch_exec(step: int, (local_beam_width, input_length, hidden_size)) # llama/gpt2 uses DynamicCache - past_key_values = DynamicCache.from_legacy_cache( + past_key_values = dynamic_cache_from_legacy( torch_cache_list[req_idx]) torch_out, past_key_values = torch_exec( @@ -1018,7 +1021,8 @@ def torch_exec(step: int, past_key_values) # llama/gpt2 uses DynamicCache - torch_cache_list[req_idx] = past_key_values.to_legacy_cache() + torch_cache_list[req_idx] = dynamic_cache_to_legacy( + past_key_values) past_key_values = torch_cache_list[req_idx][0] if use_fp8_kv_cache or use_int8_kv_cache: From 44a2d53b157bc15d3f4bece67b507fee6b84731b Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:19:09 -0700 Subject: [PATCH 12/22] fix(auto_deploy): patch BambaModel when _update_causal_mask is absent Transformers versions that removed BambaModel._update_causal_mask only expose _update_mamba_mask. Gate the causal-mask patch on hasattr so export patches apply cleanly on newer HF releases. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> Made-with: Cursor --- .../_torch/auto_deploy/models/patches/bamba.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py index 47d7eacd47a..61f5c309195 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py @@ -205,20 +205,27 @@ class BambaModelPatch(BaseExportPatch): def _apply_patch(self): self.original_values["BambaMixer.torch_forward"] = BambaMixer.torch_forward self.original_values["BambaModel._update_mamba_mask"] = BambaModel._update_mamba_mask - self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask + # Older transformers expose both; newer releases dropped `_update_causal_mask` on `BambaModel` + # (mask handling consolidated under `_update_mamba_mask`). + if hasattr(BambaModel, "_update_causal_mask"): + self.original_values["BambaModel._update_causal_mask"] = ( + BambaModel._update_causal_mask) # NOTE: there is `HybridMambaAttentionDynamicCache.__bool__` to save. # self.original_values["BambaPreTrainedModel._init_weights"] = BambaPreTrainedModel._init_weights BambaMixer.torch_forward = _bamba_mixer_torch_forward BambaModel._update_mamba_mask = _bamba_model_update_mamba_mask - BambaModel._update_causal_mask = _bamba_model_update_causal_mask + if hasattr(BambaModel, "_update_causal_mask"): + BambaModel._update_causal_mask = _bamba_model_update_causal_mask HybridMambaAttentionDynamicCache.__bool__ = _cache_bool # BambaPreTrainedModel._init_weights = _bamba_pretrained_model_init_weights def _revert_patch(self): BambaMixer.torch_forward = self.original_values["BambaMixer.torch_forward"] BambaModel._update_mamba_mask = self.original_values["BambaModel._update_mamba_mask"] - BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"] + if "BambaModel._update_causal_mask" in self.original_values: + BambaModel._update_causal_mask = self.original_values[ + "BambaModel._update_causal_mask"] del HybridMambaAttentionDynamicCache.__bool__ # BambaPreTrainedModel._init_weights = self.original_values[ # "BambaPreTrainedModel._init_weights" From 5f2643b7b3c163d087ab399c92e5cf5cd238a06e Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:38:14 +0000 Subject: [PATCH 13/22] fix: add SlidingWindowCache compatibility shim for Transformers v5 The Phi-4 multimodal model's custom modeling_phi4mm.py imports SlidingWindowCache from transformers.cache_utils, which was removed in transformers 5.3.0 (its functionality was merged into StaticCache). Inject a compatibility alias before executing the model's custom code so that the import succeeds on both old and new transformers versions. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_phi4mm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py index 268ef6ce5f5..08c7303bdbb 100644 --- a/tensorrt_llm/_torch/models/modeling_phi4mm.py +++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py @@ -114,6 +114,12 @@ def _load_phi4mm_classes(local_path): spec = importlib.util.spec_from_file_location( f"{package_name}.hf_modeling_phi4mm", modeling_phi4mm_path) hf_modeling_phi4mm = importlib.util.module_from_spec(spec) + # Inject compatibility shims for classes removed in transformers 5.x. + # The model's custom modeling_phi4mm.py may import SlidingWindowCache + # which was removed in transformers 5.3.0 (merged into StaticCache). + _cache_utils = importlib.import_module("transformers.cache_utils") + if not hasattr(_cache_utils, "SlidingWindowCache"): + _cache_utils.SlidingWindowCache = _cache_utils.StaticCache spec.loader.exec_module(hf_modeling_phi4mm) Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding Phi4MMImageEmbedding = hf_modeling_phi4mm.Phi4MMImageEmbedding From e95b30204a80c5760c823aa7e417a0de0ed80466 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:42:19 +0000 Subject: [PATCH 14/22] test: update test_gpt_attention rope config for Transformers v5 In transformers 5.x, rope_theta and rope_scaling are unified into the rope_parameters dict. Setting rope_theta directly on a config object after construction no longer populates rope_parameters, causing LlamaRotaryEmbedding to fail with a NoneType error. Build the rope_parameters dict explicitly and read from it when extracting rope_base/rope_scale_type/rope_scale for the test. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- .../trt/attention/test_gpt_attention.py | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py index d44411b060c..328ce022118 100644 --- a/tests/unittest/trt/attention/test_gpt_attention.py +++ b/tests/unittest/trt/attention/test_gpt_attention.py @@ -31,9 +31,6 @@ from transformers.models.gptj.modeling_gptj import GPTJAttention from transformers.models.llama.modeling_llama import (LlamaAttention, LlamaRotaryEmbedding) - -from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( - dynamic_cache_from_legacy, dynamic_cache_to_legacy) from utils.util import (getSMVersion, skip_bf16_fp32_accum, skip_blackwell_for_fmha_tests, skip_fp8_pre_ada, unittest_name_func) @@ -51,6 +48,8 @@ MemoryPoolsAllocator from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ PoolsKVCacheManager +from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( + dynamic_cache_from_legacy, dynamic_cache_to_legacy) class TestFunctional(unittest.TestCase): @@ -633,13 +632,19 @@ def _construct_execution( rope_scale_type = RotaryScalingType.none rope_scale = 1.0 if attention_type == "llama_attention": - rope_base = configuration.rope_theta - if configuration.rope_scaling is not None: + rope_params = getattr(configuration, 'rope_parameters', + None) or {} + rope_base = rope_params.get( + 'rope_theta', + getattr(configuration, 'rope_theta', 10000.0)) + rope_type = rope_params.get( + 'rope_type', rope_params.get('type', 'default')) + if rope_type not in ('default', None): rope_scale_type = { "linear": RotaryScalingType.linear, "dynamic": RotaryScalingType.dynamic - }[configuration.rope_scaling["type"]] - rope_scale = configuration.rope_scaling["factor"] + }[rope_type] + rope_scale = rope_params.get("factor", 1.0) rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin( configuration.max_position_embeddings, rotary_embedding_dim, rope_base, rope_scale) @@ -894,8 +899,17 @@ def _construct_execution( if attention_type == 'llama_attention': configuration.num_key_value_heads = num_kv_heads - configuration.rope_theta = rope_base - configuration.rope_scaling = rope_scaling + # In transformers 5.x, rope_theta/rope_scaling are unified into + # rope_parameters. Build the dict so LlamaRotaryEmbedding works. + if rope_scaling is not None: + rope_params = {**rope_scaling, "rope_theta": rope_base} + else: + rope_params = { + "rope_type": "default", + "rope_theta": rope_base, + } + configuration.rope_parameters = rope_params + configuration.rope_scaling = rope_params if rope_scaling is not None: # scaling is typically used for supporting longer seq lens than max_position_embeddings # so we set the max_position_embeddings to be smaller than total seq len @@ -1239,8 +1253,7 @@ def verify_kv_cache(torch_present): attention_packed_mask = None if attention_type == 'gpt2_attention': # gpt2 uses DynamicCache - torch_present = dynamic_cache_from_legacy( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, past_key_value=torch_present, use_cache=True, @@ -1380,8 +1393,7 @@ def verify_kv_cache(torch_present): # torch execution if attention_type == 'gpt2_attention': # gpt2 uses DynamicCache - torch_present = dynamic_cache_from_legacy( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, past_key_value=torch_present, use_cache=True, @@ -1395,8 +1407,7 @@ def verify_kv_cache(torch_present): device='cuda', past_key_values_length=in_len + step - 1) # llama uses DynamicCache - torch_present = dynamic_cache_from_legacy( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention( input_tensor, past_key_value=torch_present, @@ -1405,8 +1416,7 @@ def verify_kv_cache(torch_present): use_cache=True)[0] torch_present = dynamic_cache_to_legacy(torch_present) elif attention_type == 'gptj_attention': - torch_present = dynamic_cache_from_legacy( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, layer_past=torch_present, position_ids=position_ids, @@ -1419,8 +1429,7 @@ def verify_kv_cache(torch_present): attention_mask = (attention_mask >= 0).expand(batch_size, num_heads, 1, key_seqlen) - torch_present = dynamic_cache_from_legacy( - torch_present) + torch_present = dynamic_cache_from_legacy(torch_present) torch_output = attention(input_tensor, layer_past=torch_present, use_cache=True, From dcdbb3465cadabeaffe044eba6666d95bb68b1a5 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:43:58 +0000 Subject: [PATCH 15/22] fix: map rope_type "default" to rope_gpt_neox in PositionEmbeddingType Transformers 5.x unified rope_theta/rope_scaling into rope_parameters, which always contains a "rope_type" field. Standard RoPE (no scaling) now uses rope_type="default" instead of rope_scaling=None. Since many model files check `rope_scaling is not None` and then pass rope_type to PositionEmbeddingType.from_string(), this centralized mapping avoids updating every model file individually. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/functional.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 694aab45f2d..df80459359a 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -730,6 +730,9 @@ def __str__(self): @staticmethod def from_string(s): + # Transformers 5.x uses "default" for standard RoPE (no scaling). + if s == "default": + return PositionEmbeddingType.rope_gpt_neox try: return PositionEmbeddingType[s] except KeyError: From 2bc4ca9f84ce55a2f745504073d07e59e47e11fe Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:57:10 +0000 Subject: [PATCH 16/22] test: fix test_gpt_attention_IFB for Transformers v5 Three issues fixed: 1. past_key_value (singular) renamed to past_key_values (plural) for LlamaAttention and GPT2Attention in transformers 5.x. 2. use_cache parameter removed from attention forward calls in transformers 5.x (cache is always updated in-place). 3. rope_theta/rope_scaling config attributes replaced with unified rope_parameters dict (same fix as test_gpt_attention.py). Without fix #1-2, the DynamicCache was never populated because the kwarg was silently ignored, causing an empty tuple from dynamic_cache_to_legacy and an IndexError at line 1026. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- .../trt/attention/test_gpt_attention_IFB.py | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py index 12a551ee9ea..cacad8c35aa 100644 --- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py +++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py @@ -26,7 +26,6 @@ from parameterized import parameterized from transformers import GPT2Config, GPTBigCodeConfig, GPTJConfig, LlamaConfig -from transformers.cache_utils import DynamicCache from transformers.modeling_attn_mask_utils import (AttentionMaskConverter, _prepare_4d_attention_mask) from transformers.models.gpt2.modeling_gpt2 import GPT2Attention @@ -35,9 +34,6 @@ from transformers.models.gptj.modeling_gptj import GPTJAttention from transformers.models.llama.modeling_llama import (LlamaAttention, LlamaRotaryEmbedding) - -from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( - dynamic_cache_from_legacy, dynamic_cache_to_legacy) from utils.util import (skip_bf16_fp32_accum, skip_fp8_pre_ada, unittest_name_func) @@ -54,6 +50,8 @@ MemoryPoolsAllocator from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ PoolsKVCacheManager +from tests.unittest.trt.attention.hf_dynamic_cache_compat import ( + dynamic_cache_from_legacy, dynamic_cache_to_legacy) class TestFunctional(unittest.TestCase): @@ -384,13 +382,19 @@ def _construct_execution(session, rope_scale_type = RotaryScalingType.none rope_scale = 1.0 if attention_type == "llama_attention": - rope_base = configuration.rope_theta - if configuration.rope_scaling is not None: + rope_params = getattr(configuration, 'rope_parameters', + None) or {} + rope_base = rope_params.get( + 'rope_theta', + getattr(configuration, 'rope_theta', 10000.0)) + rope_type = rope_params.get( + 'rope_type', rope_params.get('type', 'default')) + if rope_type not in ('default', None): rope_scale_type = { "linear": RotaryScalingType.linear, "dynamic": RotaryScalingType.dynamic - }[configuration.rope_scaling["type"]] - rope_scale = configuration.rope_scaling["factor"] + }[rope_type] + rope_scale = rope_params.get("factor", 1.0) rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin( configuration.max_position_embeddings, rotary_embedding_dim, rope_base, rope_scale) @@ -582,8 +586,17 @@ def _construct_execution(session, attn_implementation='eager') if attention_type == 'llama_attention': configuration.num_key_value_heads = num_kv_heads - configuration.rope_theta = rope_base - configuration.rope_scaling = rope_scaling + # In transformers 5.x, rope_theta/rope_scaling are unified into + # rope_parameters. Build the dict so LlamaRotaryEmbedding works. + if rope_scaling is not None: + rope_params = {**rope_scaling, "rope_theta": rope_base} + else: + rope_params = { + "rope_type": "default", + "rope_theta": rope_base, + } + configuration.rope_parameters = rope_params + configuration.rope_scaling = rope_params if rope_scaling is not None: # scaling is typically used for supporting longer seq lens than max_position_embeddings # so we set the max_position_embeddings to be smaller than total seq len @@ -763,8 +776,7 @@ def torch_exec(step: int, tgt_len=(in_len if step == 0 else 1)) if attention_type == 'gpt2_attention': torch_output = attention(input, - past_key_value=layer_past, - use_cache=True, + past_key_values=layer_past, attention_mask=attention_mask)[0] torch_present = layer_past elif attention_type == 'llama_attention': @@ -777,10 +789,9 @@ def torch_exec(step: int, 1)) torch_output = attention( input, - past_key_value=layer_past, + past_key_values=layer_past, position_embeddings=position_embeddings, - attention_mask=attention_mask, - use_cache=True)[0] + attention_mask=attention_mask)[0] torch_present = layer_past elif attention_type == 'gptj_attention': torch_output, torch_present = attention( From 5a2a0be75c1464e286c07af8e7c3ccf51173f15e Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:58:28 +0000 Subject: [PATCH 17/22] fix: use getattr for pad_token_id in MllamaConfig for Transformers v5 In transformers 5.x, pad_token_id was removed from the top-level MllamaConfig and moved into text_config. Use getattr with fallback to text_config.pad_token_id to support both old and new versions. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_mllama.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_mllama.py b/tensorrt_llm/_torch/models/modeling_mllama.py index 16ec672539a..21a5fc447f4 100644 --- a/tensorrt_llm/_torch/models/modeling_mllama.py +++ b/tensorrt_llm/_torch/models/modeling_mllama.py @@ -274,8 +274,10 @@ def __init__( self.hidden_size = pretrained_config.text_config.hidden_size self.max_num_tiles = pretrained_config.vision_config.max_num_tiles self.vision_output_dim = pretrained_config.vision_config.vision_output_dim - self.pad_token_id = (pretrained_config.pad_token_id if - pretrained_config.pad_token_id is not None else -1) + self.pad_token_id = getattr(pretrained_config, 'pad_token_id', None) + if self.pad_token_id is None: + self.pad_token_id = getattr(pretrained_config.text_config, + 'pad_token_id', -1) or -1 self.image_size = pretrained_config.vision_config.image_size # hack config From 50c991bef48d2187b59125e94a4fb4f0386cdf2d Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Thu, 9 Apr 2026 14:06:06 +0000 Subject: [PATCH 18/22] style: apply pre-commit formatting fixes Fix isort import ordering and yapf formatting issues flagged by pre-commit hooks on prior Transformers v5 compatibility commits. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py | 6 ++---- tests/unittest/_torch/modeling/test_modeling_cohere2.py | 2 +- tests/unittest/_torch/modeling/test_modeling_exaone4.py | 2 +- tests/unittest/_torch/modeling/test_modeling_gemma3.py | 2 +- tests/unittest/trt/attention/hf_dynamic_cache_compat.py | 4 ++-- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py index 61f5c309195..779622a2a0b 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py @@ -208,8 +208,7 @@ def _apply_patch(self): # Older transformers expose both; newer releases dropped `_update_causal_mask` on `BambaModel` # (mask handling consolidated under `_update_mamba_mask`). if hasattr(BambaModel, "_update_causal_mask"): - self.original_values["BambaModel._update_causal_mask"] = ( - BambaModel._update_causal_mask) + self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask # NOTE: there is `HybridMambaAttentionDynamicCache.__bool__` to save. # self.original_values["BambaPreTrainedModel._init_weights"] = BambaPreTrainedModel._init_weights @@ -224,8 +223,7 @@ def _revert_patch(self): BambaMixer.torch_forward = self.original_values["BambaMixer.torch_forward"] BambaModel._update_mamba_mask = self.original_values["BambaModel._update_mamba_mask"] if "BambaModel._update_causal_mask" in self.original_values: - BambaModel._update_causal_mask = self.original_values[ - "BambaModel._update_causal_mask"] + BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"] del HybridMambaAttentionDynamicCache.__bool__ # BambaPreTrainedModel._init_weights = self.original_values[ # "BambaPreTrainedModel._init_weights" diff --git a/tests/unittest/_torch/modeling/test_modeling_cohere2.py b/tests/unittest/_torch/modeling/test_modeling_cohere2.py index 783c0939251..1a839cacd6c 100644 --- a/tests/unittest/_torch/modeling/test_modeling_cohere2.py +++ b/tests/unittest/_torch/modeling/test_modeling_cohere2.py @@ -1,11 +1,11 @@ from copy import deepcopy import torch +from _torch.helpers import make_hf_hybrid_cache_for_tests from transformers import Cohere2Config from transformers import Cohere2ForCausalLM as HFCohere2ForCausalLM import tensorrt_llm -from _torch.helpers import make_hf_hybrid_cache_for_tests from tensorrt_llm._torch.attention_backend.utils import get_attention_backend from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.model_config import ModelConfig diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py index a9f1517a30f..7ea88c93c57 100644 --- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py +++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py @@ -26,7 +26,7 @@ class Exaone4Config(PretrainedConfig): SKIP_EXAONE4_HF_ACCURACY_TEST = True from _torch.helpers import (create_mock_cuda_graph_runner, - make_hf_hybrid_cache_for_tests) + make_hf_hybrid_cache_for_tests) from utils.util import getSMVersion import tensorrt_llm diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py index 7d164b7a6d2..f252f4cbd2d 100644 --- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py +++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py @@ -3,13 +3,13 @@ from dataclasses import dataclass import torch +from _torch.helpers import make_hf_hybrid_cache_for_tests from parameterized import parameterized from transformers import Gemma3Config from transformers import Gemma3ForCausalLM as HFGemma3ForCausalLM from transformers import Gemma3TextConfig import tensorrt_llm -from _torch.helpers import make_hf_hybrid_cache_for_tests from tensorrt_llm._torch.attention_backend import (AttentionMetadata, FlashInferAttentionMetadata) from tensorrt_llm._torch.attention_backend.utils import get_attention_backend diff --git a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py index d2a6005e8c3..6866feb8ba6 100644 --- a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py +++ b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py @@ -26,8 +26,8 @@ def dynamic_cache_from_legacy( - past_key_values: Optional[Union[LegacyCache, - Sequence[LegacyLayerKV]]]) -> DynamicCache: + past_key_values: Optional[Union[LegacyCache, Sequence[LegacyLayerKV]]], +) -> DynamicCache: """Match pre-v5 ``DynamicCache.from_legacy_cache`` (see transformers v4.48 ``cache_utils``).""" if past_key_values is None: return DynamicCache() From d0b445ca768f45fe436b8d89b10ececeb06ca2d7 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:35:52 +0000 Subject: [PATCH 19/22] fix: prevent duplicate 'disable' kwarg in DisabledTqdm Newer huggingface_hub versions pass 'disable' explicitly to tqdm_class.__init__() via snapshot_download. Using **kwargs with an additional disable=True keyword caused a TypeError ("multiple values for keyword argument 'disable'"). Set disable in kwargs dict before forwarding to super().__init__() so that any caller-provided value is overridden rather than duplicated. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/llmapi/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py index 569a4406bba..6765617a180 100644 --- a/tensorrt_llm/llmapi/utils.py +++ b/tensorrt_llm/llmapi/utils.py @@ -231,7 +231,8 @@ def get_file_lock(model_name: str, class DisabledTqdm(tqdm): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, disable=True) + kwargs["disable"] = True + super().__init__(*args, **kwargs) def download_hf_model(model: str, revision: Optional[str] = None) -> Path: From af7fe585d9463ab4c6166c5c51164d3e0cfd8c2a Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:39:15 +0000 Subject: [PATCH 20/22] fix: handle rope_scaling key changes in Qwen models for Transformers v5 In transformers 5.x, config.rope_scaling is always a dict (never None) and uses "rope_type" key instead of "type". The dict also contains rope_type="default" for standard RoPE (no scaling). Update QwenAttention and QwenMoeAttention to: - Look up both "type" and "rope_type" keys with fallback - Treat rope_type="default" the same as no scaling (use rope_gpt_neox) - Fix QwenDecoderLayer's yarn detection to use dict.get() instead of getattr() on a dict object Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_qwen.py | 20 +++++++++++++------ .../_torch/models/modeling_qwen_moe.py | 9 ++++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_qwen.py b/tensorrt_llm/_torch/models/modeling_qwen.py index df6d83e5b75..1ec1323019c 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen.py +++ b/tensorrt_llm/_torch/models/modeling_qwen.py @@ -29,12 +29,17 @@ def __init__( layer_idx: Optional[int] = None, ): config = model_config.pretrained_config - if getattr(config, "rope_scaling", None) is not None: + rope_scaling = getattr(config, "rope_scaling", None) + # In transformers 5.x, rope_scaling is always a dict (never None) + # and uses "rope_type" key instead of "type". + rope_type = None + if rope_scaling is not None: + rope_type = rope_scaling.get("type", rope_scaling.get("rope_type")) + if rope_type is not None and rope_type != "default": pos_embd_params = PositionalEmbeddingParams( - type=PositionEmbeddingType.from_string( - config.rope_scaling["type"]), + type=PositionEmbeddingType.from_string(rope_type), rope=RopeParams.from_config(config), - mrope_section=config.rope_scaling.get('mrope_section', None)) + mrope_section=rope_scaling.get('mrope_section', None)) else: pos_embd_params = PositionalEmbeddingParams( type=PositionEmbeddingType.rope_gpt_neox, @@ -116,8 +121,11 @@ def __init__( self.layer_idx = layer_idx config = model_config.pretrained_config - if getattr(config, "rope_scaling", None) is not None and getattr( - config.rope_scaling, "rope_type", None) == "yarn": + rope_scaling = getattr(config, "rope_scaling", None) + rope_type = rope_scaling.get("rope_type", + rope_scaling.get("type")) \ + if isinstance(rope_scaling, dict) else None + if rope_type == "yarn": self.self_attn = QwenYarnAttention( model_config, layer_idx=layer_idx, diff --git a/tensorrt_llm/_torch/models/modeling_qwen_moe.py b/tensorrt_llm/_torch/models/modeling_qwen_moe.py index d19c2602ce9..dda335a962a 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen_moe.py @@ -114,10 +114,13 @@ def __init__( layer_idx: Optional[int] = None, ): config = model_config.pretrained_config - if getattr(config, "rope_scaling", None) is not None: + rope_scaling = getattr(config, "rope_scaling", None) + rope_type = None + if rope_scaling is not None: + rope_type = rope_scaling.get("type", rope_scaling.get("rope_type")) + if rope_type is not None and rope_type != "default": pos_embd_params = PositionalEmbeddingParams( - type=PositionEmbeddingType.from_string( - config.rope_scaling["type"]), + type=PositionEmbeddingType.from_string(rope_type), rope=RopeParams.from_config(config), ) else: From 2153a7357e7dcfcd43364c0f00e95372ba3955e1 Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:40:59 +0000 Subject: [PATCH 21/22] fix: use getattr for tie_word_embeddings for Transformers v5 In transformers 5.x, tie_word_embeddings is no longer a default attribute on all config classes (e.g. CLIPVisionConfig). Use getattr with a False default in generic code paths that may receive any config type (modeling_utils, weight mappers). Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- .../_torch/models/checkpoints/hf/gemma3_weight_mapper.py | 4 ++-- tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py | 4 ++-- tensorrt_llm/_torch/models/modeling_utils.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py index 8382588dc24..c03b8a7b73c 100644 --- a/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py +++ b/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py @@ -10,8 +10,8 @@ class Gemma3HfWeightMapper(HfWeightMapper): def should_skip_module(self, module_name: str) -> bool: - if self.model.config.tie_word_embeddings and module_name.startswith( - "lm_head"): + if getattr(self.model.config, 'tie_word_embeddings', + False) and module_name.startswith("lm_head"): return True # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py index cb1d8671a80..94a1eb986d5 100644 --- a/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py +++ b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py @@ -37,8 +37,8 @@ def apply_callbacks(self, module: nn.Module, module_name: str, return module_weights def should_skip_module(self, module_name: str) -> bool: - if self.model.config.tie_word_embeddings and module_name.startswith( - "lm_head"): + if getattr(self.model.config, 'tie_word_embeddings', + False) and module_name.startswith("lm_head"): return True # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py index e285c32ec29..5e9e618bf10 100755 --- a/tensorrt_llm/_torch/models/modeling_utils.py +++ b/tensorrt_llm/_torch/models/modeling_utils.py @@ -401,7 +401,7 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig], self.lm_head.weight.data.copy_(x) # use embedding weights in lm_head if tie word embedding is enabled - if config.pretrained_config.tie_word_embeddings: + if getattr(config.pretrained_config, 'tie_word_embeddings', False): assert self.lm_head.tp_size == self.model.embed_tokens.tp_size, ( "lm_head and vocab embedding should use the same TP size") assert self.lm_head.tp_mode == self.model.embed_tokens.tp_mode, ( @@ -896,7 +896,8 @@ def load_single_module(name, module): return # skip load weights if tie word embeddings is enabled and layer is lm_head - if model.config.tie_word_embeddings and name.startswith("lm_head"): + if getattr(model.config, 'tie_word_embeddings', + False) and name.startswith("lm_head"): return # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values From 8ee65631c3e45b2f50610f9d3fd1cf383a8ce6db Mon Sep 17 00:00:00 2001 From: Jonas Li <6110159+longlee0622@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:44:49 +0000 Subject: [PATCH 22/22] fix: support per-layer-type RoPE config (Gemma3) for Transformers v5 In transformers 5.x, Gemma3's rope_parameters is a nested dict keyed by attention layer type (full_attention, sliding_attention) instead of a flat dict. Also, rope_local_base_freq was removed and its value moved into rope_parameters["sliding_attention"]["rope_theta"]. Changes: - RopeParams.from_config: flatten per-layer-type rope_parameters by picking "full_attention" as the default, instead of asserting. - Gemma3Attention: fall back to rope_parameters["sliding_attention"] when rope_local_base_freq is absent. Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com> --- tensorrt_llm/_torch/attention_backend/interface.py | 14 ++++++++++---- tensorrt_llm/_torch/models/modeling_gemma3.py | 9 ++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index f86b550482f..2ea9d4261d1 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -485,10 +485,16 @@ def from_config(config) -> "RopeParams": hf_rope_parameters = getattr(config, 'rope_parameters', None) if hf_rope_parameters is not None: - assert not set(hf_rope_parameters.keys()).issubset( - ALLOWED_ATTENTION_LAYER_TYPES), ( - "Per-layer-type RoPE configuration is not supported yet.") - config.update(hf_rope_parameters) + if set(hf_rope_parameters.keys()).issubset( + ALLOWED_ATTENTION_LAYER_TYPES): + # Per-layer-type RoPE config (e.g. Gemma3 in transformers 5.x). + # Pick "full_attention" as the default; callers override theta + # for sliding-window layers independently. + flat = hf_rope_parameters.get( + "full_attention", next(iter(hf_rope_parameters.values()))) + config.update(flat) + else: + config.update(hf_rope_parameters) # get rotary parameters. hidden_size = config.hidden_size diff --git a/tensorrt_llm/_torch/models/modeling_gemma3.py b/tensorrt_llm/_torch/models/modeling_gemma3.py index 24ba665afbf..3612136e7b1 100644 --- a/tensorrt_llm/_torch/models/modeling_gemma3.py +++ b/tensorrt_llm/_torch/models/modeling_gemma3.py @@ -65,7 +65,14 @@ def __init__( rope_params = RopeParams.from_config(config) self.attention_window_size = None if is_sliding: - rope_params.theta = config.rope_local_base_freq + # transformers 5.x moved rope_local_base_freq into + # rope_parameters["sliding_attention"]["rope_theta"] + local_freq = getattr(config, 'rope_local_base_freq', None) + if local_freq is None: + rp = getattr(config, 'rope_parameters', {}) + local_freq = rp.get('sliding_attention', + {}).get('rope_theta', 10000.0) + rope_params.theta = local_freq rope_params.scale_type = RotaryScalingType.none rope_params.scale = 1.0 self.attention_window_size = config.sliding_window