From 34d8e335e8e7914af32d54703cf0e2a35bf645d7 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 00:33:56 -0700
Subject: [PATCH 01/22] Upgrade transformers 5.3.0 dependency

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b76e28208bd..bfe364fde97 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
 # torch 2.10.0+cu130 depends on nvidia-nccl-cu13==2.28.9
 nvidia-nccl-cu13>=2.28.9,<=2.29.2
 nvidia-cuda-nvrtc
-transformers==4.57.3
+transformers==5.3.0
 prometheus_client
 prometheus_fastapi_instrumentator
 pydantic>=2.9.1

From a16ddeb828b9e1348d948d076f9240543b8ed852 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 01:01:24 -0700
Subject: [PATCH 02/22] fix(auto_deploy): fallback SDPA mask patch when
 executorch helper removed

Transformers 5.x removed sdpa_mask_without_vmap from integrations.executorch.
Use functools.partial(sdpa_mask, use_vmap=False) when the legacy import fails.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 .../export/library/transformers_sdpa_mask.py           | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py
index fd21604d1b6..d6272be573d 100644
--- a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py
+++ b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py
@@ -1,6 +1,7 @@
 """Patch for transformers SDPA mask to be export-compatible."""
 
 import importlib.metadata
+from functools import partial
 
 from packaging import version
 
@@ -29,7 +30,14 @@ def _apply_patch(self):
         try:
             # imports only after version check
             from transformers import masking_utils
-            from transformers.integrations.executorch import sdpa_mask_without_vmap
+
+            # Up to ~4.53+, HF exposed this helper next to ExecuTorch export utilities.
+            # Transformers 5.x removed it; sdpa_mask now supports use_vmap=False (the default),
+            # which is export-compatible without vmap.
+            try:
+                from transformers.integrations.executorch import sdpa_mask_without_vmap
+            except ImportError:
+                sdpa_mask_without_vmap = partial(masking_utils.sdpa_mask, use_vmap=False)
 
             # recall original implementation
             self.original_values["masking_utils.sdpa_mask"] = masking_utils.sdpa_mask

From f274bb6c9a7a96ba75ae9924b85e5c7d226347f0 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 03:24:04 -0700
Subject: [PATCH 03/22] fix: import AutoModelForImageTextToText for
 Transformers v5

AutoModelForVision2Seq was removed from the public API in Transformers v5.
Fall back to AutoModelForImageTextToText when the legacy name is unavailable.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 tensorrt_llm/models/gpt/convert.py       |  9 +++++++--
 tensorrt_llm/tools/multimodal_builder.py | 11 ++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/models/gpt/convert.py b/tensorrt_llm/models/gpt/convert.py
index 1e2bc4b999d..315fe6a17d4 100644
--- a/tensorrt_llm/models/gpt/convert.py
+++ b/tensorrt_llm/models/gpt/convert.py
@@ -29,8 +29,13 @@
 import torch.nn as nn
 import yaml
 from tqdm import tqdm
-from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoTokenizer)
+try:
+    from transformers import AutoModelForVision2Seq
+except ImportError:
+    # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText
+    from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.gpt2.modeling_gpt2 import GPT2Block
 from transformers.pytorch_utils import Conv1D
 
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index bf948eb2506..c2006f5f2c0 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -14,10 +14,15 @@
 from tensorrt_llm._utils import torch_dtype_to_str, to_json_file
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
+try:
+    from transformers import AutoModelForVision2Seq
+except ImportError:
+    # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText
+    from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq
+
 from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
-                          AutoModelForVision2Seq, AutoProcessor,
-                          Blip2ForConditionalGeneration, Blip2Processor,
-                          FuyuForCausalLM, FuyuProcessor,
+                          AutoProcessor, Blip2ForConditionalGeneration,
+                          Blip2Processor, FuyuForCausalLM, FuyuProcessor,
                           LlavaForConditionalGeneration, NougatProcessor,
                           Pix2StructForConditionalGeneration,
                           VisionEncoderDecoderModel, CLIPVisionModel)

From 9989aae9f31d8b4391899c225c30e67d86b4ca75 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:25:37 -0700
Subject: [PATCH 04/22] fix: shim get_parameter_device/dtype for Transformers
 v5

Transformers v5 removed these helpers from modeling_utils. Add
hf_parameter_utils with ImportError fallback matching ModuleUtilsMixin
behavior; update CLIP, SigLIP, and Wan call sites.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 .../_torch/models/hf_parameter_utils.py       | 36 +++++++++++++++++++
 tensorrt_llm/_torch/models/modeling_clip.py   |  3 +-
 tensorrt_llm/_torch/models/modeling_siglip.py |  3 +-
 .../visual_gen/models/wan/transformer_wan.py  |  2 +-
 tensorrt_llm/models/gpt/convert.py            |  1 +
 5 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 tensorrt_llm/_torch/models/hf_parameter_utils.py

diff --git a/tensorrt_llm/_torch/models/hf_parameter_utils.py b/tensorrt_llm/_torch/models/hf_parameter_utils.py
new file mode 100644
index 00000000000..b5629827625
--- /dev/null
+++ b/tensorrt_llm/_torch/models/hf_parameter_utils.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compatibility for Hugging Face ``get_parameter_device`` / ``get_parameter_dtype``.
+
+Transformers v5 no longer exports these from ``transformers.modeling_utils``; they
+match ``ModuleUtilsMixin`` behavior for plain ``nn.Module`` stacks.
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+try:
+    from transformers.modeling_utils import (get_parameter_device,
+                                             get_parameter_dtype)
+except ImportError:
+
+    def get_parameter_device(module: nn.Module) -> torch.device:
+        return next(module.parameters()).device
+
+    def get_parameter_dtype(module: nn.Module) -> torch.dtype:
+        return next(param.dtype for param in module.parameters() if param.is_floating_point())
diff --git a/tensorrt_llm/_torch/models/modeling_clip.py b/tensorrt_llm/_torch/models/modeling_clip.py
index 1e203eda8b7..9e73dcc2dd3 100644
--- a/tensorrt_llm/_torch/models/modeling_clip.py
+++ b/tensorrt_llm/_torch/models/modeling_clip.py
@@ -4,8 +4,6 @@
 import torch.nn as nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutput
-from transformers.modeling_utils import (get_parameter_device,
-                                         get_parameter_dtype)
 from transformers.models.clip.configuration_clip import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPVisionEmbeddings
 
@@ -17,6 +15,7 @@
 from ..model_config import ModelConfig
 from ..modules.attention import Attention
 from ..modules.mlp import MLP
+from .hf_parameter_utils import get_parameter_device, get_parameter_dtype
 from .modeling_utils import _load_weights_impl, register_auto_model
 
 
diff --git a/tensorrt_llm/_torch/models/modeling_siglip.py b/tensorrt_llm/_torch/models/modeling_siglip.py
index e4ed6d462b8..071ee7f03d4 100644
--- a/tensorrt_llm/_torch/models/modeling_siglip.py
+++ b/tensorrt_llm/_torch/models/modeling_siglip.py
@@ -2,8 +2,6 @@
 
 import torch
 import torch.nn as nn
-from transformers.modeling_utils import (get_parameter_device,
-                                         get_parameter_dtype)
 from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
 from transformers.models.siglip.modeling_siglip import (SiglipVisionConfig,
                                                         SiglipVisionEmbeddings)
@@ -13,6 +11,7 @@
 from ..attention_backend.interface import AttentionMetadata
 from ..attention_backend.utils import get_attention_backend
 from ..model_config import ModelConfig
+from .hf_parameter_utils import get_parameter_device, get_parameter_dtype
 from .modeling_clip import CLIPEncoder
 from .modeling_utils import _load_weights_impl, register_auto_model
 
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py
index 49e56c4d23d..67d805416cf 100644
--- a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py
+++ b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py
@@ -6,8 +6,8 @@
 import torch.nn.functional as F
 from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
 from tqdm import tqdm
-from transformers.modeling_utils import get_parameter_device
 
+from tensorrt_llm._torch.models.hf_parameter_utils import get_parameter_device
 from tensorrt_llm._torch.modules.layer_norm import LayerNorm
 from tensorrt_llm._torch.modules.linear import Linear
 from tensorrt_llm._torch.modules.mlp import MLP
diff --git a/tensorrt_llm/models/gpt/convert.py b/tensorrt_llm/models/gpt/convert.py
index 315fe6a17d4..aa7d9a89e0d 100644
--- a/tensorrt_llm/models/gpt/convert.py
+++ b/tensorrt_llm/models/gpt/convert.py
@@ -29,6 +29,7 @@
 import torch.nn as nn
 import yaml
 from tqdm import tqdm
+
 try:
     from transformers import AutoModelForVision2Seq
 except ImportError:

From 7de8d791d6e4ee98986e64c7a45761743148b57b Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 19:12:21 -0700
Subject: [PATCH 05/22] fix: pass exist_ok to AutoConfig.register for
 Transformers 5.3

Transformers 5.x is stricter about duplicate config registration.
Allow TRT-LLM re-registration when configs are already registered.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 .../_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py  | 4 ++--
 tensorrt_llm/_torch/models/modeling_exaone4.py                | 2 +-
 tensorrt_llm/_torch/models/modeling_exaone_moe.py             | 2 +-
 tensorrt_llm/_torch/models/modeling_nemotron_h.py             | 2 +-
 tensorrt_llm/_torch/models/modeling_vila.py                   | 2 +-
 tensorrt_llm/models/llama/config.py                           | 3 ++-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
index e227bc7ebec..71c11814379 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
@@ -2854,8 +2854,8 @@ def init_input_processor(self, base):
 # Registration
 # =============================================================================
 
-AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig)
-AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig)
+AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig, exist_ok=True)
+AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig, exist_ok=True)
 
 AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM)
 Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration)
diff --git a/tensorrt_llm/_torch/models/modeling_exaone4.py b/tensorrt_llm/_torch/models/modeling_exaone4.py
index 07951fc28a4..2b49b9bb475 100644
--- a/tensorrt_llm/_torch/models/modeling_exaone4.py
+++ b/tensorrt_llm/_torch/models/modeling_exaone4.py
@@ -29,7 +29,7 @@
     class Exaone4Config(PretrainedConfig):
         model_type = "exaone4"
 
-    AutoConfig.register(Exaone4Config.model_type, Exaone4Config)
+    AutoConfig.register(Exaone4Config.model_type, Exaone4Config, exist_ok=True)
 
 
 def check_is_sliding(config: Exaone4Config, layer_idx: int) -> bool:
diff --git a/tensorrt_llm/_torch/models/modeling_exaone_moe.py b/tensorrt_llm/_torch/models/modeling_exaone_moe.py
index fe420178558..621065ccb47 100644
--- a/tensorrt_llm/_torch/models/modeling_exaone_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_exaone_moe.py
@@ -53,7 +53,7 @@ class ExaoneMoEConfig(PretrainedConfig):
     "Register ExaoneMoEConfig to mimic the ExaoneMoE model.",
     key="EXAONE_MOE_REGISTER_WARNING"
 )
-AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig)
+AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig, exist_ok=True)
 # End of the config register.
 # fmt: on
 
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
index 623195da94a..446a7383a0a 100644
--- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
@@ -1073,4 +1073,4 @@ def forward(
         return hidden_states
 
 
-AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig)
+AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig, exist_ok=True)
diff --git a/tensorrt_llm/_torch/models/modeling_vila.py b/tensorrt_llm/_torch/models/modeling_vila.py
index 8b634229237..1e3bcab02b5 100644
--- a/tensorrt_llm/_torch/models/modeling_vila.py
+++ b/tensorrt_llm/_torch/models/modeling_vila.py
@@ -1252,5 +1252,5 @@ def post_config(self):
         self.model_config.pretrained_config = self.llm.config
 
 
-AutoConfig.register(VilaConfig.model_type, VilaConfig)
+AutoConfig.register(VilaConfig.model_type, VilaConfig, exist_ok=True)
 AutoModel.register(VilaConfig, VilaModel)
diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py
index 7e0369a4ba0..6db265dbd73 100644
--- a/tensorrt_llm/models/llama/config.py
+++ b/tensorrt_llm/models/llama/config.py
@@ -112,7 +112,8 @@ def from_hugging_face(
                 from llava.model import LlavaLlamaConfig  # noqa
                 from llava.model import LlavaLlamaModel
                 transformers.AutoConfig.register("llava_llama",
-                                                 LlavaLlamaConfig)
+                                                 LlavaLlamaConfig,
+                                                 exist_ok=True)
                 transformers.AutoModelForCausalLM.register(
                     LlavaLlamaConfig, LlavaLlamaModel)
 

From cba7d8dc146467b43c0b68bc819d9d17e69a247c Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 19:12:33 -0700
Subject: [PATCH 06/22] fix: replace removed load_sharded_checkpoint for
 Transformers 5.3

Implement local sharded checkpoint loading for Llama4 vision encoder;
load_sharded_checkpoint was removed from transformers.modeling_utils.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 tensorrt_llm/_torch/models/modeling_llama.py | 62 ++++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index b13c2e3de91..e898a344e8a 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -7,7 +7,8 @@
 from torch import nn
 from transformers import (AutoProcessor, AutoTokenizer, Llama4Config,
                           Llama4VisionModel, LlamaConfig, PretrainedConfig)
-from transformers.modeling_utils import load_sharded_checkpoint
+from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME,
+                                WEIGHTS_INDEX_NAME, WEIGHTS_NAME)
 from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
 
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
@@ -1118,6 +1119,58 @@ def post_load_weights(self):
                 layer.next_attn = self.model.layers[idx + 1].self_attn
 
 
+def _load_checkpoint_into_module(module: nn.Module,
+                                 folder: str,
+                                 strict: bool = True) -> None:
+    """Load a sharded HuggingFace checkpoint into a module.
+
+    This replaces the removed ``transformers.modeling_utils.load_sharded_checkpoint``
+    function. It supports both safetensors and PyTorch checkpoint formats.
+    """
+    folder = str(folder)
+
+    # Determine checkpoint format and collect shard files
+    index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
+    if os.path.isfile(index_file):
+        import json
+        with open(index_file) as f:
+            shard_files = sorted(
+                set(json.load(f)["weight_map"].values()))
+        shard_paths = [os.path.join(folder, s) for s in shard_files]
+        use_safetensors = True
+    elif os.path.isfile(os.path.join(folder, WEIGHTS_INDEX_NAME)):
+        import json
+        with open(os.path.join(folder, WEIGHTS_INDEX_NAME)) as f:
+            shard_files = sorted(
+                set(json.load(f)["weight_map"].values()))
+        shard_paths = [os.path.join(folder, s) for s in shard_files]
+        use_safetensors = False
+    elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)):
+        shard_paths = [os.path.join(folder, SAFE_WEIGHTS_NAME)]
+        use_safetensors = True
+    elif os.path.isfile(os.path.join(folder, WEIGHTS_NAME)):
+        shard_paths = [os.path.join(folder, WEIGHTS_NAME)]
+        use_safetensors = False
+    else:
+        raise FileNotFoundError(
+            f"No checkpoint found in {folder}. Expected "
+            f"{SAFE_WEIGHTS_INDEX_NAME}, {WEIGHTS_INDEX_NAME}, "
+            f"{SAFE_WEIGHTS_NAME}, or {WEIGHTS_NAME}.")
+
+    # Load state dict from all shards and merge
+    full_state_dict: Dict[str, torch.Tensor] = {}
+    if use_safetensors:
+        from safetensors.torch import load_file
+        for path in shard_paths:
+            full_state_dict.update(load_file(path))
+    else:
+        for path in shard_paths:
+            full_state_dict.update(
+                torch.load(path, map_location="cpu", weights_only=True))
+
+    module.load_state_dict(full_state_dict, strict=strict)
+
+
 class Llama4VisionEncoder(nn.Module):
 
     def __init__(self, model_config: ModelConfig[Llama4Config], *args,
@@ -1148,9 +1201,10 @@ def load_weights(self, weights: Dict):
 
         # Otherwise, load the weights from the checkpoint.
         else:
-            load_sharded_checkpoint(module_dict,
-                                    self.pretrained_config._name_or_path,
-                                    strict=False)
+            _load_checkpoint_into_module(
+                module_dict,
+                self.pretrained_config._name_or_path,
+                strict=False)
 
         self.vision_model = module_dict["vision_model"].to(self.device)
         self.mm_projector = module_dict["multi_modal_projector"].to(self.device)

From 1ca795eb62bcdb04d58768699a1079d19b000e80 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 19:26:18 -0700
Subject: [PATCH 07/22] format code

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/hf_parameter_utils.py |  3 +--
 tensorrt_llm/_torch/models/modeling_llama.py     | 15 ++++++---------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/tensorrt_llm/_torch/models/hf_parameter_utils.py b/tensorrt_llm/_torch/models/hf_parameter_utils.py
index b5629827625..8a163c4f917 100644
--- a/tensorrt_llm/_torch/models/hf_parameter_utils.py
+++ b/tensorrt_llm/_torch/models/hf_parameter_utils.py
@@ -25,8 +25,7 @@
 import torch.nn as nn
 
 try:
-    from transformers.modeling_utils import (get_parameter_device,
-                                             get_parameter_dtype)
+    from transformers.modeling_utils import get_parameter_device, get_parameter_dtype
 except ImportError:
 
     def get_parameter_device(module: nn.Module) -> torch.device:
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index e898a344e8a..8dff496563a 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -7,9 +7,9 @@
 from torch import nn
 from transformers import (AutoProcessor, AutoTokenizer, Llama4Config,
                           Llama4VisionModel, LlamaConfig, PretrainedConfig)
+from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
 from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME,
                                 WEIGHTS_INDEX_NAME, WEIGHTS_NAME)
-from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
 
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams, MoEAllReduce)
@@ -1134,15 +1134,13 @@ def _load_checkpoint_into_module(module: nn.Module,
     if os.path.isfile(index_file):
         import json
         with open(index_file) as f:
-            shard_files = sorted(
-                set(json.load(f)["weight_map"].values()))
+            shard_files = sorted(set(json.load(f)["weight_map"].values()))
         shard_paths = [os.path.join(folder, s) for s in shard_files]
         use_safetensors = True
     elif os.path.isfile(os.path.join(folder, WEIGHTS_INDEX_NAME)):
         import json
         with open(os.path.join(folder, WEIGHTS_INDEX_NAME)) as f:
-            shard_files = sorted(
-                set(json.load(f)["weight_map"].values()))
+            shard_files = sorted(set(json.load(f)["weight_map"].values()))
         shard_paths = [os.path.join(folder, s) for s in shard_files]
         use_safetensors = False
     elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)):
@@ -1201,10 +1199,9 @@ def load_weights(self, weights: Dict):
 
         # Otherwise, load the weights from the checkpoint.
         else:
-            _load_checkpoint_into_module(
-                module_dict,
-                self.pretrained_config._name_or_path,
-                strict=False)
+            _load_checkpoint_into_module(module_dict,
+                                         self.pretrained_config._name_or_path,
+                                         strict=False)
 
         self.vision_model = module_dict["vision_model"].to(self.device)
         self.mm_projector = module_dict["multi_modal_projector"].to(self.device)

From 39efba415c1eaa701cc3dd56fd82288d40b59cad Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 23:39:22 -0700
Subject: [PATCH 08/22] test: replace HybridCache with StaticCache helper for
 Transformers v5

HybridCache was removed from transformers.cache_utils in v5.
Add make_hf_hybrid_cache_for_tests using StaticCache on ImportError;
update Cohere2, Exaone4, and Gemma3 modeling tests.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 tests/unittest/_torch/helpers.py              | 33 ++++++++++++++++++-
 .../_torch/modeling/test_modeling_cohere2.py  |  6 ++--
 .../_torch/modeling/test_modeling_exaone4.py  | 16 +++++----
 .../_torch/modeling/test_modeling_gemma3.py   | 14 ++++----
 4 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py
index 743f3998f26..f7dde64c669 100644
--- a/tests/unittest/_torch/helpers.py
+++ b/tests/unittest/_torch/helpers.py
@@ -1,4 +1,4 @@
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -256,3 +256,34 @@ def create_mock_cuda_graph_runner(batch_size: int, use_mrope: bool = False):
         dist=None,
         kv_cache_manager_key=ResourceManagerType.KV_CACHE_MANAGER)
     return CUDAGraphRunner(config)
+
+
+def make_hf_hybrid_cache_for_tests(
+    config,
+    *,
+    max_cache_len: int,
+    max_batch_size: Optional[int] = None,
+    device=None,
+    dtype=None,
+):
+    """Build Hugging Face ``past_key_values`` for hybrid / sliding-window models in tests.
+
+    Transformers v4 exposes ``HybridCache``; v5 removes it in favor of ``StaticCache``
+    for fixed-length pre-allocated KV (see HF cache refactor).
+    """
+    try:
+        from transformers.cache_utils import HybridCache
+    except ImportError:
+        from transformers.cache_utils import StaticCache
+
+        return StaticCache(config=config, max_cache_len=max_cache_len)
+
+    kwargs = {
+        "config": config,
+        "max_cache_len": max_cache_len,
+        "device": device,
+        "dtype": dtype,
+    }
+    if max_batch_size is not None:
+        kwargs["max_batch_size"] = max_batch_size
+    return HybridCache(**kwargs)
diff --git a/tests/unittest/_torch/modeling/test_modeling_cohere2.py b/tests/unittest/_torch/modeling/test_modeling_cohere2.py
index 20c2e88fe69..783c0939251 100644
--- a/tests/unittest/_torch/modeling/test_modeling_cohere2.py
+++ b/tests/unittest/_torch/modeling/test_modeling_cohere2.py
@@ -3,9 +3,9 @@
 import torch
 from transformers import Cohere2Config
 from transformers import Cohere2ForCausalLM as HFCohere2ForCausalLM
-from transformers.cache_utils import HybridCache
 
 import tensorrt_llm
+from _torch.helpers import make_hf_hybrid_cache_for_tests
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
@@ -161,8 +161,8 @@ def test_cohere2_allclose_to_hf(self) -> None:
 
         # Initialize the hugging face model
         hf_cohere2 = HFCohere2ForCausalLM(cohere2_config).to(dtype).to(device).eval()
-        hf_cache = HybridCache(
-            config=cohere2_config,
+        hf_cache = make_hf_hybrid_cache_for_tests(
+            cohere2_config,
             max_batch_size=batch_size,
             max_cache_len=10,
             device=device,
diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
index 931828be848..a9f1517a30f 100644
--- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py
+++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
@@ -25,8 +25,8 @@ class Exaone4Config(PretrainedConfig):
     # TODO: Remove this once we have a proper config for Exaone4
     SKIP_EXAONE4_HF_ACCURACY_TEST = True
 
-from _torch.helpers import create_mock_cuda_graph_runner
-from transformers.cache_utils import HybridCache
+from _torch.helpers import (create_mock_cuda_graph_runner,
+                              make_hf_hybrid_cache_for_tests)
 from utils.util import getSMVersion
 
 import tensorrt_llm
@@ -248,11 +248,13 @@ def test_exaone4_allclose_to_hf(self, scenario: Scenario) -> None:
         num_kv_heads = exaone4.config.num_key_value_heads
         max_seq_len = num_blocks * tokens_per_block
         batch_size = 1
-        hf_cache = HybridCache(config=exaone4_config,
-                               max_batch_size=batch_size,
-                               max_cache_len=max_seq_len,
-                               device=device,
-                               dtype=dtype)
+        hf_cache = make_hf_hybrid_cache_for_tests(
+            exaone4_config,
+            max_batch_size=batch_size,
+            max_cache_len=max_seq_len,
+            device=device,
+            dtype=dtype,
+        )
         if dtype == torch.half:
             kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
         elif dtype == torch.bfloat16:
diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
index 6b532b9b1c6..7d164b7a6d2 100644
--- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py
+++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
@@ -7,9 +7,9 @@
 from transformers import Gemma3Config
 from transformers import Gemma3ForCausalLM as HFGemma3ForCausalLM
 from transformers import Gemma3TextConfig
-from transformers.cache_utils import HybridCache
 
 import tensorrt_llm
+from _torch.helpers import make_hf_hybrid_cache_for_tests
 from tensorrt_llm._torch.attention_backend import (AttentionMetadata,
                                                    FlashInferAttentionMetadata)
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
@@ -285,11 +285,13 @@ def test_gemma3_allclose_to_hf(self, scenario: Scenario) -> None:
 
         hf_gemma3 = HFGemma3ForCausalLM(gemma3_config).to(dtype).to(
             device).eval()
-        hf_cache = HybridCache(config=gemma3_config,
-                               max_batch_size=batch_size,
-                               max_cache_len=10,
-                               device=device,
-                               dtype=dtype)
+        hf_cache = make_hf_hybrid_cache_for_tests(
+            gemma3_config,
+            max_batch_size=batch_size,
+            max_cache_len=10,
+            device=device,
+            dtype=dtype,
+        )
 
         model_config = ModelConfig(pretrained_config=gemma3_config,
                                    attn_backend=backend)

From a0e4198d28a88f32278ee60c97af1549b6fdc5d3 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Wed, 8 Apr 2026 23:50:36 -0700
Subject: [PATCH 09/22] fix: map Transformers v5 rope_scaling type default to
 RoPE none

Transformers 5.x may set rope_scaling["type"] to "default" for standard
RoPE. Teach RotaryScalingType.from_string to treat it as none, accept None,
and match enum names case-insensitively.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 tensorrt_llm/functional.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 5dd99755dc6..694aab45f2d 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -680,8 +680,16 @@ class RotaryScalingType(IntEnum):
 
     @staticmethod
     def from_string(s):
+        if isinstance(s, RotaryScalingType):
+            return s
+        if s is None:
+            return RotaryScalingType.none
+        key = str(s).lower()
+        # Hugging Face Transformers v5+ uses type "default" for unscaled / standard RoPE.
+        if key == "default":
+            return RotaryScalingType.none
         try:
-            return RotaryScalingType[s]
+            return RotaryScalingType[key]
         except KeyError:
             raise ValueError(f'Unsupported rotary scaling type: {s}')
 

From 6d5701f132da531bd58bfbe4473964b8874dc707 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 00:04:31 -0700
Subject: [PATCH 10/22] fix: read HF rope_theta from rope_parameters for
 Transformers v5

Add tensorrt_llm._utils.get_hf_rope_theta() and use it across TRT-LLM
configs, converters, RopeParams, and multimodal paths so Llama-style HF
configs without top-level rope_theta still resolve the correct base.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 examples/eagle/convert_checkpoint.py            |  3 ++-
 examples/medusa/convert_checkpoint.py           |  4 ++--
 .../models/contrib/dbrx/convert_checkpoint.py   |  4 ++--
 .../models/core/internlm2/convert_checkpoint.py |  4 ++--
 .../_torch/attention_backend/interface.py       |  4 ++--
 tensorrt_llm/_torch/models/modeling_gpt_oss.py  |  4 ++--
 tensorrt_llm/_utils.py                          | 17 +++++++++++++++++
 tensorrt_llm/models/commandr/config.py          |  3 ++-
 tensorrt_llm/models/deepseek_v1/config.py       |  3 ++-
 tensorrt_llm/models/deepseek_v2/config.py       |  3 ++-
 tensorrt_llm/models/deepseek_v2/convert.py      |  5 +++--
 tensorrt_llm/models/eagle/config.py             |  3 ++-
 tensorrt_llm/models/falcon/config.py            |  3 ++-
 tensorrt_llm/models/gemma/config.py             |  3 ++-
 tensorrt_llm/models/gpt/config.py               |  3 ++-
 tensorrt_llm/models/llama/config.py             |  3 ++-
 tensorrt_llm/models/mllama/config.py            |  3 ++-
 tensorrt_llm/models/nemotron_nas/config.py      |  3 ++-
 tensorrt_llm/models/phi/config.py               |  3 ++-
 tensorrt_llm/models/phi/convert.py              |  4 ++--
 tensorrt_llm/models/phi3/config.py              |  3 ++-
 tensorrt_llm/models/qwen/config.py              |  3 ++-
 .../quantization/quantize_by_modelopt.py        |  5 +++--
 tensorrt_llm/runtime/multimodal_model_runner.py |  6 +++---
 tests/unittest/trt/model/test_phi.py            |  3 ++-
 .../multimodal_encoders/1/multimodal_utils.py   |  4 +++-
 26 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/examples/eagle/convert_checkpoint.py b/examples/eagle/convert_checkpoint.py
index 217144e1ae5..130faee4453 100644
--- a/examples/eagle/convert_checkpoint.py
+++ b/examples/eagle/convert_checkpoint.py
@@ -9,6 +9,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.eagle.config import EagleConfig
 from tensorrt_llm.models.eagle.model import EagleForCausalLM
@@ -293,7 +294,7 @@ def copy(tensors):
         args.rms_norm_eps = hf_config.rms_norm_eps
         args.vocab_size = hf_config.vocab_size
         args.rotary_scaling = hf_config.rope_scaling
-        args.rotary_base = hf_config.rope_theta
+        args.rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         args.n_positions = hf_config.max_position_embeddings
         args.dtype = str(
             hf_config.torch_dtype)[6:] if args.dtype == 'auto' else args.dtype
diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py
index 48dcc6fd400..09eb55b4610 100644
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@@ -13,7 +13,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
-from tensorrt_llm._utils import numpy_to_torch
+from tensorrt_llm._utils import get_hf_rope_theta, numpy_to_torch
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import (LLaMAForCausalLM, PretrainedConfig,
@@ -209,7 +209,7 @@ def main():
         args.rms_norm_eps = hf_config.rms_norm_eps
         args.vocab_size = hf_config.vocab_size
         args.n_positions = hf_config.max_position_embeddings
-        args.rotary_base = hf_config.rope_theta
+        args.rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         args.rotary_scaling = hf_config.rope_scaling
 
     elif args.meta_ckpt_dir is not None:
diff --git a/examples/models/contrib/dbrx/convert_checkpoint.py b/examples/models/contrib/dbrx/convert_checkpoint.py
index ad487a50c76..1ca287f2588 100644
--- a/examples/models/contrib/dbrx/convert_checkpoint.py
+++ b/examples/models/contrib/dbrx/convert_checkpoint.py
@@ -18,7 +18,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
-from tensorrt_llm._utils import release_gc
+from tensorrt_llm._utils import get_hf_rope_theta, release_gc
 from tensorrt_llm.layers import MoeConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import (generate_int8,
@@ -557,7 +557,7 @@ def execute(workers, func, hf_model):
             args.moe_top_k = 1
         args.clip_qkv = hf_config.attn_config.clip_qkv
         args.hidden_act = 'swiglu'
-        args.rotary_base = hf_config.attn_config.rope_theta
+        args.rotary_base = get_hf_rope_theta(hf_config.attn_config, 10000.0)
     args.moe_config = MoeConfig(
         num_experts=args.moe_num_experts,
         top_k=args.moe_top_k,
diff --git a/examples/models/core/internlm2/convert_checkpoint.py b/examples/models/core/internlm2/convert_checkpoint.py
index 151a1afe85c..44c80d6d51f 100644
--- a/examples/models/core/internlm2/convert_checkpoint.py
+++ b/examples/models/core/internlm2/convert_checkpoint.py
@@ -14,7 +14,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
-from tensorrt_llm._utils import release_gc
+from tensorrt_llm._utils import get_hf_rope_theta, release_gc
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.llama import convert
 
@@ -480,7 +480,7 @@ def convert_from_hf(hf_model,
         'norm_epsilon': hf_config.rms_norm_eps,
         'vocab_size': hf_config.vocab_size,
         'position_embedding_type': 'rope_gpt_neox',
-        'rotary_base': hf_config.rope_theta,
+        'rotary_base': get_hf_rope_theta(hf_config, 10000.0),
         'max_position_embeddings': hf_config.max_position_embeddings,
         'hidden_act': hf_config.hidden_act,
         'use_parallel_embedding': args.use_parallel_embedding,
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index 600f655bc51..f86b550482f 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -14,7 +14,7 @@
     from ..speculative.interface import SpecMetadata
     from ..speculative.spec_tree_manager import SpecTreeManager
 
-from tensorrt_llm._utils import maybe_pin_memory
+from tensorrt_llm._utils import get_hf_rope_theta, maybe_pin_memory
 from tensorrt_llm.functional import (PositionEmbeddingType, RopeEmbeddingUtils,
                                      RotaryScalingType)
 from tensorrt_llm.mapping import Mapping
@@ -498,7 +498,7 @@ def from_config(config) -> "RopeParams":
             head_dim = hidden_size // num_attention_heads
         rope_scaling = getattr(config, 'rope_scaling', None)
         rope_params.max_positions = config.max_position_embeddings
-        rope_params.theta = getattr(config, 'rope_theta', 10000.0)
+        rope_params.theta = get_hf_rope_theta(config, 10000.0)
         rope_percentage = (getattr(config, 'rotary_pct', None)
                            or getattr(config, 'partial_rotary_factor', None)
                            or 1.0)
diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
index 4d46611a7fd..7edeb3e73be 100644
--- a/tensorrt_llm/_torch/models/modeling_gpt_oss.py
+++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
@@ -7,7 +7,7 @@
 from tqdm import tqdm
 from transformers import GptOssConfig
 
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_hf_rope_theta, get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
 
 from ..attention_backend import AttentionMetadata
@@ -55,7 +55,7 @@ def __init__(
             type=PositionEmbeddingType.yarn,
             rope=RopeParams(
                 dim=pretrained_config.head_dim,
-                theta=pretrained_config.rope_theta,
+                theta=get_hf_rope_theta(pretrained_config, 10000.0),
                 scale_type=RotaryScalingType.yarn,
                 scale=pretrained_config.rope_scaling['factor'],
                 max_positions=pretrained_config.max_position_embeddings,
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 47a6a88499e..c53e7a08504 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -70,6 +70,23 @@
 np_float8 = np.dtype('V1', metadata={"dtype": "float8"})
 
 
+def get_hf_rope_theta(config: Any, default: float = 10000.0) -> float:
+    """Return RoPE ``theta`` from a Hugging Face ``PreTrainedConfig``-like object.
+
+    Transformers v5+ nests ``rope_theta`` under ``rope_parameters`` for several
+    models (e.g. LLaMA); older releases expose ``config.rope_theta`` directly.
+    """
+    theta = getattr(config, "rope_theta", None)
+    if theta is not None:
+        return float(theta)
+    rope_params = getattr(config, "rope_parameters", None)
+    if isinstance(rope_params, dict):
+        theta = rope_params.get("rope_theta")
+        if theta is not None:
+            return float(theta)
+    return default
+
+
 def torch_to_numpy(x: torch.Tensor):
     assert isinstance(x, torch.Tensor), \
         f'x must be a torch.Tensor object, but got {type(x)}.'
diff --git a/tensorrt_llm/models/commandr/config.py b/tensorrt_llm/models/commandr/config.py
index a2edca61fb7..511640c2249 100644
--- a/tensorrt_llm/models/commandr/config.py
+++ b/tensorrt_llm/models/commandr/config.py
@@ -16,6 +16,7 @@
 
 import transformers
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -79,7 +80,7 @@ def from_hugging_face(
             hidden_act=hf_config.hidden_act,
             norm_epsilon=hf_config.layer_norm_eps,
             output_multiplier_scale=hf_config.logit_scale,
-            rotary_base=hf_config.rope_theta,
+            rotary_base=get_hf_rope_theta(hf_config, 10000.0),
             attn_bias=hf_config.attention_bias,
             qk_layernorm=hf_config.use_qk_norm,
             mapping=mapping,
diff --git a/tensorrt_llm/models/deepseek_v1/config.py b/tensorrt_llm/models/deepseek_v1/config.py
index b47fa91a43d..e7bff0d9aab 100755
--- a/tensorrt_llm/models/deepseek_v1/config.py
+++ b/tensorrt_llm/models/deepseek_v1/config.py
@@ -15,6 +15,7 @@
 
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -70,7 +71,7 @@ def from_hugging_face(
         num_key_value_heads = getattr(hf_config, "num_key_value_heads",
                                       hf_config.num_attention_heads)
         rotary_scaling = getattr(hf_config, "rope_scaling", None)
-        rotary_base = getattr(hf_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None))
         moe_config = MoeConfig(
             num_experts=getattr(hf_config, 'n_routed_experts', 0),
diff --git a/tensorrt_llm/models/deepseek_v2/config.py b/tensorrt_llm/models/deepseek_v2/config.py
index edaf21f128c..c110df0d53f 100644
--- a/tensorrt_llm/models/deepseek_v2/config.py
+++ b/tensorrt_llm/models/deepseek_v2/config.py
@@ -17,6 +17,7 @@
 
 from transformers import AutoConfig
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -129,7 +130,7 @@ def from_hugging_face(
                    max_position_embeddings=hf_config.max_position_embeddings,
                    hidden_act='swiglu',
                    norm_epsilon=hf_config.rms_norm_eps,
-                   rotary_base=hf_config.rope_theta,
+                   rotary_base=get_hf_rope_theta(hf_config, 10000.0),
                    rotary_scaling=rotary_scaling,
                    moe_inter_size=hf_config.moe_intermediate_size,
                    moe=moe_config,
diff --git a/tensorrt_llm/models/deepseek_v2/convert.py b/tensorrt_llm/models/deepseek_v2/convert.py
index 697040d3b75..5a23130fc52 100755
--- a/tensorrt_llm/models/deepseek_v2/convert.py
+++ b/tensorrt_llm/models/deepseek_v2/convert.py
@@ -20,7 +20,8 @@
 
 from tensorrt_llm.layers import MoeConfig
 
-from ..._utils import pad_vocab_size, release_gc, str_dtype_to_torch
+from ..._utils import (get_hf_rope_theta, pad_vocab_size, release_gc,
+                       str_dtype_to_torch)
 from ...logger import logger
 from ...mapping import Mapping
 from ..convert_utils import get_tllm_linear_weight
@@ -52,7 +53,7 @@ def create_trt_config_from_hf(model_dir,
     vocab_size = hf_config.vocab_size
     n_positions = hf_config.max_position_embeddings
     hidden_act = 'swiglu'  # TRT-LLM request make gated activation explicit for MOE implementation
-    rotary_base = hf_config.rope_theta
+    rotary_base = get_hf_rope_theta(hf_config, 10000.0)
     rms_norm_eps = hf_config.rms_norm_eps
     rotary_scaling_beta_fast = hf_config.rope_scaling['beta_fast']
     rotary_scaling_beta_slow = hf_config.rope_scaling['beta_slow']
diff --git a/tensorrt_llm/models/eagle/config.py b/tensorrt_llm/models/eagle/config.py
index f81e43bb03f..e7a559f3469 100644
--- a/tensorrt_llm/models/eagle/config.py
+++ b/tensorrt_llm/models/eagle/config.py
@@ -18,6 +18,7 @@
 
 from transformers import LlamaConfig
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..llama.config import LLaMAConfig
@@ -84,7 +85,7 @@ def from_hugging_face(
             rms_norm_eps = hf_config.rms_norm_eps
             vocab_size = hf_config.vocab_size
             rotary_scaling = hf_config.rope_scaling
-            rotary_base = hf_config.rope_theta
+            rotary_base = get_hf_rope_theta(hf_config, 10000.0)
             n_positions = hf_config.max_position_embeddings
             hidden_act = hf_config.hidden_act
             dtype = str(hf_config.torch_dtype)[6:] if dtype == 'auto' else dtype
diff --git a/tensorrt_llm/models/falcon/config.py b/tensorrt_llm/models/falcon/config.py
index c96bd517cc4..1ff2ff0391c 100644
--- a/tensorrt_llm/models/falcon/config.py
+++ b/tensorrt_llm/models/falcon/config.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -109,7 +110,7 @@ def from_hugging_face(
                    max_position_embeddings=getattr(hf_config,
                                                    'max_position_embeddings',
                                                    2048),
-                   rotary_base=getattr(hf_config, 'rope_theta', 10000.0),
+                   rotary_base=get_hf_rope_theta(hf_config, 10000.0),
                    intermediate_size=getattr(hf_config, 'ffn_hidden_size',
                                              None),
                    mapping=mapping,
diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py
index 8e176c4ed7e..3b0d8d6218c 100644
--- a/tensorrt_llm/models/gemma/config.py
+++ b/tensorrt_llm/models/gemma/config.py
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Union
 
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
@@ -186,7 +187,7 @@ def from_hugging_face(
             norm_epsilon=hf_config.rms_norm_eps,
             num_key_value_heads=getattr(hf_config, "num_key_value_heads",
                                         hf_config.num_attention_heads),
-            rotary_base=getattr(hf_config, "rope_theta", 10000.0),
+            rotary_base=get_hf_rope_theta(hf_config, 10000.0),
             rotary_scaling=getattr(hf_config, "rotary_scaling", None),
             quantization=quant_config,
             mapping=mapping,
diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py
index e89dddd5efe..ba09d1f8694 100644
--- a/tensorrt_llm/models/gpt/config.py
+++ b/tensorrt_llm/models/gpt/config.py
@@ -17,6 +17,7 @@
 
 import torch
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...logger import logger
 from ...mapping import Mapping
@@ -134,7 +135,7 @@ def from_hugging_face(
             hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps
             hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else gpt_variant != 'nemotron'
             hf_config.position_embedding_type = 'rope_gpt_neox'
-            hf_config.rotary_base = hf_config.rope_theta
+            hf_config.rotary_base = get_hf_rope_theta(hf_config, 10000.0)
             hf_config.rotary_pct = getattr(
                 hf_config, 'partial_rotary_factor',
                 getattr(hf_config, 'rope_percent', 1.0))
diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py
index 6db265dbd73..54038e32c4f 100644
--- a/tensorrt_llm/models/llama/config.py
+++ b/tensorrt_llm/models/llama/config.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -161,7 +162,7 @@ def from_hugging_face(
         attn_bias = getattr(hf_config, 'bias', False) or getattr(
             hf_config, 'attention_bias', False)
         rotary_scaling = getattr(hf_config, "rope_scaling", None)
-        rotary_base = getattr(hf_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         residual_mlp = getattr(hf_config, "parallel_attn_mlp_res", False)
         disable_weight_only_quant_plugin = kwargs.pop(
             'disable_weight_only_quant_plugin', False)
diff --git a/tensorrt_llm/models/mllama/config.py b/tensorrt_llm/models/mllama/config.py
index 5fb24f6fac7..cbd7f1b8f38 100644
--- a/tensorrt_llm/models/mllama/config.py
+++ b/tensorrt_llm/models/mllama/config.py
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...functional import LayerNormPositionType, LayerNormType, MLPType
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -166,7 +167,7 @@ def from_hugging_face(
         attn_bias = getattr(hf_text_config, 'bias', False) or getattr(
             hf_text_config, 'attention_bias', False)
         rotary_scaling = getattr(hf_text_config, "rope_scaling", None)
-        rotary_base = getattr(hf_text_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_text_config, 10000.0)
         residual_mlp = getattr(hf_text_config, "parallel_attn_mlp_res", False)
         disable_weight_only_quant_plugin = kwargs.pop(
             'disable_weight_only_quant_plugin', False)
diff --git a/tensorrt_llm/models/nemotron_nas/config.py b/tensorrt_llm/models/nemotron_nas/config.py
index 139b052c7bc..11d02df84b0 100644
--- a/tensorrt_llm/models/nemotron_nas/config.py
+++ b/tensorrt_llm/models/nemotron_nas/config.py
@@ -15,6 +15,7 @@
 from dataclasses import asdict
 from typing import Any, Dict, List, Optional, Union
 
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import infer_dtype
@@ -198,7 +199,7 @@ def from_hugging_face(
                    num_key_value_heads=hf_config.num_key_value_heads,
                    norm_epsilon=hf_config.rms_norm_eps,
                    rotary_scaling=hf_config.rope_scaling,
-                   rotary_base=hf_config.rope_theta,
+                   rotary_base=get_hf_rope_theta(hf_config, 10000.0),
                    vocab_size=hf_config.vocab_size,
                    max_position_embeddings=hf_config.max_position_embeddings,
                    mapping=mapping,
diff --git a/tensorrt_llm/models/phi/config.py b/tensorrt_llm/models/phi/config.py
index 3d38db0fa7b..583de15fadf 100644
--- a/tensorrt_llm/models/phi/config.py
+++ b/tensorrt_llm/models/phi/config.py
@@ -15,6 +15,7 @@
 
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -64,7 +65,7 @@ def from_hugging_face(
         num_key_value_heads = getattr(hf_config, "num_key_value_heads",
                                       hf_config.num_attention_heads)
         rotary_scaling = getattr(hf_config, "rope_scaling", None)
-        rotary_base = getattr(hf_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None))
 
         return cls(architecture=hf_config.architectures[0],
diff --git a/tensorrt_llm/models/phi/convert.py b/tensorrt_llm/models/phi/convert.py
index 0d1ec78bfd7..4bf3406c726 100644
--- a/tensorrt_llm/models/phi/convert.py
+++ b/tensorrt_llm/models/phi/convert.py
@@ -1,6 +1,6 @@
 import torch
 
-from ..._utils import pad_vocab_size, str_dtype_to_torch
+from ..._utils import get_hf_rope_theta, pad_vocab_size, str_dtype_to_torch
 
 
 def split(v, tp_size, idx, dim=0):
@@ -129,7 +129,7 @@ def convert_hf_config(hf_config, dtype, args):
         'num_hidden_layers': hf_config.num_hidden_layers,
         'num_attention_heads': hf_config.num_key_value_heads,
         'rotary_pct': hf_config.partial_rotary_factor,
-        'rope_theta': hf_config.rope_theta,
+        'rope_theta': get_hf_rope_theta(hf_config, 10000.0),
         'hidden_size': hf_config.hidden_size,
         'intermediate_size': hf_config.intermediate_size,
         'vocab_size': hf_config.vocab_size,
diff --git a/tensorrt_llm/models/phi3/config.py b/tensorrt_llm/models/phi3/config.py
index c824e921720..42d3954092e 100644
--- a/tensorrt_llm/models/phi3/config.py
+++ b/tensorrt_llm/models/phi3/config.py
@@ -15,6 +15,7 @@
 
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -96,7 +97,7 @@ def from_hugging_face(
                 hf_config, "dense_attention_every_n_layers", None)
             kwargs['norm_epsilon'] = hf_config.layer_norm_epsilon
         else:
-            kwargs['rotary_base'] = hf_config.rope_theta
+            kwargs['rotary_base'] = get_hf_rope_theta(hf_config, 10000.0)
             kwargs['norm_epsilon'] = hf_config.rms_norm_eps
         moe_variant = hf_config.architectures[0] == "PhiMoEForCausalLM"
         if moe_variant:
diff --git a/tensorrt_llm/models/qwen/config.py b/tensorrt_llm/models/qwen/config.py
index e2c22909538..0f1bd34606b 100644
--- a/tensorrt_llm/models/qwen/config.py
+++ b/tensorrt_llm/models/qwen/config.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -138,7 +139,7 @@ def from_hugging_face(cls,
             rotary_base = getattr(hf_config, "rotary_emb_base", 10000.0)
         else:
             rms_norm_eps = hf_config.rms_norm_eps
-            rotary_base = getattr(hf_config, "rope_theta", 100000.0)
+            rotary_base = get_hf_rope_theta(hf_config, 100000.0)
 
         num_labels = 1
         if hf_config.architectures[0] == "Qwen2ForSequenceClassification":
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
index 302eb74533f..8c1aa57efa6 100755
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -34,7 +34,7 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                           AutoTokenizer)
 
-from .._utils import release_gc, str_dtype_to_torch
+from .._utils import get_hf_rope_theta, release_gc, str_dtype_to_torch
 from ..logger import logger
 from ..mapping import Mapping
 from .image_processing import MllamaImageProcessor
@@ -888,7 +888,8 @@ def quantize_and_export(*,
                 if qwen_config.model_type == "qwen2":
                     tensorrt_llm_config[
                         "norm_epsilon"] = qwen_config.rms_norm_eps
-                    tensorrt_llm_config["rotary_base"] = qwen_config.rope_theta
+                    tensorrt_llm_config["rotary_base"] = get_hf_rope_theta(
+                        qwen_config, 100000.0)
                 tensorrt_llm_config[
                     "intermediate_size"] = qwen_config.intermediate_size
                 with open(f"{export_path}/config.json", "w") as f:
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
index 5a88cc9dd80..47d9bd66ad9 100644
--- a/tensorrt_llm/runtime/multimodal_model_runner.py
+++ b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -28,8 +28,8 @@
 
 from .. import profiler
 from .._deprecation import emit_engine_arch_deprecation
-from .._utils import (maybe_pin_memory, mpi_rank, prefer_pinned,
-                      str_dtype_to_torch, str_dtype_to_trt,
+from .._utils import (get_hf_rope_theta, maybe_pin_memory, mpi_rank,
+                      prefer_pinned, str_dtype_to_torch, str_dtype_to_trt,
                       supports_inflight_batching, torch_dtype_to_trt,
                       trt_dtype_to_torch)
 from ..functional import RopeEmbeddingUtils, RotaryScalingType
@@ -415,7 +415,7 @@ def __init__(self, args):
             self.max_position_embeddings = hf_config.max_position_embeddings
             self.hidden_size = hf_config.hidden_size
             self.num_attention_heads = hf_config.num_attention_heads
-            self.rope_theta = hf_config.rope_theta
+            self.rope_theta = get_hf_rope_theta(hf_config, 10000.0)
         if self.model_type == 'llava_onevision':
             self.num_frames = self.args.video_num_frames
             if self.num_frames is None:
diff --git a/tests/unittest/trt/model/test_phi.py b/tests/unittest/trt/model/test_phi.py
index 9db18f4e46e..b3cf8d28f2f 100644
--- a/tests/unittest/trt/model/test_phi.py
+++ b/tests/unittest/trt/model/test_phi.py
@@ -24,6 +24,7 @@
 
 import tensorrt_llm
 from tensorrt_llm import Builder
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.models.phi.convert import load_weights_from_hf_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
@@ -61,7 +62,7 @@ def initialize_network(self, network: tensorrt_llm.Network, hf_model,
             'num_attention_heads': hf_config.num_key_value_heads,
             'rotary_pct': hf_config.partial_rotary_factor,
             'position_embedding_type': 'rope_gpt_neox',
-            'rope_theta': hf_config.rope_theta,
+            'rope_theta': get_hf_rope_theta(hf_config, 10000.0),
             'hidden_size': hf_config.hidden_size,
             'intermediate_size': hf_config.intermediate_size,
             'vocab_size': hf_config.vocab_size,
diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
index 4caff0bbffc..85600193bfe 100644
--- a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
+++ b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from tensorrt_llm._utils import get_hf_rope_theta
+
 
 class LlavaOnevisionUtils:
     # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -129,7 +131,7 @@ def __init__(self, config):
         self.max_position_embeddings = config.max_position_embeddings
         self.hidden_size = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
-        self.rope_theta = config.rope_theta
+        self.rope_theta = get_hf_rope_theta(config, 10000.0)
 
     def get_rope_index(
         self,

From 08074a935a3abaeede0bd214acdc021e5933d208 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 00:15:03 -0700
Subject: [PATCH 11/22] test: compat helpers for DynamicCache legacy API
 (Transformers v5)

Transformers v5 removed DynamicCache.from_legacy_cache and to_legacy_cache.
Add hf_dynamic_cache_compat helpers and switch TRT attention unit tests to
use them so behavior matches v4 when the methods are absent.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 .../trt/attention/hf_dynamic_cache_compat.py  | 56 +++++++++++++++++++
 .../trt/attention/test_gpt_attention.py       | 29 +++++-----
 .../trt/attention/test_gpt_attention_IFB.py   |  8 ++-
 3 files changed, 78 insertions(+), 15 deletions(-)
 create mode 100644 tests/unittest/trt/attention/hf_dynamic_cache_compat.py

diff --git a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
new file mode 100644
index 00000000000..d2a6005e8c3
--- /dev/null
+++ b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DynamicCache legacy tuple format for tests (removed from Transformers v5+)."""
+
+from __future__ import annotations
+
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+from transformers.cache_utils import DynamicCache
+
+LegacyLayerKV = Tuple[torch.Tensor, torch.Tensor]
+LegacyCache = Tuple[LegacyLayerKV, ...]
+
+
+def dynamic_cache_from_legacy(
+        past_key_values: Optional[Union[LegacyCache,
+                                        Sequence[LegacyLayerKV]]]) -> DynamicCache:
+    """Match pre-v5 ``DynamicCache.from_legacy_cache`` (see transformers v4.48 ``cache_utils``)."""
+    if past_key_values is None:
+        return DynamicCache()
+    if hasattr(DynamicCache, "from_legacy_cache"):
+        return DynamicCache.from_legacy_cache(past_key_values)
+    cache = DynamicCache()
+    for layer_idx in range(len(past_key_values)):
+        key_states, value_states = past_key_values[layer_idx]
+        cache.update(key_states, value_states, layer_idx)
+    return cache
+
+
+def dynamic_cache_to_legacy(cache: DynamicCache) -> LegacyCache:
+    """Match pre-v5 ``DynamicCache.to_legacy_cache``."""
+    if hasattr(cache, "to_legacy_cache"):
+        return cache.to_legacy_cache()
+    layers: List[LegacyLayerKV] = []
+    for layer in cache.layers:
+        if not getattr(layer, "is_initialized", False):
+            continue
+        keys = layer.keys
+        values = layer.values
+        if keys is None or values is None:
+            continue
+        layers.append((keys, values))
+    return tuple(layers)
diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py
index 349bf6b752d..d44411b060c 100644
--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@@ -31,6 +31,9 @@
 from transformers.models.gptj.modeling_gptj import GPTJAttention
 from transformers.models.llama.modeling_llama import (LlamaAttention,
                                                       LlamaRotaryEmbedding)
+
+from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
+    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 from utils.util import (getSMVersion, skip_bf16_fp32_accum,
                         skip_blackwell_for_fmha_tests, skip_fp8_pre_ada,
                         unittest_name_func)
@@ -1236,13 +1239,13 @@ def verify_kv_cache(torch_present):
                     attention_packed_mask = None
                 if attention_type == 'gpt2_attention':
                     # gpt2 uses DynamicCache
-                    torch_present = DynamicCache.from_legacy_cache(
+                    torch_present = dynamic_cache_from_legacy(
                         torch_present)
                     torch_output = attention(input_tensor,
                                              past_key_value=torch_present,
                                              use_cache=True,
                                              attention_mask=attention_mask)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'llama_attention':
                     position_embeddings = rotary_emb(input_tensor, position_ids)
                     attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(
@@ -1257,7 +1260,7 @@ def verify_kv_cache(torch_present):
                         position_embeddings=position_embeddings,
                         attention_mask=attention_mask,
                         use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gptj_attention':
                     torch_present = DynamicCache()
                     torch_output = attention(input_tensor,
@@ -1265,7 +1268,7 @@ def verify_kv_cache(torch_present):
                                              position_ids=position_ids,
                                              attention_mask=attention_mask,
                                              use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gpt_bigcode_attention':
                     attention_mask = _prepare_4d_attention_mask(
                         ctx_attention_mask,
@@ -1280,7 +1283,7 @@ def verify_kv_cache(torch_present):
                                              layer_past=torch_present,
                                              attention_mask=attention_mask,
                                              use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 else:
                     raise RuntimeError("attention_type not properly set")
 
@@ -1377,13 +1380,13 @@ def verify_kv_cache(torch_present):
                 # torch execution
                 if attention_type == 'gpt2_attention':
                     # gpt2 uses DynamicCache
-                    torch_present = DynamicCache.from_legacy_cache(
+                    torch_present = dynamic_cache_from_legacy(
                         torch_present)
                     torch_output = attention(input_tensor,
                                              past_key_value=torch_present,
                                              use_cache=True,
                                              attention_mask=attention_mask)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'llama_attention':
                     position_embeddings = rotary_emb(input_tensor, position_ids)
                     attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(
@@ -1392,7 +1395,7 @@ def verify_kv_cache(torch_present):
                         device='cuda',
                         past_key_values_length=in_len + step - 1)
                     # llama uses DynamicCache
-                    torch_present = DynamicCache.from_legacy_cache(
+                    torch_present = dynamic_cache_from_legacy(
                         torch_present)
                     torch_output = attention(
                         input_tensor,
@@ -1400,29 +1403,29 @@ def verify_kv_cache(torch_present):
                         position_embeddings=position_embeddings,
                         attention_mask=attention_mask,
                         use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gptj_attention':
-                    torch_present = DynamicCache.from_legacy_cache(
+                    torch_present = dynamic_cache_from_legacy(
                         torch_present)
                     torch_output = attention(input_tensor,
                                              layer_past=torch_present,
                                              position_ids=position_ids,
                                              attention_mask=attention_mask,
                                              use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gpt_bigcode_attention':
                     # target shape = (b, h, 1, s_key)
                     key_seqlen = in_len + step  # ctx_attention_mask.shape[1]
                     attention_mask = (attention_mask
                                       >= 0).expand(batch_size, num_heads, 1,
                                                    key_seqlen)
-                    torch_present = DynamicCache.from_legacy_cache(
+                    torch_present = dynamic_cache_from_legacy(
                         torch_present)
                     torch_output = attention(input_tensor,
                                              layer_past=torch_present,
                                              use_cache=True,
                                              attention_mask=attention_mask)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
 
                 def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                     if num_beams == 1:
diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
index cda9025a8b9..12a551ee9ea 100644
--- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py
+++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
@@ -35,6 +35,9 @@
 from transformers.models.gptj.modeling_gptj import GPTJAttention
 from transformers.models.llama.modeling_llama import (LlamaAttention,
                                                       LlamaRotaryEmbedding)
+
+from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
+    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 from utils.util import (skip_bf16_fp32_accum, skip_fp8_pre_ada,
                         unittest_name_func)
 
@@ -1010,7 +1013,7 @@ def torch_exec(step: int,
                         (local_beam_width, input_length, hidden_size))
 
                 # llama/gpt2 uses DynamicCache
-                past_key_values = DynamicCache.from_legacy_cache(
+                past_key_values = dynamic_cache_from_legacy(
                     torch_cache_list[req_idx])
 
                 torch_out, past_key_values = torch_exec(
@@ -1018,7 +1021,8 @@ def torch_exec(step: int,
                     past_key_values)
 
                 # llama/gpt2 uses DynamicCache
-                torch_cache_list[req_idx] = past_key_values.to_legacy_cache()
+                torch_cache_list[req_idx] = dynamic_cache_to_legacy(
+                    past_key_values)
                 past_key_values = torch_cache_list[req_idx][0]
 
                 if use_fp8_kv_cache or use_int8_kv_cache:

From 44a2d53b157bc15d3f4bece67b507fee6b84731b Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 00:19:09 -0700
Subject: [PATCH 12/22] fix(auto_deploy): patch BambaModel when
 _update_causal_mask is absent

Transformers versions that removed BambaModel._update_causal_mask only
expose _update_mamba_mask. Gate the causal-mask patch on hasattr so export
patches apply cleanly on newer HF releases.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Made-with: Cursor
---
 .../_torch/auto_deploy/models/patches/bamba.py      | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
index 47d7eacd47a..61f5c309195 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
@@ -205,20 +205,27 @@ class BambaModelPatch(BaseExportPatch):
     def _apply_patch(self):
         self.original_values["BambaMixer.torch_forward"] = BambaMixer.torch_forward
         self.original_values["BambaModel._update_mamba_mask"] = BambaModel._update_mamba_mask
-        self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask
+        # Older transformers expose both; newer releases dropped `_update_causal_mask` on `BambaModel`
+        # (mask handling consolidated under `_update_mamba_mask`).
+        if hasattr(BambaModel, "_update_causal_mask"):
+            self.original_values["BambaModel._update_causal_mask"] = (
+                BambaModel._update_causal_mask)
         # NOTE: there is `HybridMambaAttentionDynamicCache.__bool__` to save.
         # self.original_values["BambaPreTrainedModel._init_weights"] = BambaPreTrainedModel._init_weights
 
         BambaMixer.torch_forward = _bamba_mixer_torch_forward
         BambaModel._update_mamba_mask = _bamba_model_update_mamba_mask
-        BambaModel._update_causal_mask = _bamba_model_update_causal_mask
+        if hasattr(BambaModel, "_update_causal_mask"):
+            BambaModel._update_causal_mask = _bamba_model_update_causal_mask
         HybridMambaAttentionDynamicCache.__bool__ = _cache_bool
         # BambaPreTrainedModel._init_weights = _bamba_pretrained_model_init_weights
 
     def _revert_patch(self):
         BambaMixer.torch_forward = self.original_values["BambaMixer.torch_forward"]
         BambaModel._update_mamba_mask = self.original_values["BambaModel._update_mamba_mask"]
-        BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"]
+        if "BambaModel._update_causal_mask" in self.original_values:
+            BambaModel._update_causal_mask = self.original_values[
+                "BambaModel._update_causal_mask"]
         del HybridMambaAttentionDynamicCache.__bool__
         # BambaPreTrainedModel._init_weights = self.original_values[
         #     "BambaPreTrainedModel._init_weights"

From 5f2643b7b3c163d087ab399c92e5cf5cd238a06e Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 13:38:14 +0000
Subject: [PATCH 13/22] fix: add SlidingWindowCache compatibility shim for
 Transformers v5

The Phi-4 multimodal model's custom modeling_phi4mm.py imports
SlidingWindowCache from transformers.cache_utils, which was removed
in transformers 5.3.0 (its functionality was merged into StaticCache).

Inject a compatibility alias before executing the model's custom code
so that the import succeeds on both old and new transformers versions.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_phi4mm.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
index 268ef6ce5f5..08c7303bdbb 100644
--- a/tensorrt_llm/_torch/models/modeling_phi4mm.py
+++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -114,6 +114,12 @@ def _load_phi4mm_classes(local_path):
         spec = importlib.util.spec_from_file_location(
             f"{package_name}.hf_modeling_phi4mm", modeling_phi4mm_path)
         hf_modeling_phi4mm = importlib.util.module_from_spec(spec)
+        # Inject compatibility shims for classes removed in transformers 5.x.
+        # The model's custom modeling_phi4mm.py may import SlidingWindowCache
+        # which was removed in transformers 5.3.0 (merged into StaticCache).
+        _cache_utils = importlib.import_module("transformers.cache_utils")
+        if not hasattr(_cache_utils, "SlidingWindowCache"):
+            _cache_utils.SlidingWindowCache = _cache_utils.StaticCache
         spec.loader.exec_module(hf_modeling_phi4mm)
         Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding
         Phi4MMImageEmbedding = hf_modeling_phi4mm.Phi4MMImageEmbedding

From e95b30204a80c5760c823aa7e417a0de0ed80466 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 13:42:19 +0000
Subject: [PATCH 14/22] test: update test_gpt_attention rope config for
 Transformers v5

In transformers 5.x, rope_theta and rope_scaling are unified into
the rope_parameters dict. Setting rope_theta directly on a config
object after construction no longer populates rope_parameters,
causing LlamaRotaryEmbedding to fail with a NoneType error.

Build the rope_parameters dict explicitly and read from it when
extracting rope_base/rope_scale_type/rope_scale for the test.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 .../trt/attention/test_gpt_attention.py       | 47 +++++++++++--------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py
index d44411b060c..328ce022118 100644
--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@@ -31,9 +31,6 @@
 from transformers.models.gptj.modeling_gptj import GPTJAttention
 from transformers.models.llama.modeling_llama import (LlamaAttention,
                                                       LlamaRotaryEmbedding)
-
-from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
-    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 from utils.util import (getSMVersion, skip_bf16_fp32_accum,
                         skip_blackwell_for_fmha_tests, skip_fp8_pre_ada,
                         unittest_name_func)
@@ -51,6 +48,8 @@
     MemoryPoolsAllocator
 from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
     PoolsKVCacheManager
+from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
+    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 
 
 class TestFunctional(unittest.TestCase):
@@ -633,13 +632,19 @@ def _construct_execution(
                 rope_scale_type = RotaryScalingType.none
                 rope_scale = 1.0
                 if attention_type == "llama_attention":
-                    rope_base = configuration.rope_theta
-                    if configuration.rope_scaling is not None:
+                    rope_params = getattr(configuration, 'rope_parameters',
+                                          None) or {}
+                    rope_base = rope_params.get(
+                        'rope_theta',
+                        getattr(configuration, 'rope_theta', 10000.0))
+                    rope_type = rope_params.get(
+                        'rope_type', rope_params.get('type', 'default'))
+                    if rope_type not in ('default', None):
                         rope_scale_type = {
                             "linear": RotaryScalingType.linear,
                             "dynamic": RotaryScalingType.dynamic
-                        }[configuration.rope_scaling["type"]]
-                        rope_scale = configuration.rope_scaling["factor"]
+                        }[rope_type]
+                        rope_scale = rope_params.get("factor", 1.0)
                 rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                     configuration.max_position_embeddings, rotary_embedding_dim,
                     rope_base, rope_scale)
@@ -894,8 +899,17 @@ def _construct_execution(
 
         if attention_type == 'llama_attention':
             configuration.num_key_value_heads = num_kv_heads
-            configuration.rope_theta = rope_base
-            configuration.rope_scaling = rope_scaling
+            # In transformers 5.x, rope_theta/rope_scaling are unified into
+            # rope_parameters.  Build the dict so LlamaRotaryEmbedding works.
+            if rope_scaling is not None:
+                rope_params = {**rope_scaling, "rope_theta": rope_base}
+            else:
+                rope_params = {
+                    "rope_type": "default",
+                    "rope_theta": rope_base,
+                }
+            configuration.rope_parameters = rope_params
+            configuration.rope_scaling = rope_params
             if rope_scaling is not None:
                 # scaling is typically used for supporting longer seq lens than max_position_embeddings
                 # so we set the max_position_embeddings to be smaller than total seq len
@@ -1239,8 +1253,7 @@ def verify_kv_cache(torch_present):
                     attention_packed_mask = None
                 if attention_type == 'gpt2_attention':
                     # gpt2 uses DynamicCache
-                    torch_present = dynamic_cache_from_legacy(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              past_key_value=torch_present,
                                              use_cache=True,
@@ -1380,8 +1393,7 @@ def verify_kv_cache(torch_present):
                 # torch execution
                 if attention_type == 'gpt2_attention':
                     # gpt2 uses DynamicCache
-                    torch_present = dynamic_cache_from_legacy(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              past_key_value=torch_present,
                                              use_cache=True,
@@ -1395,8 +1407,7 @@ def verify_kv_cache(torch_present):
                         device='cuda',
                         past_key_values_length=in_len + step - 1)
                     # llama uses DynamicCache
-                    torch_present = dynamic_cache_from_legacy(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(
                         input_tensor,
                         past_key_value=torch_present,
@@ -1405,8 +1416,7 @@ def verify_kv_cache(torch_present):
                         use_cache=True)[0]
                     torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gptj_attention':
-                    torch_present = dynamic_cache_from_legacy(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              layer_past=torch_present,
                                              position_ids=position_ids,
@@ -1419,8 +1429,7 @@ def verify_kv_cache(torch_present):
                     attention_mask = (attention_mask
                                       >= 0).expand(batch_size, num_heads, 1,
                                                    key_seqlen)
-                    torch_present = dynamic_cache_from_legacy(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              layer_past=torch_present,
                                              use_cache=True,

From dcdbb3465cadabeaffe044eba6666d95bb68b1a5 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 13:43:58 +0000
Subject: [PATCH 15/22] fix: map rope_type "default" to rope_gpt_neox in
 PositionEmbeddingType

Transformers 5.x unified rope_theta/rope_scaling into rope_parameters,
which always contains a "rope_type" field. Standard RoPE (no scaling)
now uses rope_type="default" instead of rope_scaling=None.

Since many model files check `rope_scaling is not None` and then pass
rope_type to PositionEmbeddingType.from_string(), this centralized
mapping avoids updating every model file individually.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/functional.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 694aab45f2d..df80459359a 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -730,6 +730,9 @@ def __str__(self):
 
     @staticmethod
     def from_string(s):
+        # Transformers 5.x uses "default" for standard RoPE (no scaling).
+        if s == "default":
+            return PositionEmbeddingType.rope_gpt_neox
         try:
             return PositionEmbeddingType[s]
         except KeyError:

From 2bc4ca9f84ce55a2f745504073d07e59e47e11fe Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 13:57:10 +0000
Subject: [PATCH 16/22] test: fix test_gpt_attention_IFB for Transformers v5

Three issues fixed:
1. past_key_value (singular) renamed to past_key_values (plural) for
   LlamaAttention and GPT2Attention in transformers 5.x.
2. use_cache parameter removed from attention forward calls in
   transformers 5.x (cache is always updated in-place).
3. rope_theta/rope_scaling config attributes replaced with unified
   rope_parameters dict (same fix as test_gpt_attention.py).

Without fix #1-2, the DynamicCache was never populated because the
kwarg was silently ignored, causing an empty tuple from
dynamic_cache_to_legacy and an IndexError at line 1026.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 .../trt/attention/test_gpt_attention_IFB.py   | 41 ++++++++++++-------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
index 12a551ee9ea..cacad8c35aa 100644
--- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py
+++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
@@ -26,7 +26,6 @@
 
 from parameterized import parameterized
 from transformers import GPT2Config, GPTBigCodeConfig, GPTJConfig, LlamaConfig
-from transformers.cache_utils import DynamicCache
 from transformers.modeling_attn_mask_utils import (AttentionMaskConverter,
                                                    _prepare_4d_attention_mask)
 from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
@@ -35,9 +34,6 @@
 from transformers.models.gptj.modeling_gptj import GPTJAttention
 from transformers.models.llama.modeling_llama import (LlamaAttention,
                                                       LlamaRotaryEmbedding)
-
-from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
-    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 from utils.util import (skip_bf16_fp32_accum, skip_fp8_pre_ada,
                         unittest_name_func)
 
@@ -54,6 +50,8 @@
     MemoryPoolsAllocator
 from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
     PoolsKVCacheManager
+from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
+    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 
 
 class TestFunctional(unittest.TestCase):
@@ -384,13 +382,19 @@ def _construct_execution(session,
                 rope_scale_type = RotaryScalingType.none
                 rope_scale = 1.0
                 if attention_type == "llama_attention":
-                    rope_base = configuration.rope_theta
-                    if configuration.rope_scaling is not None:
+                    rope_params = getattr(configuration, 'rope_parameters',
+                                          None) or {}
+                    rope_base = rope_params.get(
+                        'rope_theta',
+                        getattr(configuration, 'rope_theta', 10000.0))
+                    rope_type = rope_params.get(
+                        'rope_type', rope_params.get('type', 'default'))
+                    if rope_type not in ('default', None):
                         rope_scale_type = {
                             "linear": RotaryScalingType.linear,
                             "dynamic": RotaryScalingType.dynamic
-                        }[configuration.rope_scaling["type"]]
-                        rope_scale = configuration.rope_scaling["factor"]
+                        }[rope_type]
+                        rope_scale = rope_params.get("factor", 1.0)
                 rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                     configuration.max_position_embeddings, rotary_embedding_dim,
                     rope_base, rope_scale)
@@ -582,8 +586,17 @@ def _construct_execution(session,
                                   attn_implementation='eager')
         if attention_type == 'llama_attention':
             configuration.num_key_value_heads = num_kv_heads
-            configuration.rope_theta = rope_base
-            configuration.rope_scaling = rope_scaling
+            # In transformers 5.x, rope_theta/rope_scaling are unified into
+            # rope_parameters.  Build the dict so LlamaRotaryEmbedding works.
+            if rope_scaling is not None:
+                rope_params = {**rope_scaling, "rope_theta": rope_base}
+            else:
+                rope_params = {
+                    "rope_type": "default",
+                    "rope_theta": rope_base,
+                }
+            configuration.rope_parameters = rope_params
+            configuration.rope_scaling = rope_params
             if rope_scaling is not None:
                 # scaling is typically used for supporting longer seq lens than max_position_embeddings
                 # so we set the max_position_embeddings to be smaller than total seq len
@@ -763,8 +776,7 @@ def torch_exec(step: int,
                 tgt_len=(in_len if step == 0 else 1))
             if attention_type == 'gpt2_attention':
                 torch_output = attention(input,
-                                         past_key_value=layer_past,
-                                         use_cache=True,
+                                         past_key_values=layer_past,
                                          attention_mask=attention_mask)[0]
                 torch_present = layer_past
             elif attention_type == 'llama_attention':
@@ -777,10 +789,9 @@ def torch_exec(step: int,
                                             1))
                 torch_output = attention(
                     input,
-                    past_key_value=layer_past,
+                    past_key_values=layer_past,
                     position_embeddings=position_embeddings,
-                    attention_mask=attention_mask,
-                    use_cache=True)[0]
+                    attention_mask=attention_mask)[0]
                 torch_present = layer_past
             elif attention_type == 'gptj_attention':
                 torch_output, torch_present = attention(

From 5a2a0be75c1464e286c07af8e7c3ccf51173f15e Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 13:58:28 +0000
Subject: [PATCH 17/22] fix: use getattr for pad_token_id in MllamaConfig for
 Transformers v5

In transformers 5.x, pad_token_id was removed from the top-level
MllamaConfig and moved into text_config. Use getattr with fallback
to text_config.pad_token_id to support both old and new versions.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_mllama.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_mllama.py b/tensorrt_llm/_torch/models/modeling_mllama.py
index 16ec672539a..21a5fc447f4 100644
--- a/tensorrt_llm/_torch/models/modeling_mllama.py
+++ b/tensorrt_llm/_torch/models/modeling_mllama.py
@@ -274,8 +274,10 @@ def __init__(
         self.hidden_size = pretrained_config.text_config.hidden_size
         self.max_num_tiles = pretrained_config.vision_config.max_num_tiles
         self.vision_output_dim = pretrained_config.vision_config.vision_output_dim
-        self.pad_token_id = (pretrained_config.pad_token_id if
-                             pretrained_config.pad_token_id is not None else -1)
+        self.pad_token_id = getattr(pretrained_config, 'pad_token_id', None)
+        if self.pad_token_id is None:
+            self.pad_token_id = getattr(pretrained_config.text_config,
+                                        'pad_token_id', -1) or -1
         self.image_size = pretrained_config.vision_config.image_size
 
         # hack config

From 50c991bef48d2187b59125e94a4fb4f0386cdf2d Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Thu, 9 Apr 2026 14:06:06 +0000
Subject: [PATCH 18/22] style: apply pre-commit formatting fixes

Fix isort import ordering and yapf formatting issues flagged by
pre-commit hooks on prior Transformers v5 compatibility commits.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py | 6 ++----
 tests/unittest/_torch/modeling/test_modeling_cohere2.py | 2 +-
 tests/unittest/_torch/modeling/test_modeling_exaone4.py | 2 +-
 tests/unittest/_torch/modeling/test_modeling_gemma3.py  | 2 +-
 tests/unittest/trt/attention/hf_dynamic_cache_compat.py | 4 ++--
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
index 61f5c309195..779622a2a0b 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
@@ -208,8 +208,7 @@ def _apply_patch(self):
         # Older transformers expose both; newer releases dropped `_update_causal_mask` on `BambaModel`
         # (mask handling consolidated under `_update_mamba_mask`).
         if hasattr(BambaModel, "_update_causal_mask"):
-            self.original_values["BambaModel._update_causal_mask"] = (
-                BambaModel._update_causal_mask)
+            self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask
         # NOTE: there is `HybridMambaAttentionDynamicCache.__bool__` to save.
         # self.original_values["BambaPreTrainedModel._init_weights"] = BambaPreTrainedModel._init_weights
 
@@ -224,8 +223,7 @@ def _revert_patch(self):
         BambaMixer.torch_forward = self.original_values["BambaMixer.torch_forward"]
         BambaModel._update_mamba_mask = self.original_values["BambaModel._update_mamba_mask"]
         if "BambaModel._update_causal_mask" in self.original_values:
-            BambaModel._update_causal_mask = self.original_values[
-                "BambaModel._update_causal_mask"]
+            BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"]
         del HybridMambaAttentionDynamicCache.__bool__
         # BambaPreTrainedModel._init_weights = self.original_values[
         #     "BambaPreTrainedModel._init_weights"
diff --git a/tests/unittest/_torch/modeling/test_modeling_cohere2.py b/tests/unittest/_torch/modeling/test_modeling_cohere2.py
index 783c0939251..1a839cacd6c 100644
--- a/tests/unittest/_torch/modeling/test_modeling_cohere2.py
+++ b/tests/unittest/_torch/modeling/test_modeling_cohere2.py
@@ -1,11 +1,11 @@
 from copy import deepcopy
 
 import torch
+from _torch.helpers import make_hf_hybrid_cache_for_tests
 from transformers import Cohere2Config
 from transformers import Cohere2ForCausalLM as HFCohere2ForCausalLM
 
 import tensorrt_llm
-from _torch.helpers import make_hf_hybrid_cache_for_tests
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
index a9f1517a30f..7ea88c93c57 100644
--- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py
+++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
@@ -26,7 +26,7 @@ class Exaone4Config(PretrainedConfig):
     SKIP_EXAONE4_HF_ACCURACY_TEST = True
 
 from _torch.helpers import (create_mock_cuda_graph_runner,
-                              make_hf_hybrid_cache_for_tests)
+                            make_hf_hybrid_cache_for_tests)
 from utils.util import getSMVersion
 
 import tensorrt_llm
diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
index 7d164b7a6d2..f252f4cbd2d 100644
--- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py
+++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
@@ -3,13 +3,13 @@
 from dataclasses import dataclass
 
 import torch
+from _torch.helpers import make_hf_hybrid_cache_for_tests
 from parameterized import parameterized
 from transformers import Gemma3Config
 from transformers import Gemma3ForCausalLM as HFGemma3ForCausalLM
 from transformers import Gemma3TextConfig
 
 import tensorrt_llm
-from _torch.helpers import make_hf_hybrid_cache_for_tests
 from tensorrt_llm._torch.attention_backend import (AttentionMetadata,
                                                    FlashInferAttentionMetadata)
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
diff --git a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
index d2a6005e8c3..6866feb8ba6 100644
--- a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
+++ b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
@@ -26,8 +26,8 @@
 
 
 def dynamic_cache_from_legacy(
-        past_key_values: Optional[Union[LegacyCache,
-                                        Sequence[LegacyLayerKV]]]) -> DynamicCache:
+    past_key_values: Optional[Union[LegacyCache, Sequence[LegacyLayerKV]]],
+) -> DynamicCache:
     """Match pre-v5 ``DynamicCache.from_legacy_cache`` (see transformers v4.48 ``cache_utils``)."""
     if past_key_values is None:
         return DynamicCache()

From d0b445ca768f45fe436b8d89b10ececeb06ca2d7 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:35:52 +0000
Subject: [PATCH 19/22] fix: prevent duplicate 'disable' kwarg in DisabledTqdm

Newer huggingface_hub versions pass 'disable' explicitly to
tqdm_class.__init__() via snapshot_download. Using **kwargs
with an additional disable=True keyword caused a TypeError
("multiple values for keyword argument 'disable'").

Set disable in kwargs dict before forwarding to super().__init__()
so that any caller-provided value is overridden rather than
duplicated.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/llmapi/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py
index 569a4406bba..6765617a180 100644
--- a/tensorrt_llm/llmapi/utils.py
+++ b/tensorrt_llm/llmapi/utils.py
@@ -231,7 +231,8 @@ def get_file_lock(model_name: str,
 class DisabledTqdm(tqdm):
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs, disable=True)
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)
 
 
 def download_hf_model(model: str, revision: Optional[str] = None) -> Path:

From af7fe585d9463ab4c6166c5c51164d3e0cfd8c2a Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:39:15 +0000
Subject: [PATCH 20/22] fix: handle rope_scaling key changes in Qwen models for
 Transformers v5

In transformers 5.x, config.rope_scaling is always a dict (never
None) and uses "rope_type" key instead of "type". The dict also
contains rope_type="default" for standard RoPE (no scaling).

Update QwenAttention and QwenMoeAttention to:
- Look up both "type" and "rope_type" keys with fallback
- Treat rope_type="default" the same as no scaling (use rope_gpt_neox)
- Fix QwenDecoderLayer's yarn detection to use dict.get() instead of
  getattr() on a dict object

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_qwen.py   | 20 +++++++++++++------
 .../_torch/models/modeling_qwen_moe.py        |  9 ++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_qwen.py b/tensorrt_llm/_torch/models/modeling_qwen.py
index df6d83e5b75..1ec1323019c 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen.py
@@ -29,12 +29,17 @@ def __init__(
         layer_idx: Optional[int] = None,
     ):
         config = model_config.pretrained_config
-        if getattr(config, "rope_scaling", None) is not None:
+        rope_scaling = getattr(config, "rope_scaling", None)
+        # In transformers 5.x, rope_scaling is always a dict (never None)
+        # and uses "rope_type" key instead of "type".
+        rope_type = None
+        if rope_scaling is not None:
+            rope_type = rope_scaling.get("type", rope_scaling.get("rope_type"))
+        if rope_type is not None and rope_type != "default":
             pos_embd_params = PositionalEmbeddingParams(
-                type=PositionEmbeddingType.from_string(
-                    config.rope_scaling["type"]),
+                type=PositionEmbeddingType.from_string(rope_type),
                 rope=RopeParams.from_config(config),
-                mrope_section=config.rope_scaling.get('mrope_section', None))
+                mrope_section=rope_scaling.get('mrope_section', None))
         else:
             pos_embd_params = PositionalEmbeddingParams(
                 type=PositionEmbeddingType.rope_gpt_neox,
@@ -116,8 +121,11 @@ def __init__(
         self.layer_idx = layer_idx
         config = model_config.pretrained_config
 
-        if getattr(config, "rope_scaling", None) is not None and getattr(
-                config.rope_scaling, "rope_type", None) == "yarn":
+        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_type = rope_scaling.get("rope_type",
+                                     rope_scaling.get("type")) \
+            if isinstance(rope_scaling, dict) else None
+        if rope_type == "yarn":
             self.self_attn = QwenYarnAttention(
                 model_config,
                 layer_idx=layer_idx,
diff --git a/tensorrt_llm/_torch/models/modeling_qwen_moe.py b/tensorrt_llm/_torch/models/modeling_qwen_moe.py
index d19c2602ce9..dda335a962a 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen_moe.py
@@ -114,10 +114,13 @@ def __init__(
         layer_idx: Optional[int] = None,
     ):
         config = model_config.pretrained_config
-        if getattr(config, "rope_scaling", None) is not None:
+        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_type = None
+        if rope_scaling is not None:
+            rope_type = rope_scaling.get("type", rope_scaling.get("rope_type"))
+        if rope_type is not None and rope_type != "default":
             pos_embd_params = PositionalEmbeddingParams(
-                type=PositionEmbeddingType.from_string(
-                    config.rope_scaling["type"]),
+                type=PositionEmbeddingType.from_string(rope_type),
                 rope=RopeParams.from_config(config),
             )
         else:

From 2153a7357e7dcfcd43364c0f00e95372ba3955e1 Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:40:59 +0000
Subject: [PATCH 21/22] fix: use getattr for tie_word_embeddings for
 Transformers v5

In transformers 5.x, tie_word_embeddings is no longer a default
attribute on all config classes (e.g. CLIPVisionConfig). Use
getattr with a False default in generic code paths that may
receive any config type (modeling_utils, weight mappers).

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 .../_torch/models/checkpoints/hf/gemma3_weight_mapper.py     | 4 ++--
 tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py   | 4 ++--
 tensorrt_llm/_torch/models/modeling_utils.py                 | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py
index 8382588dc24..c03b8a7b73c 100644
--- a/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py
@@ -10,8 +10,8 @@
 class Gemma3HfWeightMapper(HfWeightMapper):
 
     def should_skip_module(self, module_name: str) -> bool:
-        if self.model.config.tie_word_embeddings and module_name.startswith(
-                "lm_head"):
+        if getattr(self.model.config, 'tie_word_embeddings',
+                   False) and module_name.startswith("lm_head"):
             return True
 
         # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py
index cb1d8671a80..94a1eb986d5 100644
--- a/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py
@@ -37,8 +37,8 @@ def apply_callbacks(self, module: nn.Module, module_name: str,
         return module_weights
 
     def should_skip_module(self, module_name: str) -> bool:
-        if self.model.config.tie_word_embeddings and module_name.startswith(
-                "lm_head"):
+        if getattr(self.model.config, 'tie_word_embeddings',
+                   False) and module_name.startswith("lm_head"):
             return True
 
         # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
index e285c32ec29..5e9e618bf10 100755
--- a/tensorrt_llm/_torch/models/modeling_utils.py
+++ b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -401,7 +401,7 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
                     self.lm_head.weight.data.copy_(x)
 
         # use embedding weights in lm_head if tie word embedding is enabled
-        if config.pretrained_config.tie_word_embeddings:
+        if getattr(config.pretrained_config, 'tie_word_embeddings', False):
             assert self.lm_head.tp_size == self.model.embed_tokens.tp_size, (
                 "lm_head and vocab embedding should use the same TP size")
             assert self.lm_head.tp_mode == self.model.embed_tokens.tp_mode, (
@@ -896,7 +896,8 @@ def load_single_module(name, module):
                 return
 
             # skip load weights if tie word embeddings is enabled and layer is lm_head
-            if model.config.tie_word_embeddings and name.startswith("lm_head"):
+            if getattr(model.config, 'tie_word_embeddings',
+                       False) and name.startswith("lm_head"):
                 return
 
             # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values

From 8ee65631c3e45b2f50610f9d3fd1cf383a8ce6db Mon Sep 17 00:00:00 2001
From: Jonas Li <6110159+longlee0622@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:44:49 +0000
Subject: [PATCH 22/22] fix: support per-layer-type RoPE config (Gemma3) for
 Transformers v5

In transformers 5.x, Gemma3's rope_parameters is a nested dict keyed
by attention layer type (full_attention, sliding_attention) instead of
a flat dict. Also, rope_local_base_freq was removed and its value
moved into rope_parameters["sliding_attention"]["rope_theta"].

Changes:
- RopeParams.from_config: flatten per-layer-type rope_parameters by
  picking "full_attention" as the default, instead of asserting.
- Gemma3Attention: fall back to rope_parameters["sliding_attention"]
  when rope_local_base_freq is absent.

Signed-off-by: Jonas Li <6110159+longlee0622@users.noreply.github.com>
---
 tensorrt_llm/_torch/attention_backend/interface.py | 14 ++++++++++----
 tensorrt_llm/_torch/models/modeling_gemma3.py      |  9 ++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index f86b550482f..2ea9d4261d1 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -485,10 +485,16 @@ def from_config(config) -> "RopeParams":
 
         hf_rope_parameters = getattr(config, 'rope_parameters', None)
         if hf_rope_parameters is not None:
-            assert not set(hf_rope_parameters.keys()).issubset(
-                ALLOWED_ATTENTION_LAYER_TYPES), (
-                    "Per-layer-type RoPE configuration is not supported yet.")
-            config.update(hf_rope_parameters)
+            if set(hf_rope_parameters.keys()).issubset(
+                    ALLOWED_ATTENTION_LAYER_TYPES):
+                # Per-layer-type RoPE config (e.g. Gemma3 in transformers 5.x).
+                # Pick "full_attention" as the default; callers override theta
+                # for sliding-window layers independently.
+                flat = hf_rope_parameters.get(
+                    "full_attention", next(iter(hf_rope_parameters.values())))
+                config.update(flat)
+            else:
+                config.update(hf_rope_parameters)
 
         # get rotary parameters.
         hidden_size = config.hidden_size
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3.py b/tensorrt_llm/_torch/models/modeling_gemma3.py
index 24ba665afbf..3612136e7b1 100644
--- a/tensorrt_llm/_torch/models/modeling_gemma3.py
+++ b/tensorrt_llm/_torch/models/modeling_gemma3.py
@@ -65,7 +65,14 @@ def __init__(
         rope_params = RopeParams.from_config(config)
         self.attention_window_size = None
         if is_sliding:
-            rope_params.theta = config.rope_local_base_freq
+            # transformers 5.x moved rope_local_base_freq into
+            # rope_parameters["sliding_attention"]["rope_theta"]
+            local_freq = getattr(config, 'rope_local_base_freq', None)
+            if local_freq is None:
+                rp = getattr(config, 'rope_parameters', {})
+                local_freq = rp.get('sliding_attention',
+                                    {}).get('rope_theta', 10000.0)
+            rope_params.theta = local_freq
             rope_params.scale_type = RotaryScalingType.none
             rope_params.scale = 1.0
             self.attention_window_size = config.sliding_window