diff --git a/examples/eagle/convert_checkpoint.py b/examples/eagle/convert_checkpoint.py
index 217144e1ae5..130faee4453 100644
--- a/examples/eagle/convert_checkpoint.py
+++ b/examples/eagle/convert_checkpoint.py
@@ -9,6 +9,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.eagle.config import EagleConfig
 from tensorrt_llm.models.eagle.model import EagleForCausalLM
@@ -293,7 +294,7 @@ def copy(tensors):
         args.rms_norm_eps = hf_config.rms_norm_eps
         args.vocab_size = hf_config.vocab_size
         args.rotary_scaling = hf_config.rope_scaling
-        args.rotary_base = hf_config.rope_theta
+        args.rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         args.n_positions = hf_config.max_position_embeddings
         args.dtype = str(
             hf_config.torch_dtype)[6:] if args.dtype == 'auto' else args.dtype
diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py
index 48dcc6fd400..09eb55b4610 100644
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@@ -13,7 +13,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
-from tensorrt_llm._utils import numpy_to_torch
+from tensorrt_llm._utils import get_hf_rope_theta, numpy_to_torch
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import (LLaMAForCausalLM, PretrainedConfig,
@@ -209,7 +209,7 @@ def main():
         args.rms_norm_eps = hf_config.rms_norm_eps
         args.vocab_size = hf_config.vocab_size
         args.n_positions = hf_config.max_position_embeddings
-        args.rotary_base = hf_config.rope_theta
+        args.rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         args.rotary_scaling = hf_config.rope_scaling
 
     elif args.meta_ckpt_dir is not None:
diff --git a/examples/models/contrib/dbrx/convert_checkpoint.py b/examples/models/contrib/dbrx/convert_checkpoint.py
index ad487a50c76..1ca287f2588 100644
--- a/examples/models/contrib/dbrx/convert_checkpoint.py
+++ b/examples/models/contrib/dbrx/convert_checkpoint.py
@@ -18,7 +18,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
-from tensorrt_llm._utils import release_gc
+from tensorrt_llm._utils import get_hf_rope_theta, release_gc
 from tensorrt_llm.layers import MoeConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import (generate_int8,
@@ -557,7 +557,7 @@ def execute(workers, func, hf_model):
             args.moe_top_k = 1
         args.clip_qkv = hf_config.attn_config.clip_qkv
         args.hidden_act = 'swiglu'
-        args.rotary_base = hf_config.attn_config.rope_theta
+        args.rotary_base = get_hf_rope_theta(hf_config.attn_config, 10000.0)
     args.moe_config = MoeConfig(
         num_experts=args.moe_num_experts,
         top_k=args.moe_top_k,
diff --git a/examples/models/core/internlm2/convert_checkpoint.py b/examples/models/core/internlm2/convert_checkpoint.py
index 151a1afe85c..44c80d6d51f 100644
--- a/examples/models/core/internlm2/convert_checkpoint.py
+++ b/examples/models/core/internlm2/convert_checkpoint.py
@@ -14,7 +14,7 @@
 
 import tensorrt_llm
 from tensorrt_llm._deprecation import emit_engine_arch_deprecation
-from tensorrt_llm._utils import release_gc
+from tensorrt_llm._utils import get_hf_rope_theta, release_gc
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.llama import convert
 
@@ -480,7 +480,7 @@ def convert_from_hf(hf_model,
         'norm_epsilon': hf_config.rms_norm_eps,
         'vocab_size': hf_config.vocab_size,
         'position_embedding_type': 'rope_gpt_neox',
-        'rotary_base': hf_config.rope_theta,
+        'rotary_base': get_hf_rope_theta(hf_config, 10000.0),
         'max_position_embeddings': hf_config.max_position_embeddings,
         'hidden_act': hf_config.hidden_act,
         'use_parallel_embedding': args.use_parallel_embedding,
diff --git a/requirements.txt b/requirements.txt
index b76e28208bd..bfe364fde97 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
 # torch 2.10.0+cu130 depends on nvidia-nccl-cu13==2.28.9
 nvidia-nccl-cu13>=2.28.9,<=2.29.2
 nvidia-cuda-nvrtc
-transformers==4.57.3
+transformers==5.3.0
 prometheus_client
 prometheus_fastapi_instrumentator
 pydantic>=2.9.1
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index 600f655bc51..f86b550482f 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -14,7 +14,7 @@
     from ..speculative.interface import SpecMetadata
     from ..speculative.spec_tree_manager import SpecTreeManager
 
-from tensorrt_llm._utils import maybe_pin_memory
+from tensorrt_llm._utils import get_hf_rope_theta, maybe_pin_memory
 from tensorrt_llm.functional import (PositionEmbeddingType, RopeEmbeddingUtils,
                                      RotaryScalingType)
 from tensorrt_llm.mapping import Mapping
@@ -498,7 +498,7 @@ def from_config(config) -> "RopeParams":
             head_dim = hidden_size // num_attention_heads
         rope_scaling = getattr(config, 'rope_scaling', None)
         rope_params.max_positions = config.max_position_embeddings
-        rope_params.theta = getattr(config, 'rope_theta', 10000.0)
+        rope_params.theta = get_hf_rope_theta(config, 10000.0)
         rope_percentage = (getattr(config, 'rotary_pct', None)
                            or getattr(config, 'partial_rotary_factor', None)
                            or 1.0)
diff --git a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py
index fd21604d1b6..d6272be573d 100644
--- a/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py
+++ b/tensorrt_llm/_torch/auto_deploy/export/library/transformers_sdpa_mask.py
@@ -1,6 +1,7 @@
 """Patch for transformers SDPA mask to be export-compatible."""
 
 import importlib.metadata
+from functools import partial
 
 from packaging import version
 
@@ -29,7 +30,14 @@ def _apply_patch(self):
         try:
             # imports only after version check
             from transformers import masking_utils
-            from transformers.integrations.executorch import sdpa_mask_without_vmap
+
+            # Up to ~4.53+, HF exposed this helper next to ExecuTorch export utilities.
+            # Transformers 5.x removed it; sdpa_mask now supports use_vmap=False (the default),
+            # which is export-compatible without vmap.
+            try:
+                from transformers.integrations.executorch import sdpa_mask_without_vmap
+            except ImportError:
+                sdpa_mask_without_vmap = partial(masking_utils.sdpa_mask, use_vmap=False)
 
             # recall original implementation
             self.original_values["masking_utils.sdpa_mask"] = masking_utils.sdpa_mask
diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
index e227bc7ebec..71c11814379 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
@@ -2854,8 +2854,8 @@ def init_input_processor(self, base):
 # Registration
 # =============================================================================
 
-AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig)
-AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig)
+AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig, exist_ok=True)
+AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig, exist_ok=True)
 
 AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM)
 Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
index 47d7eacd47a..779622a2a0b 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/bamba.py
@@ -205,20 +205,25 @@ class BambaModelPatch(BaseExportPatch):
     def _apply_patch(self):
         self.original_values["BambaMixer.torch_forward"] = BambaMixer.torch_forward
         self.original_values["BambaModel._update_mamba_mask"] = BambaModel._update_mamba_mask
-        self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask
+        # Older transformers expose both; newer releases dropped `_update_causal_mask` on `BambaModel`
+        # (mask handling consolidated under `_update_mamba_mask`).
+        if hasattr(BambaModel, "_update_causal_mask"):
+            self.original_values["BambaModel._update_causal_mask"] = BambaModel._update_causal_mask
         # NOTE: there is `HybridMambaAttentionDynamicCache.__bool__` to save.
         # self.original_values["BambaPreTrainedModel._init_weights"] = BambaPreTrainedModel._init_weights
 
         BambaMixer.torch_forward = _bamba_mixer_torch_forward
         BambaModel._update_mamba_mask = _bamba_model_update_mamba_mask
-        BambaModel._update_causal_mask = _bamba_model_update_causal_mask
+        if hasattr(BambaModel, "_update_causal_mask"):
+            BambaModel._update_causal_mask = _bamba_model_update_causal_mask
         HybridMambaAttentionDynamicCache.__bool__ = _cache_bool
         # BambaPreTrainedModel._init_weights = _bamba_pretrained_model_init_weights
 
     def _revert_patch(self):
         BambaMixer.torch_forward = self.original_values["BambaMixer.torch_forward"]
         BambaModel._update_mamba_mask = self.original_values["BambaModel._update_mamba_mask"]
-        BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"]
+        if "BambaModel._update_causal_mask" in self.original_values:
+            BambaModel._update_causal_mask = self.original_values["BambaModel._update_causal_mask"]
         del HybridMambaAttentionDynamicCache.__bool__
         # BambaPreTrainedModel._init_weights = self.original_values[
         #     "BambaPreTrainedModel._init_weights"
diff --git a/tensorrt_llm/_torch/models/hf_parameter_utils.py b/tensorrt_llm/_torch/models/hf_parameter_utils.py
new file mode 100644
index 00000000000..8a163c4f917
--- /dev/null
+++ b/tensorrt_llm/_torch/models/hf_parameter_utils.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compatibility for Hugging Face ``get_parameter_device`` / ``get_parameter_dtype``.
+
+Transformers v5 no longer exports these from ``transformers.modeling_utils``; they
+match ``ModuleUtilsMixin`` behavior for plain ``nn.Module`` stacks.
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+try:
+    from transformers.modeling_utils import get_parameter_device, get_parameter_dtype
+except ImportError:
+
+    def get_parameter_device(module: nn.Module) -> torch.device:
+        return next(module.parameters()).device
+
+    def get_parameter_dtype(module: nn.Module) -> torch.dtype:
+        return next(param.dtype for param in module.parameters() if param.is_floating_point())
diff --git a/tensorrt_llm/_torch/models/modeling_clip.py b/tensorrt_llm/_torch/models/modeling_clip.py
index 1e203eda8b7..9e73dcc2dd3 100644
--- a/tensorrt_llm/_torch/models/modeling_clip.py
+++ b/tensorrt_llm/_torch/models/modeling_clip.py
@@ -4,8 +4,6 @@
 import torch.nn as nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutput
-from transformers.modeling_utils import (get_parameter_device,
-                                         get_parameter_dtype)
 from transformers.models.clip.configuration_clip import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPVisionEmbeddings
 
@@ -17,6 +15,7 @@
 from ..model_config import ModelConfig
 from ..modules.attention import Attention
 from ..modules.mlp import MLP
+from .hf_parameter_utils import get_parameter_device, get_parameter_dtype
 from .modeling_utils import _load_weights_impl, register_auto_model
 
 
diff --git a/tensorrt_llm/_torch/models/modeling_exaone4.py b/tensorrt_llm/_torch/models/modeling_exaone4.py
index 07951fc28a4..2b49b9bb475 100644
--- a/tensorrt_llm/_torch/models/modeling_exaone4.py
+++ b/tensorrt_llm/_torch/models/modeling_exaone4.py
@@ -29,7 +29,7 @@
     class Exaone4Config(PretrainedConfig):
         model_type = "exaone4"
 
-    AutoConfig.register(Exaone4Config.model_type, Exaone4Config)
+    AutoConfig.register(Exaone4Config.model_type, Exaone4Config, exist_ok=True)
 
 
 def check_is_sliding(config: Exaone4Config, layer_idx: int) -> bool:
diff --git a/tensorrt_llm/_torch/models/modeling_exaone_moe.py b/tensorrt_llm/_torch/models/modeling_exaone_moe.py
index fe420178558..621065ccb47 100644
--- a/tensorrt_llm/_torch/models/modeling_exaone_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_exaone_moe.py
@@ -53,7 +53,7 @@ class ExaoneMoEConfig(PretrainedConfig):
     "Register ExaoneMoEConfig to mimic the ExaoneMoE model.",
     key="EXAONE_MOE_REGISTER_WARNING"
 )
-AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig)
+AutoConfig.register(ExaoneMoEConfig.model_type, ExaoneMoEConfig, exist_ok=True)
 # End of the config register.
 # fmt: on
 
diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
index 4d46611a7fd..7edeb3e73be 100644
--- a/tensorrt_llm/_torch/models/modeling_gpt_oss.py
+++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
@@ -7,7 +7,7 @@
 from tqdm import tqdm
 from transformers import GptOssConfig
 
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_hf_rope_theta, get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
 
 from ..attention_backend import AttentionMetadata
@@ -55,7 +55,7 @@ def __init__(
             type=PositionEmbeddingType.yarn,
             rope=RopeParams(
                 dim=pretrained_config.head_dim,
-                theta=pretrained_config.rope_theta,
+                theta=get_hf_rope_theta(pretrained_config, 10000.0),
                 scale_type=RotaryScalingType.yarn,
                 scale=pretrained_config.rope_scaling['factor'],
                 max_positions=pretrained_config.max_position_embeddings,
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index b13c2e3de91..8dff496563a 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -7,8 +7,9 @@
 from torch import nn
 from transformers import (AutoProcessor, AutoTokenizer, Llama4Config,
                           Llama4VisionModel, LlamaConfig, PretrainedConfig)
-from transformers.modeling_utils import load_sharded_checkpoint
 from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
+from transformers.utils import (SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME,
+                                WEIGHTS_INDEX_NAME, WEIGHTS_NAME)
 
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams, MoEAllReduce)
@@ -1118,6 +1119,56 @@ def post_load_weights(self):
                 layer.next_attn = self.model.layers[idx + 1].self_attn
 
 
+def _load_checkpoint_into_module(module: nn.Module,
+                                 folder: str,
+                                 strict: bool = True) -> None:
+    """Load a sharded HuggingFace checkpoint into a module.
+
+    This replaces the removed ``transformers.modeling_utils.load_sharded_checkpoint``
+    function. It supports both safetensors and PyTorch checkpoint formats.
+    """
+    folder = str(folder)
+
+    # Determine checkpoint format and collect shard files
+    index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
+    if os.path.isfile(index_file):
+        import json
+        with open(index_file) as f:
+            shard_files = sorted(set(json.load(f)["weight_map"].values()))
+        shard_paths = [os.path.join(folder, s) for s in shard_files]
+        use_safetensors = True
+    elif os.path.isfile(os.path.join(folder, WEIGHTS_INDEX_NAME)):
+        import json
+        with open(os.path.join(folder, WEIGHTS_INDEX_NAME)) as f:
+            shard_files = sorted(set(json.load(f)["weight_map"].values()))
+        shard_paths = [os.path.join(folder, s) for s in shard_files]
+        use_safetensors = False
+    elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)):
+        shard_paths = [os.path.join(folder, SAFE_WEIGHTS_NAME)]
+        use_safetensors = True
+    elif os.path.isfile(os.path.join(folder, WEIGHTS_NAME)):
+        shard_paths = [os.path.join(folder, WEIGHTS_NAME)]
+        use_safetensors = False
+    else:
+        raise FileNotFoundError(
+            f"No checkpoint found in {folder}. Expected "
+            f"{SAFE_WEIGHTS_INDEX_NAME}, {WEIGHTS_INDEX_NAME}, "
+            f"{SAFE_WEIGHTS_NAME}, or {WEIGHTS_NAME}.")
+
+    # Load state dict from all shards and merge
+    full_state_dict: Dict[str, torch.Tensor] = {}
+    if use_safetensors:
+        from safetensors.torch import load_file
+        for path in shard_paths:
+            full_state_dict.update(load_file(path))
+    else:
+        for path in shard_paths:
+            full_state_dict.update(
+                torch.load(path, map_location="cpu", weights_only=True))
+
+    module.load_state_dict(full_state_dict, strict=strict)
+
+
 class Llama4VisionEncoder(nn.Module):
 
     def __init__(self, model_config: ModelConfig[Llama4Config], *args,
@@ -1148,9 +1199,9 @@ def load_weights(self, weights: Dict):
 
         # Otherwise, load the weights from the checkpoint.
         else:
-            load_sharded_checkpoint(module_dict,
-                                    self.pretrained_config._name_or_path,
-                                    strict=False)
+            _load_checkpoint_into_module(module_dict,
+                                         self.pretrained_config._name_or_path,
+                                         strict=False)
 
         self.vision_model = module_dict["vision_model"].to(self.device)
         self.mm_projector = module_dict["multi_modal_projector"].to(self.device)
diff --git a/tensorrt_llm/_torch/models/modeling_mllama.py b/tensorrt_llm/_torch/models/modeling_mllama.py
index 16ec672539a..21a5fc447f4 100644
--- a/tensorrt_llm/_torch/models/modeling_mllama.py
+++ b/tensorrt_llm/_torch/models/modeling_mllama.py
@@ -274,8 +274,10 @@ def __init__(
         self.hidden_size = pretrained_config.text_config.hidden_size
         self.max_num_tiles = pretrained_config.vision_config.max_num_tiles
         self.vision_output_dim = pretrained_config.vision_config.vision_output_dim
-        self.pad_token_id = (pretrained_config.pad_token_id if
-                             pretrained_config.pad_token_id is not None else -1)
+        self.pad_token_id = getattr(pretrained_config, 'pad_token_id', None)
+        if self.pad_token_id is None:
+            self.pad_token_id = getattr(pretrained_config.text_config,
+                                        'pad_token_id', -1) or -1
         self.image_size = pretrained_config.vision_config.image_size
 
         # hack config
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
index 623195da94a..446a7383a0a 100644
--- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
@@ -1073,4 +1073,4 @@ def forward(
         return hidden_states
 
 
-AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig)
+AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig, exist_ok=True)
diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
index 268ef6ce5f5..08c7303bdbb 100644
--- a/tensorrt_llm/_torch/models/modeling_phi4mm.py
+++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -114,6 +114,12 @@ def _load_phi4mm_classes(local_path):
         spec = importlib.util.spec_from_file_location(
             f"{package_name}.hf_modeling_phi4mm", modeling_phi4mm_path)
         hf_modeling_phi4mm = importlib.util.module_from_spec(spec)
+        # Inject compatibility shims for classes removed in transformers 5.x.
+        # The model's custom modeling_phi4mm.py may import SlidingWindowCache
+        # which was removed in transformers 5.3.0 (merged into StaticCache).
+        _cache_utils = importlib.import_module("transformers.cache_utils")
+        if not hasattr(_cache_utils, "SlidingWindowCache"):
+            _cache_utils.SlidingWindowCache = _cache_utils.StaticCache
         spec.loader.exec_module(hf_modeling_phi4mm)
         Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding
         Phi4MMImageEmbedding = hf_modeling_phi4mm.Phi4MMImageEmbedding
diff --git a/tensorrt_llm/_torch/models/modeling_siglip.py b/tensorrt_llm/_torch/models/modeling_siglip.py
index e4ed6d462b8..071ee7f03d4 100644
--- a/tensorrt_llm/_torch/models/modeling_siglip.py
+++ b/tensorrt_llm/_torch/models/modeling_siglip.py
@@ -2,8 +2,6 @@
 
 import torch
 import torch.nn as nn
-from transformers.modeling_utils import (get_parameter_device,
-                                         get_parameter_dtype)
 from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
 from transformers.models.siglip.modeling_siglip import (SiglipVisionConfig,
                                                         SiglipVisionEmbeddings)
@@ -13,6 +11,7 @@
 from ..attention_backend.interface import AttentionMetadata
 from ..attention_backend.utils import get_attention_backend
 from ..model_config import ModelConfig
+from .hf_parameter_utils import get_parameter_device, get_parameter_dtype
 from .modeling_clip import CLIPEncoder
 from .modeling_utils import _load_weights_impl, register_auto_model
 
diff --git a/tensorrt_llm/_torch/models/modeling_vila.py b/tensorrt_llm/_torch/models/modeling_vila.py
index 8b634229237..1e3bcab02b5 100644
--- a/tensorrt_llm/_torch/models/modeling_vila.py
+++ b/tensorrt_llm/_torch/models/modeling_vila.py
@@ -1252,5 +1252,5 @@ def post_config(self):
         self.model_config.pretrained_config = self.llm.config
 
 
-AutoConfig.register(VilaConfig.model_type, VilaConfig)
+AutoConfig.register(VilaConfig.model_type, VilaConfig, exist_ok=True)
 AutoModel.register(VilaConfig, VilaModel)
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py
index 49e56c4d23d..67d805416cf 100644
--- a/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py
+++ b/tensorrt_llm/_torch/visual_gen/models/wan/transformer_wan.py
@@ -6,8 +6,8 @@
 import torch.nn.functional as F
 from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps
 from tqdm import tqdm
-from transformers.modeling_utils import get_parameter_device
 
+from tensorrt_llm._torch.models.hf_parameter_utils import get_parameter_device
 from tensorrt_llm._torch.modules.layer_norm import LayerNorm
 from tensorrt_llm._torch.modules.linear import Linear
 from tensorrt_llm._torch.modules.mlp import MLP
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 47a6a88499e..c53e7a08504 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -70,6 +70,23 @@
 np_float8 = np.dtype('V1', metadata={"dtype": "float8"})
 
 
+def get_hf_rope_theta(config: Any, default: float = 10000.0) -> float:
+    """Return RoPE ``theta`` from a Hugging Face ``PreTrainedConfig``-like object.
+
+    Transformers v5+ nests ``rope_theta`` under ``rope_parameters`` for several
+    models (e.g. LLaMA); older releases expose ``config.rope_theta`` directly.
+    """
+    theta = getattr(config, "rope_theta", None)
+    if theta is not None:
+        return float(theta)
+    rope_params = getattr(config, "rope_parameters", None)
+    if isinstance(rope_params, dict):
+        theta = rope_params.get("rope_theta")
+        if theta is not None:
+            return float(theta)
+    return default
+
+
 def torch_to_numpy(x: torch.Tensor):
     assert isinstance(x, torch.Tensor), \
         f'x must be a torch.Tensor object, but got {type(x)}.'
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 5dd99755dc6..df80459359a 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -680,8 +680,16 @@ class RotaryScalingType(IntEnum):
 
     @staticmethod
     def from_string(s):
+        if isinstance(s, RotaryScalingType):
+            return s
+        if s is None:
+            return RotaryScalingType.none
+        key = str(s).lower()
+        # Hugging Face Transformers v5+ uses type "default" for unscaled / standard RoPE.
+        if key == "default":
+            return RotaryScalingType.none
         try:
-            return RotaryScalingType[s]
+            return RotaryScalingType[key]
         except KeyError:
             raise ValueError(f'Unsupported rotary scaling type: {s}')
 
@@ -722,6 +730,9 @@ def __str__(self):
 
     @staticmethod
     def from_string(s):
+        # Transformers 5.x uses "default" for standard RoPE (no scaling).
+        if s == "default":
+            return PositionEmbeddingType.rope_gpt_neox
         try:
             return PositionEmbeddingType[s]
         except KeyError:
diff --git a/tensorrt_llm/models/commandr/config.py b/tensorrt_llm/models/commandr/config.py
index a2edca61fb7..511640c2249 100644
--- a/tensorrt_llm/models/commandr/config.py
+++ b/tensorrt_llm/models/commandr/config.py
@@ -16,6 +16,7 @@
 
 import transformers
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -79,7 +80,7 @@ def from_hugging_face(
             hidden_act=hf_config.hidden_act,
             norm_epsilon=hf_config.layer_norm_eps,
             output_multiplier_scale=hf_config.logit_scale,
-            rotary_base=hf_config.rope_theta,
+            rotary_base=get_hf_rope_theta(hf_config, 10000.0),
             attn_bias=hf_config.attention_bias,
             qk_layernorm=hf_config.use_qk_norm,
             mapping=mapping,
diff --git a/tensorrt_llm/models/deepseek_v1/config.py b/tensorrt_llm/models/deepseek_v1/config.py
index b47fa91a43d..e7bff0d9aab 100755
--- a/tensorrt_llm/models/deepseek_v1/config.py
+++ b/tensorrt_llm/models/deepseek_v1/config.py
@@ -15,6 +15,7 @@
 
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -70,7 +71,7 @@ def from_hugging_face(
         num_key_value_heads = getattr(hf_config, "num_key_value_heads",
                                       hf_config.num_attention_heads)
         rotary_scaling = getattr(hf_config, "rope_scaling", None)
-        rotary_base = getattr(hf_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None))
         moe_config = MoeConfig(
             num_experts=getattr(hf_config, 'n_routed_experts', 0),
diff --git a/tensorrt_llm/models/deepseek_v2/config.py b/tensorrt_llm/models/deepseek_v2/config.py
index edaf21f128c..c110df0d53f 100644
--- a/tensorrt_llm/models/deepseek_v2/config.py
+++ b/tensorrt_llm/models/deepseek_v2/config.py
@@ -17,6 +17,7 @@
 
 from transformers import AutoConfig
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -129,7 +130,7 @@ def from_hugging_face(
                    max_position_embeddings=hf_config.max_position_embeddings,
                    hidden_act='swiglu',
                    norm_epsilon=hf_config.rms_norm_eps,
-                   rotary_base=hf_config.rope_theta,
+                   rotary_base=get_hf_rope_theta(hf_config, 10000.0),
                    rotary_scaling=rotary_scaling,
                    moe_inter_size=hf_config.moe_intermediate_size,
                    moe=moe_config,
diff --git a/tensorrt_llm/models/deepseek_v2/convert.py b/tensorrt_llm/models/deepseek_v2/convert.py
index 697040d3b75..5a23130fc52 100755
--- a/tensorrt_llm/models/deepseek_v2/convert.py
+++ b/tensorrt_llm/models/deepseek_v2/convert.py
@@ -20,7 +20,8 @@
 
 from tensorrt_llm.layers import MoeConfig
 
-from ..._utils import pad_vocab_size, release_gc, str_dtype_to_torch
+from ..._utils import (get_hf_rope_theta, pad_vocab_size, release_gc,
+                       str_dtype_to_torch)
 from ...logger import logger
 from ...mapping import Mapping
 from ..convert_utils import get_tllm_linear_weight
@@ -52,7 +53,7 @@ def create_trt_config_from_hf(model_dir,
     vocab_size = hf_config.vocab_size
     n_positions = hf_config.max_position_embeddings
     hidden_act = 'swiglu'  # TRT-LLM request make gated activation explicit for MOE implementation
-    rotary_base = hf_config.rope_theta
+    rotary_base = get_hf_rope_theta(hf_config, 10000.0)
     rms_norm_eps = hf_config.rms_norm_eps
     rotary_scaling_beta_fast = hf_config.rope_scaling['beta_fast']
     rotary_scaling_beta_slow = hf_config.rope_scaling['beta_slow']
diff --git a/tensorrt_llm/models/eagle/config.py b/tensorrt_llm/models/eagle/config.py
index f81e43bb03f..e7a559f3469 100644
--- a/tensorrt_llm/models/eagle/config.py
+++ b/tensorrt_llm/models/eagle/config.py
@@ -18,6 +18,7 @@
 
 from transformers import LlamaConfig
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..llama.config import LLaMAConfig
@@ -84,7 +85,7 @@ def from_hugging_face(
             rms_norm_eps = hf_config.rms_norm_eps
             vocab_size = hf_config.vocab_size
             rotary_scaling = hf_config.rope_scaling
-            rotary_base = hf_config.rope_theta
+            rotary_base = get_hf_rope_theta(hf_config, 10000.0)
             n_positions = hf_config.max_position_embeddings
             hidden_act = hf_config.hidden_act
             dtype = str(hf_config.torch_dtype)[6:] if dtype == 'auto' else dtype
diff --git a/tensorrt_llm/models/falcon/config.py b/tensorrt_llm/models/falcon/config.py
index c96bd517cc4..1ff2ff0391c 100644
--- a/tensorrt_llm/models/falcon/config.py
+++ b/tensorrt_llm/models/falcon/config.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -109,7 +110,7 @@ def from_hugging_face(
                    max_position_embeddings=getattr(hf_config,
                                                    'max_position_embeddings',
                                                    2048),
-                   rotary_base=getattr(hf_config, 'rope_theta', 10000.0),
+                   rotary_base=get_hf_rope_theta(hf_config, 10000.0),
                    intermediate_size=getattr(hf_config, 'ffn_hidden_size',
                                              None),
                    mapping=mapping,
diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py
index 8e176c4ed7e..3b0d8d6218c 100644
--- a/tensorrt_llm/models/gemma/config.py
+++ b/tensorrt_llm/models/gemma/config.py
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, Union
 
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
@@ -186,7 +187,7 @@ def from_hugging_face(
             norm_epsilon=hf_config.rms_norm_eps,
             num_key_value_heads=getattr(hf_config, "num_key_value_heads",
                                         hf_config.num_attention_heads),
-            rotary_base=getattr(hf_config, "rope_theta", 10000.0),
+            rotary_base=get_hf_rope_theta(hf_config, 10000.0),
             rotary_scaling=getattr(hf_config, "rotary_scaling", None),
             quantization=quant_config,
             mapping=mapping,
diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py
index e89dddd5efe..ba09d1f8694 100644
--- a/tensorrt_llm/models/gpt/config.py
+++ b/tensorrt_llm/models/gpt/config.py
@@ -17,6 +17,7 @@
 
 import torch
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...logger import logger
 from ...mapping import Mapping
@@ -134,7 +135,7 @@ def from_hugging_face(
             hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps
             hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else gpt_variant != 'nemotron'
             hf_config.position_embedding_type = 'rope_gpt_neox'
-            hf_config.rotary_base = hf_config.rope_theta
+            hf_config.rotary_base = get_hf_rope_theta(hf_config, 10000.0)
             hf_config.rotary_pct = getattr(
                 hf_config, 'partial_rotary_factor',
                 getattr(hf_config, 'rope_percent', 1.0))
diff --git a/tensorrt_llm/models/gpt/convert.py b/tensorrt_llm/models/gpt/convert.py
index 1e2bc4b999d..aa7d9a89e0d 100644
--- a/tensorrt_llm/models/gpt/convert.py
+++ b/tensorrt_llm/models/gpt/convert.py
@@ -29,8 +29,14 @@
 import torch.nn as nn
 import yaml
 from tqdm import tqdm
-from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoTokenizer)
+
+try:
+    from transformers import AutoModelForVision2Seq
+except ImportError:
+    # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText
+    from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.gpt2.modeling_gpt2 import GPT2Block
 from transformers.pytorch_utils import Conv1D
 
diff --git a/tensorrt_llm/models/llama/config.py b/tensorrt_llm/models/llama/config.py
index 7e0369a4ba0..54038e32c4f 100644
--- a/tensorrt_llm/models/llama/config.py
+++ b/tensorrt_llm/models/llama/config.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -112,7 +113,8 @@ def from_hugging_face(
                 from llava.model import LlavaLlamaConfig  # noqa
                 from llava.model import LlavaLlamaModel
                 transformers.AutoConfig.register("llava_llama",
-                                                 LlavaLlamaConfig)
+                                                 LlavaLlamaConfig,
+                                                 exist_ok=True)
                 transformers.AutoModelForCausalLM.register(
                     LlavaLlamaConfig, LlavaLlamaModel)
 
@@ -160,7 +162,7 @@ def from_hugging_face(
         attn_bias = getattr(hf_config, 'bias', False) or getattr(
             hf_config, 'attention_bias', False)
         rotary_scaling = getattr(hf_config, "rope_scaling", None)
-        rotary_base = getattr(hf_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         residual_mlp = getattr(hf_config, "parallel_attn_mlp_res", False)
         disable_weight_only_quant_plugin = kwargs.pop(
             'disable_weight_only_quant_plugin', False)
diff --git a/tensorrt_llm/models/mllama/config.py b/tensorrt_llm/models/mllama/config.py
index 5fb24f6fac7..cbd7f1b8f38 100644
--- a/tensorrt_llm/models/mllama/config.py
+++ b/tensorrt_llm/models/mllama/config.py
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...functional import LayerNormPositionType, LayerNormType, MLPType
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -166,7 +167,7 @@ def from_hugging_face(
         attn_bias = getattr(hf_text_config, 'bias', False) or getattr(
             hf_text_config, 'attention_bias', False)
         rotary_scaling = getattr(hf_text_config, "rope_scaling", None)
-        rotary_base = getattr(hf_text_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_text_config, 10000.0)
         residual_mlp = getattr(hf_text_config, "parallel_attn_mlp_res", False)
         disable_weight_only_quant_plugin = kwargs.pop(
             'disable_weight_only_quant_plugin', False)
diff --git a/tensorrt_llm/models/nemotron_nas/config.py b/tensorrt_llm/models/nemotron_nas/config.py
index 139b052c7bc..11d02df84b0 100644
--- a/tensorrt_llm/models/nemotron_nas/config.py
+++ b/tensorrt_llm/models/nemotron_nas/config.py
@@ -15,6 +15,7 @@
 from dataclasses import asdict
 from typing import Any, Dict, List, Optional, Union
 
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import infer_dtype
@@ -198,7 +199,7 @@ def from_hugging_face(
                    num_key_value_heads=hf_config.num_key_value_heads,
                    norm_epsilon=hf_config.rms_norm_eps,
                    rotary_scaling=hf_config.rope_scaling,
-                   rotary_base=hf_config.rope_theta,
+                   rotary_base=get_hf_rope_theta(hf_config, 10000.0),
                    vocab_size=hf_config.vocab_size,
                    max_position_embeddings=hf_config.max_position_embeddings,
                    mapping=mapping,
diff --git a/tensorrt_llm/models/phi/config.py b/tensorrt_llm/models/phi/config.py
index 3d38db0fa7b..583de15fadf 100644
--- a/tensorrt_llm/models/phi/config.py
+++ b/tensorrt_llm/models/phi/config.py
@@ -15,6 +15,7 @@
 
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -64,7 +65,7 @@ def from_hugging_face(
         num_key_value_heads = getattr(hf_config, "num_key_value_heads",
                                       hf_config.num_attention_heads)
         rotary_scaling = getattr(hf_config, "rope_scaling", None)
-        rotary_base = getattr(hf_config, "rope_theta", 10000.0)
+        rotary_base = get_hf_rope_theta(hf_config, 10000.0)
         dtype = infer_dtype(dtype, getattr(hf_config, 'torch_dtype', None))
 
         return cls(architecture=hf_config.architectures[0],
diff --git a/tensorrt_llm/models/phi/convert.py b/tensorrt_llm/models/phi/convert.py
index 0d1ec78bfd7..4bf3406c726 100644
--- a/tensorrt_llm/models/phi/convert.py
+++ b/tensorrt_llm/models/phi/convert.py
@@ -1,6 +1,6 @@
 import torch
 
-from ..._utils import pad_vocab_size, str_dtype_to_torch
+from ..._utils import get_hf_rope_theta, pad_vocab_size, str_dtype_to_torch
 
 
 def split(v, tp_size, idx, dim=0):
@@ -129,7 +129,7 @@ def convert_hf_config(hf_config, dtype, args):
         'num_hidden_layers': hf_config.num_hidden_layers,
         'num_attention_heads': hf_config.num_key_value_heads,
         'rotary_pct': hf_config.partial_rotary_factor,
-        'rope_theta': hf_config.rope_theta,
+        'rope_theta': get_hf_rope_theta(hf_config, 10000.0),
         'hidden_size': hf_config.hidden_size,
         'intermediate_size': hf_config.intermediate_size,
         'vocab_size': hf_config.vocab_size,
diff --git a/tensorrt_llm/models/phi3/config.py b/tensorrt_llm/models/phi3/config.py
index c824e921720..42d3954092e 100644
--- a/tensorrt_llm/models/phi3/config.py
+++ b/tensorrt_llm/models/phi3/config.py
@@ -15,6 +15,7 @@
 
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -96,7 +97,7 @@ def from_hugging_face(
                 hf_config, "dense_attention_every_n_layers", None)
             kwargs['norm_epsilon'] = hf_config.layer_norm_epsilon
         else:
-            kwargs['rotary_base'] = hf_config.rope_theta
+            kwargs['rotary_base'] = get_hf_rope_theta(hf_config, 10000.0)
             kwargs['norm_epsilon'] = hf_config.rms_norm_eps
         moe_variant = hf_config.architectures[0] == "PhiMoEForCausalLM"
         if moe_variant:
diff --git a/tensorrt_llm/models/qwen/config.py b/tensorrt_llm/models/qwen/config.py
index e2c22909538..0f1bd34606b 100644
--- a/tensorrt_llm/models/qwen/config.py
+++ b/tensorrt_llm/models/qwen/config.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from typing import Optional, Union
 
+from ..._utils import get_hf_rope_theta
 from ...layers import MoeConfig
 from ...mapping import Mapping
 from ..convert_utils import infer_dtype
@@ -138,7 +139,7 @@ def from_hugging_face(cls,
             rotary_base = getattr(hf_config, "rotary_emb_base", 10000.0)
         else:
             rms_norm_eps = hf_config.rms_norm_eps
-            rotary_base = getattr(hf_config, "rope_theta", 100000.0)
+            rotary_base = get_hf_rope_theta(hf_config, 100000.0)
 
         num_labels = 1
         if hf_config.architectures[0] == "Qwen2ForSequenceClassification":
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
index 302eb74533f..8c1aa57efa6 100755
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -34,7 +34,7 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                           AutoTokenizer)
 
-from .._utils import release_gc, str_dtype_to_torch
+from .._utils import get_hf_rope_theta, release_gc, str_dtype_to_torch
 from ..logger import logger
 from ..mapping import Mapping
 from .image_processing import MllamaImageProcessor
@@ -888,7 +888,8 @@ def quantize_and_export(*,
                 if qwen_config.model_type == "qwen2":
                     tensorrt_llm_config[
                         "norm_epsilon"] = qwen_config.rms_norm_eps
-                    tensorrt_llm_config["rotary_base"] = qwen_config.rope_theta
+                    tensorrt_llm_config["rotary_base"] = get_hf_rope_theta(
+                        qwen_config, 100000.0)
                 tensorrt_llm_config[
                     "intermediate_size"] = qwen_config.intermediate_size
                 with open(f"{export_path}/config.json", "w") as f:
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
index 5a88cc9dd80..47d9bd66ad9 100644
--- a/tensorrt_llm/runtime/multimodal_model_runner.py
+++ b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -28,8 +28,8 @@
 
 from .. import profiler
 from .._deprecation import emit_engine_arch_deprecation
-from .._utils import (maybe_pin_memory, mpi_rank, prefer_pinned,
-                      str_dtype_to_torch, str_dtype_to_trt,
+from .._utils import (get_hf_rope_theta, maybe_pin_memory, mpi_rank,
+                      prefer_pinned, str_dtype_to_torch, str_dtype_to_trt,
                       supports_inflight_batching, torch_dtype_to_trt,
                       trt_dtype_to_torch)
 from ..functional import RopeEmbeddingUtils, RotaryScalingType
@@ -415,7 +415,7 @@ def __init__(self, args):
             self.max_position_embeddings = hf_config.max_position_embeddings
             self.hidden_size = hf_config.hidden_size
             self.num_attention_heads = hf_config.num_attention_heads
-            self.rope_theta = hf_config.rope_theta
+            self.rope_theta = get_hf_rope_theta(hf_config, 10000.0)
         if self.model_type == 'llava_onevision':
             self.num_frames = self.args.video_num_frames
             if self.num_frames is None:
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index bf948eb2506..c2006f5f2c0 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -14,10 +14,15 @@
 from tensorrt_llm._utils import torch_dtype_to_str, to_json_file
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
+try:
+    from transformers import AutoModelForVision2Seq
+except ImportError:
+    # Transformers v5+: vision-to-seq auto models use AutoModelForImageTextToText
+    from transformers import AutoModelForImageTextToText as AutoModelForVision2Seq
+
 from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
-                          AutoModelForVision2Seq, AutoProcessor,
-                          Blip2ForConditionalGeneration, Blip2Processor,
-                          FuyuForCausalLM, FuyuProcessor,
+                          AutoProcessor, Blip2ForConditionalGeneration,
+                          Blip2Processor, FuyuForCausalLM, FuyuProcessor,
                           LlavaForConditionalGeneration, NougatProcessor,
                           Pix2StructForConditionalGeneration,
                           VisionEncoderDecoderModel, CLIPVisionModel)
diff --git a/tests/unittest/_torch/helpers.py b/tests/unittest/_torch/helpers.py
index 743f3998f26..f7dde64c669 100644
--- a/tests/unittest/_torch/helpers.py
+++ b/tests/unittest/_torch/helpers.py
@@ -1,4 +1,4 @@
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -256,3 +256,34 @@ def create_mock_cuda_graph_runner(batch_size: int, use_mrope: bool = False):
         dist=None,
         kv_cache_manager_key=ResourceManagerType.KV_CACHE_MANAGER)
     return CUDAGraphRunner(config)
+
+
+def make_hf_hybrid_cache_for_tests(
+    config,
+    *,
+    max_cache_len: int,
+    max_batch_size: Optional[int] = None,
+    device=None,
+    dtype=None,
+):
+    """Build Hugging Face ``past_key_values`` for hybrid / sliding-window models in tests.
+
+    Transformers v4 exposes ``HybridCache``; v5 removes it in favor of ``StaticCache``
+    for fixed-length pre-allocated KV (see HF cache refactor).
+    """
+    try:
+        from transformers.cache_utils import HybridCache
+    except ImportError:
+        from transformers.cache_utils import StaticCache
+
+        return StaticCache(config=config, max_cache_len=max_cache_len)
+
+    kwargs = {
+        "config": config,
+        "max_cache_len": max_cache_len,
+        "device": device,
+        "dtype": dtype,
+    }
+    if max_batch_size is not None:
+        kwargs["max_batch_size"] = max_batch_size
+    return HybridCache(**kwargs)
diff --git a/tests/unittest/_torch/modeling/test_modeling_cohere2.py b/tests/unittest/_torch/modeling/test_modeling_cohere2.py
index 20c2e88fe69..1a839cacd6c 100644
--- a/tests/unittest/_torch/modeling/test_modeling_cohere2.py
+++ b/tests/unittest/_torch/modeling/test_modeling_cohere2.py
@@ -1,9 +1,9 @@
 from copy import deepcopy
 
 import torch
+from _torch.helpers import make_hf_hybrid_cache_for_tests
 from transformers import Cohere2Config
 from transformers import Cohere2ForCausalLM as HFCohere2ForCausalLM
-from transformers.cache_utils import HybridCache
 
 import tensorrt_llm
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
@@ -161,8 +161,8 @@ def test_cohere2_allclose_to_hf(self) -> None:
 
         # Initialize the hugging face model
         hf_cohere2 = HFCohere2ForCausalLM(cohere2_config).to(dtype).to(device).eval()
-        hf_cache = HybridCache(
-            config=cohere2_config,
+        hf_cache = make_hf_hybrid_cache_for_tests(
+            cohere2_config,
             max_batch_size=batch_size,
             max_cache_len=10,
             device=device,
diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
index 931828be848..7ea88c93c57 100644
--- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py
+++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
@@ -25,8 +25,8 @@ class Exaone4Config(PretrainedConfig):
     # TODO: Remove this once we have a proper config for Exaone4
     SKIP_EXAONE4_HF_ACCURACY_TEST = True
 
-from _torch.helpers import create_mock_cuda_graph_runner
-from transformers.cache_utils import HybridCache
+from _torch.helpers import (create_mock_cuda_graph_runner,
+                            make_hf_hybrid_cache_for_tests)
 from utils.util import getSMVersion
 
 import tensorrt_llm
@@ -248,11 +248,13 @@ def test_exaone4_allclose_to_hf(self, scenario: Scenario) -> None:
         num_kv_heads = exaone4.config.num_key_value_heads
         max_seq_len = num_blocks * tokens_per_block
         batch_size = 1
-        hf_cache = HybridCache(config=exaone4_config,
-                               max_batch_size=batch_size,
-                               max_cache_len=max_seq_len,
-                               device=device,
-                               dtype=dtype)
+        hf_cache = make_hf_hybrid_cache_for_tests(
+            exaone4_config,
+            max_batch_size=batch_size,
+            max_cache_len=max_seq_len,
+            device=device,
+            dtype=dtype,
+        )
         if dtype == torch.half:
             kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
         elif dtype == torch.bfloat16:
diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
index 6b532b9b1c6..f252f4cbd2d 100644
--- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py
+++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
@@ -3,11 +3,11 @@
 from dataclasses import dataclass
 
 import torch
+from _torch.helpers import make_hf_hybrid_cache_for_tests
 from parameterized import parameterized
 from transformers import Gemma3Config
 from transformers import Gemma3ForCausalLM as HFGemma3ForCausalLM
 from transformers import Gemma3TextConfig
-from transformers.cache_utils import HybridCache
 
 import tensorrt_llm
 from tensorrt_llm._torch.attention_backend import (AttentionMetadata,
@@ -285,11 +285,13 @@ def test_gemma3_allclose_to_hf(self, scenario: Scenario) -> None:
 
         hf_gemma3 = HFGemma3ForCausalLM(gemma3_config).to(dtype).to(
             device).eval()
-        hf_cache = HybridCache(config=gemma3_config,
-                               max_batch_size=batch_size,
-                               max_cache_len=10,
-                               device=device,
-                               dtype=dtype)
+        hf_cache = make_hf_hybrid_cache_for_tests(
+            gemma3_config,
+            max_batch_size=batch_size,
+            max_cache_len=10,
+            device=device,
+            dtype=dtype,
+        )
 
         model_config = ModelConfig(pretrained_config=gemma3_config,
                                    attn_backend=backend)
diff --git a/tests/unittest/trt/attention/hf_dynamic_cache_compat.py b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
new file mode 100644
index 00000000000..6866feb8ba6
--- /dev/null
+++ b/tests/unittest/trt/attention/hf_dynamic_cache_compat.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DynamicCache legacy tuple format for tests (removed from Transformers v5+)."""
+
+from __future__ import annotations
+
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+from transformers.cache_utils import DynamicCache
+
+LegacyLayerKV = Tuple[torch.Tensor, torch.Tensor]
+LegacyCache = Tuple[LegacyLayerKV, ...]
+
+
+def dynamic_cache_from_legacy(
+    past_key_values: Optional[Union[LegacyCache, Sequence[LegacyLayerKV]]],
+) -> DynamicCache:
+    """Match pre-v5 ``DynamicCache.from_legacy_cache`` (see transformers v4.48 ``cache_utils``)."""
+    if past_key_values is None:
+        return DynamicCache()
+    if hasattr(DynamicCache, "from_legacy_cache"):
+        return DynamicCache.from_legacy_cache(past_key_values)
+    cache = DynamicCache()
+    for layer_idx in range(len(past_key_values)):
+        key_states, value_states = past_key_values[layer_idx]
+        cache.update(key_states, value_states, layer_idx)
+    return cache
+
+
+def dynamic_cache_to_legacy(cache: DynamicCache) -> LegacyCache:
+    """Match pre-v5 ``DynamicCache.to_legacy_cache``."""
+    if hasattr(cache, "to_legacy_cache"):
+        return cache.to_legacy_cache()
+    layers: List[LegacyLayerKV] = []
+    for layer in cache.layers:
+        if not getattr(layer, "is_initialized", False):
+            continue
+        keys = layer.keys
+        values = layer.values
+        if keys is None or values is None:
+            continue
+        layers.append((keys, values))
+    return tuple(layers)
diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py
index 349bf6b752d..328ce022118 100644
--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@@ -48,6 +48,8 @@
     MemoryPoolsAllocator
 from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
     PoolsKVCacheManager
+from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
+    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 
 
 class TestFunctional(unittest.TestCase):
@@ -630,13 +632,19 @@ def _construct_execution(
                 rope_scale_type = RotaryScalingType.none
                 rope_scale = 1.0
                 if attention_type == "llama_attention":
-                    rope_base = configuration.rope_theta
-                    if configuration.rope_scaling is not None:
+                    rope_params = getattr(configuration, 'rope_parameters',
+                                          None) or {}
+                    rope_base = rope_params.get(
+                        'rope_theta',
+                        getattr(configuration, 'rope_theta', 10000.0))
+                    rope_type = rope_params.get(
+                        'rope_type', rope_params.get('type', 'default'))
+                    if rope_type not in ('default', None):
                         rope_scale_type = {
                             "linear": RotaryScalingType.linear,
                             "dynamic": RotaryScalingType.dynamic
-                        }[configuration.rope_scaling["type"]]
-                        rope_scale = configuration.rope_scaling["factor"]
+                        }[rope_type]
+                        rope_scale = rope_params.get("factor", 1.0)
                 rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                     configuration.max_position_embeddings, rotary_embedding_dim,
                     rope_base, rope_scale)
@@ -891,8 +899,17 @@ def _construct_execution(
 
         if attention_type == 'llama_attention':
             configuration.num_key_value_heads = num_kv_heads
-            configuration.rope_theta = rope_base
-            configuration.rope_scaling = rope_scaling
+            # In transformers 5.x, rope_theta/rope_scaling are unified into
+            # rope_parameters.  Build the dict so LlamaRotaryEmbedding works.
+            if rope_scaling is not None:
+                rope_params = {**rope_scaling, "rope_theta": rope_base}
+            else:
+                rope_params = {
+                    "rope_type": "default",
+                    "rope_theta": rope_base,
+                }
+            configuration.rope_parameters = rope_params
+            configuration.rope_scaling = rope_params
             if rope_scaling is not None:
                 # scaling is typically used for supporting longer seq lens than max_position_embeddings
                 # so we set the max_position_embeddings to be smaller than total seq len
@@ -1236,13 +1253,12 @@ def verify_kv_cache(torch_present):
                     attention_packed_mask = None
                 if attention_type == 'gpt2_attention':
                     # gpt2 uses DynamicCache
-                    torch_present = DynamicCache.from_legacy_cache(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              past_key_value=torch_present,
                                              use_cache=True,
                                              attention_mask=attention_mask)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'llama_attention':
                     position_embeddings = rotary_emb(input_tensor, position_ids)
                     attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(
@@ -1257,7 +1273,7 @@ def verify_kv_cache(torch_present):
                         position_embeddings=position_embeddings,
                         attention_mask=attention_mask,
                         use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gptj_attention':
                     torch_present = DynamicCache()
                     torch_output = attention(input_tensor,
@@ -1265,7 +1281,7 @@ def verify_kv_cache(torch_present):
                                              position_ids=position_ids,
                                              attention_mask=attention_mask,
                                              use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gpt_bigcode_attention':
                     attention_mask = _prepare_4d_attention_mask(
                         ctx_attention_mask,
@@ -1280,7 +1296,7 @@ def verify_kv_cache(torch_present):
                                              layer_past=torch_present,
                                              attention_mask=attention_mask,
                                              use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 else:
                     raise RuntimeError("attention_type not properly set")
 
@@ -1377,13 +1393,12 @@ def verify_kv_cache(torch_present):
                 # torch execution
                 if attention_type == 'gpt2_attention':
                     # gpt2 uses DynamicCache
-                    torch_present = DynamicCache.from_legacy_cache(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              past_key_value=torch_present,
                                              use_cache=True,
                                              attention_mask=attention_mask)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'llama_attention':
                     position_embeddings = rotary_emb(input_tensor, position_ids)
                     attention_mask = attention_mask + AttentionMaskConverter._make_causal_mask(
@@ -1392,37 +1407,34 @@ def verify_kv_cache(torch_present):
                         device='cuda',
                         past_key_values_length=in_len + step - 1)
                     # llama uses DynamicCache
-                    torch_present = DynamicCache.from_legacy_cache(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(
                         input_tensor,
                         past_key_value=torch_present,
                         position_embeddings=position_embeddings,
                         attention_mask=attention_mask,
                         use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gptj_attention':
-                    torch_present = DynamicCache.from_legacy_cache(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              layer_past=torch_present,
                                              position_ids=position_ids,
                                              attention_mask=attention_mask,
                                              use_cache=True)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
                 elif attention_type == 'gpt_bigcode_attention':
                     # target shape = (b, h, 1, s_key)
                     key_seqlen = in_len + step  # ctx_attention_mask.shape[1]
                     attention_mask = (attention_mask
                                       >= 0).expand(batch_size, num_heads, 1,
                                                    key_seqlen)
-                    torch_present = DynamicCache.from_legacy_cache(
-                        torch_present)
+                    torch_present = dynamic_cache_from_legacy(torch_present)
                     torch_output = attention(input_tensor,
                                              layer_past=torch_present,
                                              use_cache=True,
                                              attention_mask=attention_mask)[0]
-                    torch_present = torch_present.to_legacy_cache()
+                    torch_present = dynamic_cache_to_legacy(torch_present)
 
                 def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                     if num_beams == 1:
diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
index cda9025a8b9..cacad8c35aa 100644
--- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py
+++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
@@ -26,7 +26,6 @@
 
 from parameterized import parameterized
 from transformers import GPT2Config, GPTBigCodeConfig, GPTJConfig, LlamaConfig
-from transformers.cache_utils import DynamicCache
 from transformers.modeling_attn_mask_utils import (AttentionMaskConverter,
                                                    _prepare_4d_attention_mask)
 from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
@@ -51,6 +50,8 @@
     MemoryPoolsAllocator
 from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
     PoolsKVCacheManager
+from tests.unittest.trt.attention.hf_dynamic_cache_compat import (
+    dynamic_cache_from_legacy, dynamic_cache_to_legacy)
 
 
 class TestFunctional(unittest.TestCase):
@@ -381,13 +382,19 @@ def _construct_execution(session,
                 rope_scale_type = RotaryScalingType.none
                 rope_scale = 1.0
                 if attention_type == "llama_attention":
-                    rope_base = configuration.rope_theta
-                    if configuration.rope_scaling is not None:
+                    rope_params = getattr(configuration, 'rope_parameters',
+                                          None) or {}
+                    rope_base = rope_params.get(
+                        'rope_theta',
+                        getattr(configuration, 'rope_theta', 10000.0))
+                    rope_type = rope_params.get(
+                        'rope_type', rope_params.get('type', 'default'))
+                    if rope_type not in ('default', None):
                         rope_scale_type = {
                             "linear": RotaryScalingType.linear,
                             "dynamic": RotaryScalingType.dynamic
-                        }[configuration.rope_scaling["type"]]
-                        rope_scale = configuration.rope_scaling["factor"]
+                        }[rope_type]
+                        rope_scale = rope_params.get("factor", 1.0)
                 rotary_inv_freq, embed_positions_for_gpt_attention = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                     configuration.max_position_embeddings, rotary_embedding_dim,
                     rope_base, rope_scale)
@@ -579,8 +586,17 @@ def _construct_execution(session,
                                   attn_implementation='eager')
         if attention_type == 'llama_attention':
             configuration.num_key_value_heads = num_kv_heads
-            configuration.rope_theta = rope_base
-            configuration.rope_scaling = rope_scaling
+            # In transformers 5.x, rope_theta/rope_scaling are unified into
+            # rope_parameters.  Build the dict so LlamaRotaryEmbedding works.
+            if rope_scaling is not None:
+                rope_params = {**rope_scaling, "rope_theta": rope_base}
+            else:
+                rope_params = {
+                    "rope_type": "default",
+                    "rope_theta": rope_base,
+                }
+            configuration.rope_parameters = rope_params
+            configuration.rope_scaling = rope_params
             if rope_scaling is not None:
                 # scaling is typically used for supporting longer seq lens than max_position_embeddings
                 # so we set the max_position_embeddings to be smaller than total seq len
@@ -760,8 +776,7 @@ def torch_exec(step: int,
                 tgt_len=(in_len if step == 0 else 1))
             if attention_type == 'gpt2_attention':
                 torch_output = attention(input,
-                                         past_key_value=layer_past,
-                                         use_cache=True,
+                                         past_key_values=layer_past,
                                          attention_mask=attention_mask)[0]
                 torch_present = layer_past
             elif attention_type == 'llama_attention':
@@ -774,10 +789,9 @@ def torch_exec(step: int,
                                             1))
                 torch_output = attention(
                     input,
-                    past_key_value=layer_past,
+                    past_key_values=layer_past,
                     position_embeddings=position_embeddings,
-                    attention_mask=attention_mask,
-                    use_cache=True)[0]
+                    attention_mask=attention_mask)[0]
                 torch_present = layer_past
             elif attention_type == 'gptj_attention':
                 torch_output, torch_present = attention(
@@ -1010,7 +1024,7 @@ def torch_exec(step: int,
                         (local_beam_width, input_length, hidden_size))
 
                 # llama/gpt2 uses DynamicCache
-                past_key_values = DynamicCache.from_legacy_cache(
+                past_key_values = dynamic_cache_from_legacy(
                     torch_cache_list[req_idx])
 
                 torch_out, past_key_values = torch_exec(
@@ -1018,7 +1032,8 @@ def torch_exec(step: int,
                     past_key_values)
 
                 # llama/gpt2 uses DynamicCache
-                torch_cache_list[req_idx] = past_key_values.to_legacy_cache()
+                torch_cache_list[req_idx] = dynamic_cache_to_legacy(
+                    past_key_values)
                 past_key_values = torch_cache_list[req_idx][0]
 
                 if use_fp8_kv_cache or use_int8_kv_cache:
diff --git a/tests/unittest/trt/model/test_phi.py b/tests/unittest/trt/model/test_phi.py
index 9db18f4e46e..b3cf8d28f2f 100644
--- a/tests/unittest/trt/model/test_phi.py
+++ b/tests/unittest/trt/model/test_phi.py
@@ -24,6 +24,7 @@
 
 import tensorrt_llm
 from tensorrt_llm import Builder
+from tensorrt_llm._utils import get_hf_rope_theta
 from tensorrt_llm.models.phi.convert import load_weights_from_hf_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
@@ -61,7 +62,7 @@ def initialize_network(self, network: tensorrt_llm.Network, hf_model,
             'num_attention_heads': hf_config.num_key_value_heads,
             'rotary_pct': hf_config.partial_rotary_factor,
             'position_embedding_type': 'rope_gpt_neox',
-            'rope_theta': hf_config.rope_theta,
+            'rope_theta': get_hf_rope_theta(hf_config, 10000.0),
             'hidden_size': hf_config.hidden_size,
             'intermediate_size': hf_config.intermediate_size,
             'vocab_size': hf_config.vocab_size,
diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
index 4caff0bbffc..85600193bfe 100644
--- a/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
+++ b/triton_backend/all_models/multimodal/multimodal_encoders/1/multimodal_utils.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from tensorrt_llm._utils import get_hf_rope_theta
+
 
 class LlavaOnevisionUtils:
     # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -129,7 +131,7 @@ def __init__(self, config):
         self.max_position_embeddings = config.max_position_embeddings
         self.hidden_size = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
-        self.rope_theta = config.rope_theta
+        self.rope_theta = get_hf_rope_theta(config, 10000.0)
 
     def get_rope_index(
         self,