diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 93565ae099b..fd627e5a104 100644
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -108,11 +108,16 @@ FetchContent_Declare(
   SOURCE_SUBDIR
   dont-add-this-project-with-add-subdirectory)
 
+set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/patches/xgrammar_constexpr.patch")
 FetchContent_Declare(
   xgrammar
   GIT_REPOSITORY https://github.com/mlc-ai/xgrammar
-  GIT_TAG v0.1.25 # e4e816f5f0fe39f5b1601a17a4552307fa3b70ff
+  GIT_TAG v0.1.32 # 62e13551b9b63251114894c5ee638564b160dd48
   GIT_SHALLOW TRUE
   # NOTE: TensorRT-LLM only uses the headers
   SOURCE_SUBDIR
-  dont-add-this-project-with-add-subdirectory)
+  dont-add-this-project-with-add-subdirectory
+  PATCH_COMMAND
+    bash -c "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && \
+    patch -p1 --forward --batch -i '${_patch_file}' || \
+    echo 'Patch already applied, skipping.'")
diff --git a/3rdparty/patches/xgrammar_constexpr.patch b/3rdparty/patches/xgrammar_constexpr.patch
new file mode 100644
index 00000000000..f4e5be99a21
--- /dev/null
+++ b/3rdparty/patches/xgrammar_constexpr.patch
@@ -0,0 +1,19 @@
+--- a/cpp/grammar_functor.cc
++++ b/cpp/grammar_functor.cc
+@@ -1750,11 +1750,11 @@
+   void Apply(Grammar* grammar);
+   static std::optional<uint64_t> HashSequence(const Grammar& grammar, int32_t sequence_id);
+
+-  static const int16_t kNotEndStateFlag = -0x100;
+-  static const int16_t kEndStateFlag = -0x200;
+-  static const int16_t kSelfRecursionFlag = -0x300;
+-  static const int16_t kSimpleCycleFlag = -0x400;
+-  static const int16_t kUnKnownFlag = -0x500;
++  static constexpr int16_t kNotEndStateFlag = -0x100;
++  static constexpr int16_t kEndStateFlag = -0x200;
++  static constexpr int16_t kSelfRecursionFlag = -0x300;
++  static constexpr int16_t kSimpleCycleFlag = -0x400;
++  static constexpr int16_t kUnKnownFlag = -0x500;
+
+  private:
+   Grammar* grammar_;
diff --git a/constraints.txt b/constraints.txt
index 3586deaf81d..17a66eff253 100644
--- a/constraints.txt
+++ b/constraints.txt
@@ -4,3 +4,11 @@
 urllib3>=2.6.3
 # WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx
 wheel>=0.46.2
+# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7
+protobuf>=6.33.5
+# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg
+aiohttp>=3.13.3
+# WAR against https://github.com/advisories/GHSA-qjxf-f2mg-c6mc
+tornado>=6.5.5
+# WAR against https://github.com/advisories/GHSA-3936-cmfr-pm3m
+black>=26.3.1
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index f5d7915b02e..c2a2b851338 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -76,7 +76,8 @@ COPY constraints.txt /tmp/constraints.txt
 RUN pip3 install --no-cache-dir -r /tmp/constraints.txt && rm /tmp/constraints.txt
 
 # Remove nbconvert to avoid https://github.com/advisories/GHSA-xm59-rqc7-hhvf in the base NGC PyTorch image.
-RUN pip3 uninstall -y nbconvert || true
+# Remove pillow to avoid https://github.com/advisories/GHSA-cfh3-3jmp-rvhc in the base NGC PyTorch image.
+RUN pip3 uninstall -y nbconvert pillow || true
 
 # Install UCX, NIXL, etcd
 # TODO: Combine these into the main install.sh script
diff --git a/docker/common/install_etcd.sh b/docker/common/install_etcd.sh
index a598daacc85..663650dff43 100644
--- a/docker/common/install_etcd.sh
+++ b/docker/common/install_etcd.sh
@@ -2,7 +2,7 @@
 
 set -ex
 
-ETCD_VER=v3.5.21
+ETCD_VER=v3.6.9
 
 # choose either URL
 DOWNLOAD_URL=https://storage.googleapis.com/etcd
diff --git a/examples/models/core/gemma/requirements.txt b/examples/models/core/gemma/requirements.txt
index 20f8719a379..a1bbed25b68 100644
--- a/examples/models/core/gemma/requirements.txt
+++ b/examples/models/core/gemma/requirements.txt
@@ -5,7 +5,6 @@
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
 tensorrt_llm>=0.0.0.dev0
 flax~=0.8.0
-numpy<2
 # jax[cuda12_pip]~=0.4.19
 safetensors~=0.4.1
 sentencepiece>=0.1.99
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index ba1440547fa..f5b0f7b4bd7 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -3614,9 +3614,9 @@ def launchTestJobs(pipeline, testFilter)
                             // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability.
                             // PyTorch CUDA 13.0 package and torchvision package can be installed as expected.
                             if (k8s_arch == "amd64") {
-                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130")
                             } else {
-                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130")
                             }
                         }
 
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index da06d20e717..02c35d1f960 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202604100821-12812
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-sbsa-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202604100821-12812
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202604100821-12812
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202604100821-12812
diff --git a/requirements-dev.txt b/requirements-dev.txt
index eae33fa6c92..bcdbb8a8fb8 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -36,8 +36,8 @@ opentelemetry-api>=1.26.0
 opentelemetry-exporter-otlp>=1.26.0
 opentelemetry-semantic-conventions-ai>=0.4.1
 fuzzywuzzy==0.18.0
-aiperf==0.3.0
+aiperf==0.6.0
 nanobind>=2.9.0
-nixl==0.8.0
+nixl==0.9.0
 hf-transfer==0.1.9
 line_profiler
diff --git a/requirements.txt b/requirements.txt
index 1b0455b72d3..3f90ba50c5a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ cuda-python>=13
 diffusers>=0.27.0
 lark
 mpi4py
-numpy<2
+numpy>=2.0.0,<2.4 # numba 0.63.1 requires numpy<2.4
 onnx>=1.18.0,<1.20.0
 onnx_graphsurgeon>=0.5.2
 onnxscript==0.5.4
@@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
 # torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
 nvidia-nccl-cu13>=2.27.7,<=2.28.9
 nvidia-cuda-nvrtc
-transformers==4.57.1
+transformers==4.57.3
 prometheus_client
 prometheus_fastapi_instrumentator
 pydantic>=2.9.1
@@ -56,7 +56,7 @@ patchelf
 einops
 flashinfer-python==0.6.4
 opencv-python-headless
-xgrammar==0.1.25
+xgrammar==0.1.32
 llguidance==0.7.29
 jsonschema
 backoff
@@ -72,7 +72,7 @@ blobfile
 openai-harmony==0.0.4
 nvidia-cutlass-dsl==4.3.4; python_version >= "3.10"
 plotly
-numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
+numexpr
 partial_json_parser
 apache-tvm-ffi==0.1.6 # used for reduce nvidia-cutlass-dsl host overhead
 torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf
diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
index e7820420f1c..a55e0205f66 100644
--- a/security_scanning/pyproject.toml
+++ b/security_scanning/pyproject.toml
@@ -56,8 +56,8 @@ dependencies = [
     "peft (>=0.18.1,<0.19.0)",
     "patchelf (>=0.17.2.4,<0.18.0.0)",
     "einops (>=0.8.2,<0.9.0)",
-    "flashinfer-python (==0.6.4)",
-    "xgrammar (==0.1.25)",
+    "flashinfer-python (>=0.6.1,<0.7.0)",
+    "xgrammar (==0.1.32)",
     "llguidance (==0.7.29)",
     "jsonschema (>=4.26.0,<5.0.0)",
     "backoff (>=2.2.1,<3.0.0)",
diff --git a/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py b/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py
index cd01f3024a9..dcfa28210f8 100644
--- a/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py
+++ b/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py
@@ -4,6 +4,7 @@
 from nixl import nixl_agent, nixl_agent_config, nixl_xfer_handle
 
 from tensorrt_llm._utils import nvtx_range
+from tensorrt_llm.logger import logger
 
 # Import base classes for type compatibility
 from ..base.agent import BaseTransferAgent, RegMemoryDescs, TransferRequest, TransferStatus
@@ -36,9 +37,11 @@ def wait(self, timeout_ms=None):
         while status in (TransferState.PENDING, TransferState.PROCESSING):
             status = TransferState(self.agent.check_xfer_state(self.handle))
             if status == TransferState.ERROR:
-                return False  # Transfer failed
+                logger.error("NIXL transfer entered ERROR state (agent=%s).", self.agent.name)
+                return False
             if timeout is not None and (time.time() - start_time > timeout):
-                return False  # Timeout
+                logger.warning("NIXL transfer wait timed out after %s ms.", timeout_ms)
+                return False
             time.sleep(sleep_time)
             sleep_time = min(sleep_time * 2, max_sleep_time)
         return status == TransferState.DONE
@@ -61,23 +64,25 @@ def __init__(self, name: str, use_prog_thread: bool = True, num_threads: int = 1
         )
         self.agent = nixl_agent(name, agent_config)
 
-    def register_memory(self, descs: RegMemoryDescs):
+    def _get_validated_reg_descs(self, descs: RegMemoryDescs):
         if not descs.descs:
             raise ValueError("descs.descs must not be empty")
-        if isinstance(descs.descs[0], tuple):
-            assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}"
+        if isinstance(descs.descs[0], tuple) and len(descs.descs[0]) != 4:
+            raise ValueError(
+                f"Expected 4 elements per desc, got {len(descs.descs[0])}: {descs.descs[0]}"
+            )
         reg_descs = self.agent.get_reg_descs(descs.descs, descs.type)
-        assert reg_descs is not None, "Failed to get reg_descs"
-        self.agent.register_memory(reg_descs)
+        if reg_descs is None:
+            raise RuntimeError(
+                f"nixl get_reg_descs returned None for type={descs.type}, count={len(descs.descs)}"
+            )
+        return reg_descs
+
+    def register_memory(self, descs: RegMemoryDescs):
+        self.agent.register_memory(self._get_validated_reg_descs(descs))
 
     def deregister_memory(self, descs: RegMemoryDescs):
-        if not descs.descs:
-            raise ValueError("descs.descs must not be empty")
-        if isinstance(descs.descs[0], tuple):
-            assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}"
-        reg_descs = self.agent.get_reg_descs(descs.descs, descs.type)
-        assert reg_descs is not None, "Failed to get reg_descs"
-        self.agent.deregister_memory(reg_descs)
+        self.agent.deregister_memory(self._get_validated_reg_descs(descs))
 
     def load_remote_agent(self, name: str, agent_desc: bytes):
         self.agent.add_remote_agent(agent_desc)
@@ -97,9 +102,15 @@ def notify_sync_message(self, name: str, sync_message: str):
     @nvtx_range("NixlTransferAgent.submit_transfer_requests")
     def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus:
         src_xfer_descs = self.agent.get_xfer_descs(request.src_descs.descs, request.src_descs.type)
+        if src_xfer_descs is None:
+            raise RuntimeError(
+                f"nixl get_xfer_descs returned None for src type={request.src_descs.type}"
+            )
         dst_xfer_descs = self.agent.get_xfer_descs(request.dst_descs.descs, request.dst_descs.type)
-        assert src_xfer_descs is not None, "Failed to get src_xfer_descs"
-        assert dst_xfer_descs is not None, "Failed to get dst_xfer_descs"
+        if dst_xfer_descs is None:
+            raise RuntimeError(
+                f"nixl get_xfer_descs returned None for dst type={request.dst_descs.type}"
+            )
         sync_message = "" if request.sync_message is None else request.sync_message
         handle = self.agent.initialize_xfer(
             request.op,
@@ -110,5 +121,7 @@ def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus:
         )
         status = self.agent.transfer(handle)
         if status == "ERROR":
-            raise RuntimeError("NIXL transfer initialization failed.")
+            raise RuntimeError(
+                f"NIXL transfer failed: op={request.op}, remote={request.remote_name}"
+            )
         return NixlTransferStatus(self.agent, handle)
diff --git a/tensorrt_llm/_torch/disaggregation/nixl/agent.py b/tensorrt_llm/_torch/disaggregation/nixl/agent.py
index 5f6f3db1547..8be3c7c5775 100644
--- a/tensorrt_llm/_torch/disaggregation/nixl/agent.py
+++ b/tensorrt_llm/_torch/disaggregation/nixl/agent.py
@@ -13,31 +13,45 @@
 """
 
 
-def _load_agent(module_name, required_attributes):
+def _load_agent(
+    module_name: str, required_attributes: list[str]
+) -> tuple[object, ImportError | None]:
     try:
         module = __import__(module_name, fromlist=required_attributes, level=0)
         if all(hasattr(module, attr) for attr in required_attributes):
-            return module
+            return module, None
+        missing = [a for a in required_attributes if not hasattr(module, a)]
+        err = ImportError(f"Module {module_name} is missing required attributes: {missing}")
+        logger.warning("%s", err)
+        return None, err
     except ImportError as e:
-        logger.info("Failed to import module: %s. Error: %s", module_name, str(e))
-    return None
+        logger.warning("Failed to import module: %s. Error: %s", module_name, str(e))
+        return None, e
 
 
 NixlTransferStatus, NixlTransferAgent = None, None
 
 if use_pure_python_transfer_agent():
-    _py_agent = _load_agent(
+    _py_agent, _py_agent_err = _load_agent(
         module_name="tensorrt_llm._torch.disaggregation.nixl._agent_py",
         required_attributes=["NixlTransferAgent", "NixlTransferStatus"],
     )
-    assert _py_agent is not None, "Failed to load pure Python NIXL Transfer Agent."
+    if _py_agent is None:
+        raise ImportError(
+            "Failed to load pure Python NIXL Transfer Agent."
+            + (f" Caused by: {_py_agent_err}" if _py_agent_err else "")
+        )
     NixlTransferStatus = _py_agent.NixlTransferStatus
     NixlTransferAgent = _py_agent.NixlTransferAgent
 else:
-    _cpp_agent = _load_agent(
+    _cpp_agent, _cpp_agent_err = _load_agent(
         module_name="tensorrt_llm._torch.disaggregation.nixl._agent_cpp",
         required_attributes=["BindingsNixlTransferAgent", "BindingsNixlTransferStatus"],
     )
-    assert _cpp_agent is not None, "Failed to load C++ NIXL Transfer Agent bindings."
+    if _cpp_agent is None:
+        raise ImportError(
+            "Failed to load C++ NIXL Transfer Agent bindings."
+            + (f" Caused by: {_cpp_agent_err}" if _cpp_agent_err else "")
+        )
     NixlTransferStatus = _cpp_agent.BindingsNixlTransferStatus
     NixlTransferAgent = _cpp_agent.BindingsNixlTransferAgent
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index f341d752206..981d3b13695 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -33,7 +33,7 @@
                      fp16_array, fp32_array, get_sm_version, int32_array,
                      int64_array, np_dtype_to_trt, str_dtype_to_trt,
                      trt_dtype_to_np, trt_dtype_to_str)
-from .network import PluginInfo, set_np_weight, set_plugin_info
+from .network import PluginInfo, get_np_weight, set_np_weight, set_plugin_info
 from .plugin import TRT_LLM_PLUGIN_NAMESPACE, current_all_reduce_helper
 from .quantization import QuantMode
 
@@ -3543,6 +3543,24 @@ def avg_pool2d(input: Tensor,
     return output
 
 
+def _get_trt_weight(weight: Tensor) -> Tuple[trt.Weights, bool]:
+    is_weight_constant = (weight.producer is not None
+                          and weight.producer.type == trt.LayerType.CONSTANT)
+    if is_weight_constant:
+        ndarray = get_np_weight(default_trtnet(), weight.producer.name)
+        if ndarray is not None:
+            trt_weight = trt.Weights(np_dtype_to_trt(ndarray.dtype),
+                                     ndarray.ctypes.data,
+                                     int(np.prod(ndarray.shape)))
+        else:
+            weight.producer.__class__ = trt.IConstantLayer
+            trt_weight = weight.producer.weights
+    else:
+        trt_weight = trt.Weights()
+
+    return trt_weight, is_weight_constant
+
+
 def conv1d(input: Tensor,
            weight: Tensor,
            bias: Optional[Tensor] = None,
@@ -3553,30 +3571,32 @@ def conv1d(input: Tensor,
 
     noutput = weight.size()[0]
     kernel_size = weight.size()[-2]
-    is_weight_constant = (weight.producer is not None
-                          and weight.producer.type == trt.LayerType.CONSTANT)
-    weight = weight.producer.weights if is_weight_constant else trt.Weights()
+    kernel_shape = trt.Dims([kernel_size, 1])
+
+    trt_weight, is_weight_constant = _get_trt_weight(weight)
+    weight_tensor = weight
 
     if bias is not None:
-        is_bias_constant = (bias.producer is not None
-                            and bias.producer.type == trt.LayerType.CONSTANT)
-        bias = bias.producer.weights if is_bias_constant else trt.Weights()
+        bias_tensor = bias
+        trt_bias, is_bias_constant = _get_trt_weight(bias)
+    else:
+        bias_tensor = None
+        trt_bias = None
 
     input_shuffled = stack([input], dim=input.ndim())
-    kernel_size = trt.Dims([kernel_size, 1])
 
     layer = default_trtnet().add_convolution_nd(input_shuffled.trt_tensor,
-                                                noutput, kernel_size, weight,
-                                                bias)
+                                                noutput, kernel_shape,
+                                                trt_weight, trt_bias)
     layer.stride_nd = (stride, 2)
     layer.padding_nd = (padding, 0)
     layer.dilation_nd = (dilation, 2)
     layer.num_groups = groups
 
     if not is_weight_constant:
-        layer.set_input(1, weight.trt_tensor)
-    if bias is not None and not is_bias_constant:
-        layer.set_input(2, bias.trt_tensor)
+        layer.set_input(1, weight_tensor.trt_tensor)
+    if bias_tensor is not None and not is_bias_constant:
+        layer.set_input(2, bias_tensor.trt_tensor)
 
     output_2d = _create_tensor(layer.get_output(0), layer)
     output_1d = squeeze(output_2d, dim=-1)
@@ -3602,18 +3622,21 @@ def conv2d(input: Tensor,
 
     noutput = weight.size()[0]
     kernel_size = (weight.size()[-2], weight.size()[-1])
+    kernel_shape = trt.Dims(list(kernel_size))
 
-    is_weight_constant = (weight.producer is not None
-                          and weight.producer.type == trt.LayerType.CONSTANT)
-    weight = weight.producer.weights if is_weight_constant else trt.Weights()
+    trt_weight, is_weight_constant = _get_trt_weight(weight)
+    weight_tensor = weight
 
     if bias is not None:
-        is_bias_constant = (bias.producer is not None
-                            and bias.producer.type == trt.LayerType.CONSTANT)
-        bias = bias.producer.weights if is_bias_constant else trt.Weights()
+        bias_tensor = bias
+        trt_bias, is_bias_constant = _get_trt_weight(bias)
+    else:
+        bias_tensor = None
+        trt_bias = None
 
     layer = default_trtnet().add_convolution_nd(input.trt_tensor, noutput,
-                                                kernel_size, weight, bias)
+                                                kernel_shape, trt_weight,
+                                                trt_bias)
     layer.stride_nd = stride
     layer.padding_nd = padding
     layer.dilation_nd = dilation
@@ -3625,9 +3648,9 @@ def conv2d(input: Tensor,
         layer.post_padding = post_padding
 
     if not is_weight_constant:
-        layer.set_input(1, weight.trt_tensor)
-    if bias is not None and not is_bias_constant:
-        layer.set_input(2, bias.trt_tensor)
+        layer.set_input(1, weight_tensor.trt_tensor)
+    if bias_tensor is not None and not is_bias_constant:
+        layer.set_input(2, bias_tensor.trt_tensor)
 
     output = _create_tensor(layer.get_output(0), layer)
 
@@ -3666,18 +3689,21 @@ def conv3d(input: Tensor,
 
     noutput = weight.size()[0]
     kernel_size = (weight.size()[-3], weight.size()[-2], weight.size()[-1])
+    kernel_shape = trt.Dims(list(kernel_size))
 
-    is_weight_constant = (weight.producer is not None
-                          and weight.producer.type == trt.LayerType.CONSTANT)
-    weight = weight.producer.weights if is_weight_constant else trt.Weights()
+    trt_weight, is_weight_constant = _get_trt_weight(weight)
+    weight_tensor = weight
 
     if bias is not None:
-        is_bias_constant = (bias.producer is not None
-                            and bias.producer.type == trt.LayerType.CONSTANT)
-        bias = bias.producer.weights if is_bias_constant else trt.Weights()
+        bias_tensor = bias
+        trt_bias, is_bias_constant = _get_trt_weight(bias)
+    else:
+        bias_tensor = None
+        trt_bias = None
 
     layer = default_trtnet().add_convolution_nd(input.trt_tensor, noutput,
-                                                kernel_size, weight, bias)
+                                                kernel_shape, trt_weight,
+                                                trt_bias)
     layer.stride_nd = stride
     layer.padding_nd = padding
     layer.dilation_nd = dilation
@@ -3685,9 +3711,9 @@ def conv3d(input: Tensor,
     layer.dilation_nd = dilation
 
     if not is_weight_constant:
-        layer.set_input(1, weight.trt_tensor)
-    if bias is not None and not is_bias_constant:
-        layer.set_input(2, bias.trt_tensor)
+        layer.set_input(1, weight_tensor.trt_tensor)
+    if bias_tensor is not None and not is_bias_constant:
+        layer.set_input(2, bias_tensor.trt_tensor)
 
     output = _create_tensor(layer.get_output(0), layer)
     return output
@@ -3713,26 +3739,29 @@ def conv_transpose2d(input: Tensor,
 
     noutput = weight.size()[1]
     kernel_size = (weight.size()[-2], weight.size()[-1])
+    kernel_shape = trt.Dims(list(kernel_size))
 
-    is_weight_constant = (weight.producer is not None
-                          and weight.producer.type == trt.LayerType.CONSTANT)
-    weight = weight.producer.weights if is_weight_constant else trt.Weights()
+    trt_weight, is_weight_constant = _get_trt_weight(weight)
+    weight_tensor = weight
 
     if bias is not None:
-        is_bias_constant = (bias.producer is not None
-                            and bias.producer.type == trt.LayerType.CONSTANT)
-        bias = bias.producer.weights if is_bias_constant else trt.Weights()
+        bias_tensor = bias
+        trt_bias, is_bias_constant = _get_trt_weight(bias)
+    else:
+        bias_tensor = None
+        trt_bias = None
 
     layer = default_trtnet().add_deconvolution_nd(input.trt_tensor, noutput,
-                                                  kernel_size, weight, bias)
+                                                  kernel_shape, trt_weight,
+                                                  trt_bias)
     layer.stride_nd = stride
     layer.padding_nd = padding
     layer.num_groups = groups
 
     if not is_weight_constant:
-        layer.set_input(1, weight.trt_tensor)
-    if bias is not None and not is_bias_constant:
-        layer.set_input(2, bias.trt_tensor)
+        layer.set_input(1, weight_tensor.trt_tensor)
+    if bias_tensor is not None and not is_bias_constant:
+        layer.set_input(2, bias_tensor.trt_tensor)
 
     output = _create_tensor(layer.get_output(0), layer)
 
diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py
index 7859eff9365..d740cbb0bdb 100644
--- a/tensorrt_llm/parameter.py
+++ b/tensorrt_llm/parameter.py
@@ -243,16 +243,23 @@ def set_value_or_dummy(self, v: Union[np.ndarray, torch.Tensor]):
 
         self.value = v
 
-    def set_name(self, name: str, network):
+    def set_name(self, name: str, network: Network):
         self._name = name
         if self.is_managed(network):
             self._get_weights(network).name = name
             return True
         else:
-            return network.trt_network.set_weights_name(
-                self._get_weights(network), name)
-
-    def _get_weights(self, network) -> trt.Weights | Tensor | None:
+            weights = self._get_weights(network)
+            # TensorRT bindings may return numpy array instead of trt.Weights
+            if isinstance(weights, np.ndarray):
+                trt_dtype = np_dtype_to_trt(
+                    weights.dtype
+                ) if weights.dtype != np.object_ else self._dtype
+                trt_count = int(np.prod(weights.shape))
+                weights = trt.Weights(trt_dtype, weights.ctypes.data, trt_count)
+            return network.trt_network.set_weights_name(weights, name)
+
+    def _get_weights(self, network: Network) -> trt.Weights | Tensor | None:
         tensor = network.get_parameter_tensor(self)
         if self.is_managed(network):
             return tensor
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 78ce434542f..c9a84e658de 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -386,3 +386,5 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
+unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5920779)
+unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673)
diff --git a/tests/unittest/_torch/modeling/test_modeling_siglip.py b/tests/unittest/_torch/modeling/test_modeling_siglip.py
index 40a7dd1399b..e9b6788f538 100644
--- a/tests/unittest/_torch/modeling/test_modeling_siglip.py
+++ b/tests/unittest/_torch/modeling/test_modeling_siglip.py
@@ -135,10 +135,19 @@ def test_siglip_vision_allclose_to_hf(self, scenario: Scenario):
             attn_metadata=attn_metadata,
         )
 
-        # Compare all hidden states
+        # Compare all hidden states.
+        # TRT-LLM applies post_layernorm to the last encoder hidden state
+        # (matching production usage), so the last element must be compared
+        # against HF's post_layernormed last_hidden_state rather than the
+        # raw hidden_states[-1].
+        num_states = len(tllm_outputs)
+        for i in range(num_states):
+            tllm_hs = tllm_outputs[i]
+            if i < num_states - 1:
+                hf_hs = hf_outputs.hidden_states[i]
+            else:
+                hf_hs = hf_outputs.last_hidden_state
 
-        for i, (hf_hs, tllm_hs) in enumerate(
-                zip(hf_outputs.hidden_states, tllm_outputs)):
             self.assertEqual(hf_hs.shape, tllm_hs.shape,
                              f"Shape mismatch for hidden state {i}")
 
diff --git a/tests/unittest/disaggregated/test_agent_multi_backends.py b/tests/unittest/disaggregated/test_agent_multi_backends.py
index 0a95bad03bc..5bf1da73d34 100644
--- a/tests/unittest/disaggregated/test_agent_multi_backends.py
+++ b/tests/unittest/disaggregated/test_agent_multi_backends.py
@@ -4,6 +4,43 @@
 import pytest
 
 
+def test_load_agent_missing_module():
+    """_load_agent returns (None, ImportError) for a non-existent module.
+
+    Regression test: previously a missing nixl package caused an AssertionError
+    at module import time, making pytest exit with code 2 (collection failure)
+    instead of a clear ImportError with a descriptive message.
+    """
+    from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent
+
+    agent, err = _load_agent("_trtllm_nonexistent_module_xyz_", ["SomeClass"])
+    assert agent is None
+    assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}"
+    assert "No module named" in str(err) or "_trtllm_nonexistent_module_xyz_" in str(err)
+
+
+def test_load_agent_missing_attributes():
+    """_load_agent returns (None, ImportError) and logs a warning when attributes are missing."""
+    from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent
+
+    # 'os' exists but has no NixlTransferAgent attribute
+    agent, err = _load_agent("os", ["NixlTransferAgent"])
+    assert agent is None
+    assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}"
+    assert "NixlTransferAgent" in str(err)
+
+
+def test_load_agent_success():
+    """_load_agent returns (module, None) on success."""
+    from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent
+
+    agent, err = _load_agent("os", ["path", "getcwd"])
+    assert agent is not None
+    assert err is None
+    assert hasattr(agent, "path")
+    assert hasattr(agent, "getcwd")
+
+
 @pytest.mark.parametrize("use_py_nixl", ["0", "1"])
 def test_run_with_different_env(use_py_nixl):
     os.environ["TRTLLM_USE_PY_NIXL_KVCACHE"] = use_py_nixl
diff --git a/triton_backend/requirements.txt b/triton_backend/requirements.txt
index 7daa868ed48..4375447772c 100644
--- a/triton_backend/requirements.txt
+++ b/triton_backend/requirements.txt
@@ -1,7 +1,7 @@
 regex
 fire
 tritonclient[all]
-transformers==4.57.1
+transformers==4.57.3
 pandas
 tabulate
 flash_attn