NVIDIA · yuanjingx87 · Apr 7, 2026 · Mar 4, 2026 · Apr 7, 2026 · Apr 2, 2026
@@ -108,11 +108,16 @@ FetchContent_Declare(
   SOURCE_SUBDIR
   dont-add-this-project-with-add-subdirectory)
 
+set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/patches/xgrammar_constexpr.patch")
 FetchContent_Declare(
   xgrammar
   GIT_REPOSITORY https://github.com/mlc-ai/xgrammar
-  GIT_TAG v0.1.25 # e4e816f5f0fe39f5b1601a17a4552307fa3b70ff
+  GIT_TAG v0.1.32 # 62e13551b9b63251114894c5ee638564b160dd48
   GIT_SHALLOW TRUE
   # NOTE: TensorRT-LLM only uses the headers
   SOURCE_SUBDIR
-  dont-add-this-project-with-add-subdirectory)
+  dont-add-this-project-with-add-subdirectory
+  PATCH_COMMAND
+    bash -c "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && \
+    patch -p1 --forward --batch -i '${_patch_file}' || \
+    echo 'Patch already applied, skipping.'")
@@ -0,0 +1,19 @@
+--- a/cpp/grammar_functor.cc
++++ b/cpp/grammar_functor.cc
+@@ -1750,11 +1750,11 @@
+   void Apply(Grammar* grammar);
+   static std::optional<uint64_t> HashSequence(const Grammar& grammar, int32_t sequence_id);
+
+-  static const int16_t kNotEndStateFlag = -0x100;
+-  static const int16_t kEndStateFlag = -0x200;
+-  static const int16_t kSelfRecursionFlag = -0x300;
+-  static const int16_t kSimpleCycleFlag = -0x400;
+-  static const int16_t kUnKnownFlag = -0x500;
++  static constexpr int16_t kNotEndStateFlag = -0x100;
++  static constexpr int16_t kEndStateFlag = -0x200;
++  static constexpr int16_t kSelfRecursionFlag = -0x300;
++  static constexpr int16_t kSimpleCycleFlag = -0x400;
++  static constexpr int16_t kUnKnownFlag = -0x500;
+
+  private:
+   Grammar* grammar_;
diff --git a/constraints.txt b/constraints.txt
@@ -4,3 +4,7 @@
 urllib3>=2.6.3
 # WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx
 wheel>=0.46.2
+# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7
+protobuf>=6.33.5
+# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg
+aiohttp>=3.13.3
@@ -3614,9 +3614,9 @@ def launchTestJobs(pipeline, testFilter)
                             // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability.
                             // PyTorch CUDA 13.0 package and torchvision package can be installed as expected.
                             if (k8s_arch == "amd64") {
-                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130")
                             } else {
-                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130")
                             }
                         }
 

@@ -15,5 +15,5 @@ IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
 LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
 LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202603011156-11778
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202603011156-11778
@@ -36,8 +36,8 @@ opentelemetry-api>=1.26.0
 opentelemetry-exporter-otlp>=1.26.0
 opentelemetry-semantic-conventions-ai>=0.4.1
 fuzzywuzzy==0.18.0
-aiperf==0.3.0
+aiperf==0.4.0
 nanobind>=2.9.0
-nixl==0.8.0
+nixl==0.9.0
 hf-transfer==0.1.9
 line_profiler
@@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
 # torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
 nvidia-nccl-cu13>=2.27.7,<=2.28.9
 nvidia-cuda-nvrtc
-transformers==4.57.1
+transformers==4.57.3
 prometheus_client
 prometheus_fastapi_instrumentator
 pydantic>=2.9.1
@@ -56,7 +56,7 @@ patchelf
 einops
 flashinfer-python==0.6.4
 opencv-python-headless
-xgrammar==0.1.25
+xgrammar==0.1.32
 llguidance==0.7.29
 jsonschema
 backoff

diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
@@ -56,8 +56,8 @@ dependencies = [
     "peft (>=0.18.1,<0.19.0)",
     "patchelf (>=0.17.2.4,<0.18.0.0)",
     "einops (>=0.8.2,<0.9.0)",
-    "flashinfer-python (==0.6.4)",
-    "xgrammar (==0.1.25)",
+    "flashinfer-python (>=0.6.1,<0.7.0)",
+    "xgrammar (==0.1.32)",
     "llguidance (==0.7.29)",
     "jsonschema (>=4.26.0,<5.0.0)",
     "backoff (>=2.2.1,<3.0.0)",

@@ -4,6 +4,7 @@
 from nixl import nixl_agent, nixl_agent_config, nixl_xfer_handle
 
 from tensorrt_llm._utils import nvtx_range
+from tensorrt_llm.logger import logger
 
 # Import base classes for type compatibility
 from ..base.agent import BaseTransferAgent, RegMemoryDescs, TransferRequest, TransferStatus
@@ -36,9 +37,11 @@ def wait(self, timeout_ms=None):
         while status in (TransferState.PENDING, TransferState.PROCESSING):
             status = TransferState(self.agent.check_xfer_state(self.handle))
             if status == TransferState.ERROR:
-                return False  # Transfer failed
+                logger.error("NIXL transfer entered ERROR state (agent=%s).", self.agent.name)
+                return False
             if timeout is not None and (time.time() - start_time > timeout):
-                return False  # Timeout
+                logger.warning("NIXL transfer wait timed out after %s ms.", timeout_ms)
+                return False
             time.sleep(sleep_time)
             sleep_time = min(sleep_time * 2, max_sleep_time)
         return status == TransferState.DONE
@@ -61,23 +64,25 @@ def __init__(self, name: str, use_prog_thread: bool = True, num_threads: int = 1
         )
         self.agent = nixl_agent(name, agent_config)
 
-    def register_memory(self, descs: RegMemoryDescs):
+    def _get_validated_reg_descs(self, descs: RegMemoryDescs):
         if not descs.descs:
             raise ValueError("descs.descs must not be empty")
-        if isinstance(descs.descs[0], tuple):
-            assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}"
+        if isinstance(descs.descs[0], tuple) and len(descs.descs[0]) != 4:
+            raise ValueError(
+                f"Expected 4 elements per desc, got {len(descs.descs[0])}: {descs.descs[0]}"
+            )
         reg_descs = self.agent.get_reg_descs(descs.descs, descs.type)
-        assert reg_descs is not None, "Failed to get reg_descs"
-        self.agent.register_memory(reg_descs)
+        if reg_descs is None:
+            raise RuntimeError(
+                f"nixl get_reg_descs returned None for type={descs.type}, count={len(descs.descs)}"
+            )
+        return reg_descs
+
+    def register_memory(self, descs: RegMemoryDescs):
+        self.agent.register_memory(self._get_validated_reg_descs(descs))
 
     def deregister_memory(self, descs: RegMemoryDescs):
-        if not descs.descs:
-            raise ValueError("descs.descs must not be empty")
-        if isinstance(descs.descs[0], tuple):
-            assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}"
-        reg_descs = self.agent.get_reg_descs(descs.descs, descs.type)
-        assert reg_descs is not None, "Failed to get reg_descs"
-        self.agent.deregister_memory(reg_descs)
+        self.agent.deregister_memory(self._get_validated_reg_descs(descs))
 
     def load_remote_agent(self, name: str, agent_desc: bytes):
         self.agent.add_remote_agent(agent_desc)
@@ -97,9 +102,15 @@ def notify_sync_message(self, name: str, sync_message: str):
     @nvtx_range("NixlTransferAgent.submit_transfer_requests")
     def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus:
         src_xfer_descs = self.agent.get_xfer_descs(request.src_descs.descs, request.src_descs.type)
+        if src_xfer_descs is None:
+            raise RuntimeError(
+                f"nixl get_xfer_descs returned None for src type={request.src_descs.type}"
+            )
         dst_xfer_descs = self.agent.get_xfer_descs(request.dst_descs.descs, request.dst_descs.type)
-        assert src_xfer_descs is not None, "Failed to get src_xfer_descs"
-        assert dst_xfer_descs is not None, "Failed to get dst_xfer_descs"
+        if dst_xfer_descs is None:
+            raise RuntimeError(
+                f"nixl get_xfer_descs returned None for dst type={request.dst_descs.type}"
+            )
         sync_message = "" if request.sync_message is None else request.sync_message
         handle = self.agent.initialize_xfer(
             request.op,
@@ -110,5 +121,7 @@ def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus:
         )
         status = self.agent.transfer(handle)
         if status == "ERROR":
-            raise RuntimeError("NIXL transfer initialization failed.")
+            raise RuntimeError(
+                f"NIXL transfer failed: op={request.op}, remote={request.remote_name}"
+            )
         return NixlTransferStatus(self.agent, handle)
@@ -13,31 +13,45 @@
 """
 
 
-def _load_agent(module_name, required_attributes):
+def _load_agent(
+    module_name: str, required_attributes: list[str]
+) -> tuple[object, ImportError | None]:
     try:
         module = __import__(module_name, fromlist=required_attributes, level=0)
         if all(hasattr(module, attr) for attr in required_attributes):
-            return module
+            return module, None
+        missing = [a for a in required_attributes if not hasattr(module, a)]
+        err = ImportError(f"Module {module_name} is missing required attributes: {missing}")
+        logger.warning("%s", err)
+        return None, err
     except ImportError as e:
-        logger.info("Failed to import module: %s. Error: %s", module_name, str(e))
-    return None
+        logger.warning("Failed to import module: %s. Error: %s", module_name, str(e))
+        return None, e
 
 
 NixlTransferStatus, NixlTransferAgent = None, None
 
 if use_pure_python_transfer_agent():
-    _py_agent = _load_agent(
+    _py_agent, _py_agent_err = _load_agent(
         module_name="tensorrt_llm._torch.disaggregation.nixl._agent_py",
         required_attributes=["NixlTransferAgent", "NixlTransferStatus"],
     )
-    assert _py_agent is not None, "Failed to load pure Python NIXL Transfer Agent."
+    if _py_agent is None:
+        raise ImportError(
+            "Failed to load pure Python NIXL Transfer Agent."
+            + (f" Caused by: {_py_agent_err}" if _py_agent_err else "")
+        )
     NixlTransferStatus = _py_agent.NixlTransferStatus
     NixlTransferAgent = _py_agent.NixlTransferAgent
 else:
-    _cpp_agent = _load_agent(
+    _cpp_agent, _cpp_agent_err = _load_agent(
         module_name="tensorrt_llm._torch.disaggregation.nixl._agent_cpp",
         required_attributes=["BindingsNixlTransferAgent", "BindingsNixlTransferStatus"],
     )
-    assert _cpp_agent is not None, "Failed to load C++ NIXL Transfer Agent bindings."
+    if _cpp_agent is None:
+        raise ImportError(
+            "Failed to load C++ NIXL Transfer Agent bindings."
+            + (f" Caused by: {_cpp_agent_err}" if _cpp_agent_err else "")
+        )
     NixlTransferStatus = _cpp_agent.BindingsNixlTransferStatus
     NixlTransferAgent = _cpp_agent.BindingsNixlTransferAgent
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -386,3 +386,5 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-
 perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
 perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
+unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5920779)
+unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673)
diff --git a/tests/unittest/_torch/modeling/test_modeling_siglip.py b/tests/unittest/_torch/modeling/test_modeling_siglip.py
@@ -135,10 +135,19 @@ def test_siglip_vision_allclose_to_hf(self, scenario: Scenario):
             attn_metadata=attn_metadata,
         )
 
-        # Compare all hidden states
+        # Compare all hidden states.
+        # TRT-LLM applies post_layernorm to the last encoder hidden state
+        # (matching production usage), so the last element must be compared
+        # against HF's post_layernormed last_hidden_state rather than the
+        # raw hidden_states[-1].
+        num_states = len(tllm_outputs)
+        for i in range(num_states):
+            tllm_hs = tllm_outputs[i]
+            if i < num_states - 1:
+                hf_hs = hf_outputs.hidden_states[i]
+            else:
+                hf_hs = hf_outputs.last_hidden_state
 
-        for i, (hf_hs, tllm_hs) in enumerate(
-                zip(hf_outputs.hidden_states, tllm_outputs)):
             self.assertEqual(hf_hs.shape, tllm_hs.shape,
                              f"Shape mismatch for hidden state {i}")
 

diff --git a/tests/unittest/disaggregated/test_agent_multi_backends.py b/tests/unittest/disaggregated/test_agent_multi_backends.py
@@ -4,6 +4,43 @@
 import pytest
 
 
+def test_load_agent_missing_module():
+    """_load_agent returns (None, ImportError) for a non-existent module.
+
+    Regression test: previously a missing nixl package caused an AssertionError
+    at module import time, making pytest exit with code 2 (collection failure)
+    instead of a clear ImportError with a descriptive message.
+    """
+    from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent
+
+    agent, err = _load_agent("_trtllm_nonexistent_module_xyz_", ["SomeClass"])
+    assert agent is None
+    assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}"
+    assert "No module named" in str(err) or "_trtllm_nonexistent_module_xyz_" in str(err)
+
+
+def test_load_agent_missing_attributes():
+    """_load_agent returns (None, ImportError) and logs a warning when attributes are missing."""
+    from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent
+
+    # 'os' exists but has no NixlTransferAgent attribute
+    agent, err = _load_agent("os", ["NixlTransferAgent"])
+    assert agent is None
+    assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}"
+    assert "NixlTransferAgent" in str(err)
+
+
+def test_load_agent_success():
+    """_load_agent returns (module, None) on success."""
+    from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent
+
+    agent, err = _load_agent("os", ["path", "getcwd"])
+    assert agent is not None
+    assert err is None
+    assert hasattr(agent, "path")
+    assert hasattr(agent, "getcwd")
+
+
 @pytest.mark.parametrize("use_py_nixl", ["0", "1"])
 def test_run_with_different_env(use_py_nixl):
     os.environ["TRTLLM_USE_PY_NIXL_KVCACHE"] = use_py_nixl

@@ -1,7 +1,7 @@
 regex
 fire
 tritonclient[all]
-transformers==4.57.1
+transformers==4.57.3
 pandas
 tabulate
 flash_attn