Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions 3rdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,16 @@ FetchContent_Declare(
SOURCE_SUBDIR
dont-add-this-project-with-add-subdirectory)

set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/patches/xgrammar_constexpr.patch")
FetchContent_Declare(
xgrammar
GIT_REPOSITORY https://github.com/mlc-ai/xgrammar
GIT_TAG v0.1.25 # e4e816f5f0fe39f5b1601a17a4552307fa3b70ff
GIT_TAG v0.1.32 # 62e13551b9b63251114894c5ee638564b160dd48
GIT_SHALLOW TRUE
# NOTE: TensorRT-LLM only uses the headers
SOURCE_SUBDIR
dont-add-this-project-with-add-subdirectory)
dont-add-this-project-with-add-subdirectory
PATCH_COMMAND
bash -c "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && \
patch -p1 --forward --batch -i '${_patch_file}' || \
echo 'Patch already applied, skipping.'")
19 changes: 19 additions & 0 deletions 3rdparty/patches/xgrammar_constexpr.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
--- a/cpp/grammar_functor.cc
+++ b/cpp/grammar_functor.cc
@@ -1750,11 +1750,11 @@
void Apply(Grammar* grammar);
static std::optional<uint64_t> HashSequence(const Grammar& grammar, int32_t sequence_id);

- static const int16_t kNotEndStateFlag = -0x100;
- static const int16_t kEndStateFlag = -0x200;
- static const int16_t kSelfRecursionFlag = -0x300;
- static const int16_t kSimpleCycleFlag = -0x400;
- static const int16_t kUnKnownFlag = -0x500;
+ static constexpr int16_t kNotEndStateFlag = -0x100;
+ static constexpr int16_t kEndStateFlag = -0x200;
+ static constexpr int16_t kSelfRecursionFlag = -0x300;
+ static constexpr int16_t kSimpleCycleFlag = -0x400;
+ static constexpr int16_t kUnKnownFlag = -0x500;

private:
Grammar* grammar_;
4 changes: 4 additions & 0 deletions constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
urllib3>=2.6.3
# WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx
wheel>=0.46.2
# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7
protobuf>=6.33.5
# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg
aiohttp>=3.13.3
4 changes: 2 additions & 2 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3614,9 +3614,9 @@ def launchTestJobs(pipeline, testFilter)
// Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability.
// PyTorch CUDA 13.0 package and torchvision package can be installed as expected.
if (k8s_arch == "amd64") {
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130")
} else {
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130")
}
}

Expand Down
4 changes: 2 additions & 2 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm

LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202603011156-11778
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202603011156-11778
4 changes: 2 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ opentelemetry-api>=1.26.0
opentelemetry-exporter-otlp>=1.26.0
opentelemetry-semantic-conventions-ai>=0.4.1
fuzzywuzzy==0.18.0
aiperf==0.3.0
aiperf==0.4.0
nanobind>=2.9.0
nixl==0.8.0
nixl==0.9.0
hf-transfer==0.1.9
line_profiler
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
# torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
nvidia-nccl-cu13>=2.27.7,<=2.28.9
nvidia-cuda-nvrtc
transformers==4.57.1
transformers==4.57.3
prometheus_client
prometheus_fastapi_instrumentator
pydantic>=2.9.1
Expand All @@ -56,7 +56,7 @@ patchelf
einops
flashinfer-python==0.6.4
opencv-python-headless
xgrammar==0.1.25
xgrammar==0.1.32
llguidance==0.7.29
jsonschema
backoff
Expand Down
4 changes: 2 additions & 2 deletions security_scanning/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ dependencies = [
"peft (>=0.18.1,<0.19.0)",
"patchelf (>=0.17.2.4,<0.18.0.0)",
"einops (>=0.8.2,<0.9.0)",
"flashinfer-python (==0.6.4)",
"xgrammar (==0.1.25)",
"flashinfer-python (>=0.6.1,<0.7.0)",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thorjohnsen I thought we wanted to pin this to 0.6.4 because of some additional issues?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security_scanning is auto-generated and be cherry-picked, it is only for nspect scanning, this should not affect trtllm itself

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"xgrammar (==0.1.32)",
"llguidance (==0.7.29)",
"jsonschema (>=4.26.0,<5.0.0)",
"backoff (>=2.2.1,<3.0.0)",
Expand Down
47 changes: 30 additions & 17 deletions tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from nixl import nixl_agent, nixl_agent_config, nixl_xfer_handle

from tensorrt_llm._utils import nvtx_range
from tensorrt_llm.logger import logger

# Import base classes for type compatibility
from ..base.agent import BaseTransferAgent, RegMemoryDescs, TransferRequest, TransferStatus
Expand Down Expand Up @@ -36,9 +37,11 @@ def wait(self, timeout_ms=None):
while status in (TransferState.PENDING, TransferState.PROCESSING):
status = TransferState(self.agent.check_xfer_state(self.handle))
if status == TransferState.ERROR:
return False # Transfer failed
logger.error("NIXL transfer entered ERROR state (agent=%s).", self.agent.name)
return False
if timeout is not None and (time.time() - start_time > timeout):
return False # Timeout
logger.warning("NIXL transfer wait timed out after %s ms.", timeout_ms)
return False
time.sleep(sleep_time)
sleep_time = min(sleep_time * 2, max_sleep_time)
return status == TransferState.DONE
Expand All @@ -61,23 +64,25 @@ def __init__(self, name: str, use_prog_thread: bool = True, num_threads: int = 1
)
self.agent = nixl_agent(name, agent_config)

def register_memory(self, descs: RegMemoryDescs):
def _get_validated_reg_descs(self, descs: RegMemoryDescs):
if not descs.descs:
raise ValueError("descs.descs must not be empty")
if isinstance(descs.descs[0], tuple):
assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}"
if isinstance(descs.descs[0], tuple) and len(descs.descs[0]) != 4:
raise ValueError(
f"Expected 4 elements per desc, got {len(descs.descs[0])}: {descs.descs[0]}"
)
reg_descs = self.agent.get_reg_descs(descs.descs, descs.type)
assert reg_descs is not None, "Failed to get reg_descs"
self.agent.register_memory(reg_descs)
if reg_descs is None:
raise RuntimeError(
f"nixl get_reg_descs returned None for type={descs.type}, count={len(descs.descs)}"
)
return reg_descs

def register_memory(self, descs: RegMemoryDescs):
self.agent.register_memory(self._get_validated_reg_descs(descs))

def deregister_memory(self, descs: RegMemoryDescs):
if not descs.descs:
raise ValueError("descs.descs must not be empty")
if isinstance(descs.descs[0], tuple):
assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}"
reg_descs = self.agent.get_reg_descs(descs.descs, descs.type)
assert reg_descs is not None, "Failed to get reg_descs"
self.agent.deregister_memory(reg_descs)
self.agent.deregister_memory(self._get_validated_reg_descs(descs))

def load_remote_agent(self, name: str, agent_desc: bytes):
self.agent.add_remote_agent(agent_desc)
Expand All @@ -97,9 +102,15 @@ def notify_sync_message(self, name: str, sync_message: str):
@nvtx_range("NixlTransferAgent.submit_transfer_requests")
def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus:
src_xfer_descs = self.agent.get_xfer_descs(request.src_descs.descs, request.src_descs.type)
if src_xfer_descs is None:
raise RuntimeError(
f"nixl get_xfer_descs returned None for src type={request.src_descs.type}"
)
dst_xfer_descs = self.agent.get_xfer_descs(request.dst_descs.descs, request.dst_descs.type)
assert src_xfer_descs is not None, "Failed to get src_xfer_descs"
assert dst_xfer_descs is not None, "Failed to get dst_xfer_descs"
if dst_xfer_descs is None:
raise RuntimeError(
f"nixl get_xfer_descs returned None for dst type={request.dst_descs.type}"
)
sync_message = "" if request.sync_message is None else request.sync_message
handle = self.agent.initialize_xfer(
request.op,
Expand All @@ -110,5 +121,7 @@ def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus:
)
status = self.agent.transfer(handle)
if status == "ERROR":
raise RuntimeError("NIXL transfer initialization failed.")
raise RuntimeError(
f"NIXL transfer failed: op={request.op}, remote={request.remote_name}"
)
return NixlTransferStatus(self.agent, handle)
30 changes: 22 additions & 8 deletions tensorrt_llm/_torch/disaggregation/nixl/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,45 @@
"""


def _load_agent(module_name, required_attributes):
def _load_agent(
module_name: str, required_attributes: list[str]
) -> tuple[object, ImportError | None]:
try:
module = __import__(module_name, fromlist=required_attributes, level=0)
if all(hasattr(module, attr) for attr in required_attributes):
return module
return module, None
missing = [a for a in required_attributes if not hasattr(module, a)]
err = ImportError(f"Module {module_name} is missing required attributes: {missing}")
logger.warning("%s", err)
return None, err
except ImportError as e:
logger.info("Failed to import module: %s. Error: %s", module_name, str(e))
return None
logger.warning("Failed to import module: %s. Error: %s", module_name, str(e))
return None, e


NixlTransferStatus, NixlTransferAgent = None, None

if use_pure_python_transfer_agent():
_py_agent = _load_agent(
_py_agent, _py_agent_err = _load_agent(
module_name="tensorrt_llm._torch.disaggregation.nixl._agent_py",
required_attributes=["NixlTransferAgent", "NixlTransferStatus"],
)
assert _py_agent is not None, "Failed to load pure Python NIXL Transfer Agent."
if _py_agent is None:
raise ImportError(
"Failed to load pure Python NIXL Transfer Agent."
+ (f" Caused by: {_py_agent_err}" if _py_agent_err else "")
)
NixlTransferStatus = _py_agent.NixlTransferStatus
NixlTransferAgent = _py_agent.NixlTransferAgent
else:
_cpp_agent = _load_agent(
_cpp_agent, _cpp_agent_err = _load_agent(
module_name="tensorrt_llm._torch.disaggregation.nixl._agent_cpp",
required_attributes=["BindingsNixlTransferAgent", "BindingsNixlTransferStatus"],
)
assert _cpp_agent is not None, "Failed to load C++ NIXL Transfer Agent bindings."
if _cpp_agent is None:
raise ImportError(
"Failed to load C++ NIXL Transfer Agent bindings."
+ (f" Caused by: {_cpp_agent_err}" if _cpp_agent_err else "")
)
NixlTransferStatus = _cpp_agent.BindingsNixlTransferStatus
NixlTransferAgent = _cpp_agent.BindingsNixlTransferAgent
2 changes: 2 additions & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -386,3 +386,5 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026)
unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5920779)
unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673)
15 changes: 12 additions & 3 deletions tests/unittest/_torch/modeling/test_modeling_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,19 @@ def test_siglip_vision_allclose_to_hf(self, scenario: Scenario):
attn_metadata=attn_metadata,
)

# Compare all hidden states
# Compare all hidden states.
# TRT-LLM applies post_layernorm to the last encoder hidden state
# (matching production usage), so the last element must be compared
# against HF's post_layernormed last_hidden_state rather than the
# raw hidden_states[-1].
num_states = len(tllm_outputs)
for i in range(num_states):
tllm_hs = tllm_outputs[i]
if i < num_states - 1:
hf_hs = hf_outputs.hidden_states[i]
else:
hf_hs = hf_outputs.last_hidden_state

for i, (hf_hs, tllm_hs) in enumerate(
zip(hf_outputs.hidden_states, tllm_outputs)):
self.assertEqual(hf_hs.shape, tllm_hs.shape,
f"Shape mismatch for hidden state {i}")

Expand Down
37 changes: 37 additions & 0 deletions tests/unittest/disaggregated/test_agent_multi_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,43 @@
import pytest


def test_load_agent_missing_module():
"""_load_agent returns (None, ImportError) for a non-existent module.

Regression test: previously a missing nixl package caused an AssertionError
at module import time, making pytest exit with code 2 (collection failure)
instead of a clear ImportError with a descriptive message.
"""
from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent

agent, err = _load_agent("_trtllm_nonexistent_module_xyz_", ["SomeClass"])
assert agent is None
assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}"
assert "No module named" in str(err) or "_trtllm_nonexistent_module_xyz_" in str(err)


def test_load_agent_missing_attributes():
"""_load_agent returns (None, ImportError) and logs a warning when attributes are missing."""
from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent

# 'os' exists but has no NixlTransferAgent attribute
agent, err = _load_agent("os", ["NixlTransferAgent"])
assert agent is None
assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}"
assert "NixlTransferAgent" in str(err)


def test_load_agent_success():
"""_load_agent returns (module, None) on success."""
from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent

agent, err = _load_agent("os", ["path", "getcwd"])
assert agent is not None
assert err is None
assert hasattr(agent, "path")
assert hasattr(agent, "getcwd")


@pytest.mark.parametrize("use_py_nixl", ["0", "1"])
def test_run_with_different_env(use_py_nixl):
os.environ["TRTLLM_USE_PY_NIXL_KVCACHE"] = use_py_nixl
Expand Down
2 changes: 1 addition & 1 deletion triton_backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
regex
fire
tritonclient[all]
transformers==4.57.1
transformers==4.57.3
pandas
tabulate
flash_attn
Expand Down
Loading