diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 93565ae099b..fd627e5a104 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -108,11 +108,16 @@ FetchContent_Declare( SOURCE_SUBDIR dont-add-this-project-with-add-subdirectory) +set(_patch_file "${CMAKE_CURRENT_SOURCE_DIR}/patches/xgrammar_constexpr.patch") FetchContent_Declare( xgrammar GIT_REPOSITORY https://github.com/mlc-ai/xgrammar - GIT_TAG v0.1.25 # e4e816f5f0fe39f5b1601a17a4552307fa3b70ff + GIT_TAG v0.1.32 # 62e13551b9b63251114894c5ee638564b160dd48 GIT_SHALLOW TRUE # NOTE: TensorRT-LLM only uses the headers SOURCE_SUBDIR - dont-add-this-project-with-add-subdirectory) + dont-add-this-project-with-add-subdirectory + PATCH_COMMAND + bash -c "patch -p1 --forward --batch --dry-run -i '${_patch_file}' && \ + patch -p1 --forward --batch -i '${_patch_file}' || \ + echo 'Patch already applied, skipping.'") diff --git a/3rdparty/patches/xgrammar_constexpr.patch b/3rdparty/patches/xgrammar_constexpr.patch new file mode 100644 index 00000000000..f4e5be99a21 --- /dev/null +++ b/3rdparty/patches/xgrammar_constexpr.patch @@ -0,0 +1,19 @@ +--- a/cpp/grammar_functor.cc ++++ b/cpp/grammar_functor.cc +@@ -1750,11 +1750,11 @@ + void Apply(Grammar* grammar); + static std::optional HashSequence(const Grammar& grammar, int32_t sequence_id); + +- static const int16_t kNotEndStateFlag = -0x100; +- static const int16_t kEndStateFlag = -0x200; +- static const int16_t kSelfRecursionFlag = -0x300; +- static const int16_t kSimpleCycleFlag = -0x400; +- static const int16_t kUnKnownFlag = -0x500; ++ static constexpr int16_t kNotEndStateFlag = -0x100; ++ static constexpr int16_t kEndStateFlag = -0x200; ++ static constexpr int16_t kSelfRecursionFlag = -0x300; ++ static constexpr int16_t kSimpleCycleFlag = -0x400; ++ static constexpr int16_t kUnKnownFlag = -0x500; + + private: + Grammar* grammar_; diff --git a/constraints.txt b/constraints.txt index 3586deaf81d..17a66eff253 100644 --- a/constraints.txt +++ b/constraints.txt @@ -4,3 +4,11 @@ urllib3>=2.6.3 # WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx wheel>=0.46.2 +# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7 +protobuf>=6.33.5 +# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg +aiohttp>=3.13.3 +# WAR against https://github.com/advisories/GHSA-qjxf-f2mg-c6mc +tornado>=6.5.5 +# WAR against https://github.com/advisories/GHSA-3936-cmfr-pm3m +black>=26.3.1 diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index f5d7915b02e..c2a2b851338 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -76,7 +76,8 @@ COPY constraints.txt /tmp/constraints.txt RUN pip3 install --no-cache-dir -r /tmp/constraints.txt && rm /tmp/constraints.txt # Remove nbconvert to avoid https://github.com/advisories/GHSA-xm59-rqc7-hhvf in the base NGC PyTorch image. -RUN pip3 uninstall -y nbconvert || true +# Remove pillow to avoid https://github.com/advisories/GHSA-cfh3-3jmp-rvhc in the base NGC PyTorch image. +RUN pip3 uninstall -y nbconvert pillow || true # Install UCX, NIXL, etcd # TODO: Combine these into the main install.sh script diff --git a/docker/common/install_etcd.sh b/docker/common/install_etcd.sh index a598daacc85..663650dff43 100644 --- a/docker/common/install_etcd.sh +++ b/docker/common/install_etcd.sh @@ -2,7 +2,7 @@ set -ex -ETCD_VER=v3.5.21 +ETCD_VER=v3.6.9 # choose either URL DOWNLOAD_URL=https://storage.googleapis.com/etcd diff --git a/examples/models/core/gemma/requirements.txt b/examples/models/core/gemma/requirements.txt index 20f8719a379..a1bbed25b68 100644 --- a/examples/models/core/gemma/requirements.txt +++ b/examples/models/core/gemma/requirements.txt @@ -5,7 +5,6 @@ nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" tensorrt_llm>=0.0.0.dev0 flax~=0.8.0 -numpy<2 # jax[cuda12_pip]~=0.4.19 safetensors~=0.4.1 sentencepiece>=0.1.99 diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index ba1440547fa..f5b0f7b4bd7 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3614,9 +3614,9 @@ def launchTestJobs(pipeline, testFilter) // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability. // PyTorch CUDA 13.0 package and torchvision package can be installed as expected. if (k8s_arch == "amd64") { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130") } else { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple --extra-index-url https://download.pytorch.org/whl/cu130") } } diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index da06d20e717..02c35d1f960 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202604100821-12812 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-sbsa-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202604100821-12812 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202604100821-12812 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202604100821-12812 diff --git a/requirements-dev.txt b/requirements-dev.txt index eae33fa6c92..bcdbb8a8fb8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -36,8 +36,8 @@ opentelemetry-api>=1.26.0 opentelemetry-exporter-otlp>=1.26.0 opentelemetry-semantic-conventions-ai>=0.4.1 fuzzywuzzy==0.18.0 -aiperf==0.3.0 +aiperf==0.6.0 nanobind>=2.9.0 -nixl==0.8.0 +nixl==0.9.0 hf-transfer==0.1.9 line_profiler diff --git a/requirements.txt b/requirements.txt index 1b0455b72d3..3f90ba50c5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ cuda-python>=13 diffusers>=0.27.0 lark mpi4py -numpy<2 +numpy>=2.0.0,<2.4 # numba 0.63.1 requires numpy<2.4 onnx>=1.18.0,<1.20.0 onnx_graphsurgeon>=0.5.2 onnxscript==0.5.4 @@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0 # torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7 nvidia-nccl-cu13>=2.27.7,<=2.28.9 nvidia-cuda-nvrtc -transformers==4.57.1 +transformers==4.57.3 prometheus_client prometheus_fastapi_instrumentator pydantic>=2.9.1 @@ -56,7 +56,7 @@ patchelf einops flashinfer-python==0.6.4 opencv-python-headless -xgrammar==0.1.25 +xgrammar==0.1.32 llguidance==0.7.29 jsonschema backoff @@ -72,7 +72,7 @@ blobfile openai-harmony==0.0.4 nvidia-cutlass-dsl==4.3.4; python_version >= "3.10" plotly -numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing +numexpr partial_json_parser apache-tvm-ffi==0.1.6 # used for reduce nvidia-cutlass-dsl host overhead torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index e7820420f1c..a55e0205f66 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -56,8 +56,8 @@ dependencies = [ "peft (>=0.18.1,<0.19.0)", "patchelf (>=0.17.2.4,<0.18.0.0)", "einops (>=0.8.2,<0.9.0)", - "flashinfer-python (==0.6.4)", - "xgrammar (==0.1.25)", + "flashinfer-python (>=0.6.1,<0.7.0)", + "xgrammar (==0.1.32)", "llguidance (==0.7.29)", "jsonschema (>=4.26.0,<5.0.0)", "backoff (>=2.2.1,<3.0.0)", diff --git a/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py b/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py index cd01f3024a9..dcfa28210f8 100644 --- a/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py +++ b/tensorrt_llm/_torch/disaggregation/nixl/_agent_py.py @@ -4,6 +4,7 @@ from nixl import nixl_agent, nixl_agent_config, nixl_xfer_handle from tensorrt_llm._utils import nvtx_range +from tensorrt_llm.logger import logger # Import base classes for type compatibility from ..base.agent import BaseTransferAgent, RegMemoryDescs, TransferRequest, TransferStatus @@ -36,9 +37,11 @@ def wait(self, timeout_ms=None): while status in (TransferState.PENDING, TransferState.PROCESSING): status = TransferState(self.agent.check_xfer_state(self.handle)) if status == TransferState.ERROR: - return False # Transfer failed + logger.error("NIXL transfer entered ERROR state (agent=%s).", self.agent.name) + return False if timeout is not None and (time.time() - start_time > timeout): - return False # Timeout + logger.warning("NIXL transfer wait timed out after %s ms.", timeout_ms) + return False time.sleep(sleep_time) sleep_time = min(sleep_time * 2, max_sleep_time) return status == TransferState.DONE @@ -61,23 +64,25 @@ def __init__(self, name: str, use_prog_thread: bool = True, num_threads: int = 1 ) self.agent = nixl_agent(name, agent_config) - def register_memory(self, descs: RegMemoryDescs): + def _get_validated_reg_descs(self, descs: RegMemoryDescs): if not descs.descs: raise ValueError("descs.descs must not be empty") - if isinstance(descs.descs[0], tuple): - assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}" + if isinstance(descs.descs[0], tuple) and len(descs.descs[0]) != 4: + raise ValueError( + f"Expected 4 elements per desc, got {len(descs.descs[0])}: {descs.descs[0]}" + ) reg_descs = self.agent.get_reg_descs(descs.descs, descs.type) - assert reg_descs is not None, "Failed to get reg_descs" - self.agent.register_memory(reg_descs) + if reg_descs is None: + raise RuntimeError( + f"nixl get_reg_descs returned None for type={descs.type}, count={len(descs.descs)}" + ) + return reg_descs + + def register_memory(self, descs: RegMemoryDescs): + self.agent.register_memory(self._get_validated_reg_descs(descs)) def deregister_memory(self, descs: RegMemoryDescs): - if not descs.descs: - raise ValueError("descs.descs must not be empty") - if isinstance(descs.descs[0], tuple): - assert len(descs.descs[0]) == 4, f"Expected 4 elements per desc, got {descs.descs[0]}" - reg_descs = self.agent.get_reg_descs(descs.descs, descs.type) - assert reg_descs is not None, "Failed to get reg_descs" - self.agent.deregister_memory(reg_descs) + self.agent.deregister_memory(self._get_validated_reg_descs(descs)) def load_remote_agent(self, name: str, agent_desc: bytes): self.agent.add_remote_agent(agent_desc) @@ -97,9 +102,15 @@ def notify_sync_message(self, name: str, sync_message: str): @nvtx_range("NixlTransferAgent.submit_transfer_requests") def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus: src_xfer_descs = self.agent.get_xfer_descs(request.src_descs.descs, request.src_descs.type) + if src_xfer_descs is None: + raise RuntimeError( + f"nixl get_xfer_descs returned None for src type={request.src_descs.type}" + ) dst_xfer_descs = self.agent.get_xfer_descs(request.dst_descs.descs, request.dst_descs.type) - assert src_xfer_descs is not None, "Failed to get src_xfer_descs" - assert dst_xfer_descs is not None, "Failed to get dst_xfer_descs" + if dst_xfer_descs is None: + raise RuntimeError( + f"nixl get_xfer_descs returned None for dst type={request.dst_descs.type}" + ) sync_message = "" if request.sync_message is None else request.sync_message handle = self.agent.initialize_xfer( request.op, @@ -110,5 +121,7 @@ def submit_transfer_requests(self, request: TransferRequest) -> TransferStatus: ) status = self.agent.transfer(handle) if status == "ERROR": - raise RuntimeError("NIXL transfer initialization failed.") + raise RuntimeError( + f"NIXL transfer failed: op={request.op}, remote={request.remote_name}" + ) return NixlTransferStatus(self.agent, handle) diff --git a/tensorrt_llm/_torch/disaggregation/nixl/agent.py b/tensorrt_llm/_torch/disaggregation/nixl/agent.py index 5f6f3db1547..8be3c7c5775 100644 --- a/tensorrt_llm/_torch/disaggregation/nixl/agent.py +++ b/tensorrt_llm/_torch/disaggregation/nixl/agent.py @@ -13,31 +13,45 @@ """ -def _load_agent(module_name, required_attributes): +def _load_agent( + module_name: str, required_attributes: list[str] +) -> tuple[object, ImportError | None]: try: module = __import__(module_name, fromlist=required_attributes, level=0) if all(hasattr(module, attr) for attr in required_attributes): - return module + return module, None + missing = [a for a in required_attributes if not hasattr(module, a)] + err = ImportError(f"Module {module_name} is missing required attributes: {missing}") + logger.warning("%s", err) + return None, err except ImportError as e: - logger.info("Failed to import module: %s. Error: %s", module_name, str(e)) - return None + logger.warning("Failed to import module: %s. Error: %s", module_name, str(e)) + return None, e NixlTransferStatus, NixlTransferAgent = None, None if use_pure_python_transfer_agent(): - _py_agent = _load_agent( + _py_agent, _py_agent_err = _load_agent( module_name="tensorrt_llm._torch.disaggregation.nixl._agent_py", required_attributes=["NixlTransferAgent", "NixlTransferStatus"], ) - assert _py_agent is not None, "Failed to load pure Python NIXL Transfer Agent." + if _py_agent is None: + raise ImportError( + "Failed to load pure Python NIXL Transfer Agent." + + (f" Caused by: {_py_agent_err}" if _py_agent_err else "") + ) NixlTransferStatus = _py_agent.NixlTransferStatus NixlTransferAgent = _py_agent.NixlTransferAgent else: - _cpp_agent = _load_agent( + _cpp_agent, _cpp_agent_err = _load_agent( module_name="tensorrt_llm._torch.disaggregation.nixl._agent_cpp", required_attributes=["BindingsNixlTransferAgent", "BindingsNixlTransferStatus"], ) - assert _cpp_agent is not None, "Failed to load C++ NIXL Transfer Agent bindings." + if _cpp_agent is None: + raise ImportError( + "Failed to load C++ NIXL Transfer Agent bindings." + + (f" Caused by: {_cpp_agent_err}" if _cpp_agent_err else "") + ) NixlTransferStatus = _cpp_agent.BindingsNixlTransferStatus NixlTransferAgent = _cpp_agent.BindingsNixlTransferAgent diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index f341d752206..981d3b13695 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -33,7 +33,7 @@ fp16_array, fp32_array, get_sm_version, int32_array, int64_array, np_dtype_to_trt, str_dtype_to_trt, trt_dtype_to_np, trt_dtype_to_str) -from .network import PluginInfo, set_np_weight, set_plugin_info +from .network import PluginInfo, get_np_weight, set_np_weight, set_plugin_info from .plugin import TRT_LLM_PLUGIN_NAMESPACE, current_all_reduce_helper from .quantization import QuantMode @@ -3543,6 +3543,24 @@ def avg_pool2d(input: Tensor, return output +def _get_trt_weight(weight: Tensor) -> Tuple[trt.Weights, bool]: + is_weight_constant = (weight.producer is not None + and weight.producer.type == trt.LayerType.CONSTANT) + if is_weight_constant: + ndarray = get_np_weight(default_trtnet(), weight.producer.name) + if ndarray is not None: + trt_weight = trt.Weights(np_dtype_to_trt(ndarray.dtype), + ndarray.ctypes.data, + int(np.prod(ndarray.shape))) + else: + weight.producer.__class__ = trt.IConstantLayer + trt_weight = weight.producer.weights + else: + trt_weight = trt.Weights() + + return trt_weight, is_weight_constant + + def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, @@ -3553,30 +3571,32 @@ def conv1d(input: Tensor, noutput = weight.size()[0] kernel_size = weight.size()[-2] - is_weight_constant = (weight.producer is not None - and weight.producer.type == trt.LayerType.CONSTANT) - weight = weight.producer.weights if is_weight_constant else trt.Weights() + kernel_shape = trt.Dims([kernel_size, 1]) + + trt_weight, is_weight_constant = _get_trt_weight(weight) + weight_tensor = weight if bias is not None: - is_bias_constant = (bias.producer is not None - and bias.producer.type == trt.LayerType.CONSTANT) - bias = bias.producer.weights if is_bias_constant else trt.Weights() + bias_tensor = bias + trt_bias, is_bias_constant = _get_trt_weight(bias) + else: + bias_tensor = None + trt_bias = None input_shuffled = stack([input], dim=input.ndim()) - kernel_size = trt.Dims([kernel_size, 1]) layer = default_trtnet().add_convolution_nd(input_shuffled.trt_tensor, - noutput, kernel_size, weight, - bias) + noutput, kernel_shape, + trt_weight, trt_bias) layer.stride_nd = (stride, 2) layer.padding_nd = (padding, 0) layer.dilation_nd = (dilation, 2) layer.num_groups = groups if not is_weight_constant: - layer.set_input(1, weight.trt_tensor) - if bias is not None and not is_bias_constant: - layer.set_input(2, bias.trt_tensor) + layer.set_input(1, weight_tensor.trt_tensor) + if bias_tensor is not None and not is_bias_constant: + layer.set_input(2, bias_tensor.trt_tensor) output_2d = _create_tensor(layer.get_output(0), layer) output_1d = squeeze(output_2d, dim=-1) @@ -3602,18 +3622,21 @@ def conv2d(input: Tensor, noutput = weight.size()[0] kernel_size = (weight.size()[-2], weight.size()[-1]) + kernel_shape = trt.Dims(list(kernel_size)) - is_weight_constant = (weight.producer is not None - and weight.producer.type == trt.LayerType.CONSTANT) - weight = weight.producer.weights if is_weight_constant else trt.Weights() + trt_weight, is_weight_constant = _get_trt_weight(weight) + weight_tensor = weight if bias is not None: - is_bias_constant = (bias.producer is not None - and bias.producer.type == trt.LayerType.CONSTANT) - bias = bias.producer.weights if is_bias_constant else trt.Weights() + bias_tensor = bias + trt_bias, is_bias_constant = _get_trt_weight(bias) + else: + bias_tensor = None + trt_bias = None layer = default_trtnet().add_convolution_nd(input.trt_tensor, noutput, - kernel_size, weight, bias) + kernel_shape, trt_weight, + trt_bias) layer.stride_nd = stride layer.padding_nd = padding layer.dilation_nd = dilation @@ -3625,9 +3648,9 @@ def conv2d(input: Tensor, layer.post_padding = post_padding if not is_weight_constant: - layer.set_input(1, weight.trt_tensor) - if bias is not None and not is_bias_constant: - layer.set_input(2, bias.trt_tensor) + layer.set_input(1, weight_tensor.trt_tensor) + if bias_tensor is not None and not is_bias_constant: + layer.set_input(2, bias_tensor.trt_tensor) output = _create_tensor(layer.get_output(0), layer) @@ -3666,18 +3689,21 @@ def conv3d(input: Tensor, noutput = weight.size()[0] kernel_size = (weight.size()[-3], weight.size()[-2], weight.size()[-1]) + kernel_shape = trt.Dims(list(kernel_size)) - is_weight_constant = (weight.producer is not None - and weight.producer.type == trt.LayerType.CONSTANT) - weight = weight.producer.weights if is_weight_constant else trt.Weights() + trt_weight, is_weight_constant = _get_trt_weight(weight) + weight_tensor = weight if bias is not None: - is_bias_constant = (bias.producer is not None - and bias.producer.type == trt.LayerType.CONSTANT) - bias = bias.producer.weights if is_bias_constant else trt.Weights() + bias_tensor = bias + trt_bias, is_bias_constant = _get_trt_weight(bias) + else: + bias_tensor = None + trt_bias = None layer = default_trtnet().add_convolution_nd(input.trt_tensor, noutput, - kernel_size, weight, bias) + kernel_shape, trt_weight, + trt_bias) layer.stride_nd = stride layer.padding_nd = padding layer.dilation_nd = dilation @@ -3685,9 +3711,9 @@ def conv3d(input: Tensor, layer.dilation_nd = dilation if not is_weight_constant: - layer.set_input(1, weight.trt_tensor) - if bias is not None and not is_bias_constant: - layer.set_input(2, bias.trt_tensor) + layer.set_input(1, weight_tensor.trt_tensor) + if bias_tensor is not None and not is_bias_constant: + layer.set_input(2, bias_tensor.trt_tensor) output = _create_tensor(layer.get_output(0), layer) return output @@ -3713,26 +3739,29 @@ def conv_transpose2d(input: Tensor, noutput = weight.size()[1] kernel_size = (weight.size()[-2], weight.size()[-1]) + kernel_shape = trt.Dims(list(kernel_size)) - is_weight_constant = (weight.producer is not None - and weight.producer.type == trt.LayerType.CONSTANT) - weight = weight.producer.weights if is_weight_constant else trt.Weights() + trt_weight, is_weight_constant = _get_trt_weight(weight) + weight_tensor = weight if bias is not None: - is_bias_constant = (bias.producer is not None - and bias.producer.type == trt.LayerType.CONSTANT) - bias = bias.producer.weights if is_bias_constant else trt.Weights() + bias_tensor = bias + trt_bias, is_bias_constant = _get_trt_weight(bias) + else: + bias_tensor = None + trt_bias = None layer = default_trtnet().add_deconvolution_nd(input.trt_tensor, noutput, - kernel_size, weight, bias) + kernel_shape, trt_weight, + trt_bias) layer.stride_nd = stride layer.padding_nd = padding layer.num_groups = groups if not is_weight_constant: - layer.set_input(1, weight.trt_tensor) - if bias is not None and not is_bias_constant: - layer.set_input(2, bias.trt_tensor) + layer.set_input(1, weight_tensor.trt_tensor) + if bias_tensor is not None and not is_bias_constant: + layer.set_input(2, bias_tensor.trt_tensor) output = _create_tensor(layer.get_output(0), layer) diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py index 7859eff9365..d740cbb0bdb 100644 --- a/tensorrt_llm/parameter.py +++ b/tensorrt_llm/parameter.py @@ -243,16 +243,23 @@ def set_value_or_dummy(self, v: Union[np.ndarray, torch.Tensor]): self.value = v - def set_name(self, name: str, network): + def set_name(self, name: str, network: Network): self._name = name if self.is_managed(network): self._get_weights(network).name = name return True else: - return network.trt_network.set_weights_name( - self._get_weights(network), name) - - def _get_weights(self, network) -> trt.Weights | Tensor | None: + weights = self._get_weights(network) + # TensorRT bindings may return numpy array instead of trt.Weights + if isinstance(weights, np.ndarray): + trt_dtype = np_dtype_to_trt( + weights.dtype + ) if weights.dtype != np.object_ else self._dtype + trt_count = int(np.prod(weights.shape)) + weights = trt.Weights(trt_dtype, weights.ctypes.data, trt_count) + return network.trt_network.set_weights_name(weights, name) + + def _get_weights(self, network: Network) -> trt.Weights | Tensor | None: tensor = network.get_parameter_tensor(self) if self.is_managed(network): return tensor diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 78ce434542f..c9a84e658de 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -386,3 +386,5 @@ perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] SKIP (https://nvbugspro.nvidia.com/bug/5919026) perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026) perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugspro.nvidia.com/bug/5919026) +unittest/_torch/flashinfer/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5920779) +unittest/disaggregated/test_agent_multi_backends.py::test_run_with_different_env[1] SKIP (https://nvbugs/5979673) diff --git a/tests/unittest/_torch/modeling/test_modeling_siglip.py b/tests/unittest/_torch/modeling/test_modeling_siglip.py index 40a7dd1399b..e9b6788f538 100644 --- a/tests/unittest/_torch/modeling/test_modeling_siglip.py +++ b/tests/unittest/_torch/modeling/test_modeling_siglip.py @@ -135,10 +135,19 @@ def test_siglip_vision_allclose_to_hf(self, scenario: Scenario): attn_metadata=attn_metadata, ) - # Compare all hidden states + # Compare all hidden states. + # TRT-LLM applies post_layernorm to the last encoder hidden state + # (matching production usage), so the last element must be compared + # against HF's post_layernormed last_hidden_state rather than the + # raw hidden_states[-1]. + num_states = len(tllm_outputs) + for i in range(num_states): + tllm_hs = tllm_outputs[i] + if i < num_states - 1: + hf_hs = hf_outputs.hidden_states[i] + else: + hf_hs = hf_outputs.last_hidden_state - for i, (hf_hs, tllm_hs) in enumerate( - zip(hf_outputs.hidden_states, tllm_outputs)): self.assertEqual(hf_hs.shape, tllm_hs.shape, f"Shape mismatch for hidden state {i}") diff --git a/tests/unittest/disaggregated/test_agent_multi_backends.py b/tests/unittest/disaggregated/test_agent_multi_backends.py index 0a95bad03bc..5bf1da73d34 100644 --- a/tests/unittest/disaggregated/test_agent_multi_backends.py +++ b/tests/unittest/disaggregated/test_agent_multi_backends.py @@ -4,6 +4,43 @@ import pytest +def test_load_agent_missing_module(): + """_load_agent returns (None, ImportError) for a non-existent module. + + Regression test: previously a missing nixl package caused an AssertionError + at module import time, making pytest exit with code 2 (collection failure) + instead of a clear ImportError with a descriptive message. + """ + from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent + + agent, err = _load_agent("_trtllm_nonexistent_module_xyz_", ["SomeClass"]) + assert agent is None + assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}" + assert "No module named" in str(err) or "_trtllm_nonexistent_module_xyz_" in str(err) + + +def test_load_agent_missing_attributes(): + """_load_agent returns (None, ImportError) and logs a warning when attributes are missing.""" + from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent + + # 'os' exists but has no NixlTransferAgent attribute + agent, err = _load_agent("os", ["NixlTransferAgent"]) + assert agent is None + assert isinstance(err, ImportError), f"Expected ImportError, got {type(err)}: {err}" + assert "NixlTransferAgent" in str(err) + + +def test_load_agent_success(): + """_load_agent returns (module, None) on success.""" + from tensorrt_llm._torch.disaggregation.nixl.agent import _load_agent + + agent, err = _load_agent("os", ["path", "getcwd"]) + assert agent is not None + assert err is None + assert hasattr(agent, "path") + assert hasattr(agent, "getcwd") + + @pytest.mark.parametrize("use_py_nixl", ["0", "1"]) def test_run_with_different_env(use_py_nixl): os.environ["TRTLLM_USE_PY_NIXL_KVCACHE"] = use_py_nixl diff --git a/triton_backend/requirements.txt b/triton_backend/requirements.txt index 7daa868ed48..4375447772c 100644 --- a/triton_backend/requirements.txt +++ b/triton_backend/requirements.txt @@ -1,7 +1,7 @@ regex fire tritonclient[all] -transformers==4.57.1 +transformers==4.57.3 pandas tabulate flash_attn