Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ATTRIBUTIONS-Python.md
Original file line number Diff line number Diff line change
Expand Up @@ -5261,7 +5261,7 @@ For more information, please refer to <http://unlicense.org>
- `Tracker`: https://github.com/tox-dev/py-filelock/issues


## flashinfer-python (0.6.6)
## flashinfer-python (0.6.7)

### Licenses
License: `Apache-2.0`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,30 @@ using tensorrt_llm::common::launchWithPdlWhenEnabled;
__VA_ARGS__; \
break; \
} \
case 18: \
{ \
constexpr int TOP_K = 18; \
__VA_ARGS__; \
break; \
} \
case 16: \
{ \
constexpr int TOP_K = 16; \
__VA_ARGS__; \
break; \
} \
case 14: \
{ \
constexpr int TOP_K = 14; \
__VA_ARGS__; \
break; \
} \
case 12: \
{ \
constexpr int TOP_K = 12; \
__VA_ARGS__; \
break; \
} \
case 10: \
{ \
constexpr int TOP_K = 10; \
Expand Down
5 changes: 3 additions & 2 deletions cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,9 @@ void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk
int num_threads = NumDeepseekExperts;
if (is_single_group)
{
// Special case for Nemotron, which selects top 22 from 512 experts, and 1 group only.
if (num_experts == NumNemotronExperts && n_group == 1 && topk == MaxSupportedTopExperts)
// Nemotron models: 512 experts, 1 group, top_k up to 22.
// Variants use varying top_k (4..22) across layers.
if (num_experts == NumNemotronExperts && n_group == 1 && topk <= MaxSupportedTopExperts)
{
kernel_instance = &deepseek_v3_topk_kernel<InputT, BiasT, OutputT, IdxT, NumNemotronExperts, false,
MaxSupportedTopExperts>;
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ ordered-set
peft
patchelf
einops
flashinfer-python==0.6.6
flashinfer-python @ https://github.com/flashinfer-ai/flashinfer/releases/download/nightly-v0.6.7-20260406/flashinfer_python-0.6.7.dev20260406-py3-none-any.whl
opencv-python-headless
xgrammar==0.1.32
llguidance==0.7.29
Expand All @@ -71,7 +71,7 @@ xdsl>=0.59.0 # Optional: required for MLIR-based elementwise fusion in AutoDeplo
tiktoken
blobfile
openai-harmony==0.0.4
nvidia-cutlass-dsl==4.3.4; python_version >= "3.10"
nvidia-cutlass-dsl>=4.4.2; python_version >= "3.10"
plotly
numexpr
partial_json_parser
Expand Down
2 changes: 1 addition & 1 deletion security_scanning/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ dependencies = [
"peft (>=0.18.1,<0.19.0)",
"patchelf (>=0.17.2.4,<0.18.0.0)",
"einops (>=0.8.2,<0.9.0)",
"flashinfer-python (==0.6.6)",
"flashinfer-python (==0.6.7)",
"opencv-python-headless (>=4.13.0.92,<5.0.0.0)",
"xgrammar (==0.1.32)",
"llguidance (==0.7.29)",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from tensorrt_llm._torch.utils import split


@register_mapper("HF", "NemotronHPuzzleForCausalLM")
@register_mapper("HF", "NemotronHForCausalLM")
class NemotronHHfWeightMapper(HfWeightMapper):

Expand Down
77 changes: 66 additions & 11 deletions tensorrt_llm/_torch/models/modeling_nemotron_h.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,27 @@ class NemotronHConfig(PretrainedConfig):
model_type = "nemotron_h"


class NemotronHPuzzleConfig(PretrainedConfig):
model_type = "nemotron_h_puzzle"


def _bc_getattr(bc, key, default=None):
"""Get attribute from a block_config entry (dict or dataclass)."""
if isinstance(bc, dict):
return bc.get(key, default)
return getattr(bc, key, default)


def _get_layer_moe_param(config, layer_idx: int, param_name: str):
"""Get per-layer MoE parameter, falling back to global config."""
block_configs = getattr(config, 'block_configs', None)
if block_configs and layer_idx < len(block_configs):
val = _bc_getattr(block_configs[layer_idx], param_name)
if val is not None:
return val
return getattr(config, param_name, None)


class MLPLayer(MLP):

def __init__(
Expand Down Expand Up @@ -152,22 +173,26 @@ def __init__(
self.hidden_dim = config.hidden_size
self.ffn_dim = config.intermediate_size
self.layer_idx = layer_idx
self.moe_intermediate_size = (config.moe_intermediate_size[0]
if isinstance(
config.moe_intermediate_size, list)
else config.moe_intermediate_size)
self.use_latent_moe: bool = getattr(config, "moe_latent_size",
None) is not None
self.moe_hidden_size: int = (config.moe_latent_size
if self.use_latent_moe else

# Per-layer MoE params (models with block_configs have varying params).
def _moe(name):
return _get_layer_moe_param(config, layer_idx, name)

moe_intermediate = _moe('moe_intermediate_size')
self.moe_intermediate_size = (moe_intermediate[0] if isinstance(
moe_intermediate, list) else moe_intermediate)

moe_latent = _moe('moe_latent_size')
self.use_latent_moe: bool = moe_latent is not None
self.moe_hidden_size: int = (moe_latent if self.use_latent_moe else
config.hidden_size)
self.mlp_bias = config.mlp_bias if hasattr(config,
"mlp_bias") else False
self.moe_n_group = config.n_group
self.num_experts = config.n_routed_experts
self.num_experts = _moe('n_routed_experts')
self.hidden_size = config.hidden_size
self.num_shared_experts = config.n_shared_experts
self.top_k = config.num_experts_per_tok
self.top_k = _moe('num_experts_per_tok')
self.enable_attention_dp = model_config.mapping.enable_attention_dp
self.routed_scaling_factor = config.routed_scaling_factor
self.mapping = model_config.mapping
Expand All @@ -177,7 +202,7 @@ def __init__(
self.shared_experts = None
else:
shared_expert_intermediate_size = (
config.moe_shared_expert_intermediate_size *
_moe('moe_shared_expert_intermediate_size') *
config.n_shared_experts)

self.shared_experts = MLP(
Expand Down Expand Up @@ -703,6 +728,7 @@ def forward(
return hidden_states


@register_auto_model("NemotronHPuzzleForCausalLM")
@register_auto_model("NemotronHForCausalLM")
class NemotronHForCausalLM(SpecDecOneEngineForCausalLM[NemotronHModel,
NemotronHConfig]):
Expand All @@ -720,6 +746,9 @@ def __init__(
raise ValueError("layer_norm_epsilon or rms_norm_eps is not set")
model_config.pretrained_config.rms_norm_eps = rms_epsilon

# Normalize per-layer block_configs into global config attributes.
self._normalize_puzzle_config(model_config.pretrained_config)

if (not model_config.mapping.enable_attention_dp
and model_config.mapping.tp_size not in [1, 2, 4, 8]):
raise ValueError("TP has to be either 1, 2, 4 or 8")
Expand Down Expand Up @@ -776,6 +805,31 @@ def __init__(
self.epilogue.extend(self.draft_model.mtp_layers)
self.epilogue.append(self.spec_worker)

@staticmethod
def _normalize_puzzle_config(config):
"""Set global MoE defaults from block_configs for models with per-layer MoE params."""
block_configs = getattr(config, 'block_configs', None)
if not block_configs:
return

def _is_moe(bc):
return _bc_getattr(bc, 'block_type') == 'moe'

first_moe = next((bc for bc in block_configs if _is_moe(bc)), None)
if first_moe is None:
return

# Prefer MTP MoE block as fallback (used for MTP layers beyond
# block_configs range), otherwise use first main-model MoE block.
mtp_configs = getattr(config, 'mtp_block_configs', None) or []
fallback = next((bc for bc in mtp_configs if _is_moe(bc)), first_moe)

for attr in ('n_routed_experts', 'moe_intermediate_size',
'num_experts_per_tok', 'moe_latent_size',
'moe_shared_expert_intermediate_size'):
if not hasattr(config, attr) or getattr(config, attr) is None:
setattr(config, attr, _bc_getattr(fallback, attr))

def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
new_weights = weight_mapper.preprocess_weights(weights)
super().load_weights(weights=new_weights, weight_mapper=weight_mapper)
Expand Down Expand Up @@ -1074,3 +1128,4 @@ def forward(


AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig)
AutoConfig.register(NemotronHPuzzleConfig.model_type, NemotronHPuzzleConfig)
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/models/modeling_speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ def __init__(
case "exaone_moe":
from .modeling_exaone_moe import ExaoneMoeMTP
mtp_layer = ExaoneMoeMTP
case "nemotron_h":
case "nemotron_h" | "nemotron_h_puzzle":
from .modeling_nemotron_h import NemotronHMTP
mtp_layer = NemotronHMTP
case "qwen3_next":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,32 @@ def __init__(
# Initialize or reuse workspace
MnnvlMemory.initialize()

if self._WORKSPACE is None:
need_alloc = self._WORKSPACE is None
if not need_alloc:
assert self._WORKSPACE["max_num_tokens_per_rank"] == self.max_num_tokens_per_rank, (
"reuse workspace with different max_num_tokens_per_rank"
)
assert self._WORKSPACE["ep_rank"] == self.ep_rank, (
"reuse workspace with different ep_rank"
)
assert self._WORKSPACE["ep_size"] == self.ep_size, (
"reuse workspace with different ep_size"
)
assert self._WORKSPACE["eplb_stats_num_experts"] == self.eplb_stats_num_experts, (
"reuse workspace with different eplb_stats_num_experts"
)

# Models with per-layer MoE params may request different workspace sizes across layers.
# Reallocate when a larger workspace is needed; reuse otherwise.
if self._WORKSPACE["workspace_size_per_rank"] < self.workspace_size_per_rank:
tllm_logger.info(
f"NVLinkOneSided: Reallocating workspace "
f"{self._WORKSPACE['workspace_size_per_rank']} -> "
f"{self.workspace_size_per_rank} bytes."
)
need_alloc = True

if need_alloc:
tllm_logger.info(
f"NVLinkOneSided: Allocating workspace with size {self.workspace_size_per_rank} bytes."
f"ep_rank: {self.ep_rank}, ep_size: {self.ep_size}, top_k: {self.top_k}, max_num_tokens_per_rank: {self.max_num_tokens_per_rank}"
Expand All @@ -248,26 +273,8 @@ def __init__(
"workspace": workspace,
"metainfo": metainfo,
}
else:
assert self._WORKSPACE["workspace_size_per_rank"] == self.workspace_size_per_rank, (
"reuse workspace with different workspace_size_per_rank"
)
assert self._WORKSPACE["max_num_tokens_per_rank"] == self.max_num_tokens_per_rank, (
"reuse workspace with different max_num_tokens_per_rank"
)
assert self._WORKSPACE["ep_rank"] == self.ep_rank, (
"reuse workspace with different ep_rank"
)
assert self._WORKSPACE["ep_size"] == self.ep_size, (
"reuse workspace with different ep_size"
)
assert self._WORKSPACE["eplb_stats_num_experts"] == self.eplb_stats_num_experts, (
"reuse workspace with different eplb_stats_num_experts"
)

self.mnnvl_mem = self._WORKSPACE["mnnvl_mem"]
self.workspace = self._WORKSPACE["workspace"]
self.moe_a2a_metainfo = self._WORKSPACE["metainfo"]
# Read max_num_tokens_per_rank from the (possibly grown) workspace.
self.max_num_tokens_per_rank = self._WORKSPACE["max_num_tokens_per_rank"]

# Initialize dispatch state
Expand All @@ -276,6 +283,21 @@ def __init__(
# Invalid token expert ID (default to -1), the kernels in TRTLLM-gen is hard-code to support -1 only.
self.invalid_token_expert_id: int = -1

# Properties delegate to _WORKSPACE so all instances see the latest
# allocation (workspace may be reallocated when layers need more space).

@property
def mnnvl_mem(self):
return self._WORKSPACE["mnnvl_mem"]

@property
def workspace(self):
return self._WORKSPACE["workspace"]

@property
def moe_a2a_metainfo(self):
return self._WORKSPACE["metainfo"]

@staticmethod
def is_platform_supported() -> bool:
"""
Expand Down
5 changes: 2 additions & 3 deletions tensorrt_llm/_torch/modules/fused_moe/routing.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,8 @@ def noaux_tc(self, logits, e_score_correction_bias):
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
)
self.is_fused = False
elif (num_experts > 512 or (self.top_k > 8 and self.top_k != 22)
or (self.topk_group == 1 and self.top_k != 22)):
# We have special implementation for n_group == 1, top_k == 22 and num_experts == 512 for Nemotron Super v3.
elif num_experts > 512 or self.top_k > 22 or (self.top_k > 8
and num_experts != 512):
if self.is_fused:
warnings.warn(
"The configuration is not supported by the fused routing kernel. We have to use the original pytorch implementation."
Expand Down
9 changes: 3 additions & 6 deletions tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,10 @@ def __init__(

# Choose between flashinfer and native implementation. (default to flashinfer)
self._mamba_ssm_cache_dtype = config.quant_config.mamba_ssm_cache_dtype
# TODO: Update head_dims and head_group_ratios once flashinfer is updated.
# TODO: Update head_dims once flashinfer is updated.
# Nemotron-v2-Nano (mamba_head_dim=80) is not supported by flashinfer yet.
supported_head_dims = [64, 128]
supported_head_group_ratios = [1, 8, 16]
head_group_ratio = (self.tp_nheads //
self.tp_ngroups if self.tp_ngroups > 0 else 0)
self._use_flashinfer = (head_dim in supported_head_dims and
head_group_ratio in supported_head_group_ratios)
self._use_flashinfer = head_dim in supported_head_dims
# Stochastic rounding requires FlashInfer and fp16 cache
self._use_stochastic_rounding = (
config.quant_config.mamba_ssm_stochastic_rounding
Expand Down
Loading
Loading