NVIDIA · JacobHu-NV · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 13, 2026
@@ -61,7 +61,7 @@ def add_llm_args(parser):
         default='AUTO',
         choices=[
             'AUTO', 'CUTLASS', 'TRTLLM', 'VANILLA', 'WIDEEP', 'DEEPGEMM',
-            'CUTEDSL', 'TRITON'
+            'CUTEDSL', 'TRITON', 'DENSEGEMM'
         ],
         help=
         'MoE backend to use. AUTO selects default backend based on model. It currently doesn\'t always give the best choice for all scenarios. The capabilities of auto selection will be improved in future releases.'
@@ -203,6 +203,19 @@ def add_llm_args(parser):
     parser.add_argument('--relaxed_topk', type=int, default=1)
     parser.add_argument('--relaxed_delta', type=float, default=0.)
 
+    # CuTe DSL
+    parser.add_argument(
+        '--use_cute_dsl_bf16_bmm',
+        default=False,
+        action='store_true',
+        help='Use CuTe DSL bf16 persistent GEMM for BMM on Blackwell.')
+    parser.add_argument(
+        '--use_cute_dsl_bf16_gemm',
+        default=False,
+        action='store_true',
+        help='Use CuTe DSL bf16 persistent GEMM for Linear layers on Blackwell.'
+    )
+
     # HF
     parser.add_argument('--trust_remote_code',
                         default=False,
@@ -331,6 +344,8 @@ def setup_llm(args, **kwargs):
         gather_generation_logits=args.return_generation_logits,
         max_beam_width=args.max_beam_width,
         orchestrator_type=args.orchestrator_type,
+        use_cute_dsl_bf16_bmm=args.use_cute_dsl_bf16_bmm,
+        use_cute_dsl_bf16_gemm=args.use_cute_dsl_bf16_gemm,
         **kwargs)
 
     use_beam_search = args.max_beam_width > 1