InternLM · lvhan028 · Apr 8, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
@@ -89,7 +89,7 @@ jobs:
           exit 1
       - name: Test restful server - turbomind InternVL3-38B
         run: |
-          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client --trust-remote-code > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
@@ -169,7 +169,7 @@ jobs:
           exit 1
       - name: Test restful server - pytorch InternVL3_5-30B-A3B
         run: |
-          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs  --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs  --allow-terminate-by-client --trust-remote-code > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do

diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
@@ -54,7 +54,8 @@ def run_pipeline_chat_test(model_path, run_config, cases_path, is_pr_test: bool
 
     print('backend_config config: ' + str(backend_config))
     print('speculative_config config: ' + str(speculative_config))
-    pipe = pipeline(model_path, backend_config=backend_config, speculative_config=speculative_config)
+    pipe = pipeline(model_path, backend_config=backend_config, speculative_config=speculative_config,
+                    trust_remote_code=True)
 
     cases_path = os.path.join(cases_path)
     with open(cases_path) as f:

diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
@@ -60,7 +60,7 @@ def run_pipeline_mllm_test(model_path, run_config, resource_path, is_pr_test: bo
             print(f"Warning: Cannot set attribute '{attr_name}' on backend_config. Skipping.")
 
     print('backend_config config: ' + str(backend_config))
-    pipe = pipeline(model_path, backend_config=backend_config)
+    pipe = pipeline(model_path, backend_config=backend_config, trust_remote_code=True)
 
     image = load_image(f'{resource_path}/{PIC1}')
 

diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
@@ -162,6 +162,7 @@ def get_cli_common_param(run_config: dict[str, Any]) -> str:
 
     # Extra params
     cli_params.append(get_cli_str(extra_params))
+    cli_params.append('--trust-remote-code')
 
     return ' '.join(cli_params).strip()
 

diff --git a/autotest/utils/quantization_utils.py b/autotest/utils/quantization_utils.py
@@ -45,6 +45,8 @@ def quantization(config,
     else:
         quantization_cmd += ' --batch-size 32'
 
+    quantization_cmd += ' --trust-remote-code'
+
     with open(quantization_log, 'w') as f:
         # remove existing folder
         subprocess.run([' '.join(['rm -rf', quantization_model_path])],

diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py
@@ -131,12 +131,14 @@ def sample_random_requests(
 
 class Engine:
 
-    def __init__(self, model_path: str, engine_config, csv: str, speculative_config: SpeculativeConfig | None = None):
+    def __init__(self, model_path: str, engine_config, csv: str, speculative_config: SpeculativeConfig | None = None,
+                 trust_remote_code: bool = False):
         self.pipe = pipeline(model_path,
                              backend_config=engine_config,
                              log_level='ERROR',
-                             speculative_config=speculative_config)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+                             speculative_config=speculative_config,
+                             trust_remote_code=trust_remote_code)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=trust_remote_code)
         self.return_routed_experts = getattr(self.pipe.backend_config, 'enable_return_routed_experts', False)
         self.csv = csv
 
@@ -254,6 +256,7 @@ def parse_args():
     ArgumentHelper.top_k(parser)
     ArgumentHelper.log_level(parser)
     ArgumentHelper.backend(parser)
+    ArgumentHelper.trust_remote_code(parser)
 
     # pytorch engine args
     pt_group = parser.add_argument_group('PyTorch engine arguments')
@@ -319,7 +322,8 @@ def main():
         )
 
     speculative_config = get_speculative_config(args)
-    engine = Engine(args.model_path, engine_config, csv=args.csv, speculative_config=speculative_config)
+    engine = Engine(args.model_path, engine_config, csv=args.csv, speculative_config=speculative_config,
+                    trust_remote_code=args.trust_remote_code)
 
     profiler = Profiler(args.stream_output, [50, 75, 95, 99])
 

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
@@ -441,18 +441,20 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
-def get_tokenizer(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+def get_tokenizer(pretrained_model_name_or_path: str,
+                  trust_remote_code: bool = False) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'):
         from sglang.srt.hf_transformers_utils import get_tokenizer
 
         return get_tokenizer(pretrained_model_name_or_path)
 
     if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
         pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
-    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
 
 
-def get_processor(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+def get_processor(pretrained_model_name_or_path: str,
+                  trust_remote_code: bool = False) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     assert (pretrained_model_name_or_path is not None and pretrained_model_name_or_path != '')
     if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'):
         from sglang.srt.utils.hf_transformers_utils import get_processor
@@ -461,7 +463,7 @@ def get_processor(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer |
 
     if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
         pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
-    return AutoProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+    return AutoProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
 
 
 ASYNC_REQUEST_FUNCS = {
@@ -1172,9 +1174,9 @@ def parse_request_rate_range(request_rate_range):
         return list(map(int, request_rate_range.split(',')))
 
 
-def check_chat_template(model_path):
+def check_chat_template(model_path, trust_remote_code: bool = False):
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code)
         return 'chat_template' in tokenizer.init_kwargs
     except Exception as e:
         print(f'Fail to load tokenizer config with error={e}')
@@ -1256,15 +1258,15 @@ def run_benchmark(args_: argparse.Namespace):
               'using `--model`.')
         sys.exit(1)
 
-    if not check_chat_template(model_path):
+    if not check_chat_template(model_path, args.trust_remote_code):
         print('\nWARNING It is recommended to use the `Chat` or `Instruct` '
               'model for benchmarking.\n'
               'Because when the tokenizer counts the output tokens, if '
               'there is gibberish, it might count incorrectly.\n')
 
     print(f'{args}\n')
 
-    tokenizer = get_tokenizer(tokenizer_id)
+    tokenizer = get_tokenizer(tokenizer_id, args.trust_remote_code)
 
     if args.dataset_name == 'sharegpt':
         assert args.random_input_len is None and args.random_output_len is None
@@ -1286,7 +1288,7 @@ def run_benchmark(args_: argparse.Namespace):
             dataset_path=args.dataset_path,
         )
     elif args.dataset_name == 'image':
-        processor = get_processor(model_path)
+        processor = get_processor(model_path, args.trust_remote_code)
         input_requests = sample_image_requests(
             num_requests=args.num_prompts,
             image_count=args.image_count,
@@ -1502,5 +1504,11 @@ def set_ulimit(target_soft_limit=65535):
         default=None,
         help='Disable a warmup request before the benchmark. ',
     )
+    parser.add_argument(
+        '--trust-remote-code',
+        action='store_true',
+        default=False,
+        help='Trust remote code.',
+    )
     args = parser.parse_args()
     run_benchmark(args)
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -135,17 +135,20 @@ class Engine:
 
     def __init__(self, model_path: str,
                  engine_config: PytorchEngineConfig | TurbomindEngineConfig,
-                 speculative_config: SpeculativeConfig):
+                 speculative_config: SpeculativeConfig,
+                 trust_remote_code: bool = False):
         self.tokenizer = Tokenizer(model_path)
         if isinstance(engine_config, TurbomindEngineConfig):
             from lmdeploy.turbomind import TurboMind
-            tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config)
+            tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config,
+                                                 trust_remote_code=trust_remote_code)
             self.backend = 'turbomind'
         elif isinstance(engine_config, PytorchEngineConfig):
             from lmdeploy.pytorch.engine import Engine as PytorchEngine
             tm_model = PytorchEngine.from_pretrained(model_path,
                                                      engine_config=engine_config,
-                                                     speculative_config=speculative_config)
+                                                     speculative_config=speculative_config,
+                                                     trust_remote_code=trust_remote_code)
             self.backend = 'pytorch'
 
         self.tm_model = tm_model
@@ -295,6 +298,12 @@ def parse_args():
         help='Range of sampled ratio of input/output length, '
         'used only for random dataset.',
     )
+    parser.add_argument(
+        '--trust-remote-code',
+        action='store_true',
+        default=False,
+        help='Trust remote code.',
+    )
     # other args
     ArgumentHelper.top_p(parser)
     ArgumentHelper.temperature(parser)
@@ -382,7 +391,7 @@ def main():
         asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
     speculative_config = get_speculative_config(args)
-    engine = Engine(args.model_path, engine_config, speculative_config)
+    engine = Engine(args.model_path, engine_config, speculative_config, trust_remote_code=args.trust_remote_code)
 
     if args.dataset_name == 'sharegpt':
         assert args.random_input_len is None and args.random_output_len is None

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
@@ -17,6 +17,7 @@ def pipeline(model_path: str,
              chat_template_config: ChatTemplateConfig | None = None,
              log_level: str = 'WARNING',
              max_log_len: int | None = None,
+             trust_remote_code: bool = False,
              speculative_config: SpeculativeConfig | None = None,
              **kwargs):
     """Create a pipeline for inference.
@@ -41,6 +42,7 @@ def pipeline(model_path: str,
             ``WARNING``, ``INFO``, ``DEBUG``]
         max_log_len: Max number of prompt characters or prompt tokens
             being printed in log.
+        trust_remote_code: whether to trust remote code from model repositories.
         speculative_config: speculative decoding configuration.
         **kwargs: additional keyword arguments passed to the pipeline.
 
@@ -73,6 +75,7 @@ def pipeline(model_path: str,
                     chat_template_config=chat_template_config,
                     log_level=log_level,
                     max_log_len=max_log_len,
+                    trust_remote_code=trust_remote_code,
                     speculative_config=speculative_config,
                     **kwargs)
 

diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
@@ -10,7 +10,7 @@
 logger = get_logger('lmdeploy')
 
 
-def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:
+def autoget_backend(model_path: str, trust_remote_code: bool = False):
     """Get backend type in auto backend mode.
 
     Args:
@@ -36,7 +36,7 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:
     is_turbomind_installed = True
     try:
         from lmdeploy.turbomind.supported_models import is_supported as is_supported_turbomind
-        turbomind_has = is_supported_turbomind(model_path)
+        turbomind_has = is_supported_turbomind(model_path, trust_remote_code=trust_remote_code)
     except ImportError:
         is_turbomind_installed = False
 
@@ -57,7 +57,8 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:
 
 def autoget_backend_config(
     model_path: str,
-    backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None
+    backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None,
+    trust_remote_code: bool = False
 ) -> tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]:
     """Get backend config automatically.
 
@@ -75,7 +76,7 @@ def autoget_backend_config(
     if isinstance(backend_config, PytorchEngineConfig):
         return 'pytorch', backend_config
 
-    backend = autoget_backend(model_path)
+    backend = autoget_backend(model_path, trust_remote_code=trust_remote_code)
     config = PytorchEngineConfig() if backend == 'pytorch' else TurbomindEngineConfig()
     if backend_config is not None:
         if type(backend_config) is type(config):
@@ -128,14 +129,14 @@ def check_vl_llm(backend: str, config: dict) -> bool:
     return False
 
 
-def get_task(backend: str, model_path: str):
+def get_task(backend: str, model_path: str, trust_remote_code: bool = False):
     """Get pipeline type and pipeline class from model config."""
     from lmdeploy.serve.core import AsyncEngine
 
     if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
         # workspace model
         return 'llm', AsyncEngine
-    _, config = get_model_arch(model_path)
+    _, config = get_model_arch(model_path, trust_remote_code=trust_remote_code)
     if check_vl_llm(backend, config.to_dict()):
         from lmdeploy.serve.core import VLAsyncEngine
         return 'vlm', VLAsyncEngine
@@ -144,17 +145,17 @@ def get_task(backend: str, model_path: str):
     return 'llm', AsyncEngine
 
 
-def get_model_arch(model_path: str):
+def get_model_arch(model_path: str, trust_remote_code: bool = False):
     """Get a model's architecture and configuration.
 
     Args:
         model_path(str): the model path
     """
     try:
-        cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
     except Exception as e:  # noqa
         from transformers import PretrainedConfig
-        cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
+        cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
 
     _cfg = cfg.to_dict()
     if _cfg.get('architectures', None):

diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
@@ -14,7 +14,7 @@ def input_prompt():
     return '\n'.join(iter(input, sentinel))
 
 
-def build_pipe(model_path, backend, **kwargs):
+def build_pipe(model_path, backend, trust_remote_code=False, **kwargs):
     engine_config = None
     if kwargs.get('enable_prefix_caching', False):
         print('interactive chat cannot be used when prefix caching is enabled')
@@ -48,6 +48,7 @@ def build_pipe(model_path, backend, **kwargs):
                     backend_config=engine_config,
                     chat_template_config=chat_template_config,
                     log_level='ERROR',
+                    trust_remote_code=trust_remote_code,
                     **kwargs)
     return pipe
 
@@ -68,12 +69,12 @@ def get_adapter_name(adapters=None, **kwargs):
     return list(adapters.keys())[0]
 
 
-def main(model_path, backend, **kwargs):
+def main(model_path, backend, trust_remote_code=False, **kwargs):
     if backend != 'pytorch':
         # set auto backend mode
-        backend = autoget_backend(model_path)
+        backend = autoget_backend(model_path, trust_remote_code=trust_remote_code)
     quit = False
-    with build_pipe(model_path, backend, **kwargs) as pipe:
+    with build_pipe(model_path, backend, trust_remote_code=trust_remote_code, **kwargs) as pipe:
         gen_config = build_gen_config(**kwargs)
         adapter_name = get_adapter_name(**kwargs)
         while not quit:

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -46,6 +46,7 @@ def add_parser_chat():
         # model args
         ArgumentHelper.revision(parser)
         ArgumentHelper.download_dir(parser)
+        ArgumentHelper.trust_remote_code(parser)
 
         # pytorch engine args
         pt_group = parser.add_argument_group('PyTorch engine arguments')

diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
@@ -32,6 +32,7 @@ def add_parser_auto_awq():
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
         ArgumentHelper.dtype(parser)
+        ArgumentHelper.trust_remote_code(parser)
         parser.add_argument('--device', type=str, default='cuda', help='Device for weight quantization (cuda or npu)')
         parser.add_argument('--w-bits', type=int, default=4, help='Bit number for weight quantization')
         parser.add_argument('--w-sym', action='store_true', help='Whether to do symmetric quantization')
@@ -56,6 +57,7 @@ def add_parser_auto_gptq():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.dtype(parser)
+        ArgumentHelper.trust_remote_code(parser)
         parser.add_argument('--w-bits', type=int, default=4, help='Bit number for weight quantization')
         parser.add_argument('--w-group-size',
                             type=int,
@@ -78,6 +80,7 @@ def add_parser_calibrate():
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
         ArgumentHelper.dtype(parser)
+        ArgumentHelper.trust_remote_code(parser)
 
     @staticmethod
     def add_parser_smooth_quant():
@@ -102,6 +105,7 @@ def add_parser_smooth_quant():
         ArgumentHelper.quant_dtype(parser)
         ArgumentHelper.revision(parser)
         ArgumentHelper.download_dir(parser)
+        ArgumentHelper.trust_remote_code(parser)
 
     @staticmethod
     def auto_awq(args):