Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lmdeploy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def pipeline(model_path: str,
chat_template_config: ChatTemplateConfig | None = None,
log_level: str = 'WARNING',
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Create a pipeline for inference.
Expand All @@ -41,6 +42,7 @@ def pipeline(model_path: str,
``WARNING``, ``INFO``, ``DEBUG``]
max_log_len: Max number of prompt characters or prompt tokens
being printed in log.
trust_remote_code: whether to trust remote code from model repositories.
speculative_config: speculative decoding configuration.
**kwargs: additional keyword arguments passed to the pipeline.

Expand Down Expand Up @@ -73,6 +75,7 @@ def pipeline(model_path: str,
chat_template_config=chat_template_config,
log_level=log_level,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)

Expand Down
10 changes: 5 additions & 5 deletions lmdeploy/archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,14 @@ def check_vl_llm(backend: str, config: dict) -> bool:
return False


def get_task(backend: str, model_path: str):
def get_task(backend: str, model_path: str, trust_remote_code: bool = False):
"""Get pipeline type and pipeline class from model config."""
from lmdeploy.serve.core import AsyncEngine

if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
# workspace model
return 'llm', AsyncEngine
_, config = get_model_arch(model_path)
_, config = get_model_arch(model_path, trust_remote_code=trust_remote_code)
if check_vl_llm(backend, config.to_dict()):
from lmdeploy.serve.core import VLAsyncEngine
return 'vlm', VLAsyncEngine
Expand All @@ -144,17 +144,17 @@ def get_task(backend: str, model_path: str):
return 'llm', AsyncEngine


def get_model_arch(model_path: str):
def get_model_arch(model_path: str, trust_remote_code: bool = False):
"""Get a model's architecture and configuration.

Args:
model_path(str): the model path
"""
try:
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
except Exception as e: # noqa
from transformers import PretrainedConfig
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)

_cfg = cfg.to_dict()
if _cfg.get('architectures', None):
Expand Down
5 changes: 5 additions & 0 deletions lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def add_parser_api_server():
default=['*'],
help='A list of allowed http headers for cors')
parser.add_argument('--proxy-url', type=str, default=None, help='The proxy url for api server.')
parser.add_argument('--trust-remote-code',
action='store_true',
help='Whether to trust remote code from model repositories.')
parser.add_argument('--max-concurrent-requests',
type=int,
default=None,
Expand Down Expand Up @@ -303,6 +306,7 @@ def api_server(args):
max_log_len=args.max_log_len,
disable_fastapi_docs=args.disable_fastapi_docs,
max_concurrent_requests=args.max_concurrent_requests,
trust_remote_code=args.trust_remote_code,
reasoning_parser=args.reasoning_parser,
tool_call_parser=args.tool_call_parser,
speculative_config=speculative_config,
Expand Down Expand Up @@ -334,6 +338,7 @@ def api_server(args):
max_log_len=args.max_log_len,
disable_fastapi_docs=args.disable_fastapi_docs,
max_concurrent_requests=args.max_concurrent_requests,
trust_remote_code=args.trust_remote_code,
reasoning_parser=args.reasoning_parser,
tool_call_parser=args.tool_call_parser,
speculative_config=speculative_config,
Expand Down
5 changes: 4 additions & 1 deletion lmdeploy/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self,
chat_template_config: ChatTemplateConfig | None = None,
log_level: str = 'WARNING',
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Initialize Pipeline.
Expand All @@ -49,6 +50,7 @@ def __init__(self,
chat_template_config: Chat template configuration.
log_level: Log level.
max_log_len: Max number of prompt characters or prompt tokens being printed in log.
trust_remote_code: whether to trust remote code from model repositories.
speculative_config: Speculative decoding configuration.
**kwargs: Additional keyword arguments.
"""
Expand All @@ -69,12 +71,13 @@ def __init__(self,

# Create inference engine
backend, backend_config = autoget_backend_config(model_path, backend_config)
_, pipeline_class = get_task(backend, model_path)
_, pipeline_class = get_task(backend, model_path, trust_remote_code=trust_remote_code)
self.async_engine = pipeline_class(model_path,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)
Comment on lines 36 to 83
Copy link

Copilot AI Apr 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trust_remote_code is now a user-facing switch that changes how configs/tokenizers/models are loaded (defaulting to False). There are existing integration tests for pipeline() behavior (e.g. tests/test_lmdeploy/test_pipeline.py), but the new parameter and its default are not exercised here.

Please add/adjust tests to cover both: (1) default trust_remote_code=False behavior, and (2) successful loading when trust_remote_code=True is explicitly set for models that require remote code. This helps prevent regressions and ensures the security control remains effective.

Copilot uses AI. Check for mistakes.
self.internal_thread = _EventLoopThread(daemon=True)
Expand Down
5 changes: 3 additions & 2 deletions lmdeploy/pytorch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def get_head_size(self):
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
trust_remote_code: bool = True,
trust_remote_code: bool = False,
dtype: str = 'auto',
dist_config: DistConfig = None,
hf_overrides: dict[str, Any] = None,
Expand Down Expand Up @@ -563,10 +563,11 @@ def from_config(
target_cache_cfg: CacheConfig,
target_model: str = None,
dtype: str = 'auto',
trust_remote_code: bool = False,
):
model = model or target_model
model_config = ModelConfig.from_pretrained(model,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
dtype=dtype,
is_draft_model=True,
spec_method=method,
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/pytorch/engine/config_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def build_misc_config(engine_config: PytorchEngineConfig):

@staticmethod
def build_specdecode_config(target_model, speculative_config: SpeculativeConfig, engine_config: PytorchEngineConfig,
cache_config: CacheConfig):
cache_config: CacheConfig, trust_remote_code: bool = False):
"""Build spec decode config."""
specdecode_config = None
if speculative_config is not None:
Expand All @@ -113,5 +113,6 @@ def build_specdecode_config(target_model, speculative_config: SpeculativeConfig,
target_model=target_model,
target_cache_cfg=cache_config,
dtype=engine_config.dtype,
trust_remote_code=trust_remote_code,
)
return specdecode_config
7 changes: 4 additions & 3 deletions lmdeploy/pytorch/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(
self,
model_path: str,
engine_config: PytorchEngineConfig = None,
trust_remote_code: bool = True,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig = None,
) -> None:
# make sure engine config exist
Expand Down Expand Up @@ -133,7 +133,7 @@ def __init__(
misc_config = ConfigBuilder.build_misc_config(engine_config)
# spec decode
self.specdecode_config = ConfigBuilder.build_specdecode_config(model_path, speculative_config, engine_config,
cache_config)
cache_config, trust_remote_code)

# build model agent
self.executor = build_executor(
Expand All @@ -147,6 +147,7 @@ def __init__(
distributed_executor_backend=engine_config.distributed_executor_backend,
dtype=engine_config.dtype,
specdecode_config=self.specdecode_config,
trust_remote_code=trust_remote_code,
)
self.executor.init()

Expand Down Expand Up @@ -198,7 +199,7 @@ def __init__(
def from_pretrained(cls,
pretrained_model_name_or_path: str,
engine_config: PytorchEngineConfig = None,
trust_remote_code: bool = True,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig = None,
**kwargs):
"""Lmdeploy python inference engine.
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/pytorch/engine/executor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def build_executor(
distributed_executor_backend: str = None,
dtype: str = 'auto',
specdecode_config: SpecDecodeConfig = None,
trust_remote_code: bool = False,
) -> ExecutorBase:
"""Build model agent executor."""
logger = get_logger('lmdeploy')
Expand All @@ -71,7 +72,7 @@ def build_executor(

model_config = ModelConfig.from_pretrained(
model_path,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
dtype=dtype,
hf_overrides=misc_config.hf_overrides,
dist_config=dist_config,
Expand Down
30 changes: 23 additions & 7 deletions lmdeploy/serve/core/async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def __init__(self,
backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
chat_template_config: ChatTemplateConfig | None = None,
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs) -> None:
logger.info(f'input backend={backend}, backend_config={backend_config}')
Expand All @@ -118,21 +119,25 @@ def __init__(self,
if backend == 'turbomind' else PytorchEngineConfig())
self.model_name = model_name if model_name else model_path
self.chat_template = get_chat_template(model_path, chat_template_config)
self.tokenizer = Tokenizer(model_path)
self.tokenizer = Tokenizer(model_path, trust_remote_code=trust_remote_code)
self.prompt_processor = MultimodalProcessor(self.tokenizer, self.chat_template)
self.hf_gen_cfg = get_hf_gen_cfg(model_path)
self.arch, self.hf_cfg = get_model_arch(model_path)
self.hf_gen_cfg = get_hf_gen_cfg(model_path, trust_remote_code=trust_remote_code)
self.arch, self.hf_cfg = get_model_arch(model_path, trust_remote_code=trust_remote_code)
self.session_len = (_get_and_verify_max_len(self.hf_cfg, None)
if backend_config.session_len is None else backend_config.session_len)
backend_config.session_len = self.session_len
if speculative_config is not None and backend == 'turbomind':
logger.warning('speculative decoding is not supported by turbomind ')
# build backend engine
if backend == 'turbomind':
self.engine = self._build_turbomind(model_path=model_path, backend_config=backend_config, **kwargs)
self.engine = self._build_turbomind(model_path=model_path,
backend_config=backend_config,
trust_remote_code=trust_remote_code,
**kwargs)
elif backend == 'pytorch':
self.engine = self._build_pytorch(model_path=model_path,
backend_config=backend_config,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)
else:
Expand Down Expand Up @@ -169,19 +174,30 @@ def __enter__(self):
def __exit__(self, exc_type, exc_value, traceback):
self.close()

def _build_turbomind(self, model_path: str, backend_config: TurbomindEngineConfig | None = None, **kwargs):
def _build_turbomind(self,
model_path: str,
backend_config: TurbomindEngineConfig | None = None,
trust_remote_code: bool = False,
**kwargs):
"""Inner build method for turbomind backend."""
from lmdeploy import turbomind as tm
return tm.TurboMind.from_pretrained(model_path, engine_config=backend_config, **kwargs)
return tm.TurboMind.from_pretrained(model_path,
engine_config=backend_config,
trust_remote_code=trust_remote_code,
**kwargs)

def _build_pytorch(self,
model_path: str,
backend_config: PytorchEngineConfig | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Inner build method for pytorch backend."""
from lmdeploy.pytorch.engine import Engine
return Engine.from_pretrained(model_path, engine_config=backend_config, speculative_config=speculative_config)
return Engine.from_pretrained(model_path,
engine_config=backend_config,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config)

def _build_stat_loggers(self):
self.stat_loggers = []
Expand Down
13 changes: 11 additions & 2 deletions lmdeploy/serve/core/vl_async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self,
backend: Literal['turbomind', 'pytorch'] = 'turbomind',
backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
vision_config: VisionConfig | None = None,
trust_remote_code: bool = False,
**kwargs) -> None:
from lmdeploy.serve.processors import MultimodalProcessor
from lmdeploy.utils import try_import_deeplink
Expand All @@ -27,8 +28,16 @@ def __init__(self,
if backend_config and backend_config.enable_prefix_caching:
backend_config.enable_prefix_caching = False
logger.warning('Prefix caching is disabled since LMDeploy hasn\'t support in on VL models yet')
self.vl_encoder = ImageEncoder(model_path, backend, vision_config, backend_config=backend_config)
super().__init__(model_path, backend=backend, backend_config=backend_config, **kwargs)
self.vl_encoder = ImageEncoder(model_path,
backend,
vision_config,
backend_config=backend_config,
trust_remote_code=trust_remote_code)
super().__init__(model_path,
backend=backend,
backend_config=backend_config,
trust_remote_code=trust_remote_code,
**kwargs)
# Update prompt_processor to support multimodal processing
self.prompt_processor = MultimodalProcessor(self.tokenizer,
self.chat_template,
Expand Down
4 changes: 3 additions & 1 deletion lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,7 @@ def serve(model_path: str,
max_log_len: int | None = None,
disable_fastapi_docs: bool = False,
max_concurrent_requests: int | None = None,
trust_remote_code: bool = False,
reasoning_parser: str | None = None,
tool_call_parser: str | None = None,
allow_terminate_by_client: bool = False,
Expand Down Expand Up @@ -1487,7 +1488,7 @@ def serve(model_path: str,
http_or_https = 'https'

handle_torchrun()
_, pipeline_class = get_task(backend, model_path)
_, pipeline_class = get_task(backend, model_path, trust_remote_code=trust_remote_code)
if isinstance(backend_config, PytorchEngineConfig):
backend_config.enable_mp_engine = True
# router replay
Expand All @@ -1499,6 +1500,7 @@ def serve(model_path: str,
backend_config=backend_config,
chat_template_config=chat_template_config,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)
# set reasoning parser and tool parser
Expand Down
Loading
Loading