Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pr_ete_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
exit 1
- name: Test restful server - turbomind InternVL3-38B
run: |
CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 &
CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client --trust-remote-code > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
Expand Down Expand Up @@ -169,7 +169,7 @@ jobs:
exit 1
- name: Test restful server - pytorch InternVL3_5-30B-A3B
run: |
CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client --trust-remote-code > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
Expand Down
3 changes: 2 additions & 1 deletion autotest/tools/pipeline/llm_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def run_pipeline_chat_test(model_path, run_config, cases_path, is_pr_test: bool

print('backend_config config: ' + str(backend_config))
print('speculative_config config: ' + str(speculative_config))
pipe = pipeline(model_path, backend_config=backend_config, speculative_config=speculative_config)
pipe = pipeline(model_path, backend_config=backend_config, speculative_config=speculative_config,
trust_remote_code=True)

cases_path = os.path.join(cases_path)
with open(cases_path) as f:
Expand Down
2 changes: 1 addition & 1 deletion autotest/tools/pipeline/mllm_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def run_pipeline_mllm_test(model_path, run_config, resource_path, is_pr_test: bo
print(f"Warning: Cannot set attribute '{attr_name}' on backend_config. Skipping.")

print('backend_config config: ' + str(backend_config))
pipe = pipeline(model_path, backend_config=backend_config)
pipe = pipeline(model_path, backend_config=backend_config, trust_remote_code=True)

image = load_image(f'{resource_path}/{PIC1}')

Expand Down
1 change: 1 addition & 0 deletions autotest/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def get_cli_common_param(run_config: dict[str, Any]) -> str:

# Extra params
cli_params.append(get_cli_str(extra_params))
cli_params.append('--trust-remote-code')

return ' '.join(cli_params).strip()

Expand Down
2 changes: 2 additions & 0 deletions autotest/utils/quantization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def quantization(config,
else:
quantization_cmd += ' --batch-size 32'

quantization_cmd += ' --trust-remote-code'

with open(quantization_log, 'w') as f:
# remove existing folder
subprocess.run([' '.join(['rm -rf', quantization_model_path])],
Expand Down
12 changes: 8 additions & 4 deletions benchmark/profile_pipeline_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,14 @@ def sample_random_requests(

class Engine:

def __init__(self, model_path: str, engine_config, csv: str, speculative_config: SpeculativeConfig | None = None):
def __init__(self, model_path: str, engine_config, csv: str, speculative_config: SpeculativeConfig | None = None,
trust_remote_code: bool = False):
self.pipe = pipeline(model_path,
backend_config=engine_config,
log_level='ERROR',
speculative_config=speculative_config)
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
speculative_config=speculative_config,
trust_remote_code=trust_remote_code)
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=trust_remote_code)
self.return_routed_experts = getattr(self.pipe.backend_config, 'enable_return_routed_experts', False)
self.csv = csv

Expand Down Expand Up @@ -254,6 +256,7 @@ def parse_args():
ArgumentHelper.top_k(parser)
ArgumentHelper.log_level(parser)
ArgumentHelper.backend(parser)
ArgumentHelper.trust_remote_code(parser)

# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
Expand Down Expand Up @@ -319,7 +322,8 @@ def main():
)

speculative_config = get_speculative_config(args)
engine = Engine(args.model_path, engine_config, csv=args.csv, speculative_config=speculative_config)
engine = Engine(args.model_path, engine_config, csv=args.csv, speculative_config=speculative_config,
trust_remote_code=args.trust_remote_code)

profiler = Profiler(args.stream_output, [50, 75, 95, 99])

Expand Down
26 changes: 17 additions & 9 deletions benchmark/profile_restful_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,18 +441,20 @@ def get_model(pretrained_model_name_or_path: str) -> str:
return pretrained_model_name_or_path


def get_tokenizer(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
def get_tokenizer(pretrained_model_name_or_path: str,
trust_remote_code: bool = False) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'):
from sglang.srt.hf_transformers_utils import get_tokenizer

return get_tokenizer(pretrained_model_name_or_path)

if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)


def get_processor(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
def get_processor(pretrained_model_name_or_path: str,
trust_remote_code: bool = False) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
assert (pretrained_model_name_or_path is not None and pretrained_model_name_or_path != '')
if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'):
from sglang.srt.utils.hf_transformers_utils import get_processor
Expand All @@ -461,7 +463,7 @@ def get_processor(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer |

if pretrained_model_name_or_path is not None and not os.path.exists(pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
return AutoProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
return AutoProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)


ASYNC_REQUEST_FUNCS = {
Expand Down Expand Up @@ -1172,9 +1174,9 @@ def parse_request_rate_range(request_rate_range):
return list(map(int, request_rate_range.split(',')))


def check_chat_template(model_path):
def check_chat_template(model_path, trust_remote_code: bool = False):
try:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code)
return 'chat_template' in tokenizer.init_kwargs
except Exception as e:
print(f'Fail to load tokenizer config with error={e}')
Expand Down Expand Up @@ -1256,15 +1258,15 @@ def run_benchmark(args_: argparse.Namespace):
'using `--model`.')
sys.exit(1)

if not check_chat_template(model_path):
if not check_chat_template(model_path, args.trust_remote_code):
print('\nWARNING It is recommended to use the `Chat` or `Instruct` '
'model for benchmarking.\n'
'Because when the tokenizer counts the output tokens, if '
'there is gibberish, it might count incorrectly.\n')

print(f'{args}\n')

tokenizer = get_tokenizer(tokenizer_id)
tokenizer = get_tokenizer(tokenizer_id, args.trust_remote_code)

if args.dataset_name == 'sharegpt':
assert args.random_input_len is None and args.random_output_len is None
Expand All @@ -1286,7 +1288,7 @@ def run_benchmark(args_: argparse.Namespace):
dataset_path=args.dataset_path,
)
elif args.dataset_name == 'image':
processor = get_processor(model_path)
processor = get_processor(model_path, args.trust_remote_code)
input_requests = sample_image_requests(
num_requests=args.num_prompts,
image_count=args.image_count,
Expand Down Expand Up @@ -1502,5 +1504,11 @@ def set_ulimit(target_soft_limit=65535):
default=None,
help='Disable a warmup request before the benchmark. ',
)
parser.add_argument(
'--trust-remote-code',
action='store_true',
default=False,
help='Trust remote code.',
)
args = parser.parse_args()
run_benchmark(args)
17 changes: 13 additions & 4 deletions benchmark/profile_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,20 @@ class Engine:

def __init__(self, model_path: str,
engine_config: PytorchEngineConfig | TurbomindEngineConfig,
speculative_config: SpeculativeConfig):
speculative_config: SpeculativeConfig,
trust_remote_code: bool = False):
self.tokenizer = Tokenizer(model_path)
if isinstance(engine_config, TurbomindEngineConfig):
from lmdeploy.turbomind import TurboMind
tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config)
tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config,
trust_remote_code=trust_remote_code)
self.backend = 'turbomind'
elif isinstance(engine_config, PytorchEngineConfig):
from lmdeploy.pytorch.engine import Engine as PytorchEngine
tm_model = PytorchEngine.from_pretrained(model_path,
engine_config=engine_config,
speculative_config=speculative_config)
speculative_config=speculative_config,
trust_remote_code=trust_remote_code)
self.backend = 'pytorch'

self.tm_model = tm_model
Expand Down Expand Up @@ -295,6 +298,12 @@ def parse_args():
help='Range of sampled ratio of input/output length, '
'used only for random dataset.',
)
parser.add_argument(
'--trust-remote-code',
action='store_true',
default=False,
help='Trust remote code.',
)
# other args
ArgumentHelper.top_p(parser)
ArgumentHelper.temperature(parser)
Expand Down Expand Up @@ -382,7 +391,7 @@ def main():
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

speculative_config = get_speculative_config(args)
engine = Engine(args.model_path, engine_config, speculative_config)
engine = Engine(args.model_path, engine_config, speculative_config, trust_remote_code=args.trust_remote_code)

if args.dataset_name == 'sharegpt':
assert args.random_input_len is None and args.random_output_len is None
Expand Down
3 changes: 3 additions & 0 deletions lmdeploy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def pipeline(model_path: str,
chat_template_config: ChatTemplateConfig | None = None,
log_level: str = 'WARNING',
max_log_len: int | None = None,
trust_remote_code: bool = False,
speculative_config: SpeculativeConfig | None = None,
**kwargs):
"""Create a pipeline for inference.
Expand All @@ -41,6 +42,7 @@ def pipeline(model_path: str,
``WARNING``, ``INFO``, ``DEBUG``]
max_log_len: Max number of prompt characters or prompt tokens
being printed in log.
trust_remote_code: whether to trust remote code from model repositories.
speculative_config: speculative decoding configuration.
**kwargs: additional keyword arguments passed to the pipeline.

Expand Down Expand Up @@ -73,6 +75,7 @@ def pipeline(model_path: str,
chat_template_config=chat_template_config,
log_level=log_level,
max_log_len=max_log_len,
trust_remote_code=trust_remote_code,
speculative_config=speculative_config,
**kwargs)

Expand Down
19 changes: 10 additions & 9 deletions lmdeploy/archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger = get_logger('lmdeploy')


def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:
def autoget_backend(model_path: str, trust_remote_code: bool = False):
"""Get backend type in auto backend mode.

Args:
Expand All @@ -36,7 +36,7 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:
is_turbomind_installed = True
try:
from lmdeploy.turbomind.supported_models import is_supported as is_supported_turbomind
turbomind_has = is_supported_turbomind(model_path)
turbomind_has = is_supported_turbomind(model_path, trust_remote_code=trust_remote_code)
except ImportError:
is_turbomind_installed = False

Expand All @@ -57,7 +57,8 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:

def autoget_backend_config(
model_path: str,
backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None
backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None,
trust_remote_code: bool = False
) -> tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]:
"""Get backend config automatically.

Expand All @@ -75,7 +76,7 @@ def autoget_backend_config(
if isinstance(backend_config, PytorchEngineConfig):
return 'pytorch', backend_config

backend = autoget_backend(model_path)
backend = autoget_backend(model_path, trust_remote_code=trust_remote_code)
config = PytorchEngineConfig() if backend == 'pytorch' else TurbomindEngineConfig()
if backend_config is not None:
if type(backend_config) is type(config):
Expand Down Expand Up @@ -128,14 +129,14 @@ def check_vl_llm(backend: str, config: dict) -> bool:
return False


def get_task(backend: str, model_path: str):
def get_task(backend: str, model_path: str, trust_remote_code: bool = False):
"""Get pipeline type and pipeline class from model config."""
from lmdeploy.serve.core import AsyncEngine

if os.path.exists(os.path.join(model_path, 'triton_models', 'weights')):
# workspace model
return 'llm', AsyncEngine
_, config = get_model_arch(model_path)
_, config = get_model_arch(model_path, trust_remote_code=trust_remote_code)
if check_vl_llm(backend, config.to_dict()):
from lmdeploy.serve.core import VLAsyncEngine
return 'vlm', VLAsyncEngine
Expand All @@ -144,17 +145,17 @@ def get_task(backend: str, model_path: str):
return 'llm', AsyncEngine


def get_model_arch(model_path: str):
def get_model_arch(model_path: str, trust_remote_code: bool = False):
"""Get a model's architecture and configuration.

Args:
model_path(str): the model path
"""
try:
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
except Exception as e: # noqa
from transformers import PretrainedConfig
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)

_cfg = cfg.to_dict()
if _cfg.get('architectures', None):
Expand Down
9 changes: 5 additions & 4 deletions lmdeploy/cli/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def input_prompt():
return '\n'.join(iter(input, sentinel))


def build_pipe(model_path, backend, **kwargs):
def build_pipe(model_path, backend, trust_remote_code=False, **kwargs):
engine_config = None
if kwargs.get('enable_prefix_caching', False):
print('interactive chat cannot be used when prefix caching is enabled')
Expand Down Expand Up @@ -48,6 +48,7 @@ def build_pipe(model_path, backend, **kwargs):
backend_config=engine_config,
chat_template_config=chat_template_config,
log_level='ERROR',
trust_remote_code=trust_remote_code,
**kwargs)
return pipe

Expand All @@ -68,12 +69,12 @@ def get_adapter_name(adapters=None, **kwargs):
return list(adapters.keys())[0]


def main(model_path, backend, **kwargs):
def main(model_path, backend, trust_remote_code=False, **kwargs):
if backend != 'pytorch':
# set auto backend mode
backend = autoget_backend(model_path)
backend = autoget_backend(model_path, trust_remote_code=trust_remote_code)
quit = False
with build_pipe(model_path, backend, **kwargs) as pipe:
with build_pipe(model_path, backend, trust_remote_code=trust_remote_code, **kwargs) as pipe:
gen_config = build_gen_config(**kwargs)
adapter_name = get_adapter_name(**kwargs)
while not quit:
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def add_parser_chat():
# model args
ArgumentHelper.revision(parser)
ArgumentHelper.download_dir(parser)
ArgumentHelper.trust_remote_code(parser)

# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
Expand Down
4 changes: 4 additions & 0 deletions lmdeploy/cli/lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def add_parser_auto_awq():
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.calib_search_scale(parser)
ArgumentHelper.dtype(parser)
ArgumentHelper.trust_remote_code(parser)
parser.add_argument('--device', type=str, default='cuda', help='Device for weight quantization (cuda or npu)')
parser.add_argument('--w-bits', type=int, default=4, help='Bit number for weight quantization')
parser.add_argument('--w-sym', action='store_true', help='Whether to do symmetric quantization')
Expand All @@ -56,6 +57,7 @@ def add_parser_auto_gptq():
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.dtype(parser)
ArgumentHelper.trust_remote_code(parser)
parser.add_argument('--w-bits', type=int, default=4, help='Bit number for weight quantization')
parser.add_argument('--w-group-size',
type=int,
Expand All @@ -78,6 +80,7 @@ def add_parser_calibrate():
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.calib_search_scale(parser)
ArgumentHelper.dtype(parser)
ArgumentHelper.trust_remote_code(parser)

@staticmethod
def add_parser_smooth_quant():
Expand All @@ -102,6 +105,7 @@ def add_parser_smooth_quant():
ArgumentHelper.quant_dtype(parser)
ArgumentHelper.revision(parser)
ArgumentHelper.download_dir(parser)
ArgumentHelper.trust_remote_code(parser)

@staticmethod
def auto_awq(args):
Expand Down
Loading
Loading