diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py index e296c4d760..96754200b7 100644 --- a/.github/scripts/action_tools.py +++ b/.github/scripts/action_tools.py @@ -94,7 +94,7 @@ def evaluate(models: list[str], for idx, ori_model in enumerate(models): print() print(50 * '==') - print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...') + print(f'Start evaluating {idx + 1}/{num_model} {ori_model} ...') model = ori_model.lower() lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR']) diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py index 26caa0b103..750968530b 100644 --- a/.github/scripts/eval_chat_config.py +++ b/.github/scripts/eval_chat_config.py @@ -133,7 +133,7 @@ dict(role='HUMAN', begin='[INST] ', end=' [/INST]'), dict(role='BOT', begin='', end='', generate=True), ], - eos_token_id=2) + eos_token_id=2) MAX_SESSION_LEN = 2048 MAX_NEW_TOKENS = 1024 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d4cf15d30..be07bba848 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,8 @@ repos: + - repo: https://github.com/hhatto/autopep8 + rev: v2.3.2 + hooks: + - id: autopep8 - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.15.4 hooks: diff --git a/autotest/interface/restful/test_restful_chat_completions_v1.py b/autotest/interface/restful/test_restful_chat_completions_v1.py index 67e166a568..dfff5597e1 100644 --- a/autotest/interface/restful/test_restful_chat_completions_v1.py +++ b/autotest/interface/restful/test_restful_chat_completions_v1.py @@ -227,14 +227,14 @@ def test_array_stopwords_streaming(self, backend, model_case): @pytest.mark.internlm2_5 def test_special_words(self, backend, model_case): message = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' \ - '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \ - '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \ - '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \ - '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \ - '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \ - '机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \ - 'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \ - '计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' + '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \ + '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \ + '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \ + '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \ + '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \ + '机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \ + 'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \ + '计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' api_client = APIClient(BASE_URL) model_name = api_client.available_models[0] for output in api_client.chat_completions_v1(model=model_name, diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py index e08b5c3a92..65f1638c39 100644 --- a/autotest/interface/restful/test_restful_generate.py +++ b/autotest/interface/restful/test_restful_generate.py @@ -748,7 +748,7 @@ def single_request(idx): success_rate = success_count / 20 assert success_rate == 1.0, \ - f'Stress test failed: success rate {success_rate*100}% < 80%' + f'Stress test failed: success rate {success_rate * 100}% < 80%' if success_count > 0: avg_latency = total_latency / success_count diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 2ac134f440..cbf01f4211 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -210,7 +210,7 @@ def load_video(video_path, bound=None, num_segments=32): question = '' for i in range(len(imgs)): - question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n' + question = question + f'Frame{i + 1}: {IMAGE_TOKEN}\n' if lang == 'cn': question += '视频里有什么动物,它在做什么?' diff --git a/lmdeploy/api.py b/lmdeploy/api.py index 11f31c1de4..72d107c2ee 100644 --- a/lmdeploy/api.py +++ b/lmdeploy/api.py @@ -95,7 +95,7 @@ def serve(model_path: str, This function has been removed. Please use alternative methods. This will run the api_server in a subprocess. - """ # noqa E501 + """ # noqa E501 raise NotImplementedError("The 'serve' function is no longer available. " 'This function has been deprecated and removed.') diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 39469c6f07..0d134a0c40 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -61,7 +61,7 @@ def get_lora_adapters(adapters: list[str]): else: for pair in adapters: assert '=' in pair, f'Multiple lora paths must in format of ' \ - f'xxx=yyy. But given: {pair}' + f'xxx=yyy. But given: {pair}' name, path = pair.strip().split('=', 1) assert name not in output, f'Multiple lora paths with repeated lora name: {name}' output[name] = path @@ -420,8 +420,7 @@ def calib_batchsize(parser): '--batch-size', type=int, default=1, - help=\ - 'The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM' # noqa + help='The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM' # noqa ) @staticmethod @@ -432,8 +431,7 @@ def calib_search_scale(parser): '--search-scale', action='store_true', default=False, - help=\ - 'Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied' # noqa + help='Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied' # noqa ) @staticmethod @@ -454,8 +452,7 @@ def chat_template(parser): '--chat-template', type=str, default=None, - help=\ - 'A JSON file or string that specifies the chat template configuration. ' # noqa + help='A JSON file or string that specifies the chat template configuration. ' # noqa 'Please refer to https://lmdeploy.readthedocs.io/en/latest/advance/chat_template.html for the specification' # noqa ) diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py index d801a1f447..1d2df866f8 100644 --- a/lmdeploy/lite/utils/__init__.py +++ b/lmdeploy/lite/utils/__init__.py @@ -2,14 +2,14 @@ from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs from .cal_qparams import ( - QParams, - cal_qparams_per_channel_absmax, - cal_qparams_per_channel_minmax, - cal_qparams_per_group_absmax, - cal_qparams_per_group_minmax, - cal_qparams_per_tensor_absmax, - cal_qparams_per_tensor_minmax, - precise_round, + QParams, + cal_qparams_per_channel_absmax, + cal_qparams_per_channel_minmax, + cal_qparams_per_group_absmax, + cal_qparams_per_group_minmax, + cal_qparams_per_tensor_absmax, + cal_qparams_per_tensor_minmax, + precise_round, ) from .calib_dataloader import get_calib_loaders from .collect import bimap_name_mod, collect_target_modules, collect_target_weights diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 3c7b43b5a4..77db60dfa6 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -451,16 +451,16 @@ def __post_init__(self): assert self.quant_policy in (0, 4, 8), 'invalid quant_policy' assert self.device_type in ['cuda', 'ascend', 'maca', 'camb'], (f'invalid device_type: {self.device_type}') assert self.kernel_block_size >= 16 and \ - (self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \ - f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}' + (self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \ + f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}' assert self.block_size >= self.kernel_block_size and \ - self.block_size % self.kernel_block_size == 0, \ - (f'block_size must be >= kernel_block_size and an integer multiple ' - f'of kernel_block_size, but got block_size {self.block_size} ' - f'and kernel_block_size {self.kernel_block_size}') + self.block_size % self.kernel_block_size == 0, \ + (f'block_size must be >= kernel_block_size and an integer multiple ' + f'of kernel_block_size, but got block_size {self.block_size} ' + f'and kernel_block_size {self.kernel_block_size}') if self.quant_policy > 0 and self.device_type not in ['cuda', 'ascend']: assert False, \ - 'kv cache quantization only works for CUDA and ASCEND.' + 'kv cache quantization only works for CUDA and ASCEND.' if self.device_type == 'camb' and self.block_size != 16: self.block_size = 16 logger.warning('Currently, camb device requires block size to be 16, \ diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py index bd8c78d8c1..f28bc0dbe2 100644 --- a/lmdeploy/metrics/loggers.py +++ b/lmdeploy/metrics/loggers.py @@ -116,10 +116,10 @@ def log(self): f'{scheduler_stats.num_api_routed_reqs} / {scheduler_stats.num_api_waiting_reqs}, ' f'Engine (running/waiting): ' f'{scheduler_stats.num_running_reqs} / {scheduler_stats.num_waiting_reqs}, ' - f'KV cache: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, ') + f'KV cache: {scheduler_stats.gpu_cache_usage * 100:.1f}%, ') if scheduler_stats.prefix_cache_hit_rate != 0: - log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, ' + log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100:.1f}%, ' if spec_msg is not None: log_msg += spec_msg diff --git a/lmdeploy/model.py b/lmdeploy/model.py index dbd8939ecf..873c3524b4 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -162,10 +162,10 @@ def get_prompt(self, prompt, sequence_start=True): f'{self.assistant}' else: return f'{self.user}{prompt}{self.eoh}' \ - f'{self.assistant}' + f'{self.assistant}' else: return f'{self.separator}{self.user}{prompt}{self.eoh}' \ - f'{self.assistant}' + f'{self.assistant}' def messages2prompt(self, messages, sequence_start=True, **kwargs): """Return the prompt that is concatenated with other elements in the diff --git a/lmdeploy/pipeline.py b/lmdeploy/pipeline.py index ca4c42bba0..818248a374 100644 --- a/lmdeploy/pipeline.py +++ b/lmdeploy/pipeline.py @@ -122,7 +122,8 @@ def infer(self, res = res.extend(out) if res else out outputs.append(res) finally: - if pbar: pbar.close() # noqa + if pbar: + pbar.close() # noqa if is_single: return outputs[0] return outputs diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py index d687d5a485..b011e54b48 100644 --- a/lmdeploy/profiler.py +++ b/lmdeploy/profiler.py @@ -166,8 +166,8 @@ def save_csv(self, csv_file: str, hyperparams): f'{self.rps:.3f}', f'{(self.input_throughput):.3f}', f'{self.output_throughput:.3f}', - f'{self.e2e_mean*1000:.3f}', - f'{self.ttft_mean*1000:.3f}' if self.stream_output else '-', - f'{self.tpot_mean*1000:.3f}', - f'{self.itls_mean*1000:.3f}' if self.stream_output else '-', + f'{self.e2e_mean * 1000:.3f}', + f'{self.ttft_mean * 1000:.3f}' if self.stream_output else '-', + f'{self.tpot_mean * 1000:.3f}', + f'{self.itls_mean * 1000:.3f}' if self.stream_output else '-', ]) diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py index 78376df43f..7266b37d25 100644 --- a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py @@ -55,7 +55,7 @@ def get_total_slots(): is_prefill_no_cache = False if not step_context.is_decoding: is_prefill_no_cache = \ - all((step_context.q_seqlens == + all((step_context.q_seqlens == step_context.kv_seqlens).tolist()) q_start_loc = step_context.q_start_loc cu_seqlens = torch.cat((q_start_loc, step_context.q_seqlens.sum().unsqueeze(0))).int() diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py index 55fb6e807b..fbd0de9e66 100644 --- a/lmdeploy/pytorch/engine/model_agent/agent.py +++ b/lmdeploy/pytorch/engine/model_agent/agent.py @@ -1180,7 +1180,7 @@ def wakeup(self, tags: list[str] | None = None): if 'weights' in tags: device = next(self.patched_model.get_model().parameters()).device assert device.type in ['cpu', 'meta'] - spec_model = self.spec_agent.get_model() + spec_model = self.spec_agent.get_model() if device.type == 'cpu': self.patched_model.get_model().to(torch.cuda.current_device()) diff --git a/lmdeploy/pytorch/kernels/__init__.py b/lmdeploy/pytorch/kernels/__init__.py index ae4a278777..4befc9b134 100644 --- a/lmdeploy/pytorch/kernels/__init__.py +++ b/lmdeploy/pytorch/kernels/__init__.py @@ -1,10 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .w8a8_triton_kernels import ( - matmul_kernel_dynamic_quant, - per_channel_quant, - per_token_quant_int8, - rms_norm_dynamic_quant, + matmul_kernel_dynamic_quant, + per_channel_quant, + per_token_quant_int8, + rms_norm_dynamic_quant, ) __all__ = [ diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py index e6717b5ef4..474e9bd0ea 100644 --- a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py +++ b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py @@ -208,7 +208,7 @@ def _gemm_fp8_tma_pre_hook(nargs): 'BLOCK_N': 64, }, num_stages=3, num_warps=4, pre_hook=_gemm_fp8_tma_pre_hook) ], - key=['N', 'K']) + key=['N', 'K']) @triton.jit def _gemm_fp8_tma_kernel( desc_a, @@ -296,7 +296,7 @@ def _gemm_fp8_tma_kernel( 'BLOCK_N': 64, }, num_stages=3, num_warps=4) ], - key=['N', 'K']) + key=['N', 'K']) @triton.jit def _gemm_fp8_kernel( A, diff --git a/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py b/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py index 0e7a853a90..1e42d0f7e5 100644 --- a/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py +++ b/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py @@ -121,7 +121,7 @@ def causal_conv1d_fwd_main( init_col = k_val + w if init_col < width - 1: out_vals[i] += w_local[w] * T.cast(Init_states[seq_idx_cur, c_idx, init_col], - T.float32) + T.float32) else: for w in T.unroll(width): out_vals[i] += T.if_then_else(seq_idx_local[i + w] == seq_idx_cur, diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py index 8437c16541..310dc23d6d 100644 --- a/lmdeploy/pytorch/kernels/cuda/fused_moe.py +++ b/lmdeploy/pytorch/kernels/cuda/fused_moe.py @@ -17,16 +17,16 @@ def get_cuda_autotune_config(): 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, }, - num_stages=3, - num_warps=8), + num_stages=3, + num_warps=8), triton.Config({ 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), # SM8 triton.Config({ 'BLOCK_SIZE_M': 128, @@ -34,24 +34,24 @@ def get_cuda_autotune_config(): 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), # SM7- triton.Config({ 'BLOCK_SIZE_M': 64, @@ -59,24 +59,24 @@ def get_cuda_autotune_config(): 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=5, - num_warps=2), + num_stages=5, + num_warps=2), ] diff --git a/lmdeploy/pytorch/kernels/cuda/rms_norm.py b/lmdeploy/pytorch/kernels/cuda/rms_norm.py index 5238bab261..8376686e6a 100644 --- a/lmdeploy/pytorch/kernels/cuda/rms_norm.py +++ b/lmdeploy/pytorch/kernels/cuda/rms_norm.py @@ -191,8 +191,8 @@ def test_rms_norm(bsz, ctx_len, feat_len, dtype): torch_cost = (t1 - t0) / N_REPEATS * 1000 triton_cost = (t2 - t1) / N_REPEATS * 1000 - print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' \ - f' torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n') + print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' + f' torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n') test_rms_norm(1, 8128, 5120, torch.float16) test_rms_norm(1, 8128, 5120, torch.float32) diff --git a/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py b/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py index dd8c08109c..503ae85677 100644 --- a/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py +++ b/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py @@ -17,32 +17,32 @@ def get_cuda_autotune_config(): 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, }, - num_stages=4, - num_warps=4), + num_stages=4, + num_warps=4), triton.Config({ 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, }, - num_stages=3, - num_warps=8), + num_stages=3, + num_warps=8), ] diff --git a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py index bdff352823..6bdaf6de40 100644 --- a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py +++ b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py @@ -599,6 +599,6 @@ def perf(ms): plot_name='bench-triton', args={ 'dtype': torch.float16, - }) + }) bench_funch = (triton.testing.perf_report(config))(bench_rms_and_linear) bench_funch.run(print_data=True) diff --git a/lmdeploy/pytorch/models/deepseek_mtp.py b/lmdeploy/pytorch/models/deepseek_mtp.py index a36a14cd34..7cb5653af6 100644 --- a/lmdeploy/pytorch/models/deepseek_mtp.py +++ b/lmdeploy/pytorch/models/deepseek_mtp.py @@ -753,7 +753,7 @@ def __skip_nextn(name, nextn_keys): num_hidden_layers = self.config.num_hidden_layers num_nextn_predict_layers = getattr(self.config, 'num_nextn_predict_layers', 1) - nextn_keys = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)] + nextn_keys = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)] params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py index 087d744bf9..29e9048599 100644 --- a/lmdeploy/pytorch/models/deepseek_v2.py +++ b/lmdeploy/pytorch/models/deepseek_v2.py @@ -1345,7 +1345,7 @@ def __skip_layers(): num_hidden_layers = self.config.num_hidden_layers num_nextn_predict_layers = getattr(self.config, 'num_nextn_predict_layers', 1) - nextn_keys = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)] + nextn_keys = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)] params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/lmdeploy/pytorch/models/glm4_moe.py b/lmdeploy/pytorch/models/glm4_moe.py index 671a4e6e9a..2d332fcfce 100644 --- a/lmdeploy/pytorch/models/glm4_moe.py +++ b/lmdeploy/pytorch/models/glm4_moe.py @@ -550,7 +550,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): if hasattr(self.config, 'num_nextn_predict_layers'): num_hidden_layers = self.config.num_hidden_layers num_nextn_predict_layers = self.config.num_nextn_predict_layers - mtp_param_list = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)] + mtp_param_list = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)] # expert map num_experts = self.config.n_routed_experts diff --git a/lmdeploy/pytorch/models/glm4moe_mtp.py b/lmdeploy/pytorch/models/glm4moe_mtp.py index 4e5ec8c818..1cc5b51f18 100644 --- a/lmdeploy/pytorch/models/glm4moe_mtp.py +++ b/lmdeploy/pytorch/models/glm4moe_mtp.py @@ -110,7 +110,7 @@ def __skip_nextn(name, nextn_keys): num_hidden_layers = self.config.num_hidden_layers num_nextn_predict_layers = getattr(self.config, 'num_nextn_predict_layers', 1) - nextn_keys = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)] + nextn_keys = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)] # expert map num_experts = self.config.n_routed_experts diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py index aae94cb6d7..88d90a5690 100644 --- a/lmdeploy/pytorch/models/qwen3_5.py +++ b/lmdeploy/pytorch/models/qwen3_5.py @@ -433,13 +433,12 @@ def __init__( quant_config=quantization_config) self.in_proj_ba = build_merged_colwise_linear(self.hidden_size, - [self.num_v_heads, self.num_v_heads], - bias=False, - dtype=dtype, - device=device, - is_tp=True, - out_names=['b', 'a']) - + [self.num_v_heads, self.num_v_heads], + bias=False, + dtype=dtype, + device=device, + is_tp=True, + out_names=['b', 'a']) # time step projection (discretization) # instantiate once and copy inv_dt in init_weights of PretrainedModel @@ -1113,7 +1112,6 @@ def __init__(self, self.enable_return_routed_experts = False self.is_spec_decoding = get_build_model_context().num_spec_tokens > 0 - def forward( self, input_ids: torch.Tensor, diff --git a/lmdeploy/pytorch/strategies/ar/step_inputs.py b/lmdeploy/pytorch/strategies/ar/step_inputs.py index 5fad97dcf2..96b74506ff 100644 --- a/lmdeploy/pytorch/strategies/ar/step_inputs.py +++ b/lmdeploy/pytorch/strategies/ar/step_inputs.py @@ -26,7 +26,7 @@ def step_sampling_delta(sampling_delta: SamplingInputsDelta, - next_token_ids: torch.Tensor) -> SamplingInputsDelta: + next_token_ids: torch.Tensor) -> SamplingInputsDelta: """Advance sampling delta for one decode step.""" sampling_delta.num_ignore_eos = sampling_delta.num_ignore_eos - 1 if sampling_delta.random_offsets is not None: diff --git a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py index df749d019e..234ff92bd5 100644 --- a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py +++ b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py @@ -12,7 +12,7 @@ from lmdeploy.pytorch.messages import SchedulerSequence from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta -from ..ar.model_agent import ARStoppingCriteria +from ..ar.model_agent import ARStoppingCriteria, get_model_inputs_next_decoding from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy SeqList = list[SchedulerSequence] @@ -179,11 +179,62 @@ def make_extra_inputs(self, seqs: 'SeqList', model_inputs: 'ModelInputs') -> Ext """Create extra inputs.""" return ARSpecExtraInputs() + def update_extra_inputs(self, extra_inputs: ARSpecExtraInputs, delta: 'ModelInputsDelta') -> ARSpecExtraInputs: + """Update extra inputs with model inputs delta.""" + return extra_inputs.update(delta) + def make_extra_outputs(self, extra_inputs: ARSpecExtraInputs) -> ARSpecExtraOutputs: """Create extra outputs.""" output = ARSpecExtraOutputs(draft_token_ids=extra_inputs.output_draft_token_ids) return output + def update_prefill_for_next_step( + self, + model_inputs: 'ModelInputs', + extra_inputs: ARSpecExtraInputs, + next_token_ids: torch.Tensor, + model_metas: Any, + extra_outputs: ARSpecExtraOutputs, + ) -> tuple['ModelInputs', ARSpecExtraInputs]: + """Step next decoding.""" + next_token_ids = next_token_ids[:, None] + next_token_ids = torch.cat([next_token_ids, extra_outputs.draft_token_ids], dim=-1) + max_q_seqlen = next_token_ids.size(-1) + next_token_ids = next_token_ids.flatten()[None, :] + inputs = get_model_inputs_next_decoding(model_inputs, + next_token_ids, + max_q_seqlen=max_q_seqlen, + model_metas=model_metas) + + # update mrope pos ids + mrope_pos_ids = inputs.mrope_pos_ids + if mrope_pos_ids is not None: + offsets = torch.arange(max_q_seqlen, dtype=mrope_pos_ids.dtype, device=mrope_pos_ids.device)[None, None, :] + mrope_pos_ids = mrope_pos_ids.unflatten(1, (-1, 1)).repeat(1, 1, max_q_seqlen) + offsets + inputs.mrope_pos_ids = mrope_pos_ids.flatten(1, 2) + + extra_inputs = extra_inputs.clone() + return inputs, extra_inputs + + def update_decoding_for_next_step(self, model_inputs: 'ModelInputs', next_token_ids: torch.Tensor, model_metas: Any, + extra_inputs: ARSpecExtraInputs, extra_outputs: ARSpecExtraOutputs): + """Step next inputs.""" + model_inputs.model_metas = model_metas + step_seqlens = model_inputs.seq_length + batch_size = step_seqlens.size(0) + + # update extra inputs + extra_inputs.output_token_ids = extra_outputs.draft_token_ids + + # update inputs + step_seqlens = model_inputs.seq_length - extra_inputs.num_rejected_tokens + input_ids = next_token_ids.new_empty((batch_size, self.num_spec_tokens + 1)) + input_ids[:, 0] = next_token_ids + input_ids[:, 1:] = extra_inputs.output_draft_token_ids + input_ids = input_ids.flatten()[None, :] + model_inputs = model_inputs.step(input_ids, step_seqlens) + return model_inputs, extra_inputs + def post_sampling(self, inputs: 'ModelInputs', logits: torch.Tensor, next_token_ids: torch.LongTensor, extra_inputs: ARSpecExtraInputs): """Post sampling.""" diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index a259fbdd90..d87cfce38b 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -202,7 +202,7 @@ def _build_stat_loggers(self): metrics_processor.stat_loggers = self.stat_loggers def _if_session_stale(self, session: Session, - input_token_len: int) -> GenOut | None: + input_token_len: int) -> GenOut | None: """If ``session.epoch`` was stamped by api_server and ``stop_all_session`` ran since then (the engine epoch changed), drop the session.""" @@ -449,7 +449,6 @@ def is_error(status): if not gen_config.ignore_eos: stop_ids = gen_config.stop_token_ids or [] - stale = self._if_session_stale(session, len(prompt_input['input_ids'])) if stale is not None: metrics_processor.increase_failed_requests('abort') @@ -460,7 +459,7 @@ def is_error(status): async with session.request_handle() as handle: if session.epoch is not None and session.epoch != self.epoch: logger.info(f'[generate] session {session_id} got aborted before starting inference, ' - f'session.epoch={session.epoch}, async_engine.epoch={self.epoch}') + f'session.epoch={session.epoch}, async_engine.epoch={self.epoch}') metrics_processor.increase_failed_requests('abort') yield GenOut(response='', history_token_len=0, diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py index 667886273e..4af1d08e10 100644 --- a/lmdeploy/serve/proxy/proxy.py +++ b/lmdeploy/serve/proxy/proxy.py @@ -146,8 +146,8 @@ def update_config_file(self): node_url: node_status.model_dump_json() for node_url, node_status in nodes.items() }, - config_file, - indent=2) + config_file, + indent=2) def add(self, node_url: str, status: Status | None = None): """Add a node to the manager. diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 5271262f7b..2c94bfb18f 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -720,7 +720,7 @@ async def async_stream_infer(self, grammar = compiler.compile_json_schema(decode_grammar) else: assert False, f'Decode grammar type {decode_grammar_type} should be in ' \ - '["json_schema", "regex_schema", "json_object"]' + '["json_schema", "regex_schema", "json_object"]' self.model_inst.set_grammar(grammar) except ValueError as e: diff --git a/lmdeploy/vl/__init__.py b/lmdeploy/vl/__init__.py index a62115171d..b5b3247d60 100644 --- a/lmdeploy/vl/__init__.py +++ b/lmdeploy/vl/__init__.py @@ -1,11 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .utils import ( - encode_image_base64, - encode_time_series_base64, - encode_video_base64, - load_image, - load_time_series, - load_video, + encode_image_base64, + encode_time_series_base64, + encode_video_base64, + load_image, + load_time_series, + load_video, ) __all__ = [ diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py index 6534886017..c15bbfe04b 100644 --- a/lmdeploy/vl/model/interns1_pro.py +++ b/lmdeploy/vl/model/interns1_pro.py @@ -12,7 +12,6 @@ logger = get_logger('lmdeploy') - @VISION_MODELS.register_module() class InternS1ProVisionModel(Qwen3VLModel): """InternS1Pro model. diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py index 6dc5eff4c4..49fe39ca05 100644 --- a/lmdeploy/vl/model/llava.py +++ b/lmdeploy/vl/model/llava.py @@ -364,7 +364,7 @@ def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: feat = unpad_image(feat, image_sizes[img_idx]) feat = torch.cat((feat, self.model.image_newline[:, None, None].expand( *feat.shape[:-1], 1).to(feat.device)), - dim=-1) + dim=-1) feat = feat.flatten(1, 2).transpose(0, 1) else: feat = feat.permute(0, 2, 1, 3, 4).contiguous() diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py index 89eaa7659a..ef2be1c2da 100644 --- a/lmdeploy/vl/model/xcomposer2.py +++ b/lmdeploy/vl/model/xcomposer2.py @@ -271,7 +271,7 @@ def proc_messages(messages, chat_template, sequence_start, model_type): if n_images == 1: prefix_image_token, prompt = IMAGE_TOKEN, content[0] else: - prompt = ''.join([f'Image{i+1} {IMAGE_TOKEN}; ' for i in range(n_images)]) + content[0] + prompt = ''.join([f'Image{i + 1} {IMAGE_TOKEN}; ' for i in range(n_images)]) + content[0] else: prompt = ''.join([IMAGE_TOKEN] * n_images) + content[0] else: diff --git a/pyproject.toml b/pyproject.toml index 43b200dd4c..8d84226a55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,3 +20,7 @@ select = [ ignore = [ "E231", "E741" ] + +[tool.autopep8] +max_line_length = 120 +aggressive = 3 diff --git a/tests/pytorch/engine/test_logits_process.py b/tests/pytorch/engine/test_logits_process.py index 01c6f9777c..3ac2c7d2ef 100644 --- a/tests/pytorch/engine/test_logits_process.py +++ b/tests/pytorch/engine/test_logits_process.py @@ -150,7 +150,7 @@ def _get_emtas(n, window_size): [9, 8, 7, 3, 8, 7, 5, 9, 8, 7], [9, 8, 7, 3, 8, 7, 5, 9, 8, 7], ], - dtype=torch.int64) + dtype=torch.int64) n = torch.tensor([3, 3, 2], dtype=torch.int64) threshold = torch.tensor([3, 3, 3], dtype=torch.int64) diff --git a/tests/pytorch/kernel/test_fill_kv_cache.py b/tests/pytorch/kernel/test_fill_kv_cache.py index 43204af183..59c721a752 100644 --- a/tests/pytorch/kernel/test_fill_kv_cache.py +++ b/tests/pytorch/kernel/test_fill_kv_cache.py @@ -140,7 +140,7 @@ def gt(self, k_states, v_states, k_caches, v_caches, seq_lens, history_lens, blo ((1, 1, 1, 1), (1, 16, 31, 24)), ((1, 8, 16, 24), (1, 16, 31, 24)), ], - indirect=True) + indirect=True) def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, block_offsets, q_start_loc, q_seq_length, kv_seq_length, max_q_seq_length, gt): from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache @@ -229,7 +229,7 @@ def gt(self, k_states, v_states, k_caches, v_caches, seq_lens, history_lens, blo ((1, 1, 1, 1), (1, 16, 31, 24)), ((1, 8, 16, 24), (1, 16, 31, 24)), ], - indirect=True) + indirect=True) def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, k_scales_zeros, v_scales_zeros, block_offsets, q_start_loc, q_seq_length, kv_seq_length, max_q_seq_length, gt): from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache @@ -258,7 +258,7 @@ def nbits(self): ((1, 1, 1, 1), (1, 16, 31, 24)), ((1, 8, 16, 24), (1, 16, 31, 24)), ], - indirect=True) + indirect=True) def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, k_scales_zeros, v_scales_zeros, block_offsets, q_start_loc, q_seq_length, kv_seq_length, max_q_seq_length, gt, nbits): from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache @@ -382,7 +382,7 @@ def uncache(self, k_caches, ks_caches, v_caches, vs_caches, cu_seqlen_q, kv_seql ((1, 1, 1, 1), (1, 128, 256, 200)), ((1, 64, 128, 50), (1, 128, 256, 200)), ], - indirect=True) + indirect=True) def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, ks_caches, vs_caches, block_offsets, cu_seqlen_q, kv_seq_length, max_q_seq_length, gt, group_size, scale_fmt): from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache_blocked_fp8 diff --git a/tests/pytorch/spec_decode/test_reject_sample.py b/tests/pytorch/spec_decode/test_reject_sample.py index c46ded9bcd..34f22000fe 100644 --- a/tests/pytorch/spec_decode/test_reject_sample.py +++ b/tests/pytorch/spec_decode/test_reject_sample.py @@ -100,8 +100,8 @@ def test_greedy_mixed_mismatch_positions(self): [10, 21, 30], [10, 20, 31], ], - dtype=torch.long, - device=device) + dtype=torch.long, + device=device) bonus = torch.tensor([99, 88, 77, 66], dtype=torch.long, device=device) si = SamplingInputs(max_top_k=1) diff --git a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py index 1b575e574d..1a8827700b 100644 --- a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py +++ b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py @@ -93,8 +93,8 @@ def mock_messages(self): @pytest.fixture(scope='module') def mock_IMAGE_TOKEN_messages(self): return [ - dict(role='system', content='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、' \ - '清华大学及多家合作单位联合开发的多模态大语言模型。'), + dict(role='system', content='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、' + '清华大学及多家合作单位联合开发的多模态大语言模型。'), dict(role='user', content=[ dict(type='text', text='\nDescribe the following images in detail'),