diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
index e296c4d760..96754200b7 100644
--- a/.github/scripts/action_tools.py
+++ b/.github/scripts/action_tools.py
@@ -94,7 +94,7 @@ def evaluate(models: list[str],
     for idx, ori_model in enumerate(models):
         print()
         print(50 * '==')
-        print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...')
+        print(f'Start evaluating {idx + 1}/{num_model} {ori_model} ...')
         model = ori_model.lower()
 
         lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index 26caa0b103..750968530b 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -133,7 +133,7 @@
     dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
     dict(role='BOT', begin='', end='', generate=True),
 ],
-                            eos_token_id=2)
+    eos_token_id=2)
 
 MAX_SESSION_LEN = 2048
 MAX_NEW_TOKENS = 1024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d4cf15d30..be07bba848 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,8 @@
 repos:
+  - repo: https://github.com/hhatto/autopep8
+    rev: v2.3.2
+    hooks:
+      - id: autopep8
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.15.4
     hooks:
diff --git a/autotest/interface/restful/test_restful_chat_completions_v1.py b/autotest/interface/restful/test_restful_chat_completions_v1.py
index 67e166a568..dfff5597e1 100644
--- a/autotest/interface/restful/test_restful_chat_completions_v1.py
+++ b/autotest/interface/restful/test_restful_chat_completions_v1.py
@@ -227,14 +227,14 @@ def test_array_stopwords_streaming(self, backend, model_case):
     @pytest.mark.internlm2_5
     def test_special_words(self, backend, model_case):
         message = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' \
-                '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
-                '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
-                '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
-                '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
-                '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
-                '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
-                'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
-                '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
+            '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
+            '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
+            '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
+            '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
+            '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
+            '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
+            'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
+            '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
         api_client = APIClient(BASE_URL)
         model_name = api_client.available_models[0]
         for output in api_client.chat_completions_v1(model=model_name,
diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py
index e08b5c3a92..65f1638c39 100644
--- a/autotest/interface/restful/test_restful_generate.py
+++ b/autotest/interface/restful/test_restful_generate.py
@@ -748,7 +748,7 @@ def single_request(idx):
 
         success_rate = success_count / 20
         assert success_rate == 1.0, \
-            f'Stress test failed: success rate {success_rate*100}% < 80%'
+            f'Stress test failed: success rate {success_rate * 100}% < 80%'
 
         if success_count > 0:
             avg_latency = total_latency / success_count
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 2ac134f440..cbf01f4211 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -210,7 +210,7 @@ def load_video(video_path, bound=None, num_segments=32):
 
     question = ''
     for i in range(len(imgs)):
-        question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
+        question = question + f'Frame{i + 1}: {IMAGE_TOKEN}\n'
 
     if lang == 'cn':
         question += '视频里有什么动物，它在做什么？'
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 11f31c1de4..72d107c2ee 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -95,7 +95,7 @@ def serve(model_path: str,
         This function has been removed. Please use alternative methods.
 
     This will run the api_server in a subprocess.
-    """ # noqa E501
+    """  # noqa E501
     raise NotImplementedError("The 'serve' function is no longer available. "
                               'This function has been deprecated and removed.')
 
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 39469c6f07..0d134a0c40 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -61,7 +61,7 @@ def get_lora_adapters(adapters: list[str]):
     else:
         for pair in adapters:
             assert '=' in pair, f'Multiple lora paths must in format of ' \
-                                 f'xxx=yyy. But given: {pair}'
+                f'xxx=yyy. But given: {pair}'
             name, path = pair.strip().split('=', 1)
             assert name not in output, f'Multiple lora paths with repeated lora name: {name}'
             output[name] = path
@@ -420,8 +420,7 @@ def calib_batchsize(parser):
             '--batch-size',
             type=int,
             default=1,
-            help=\
-            'The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM'  # noqa
+            help='The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM'  # noqa
         )
 
     @staticmethod
@@ -432,8 +431,7 @@ def calib_search_scale(parser):
             '--search-scale',
             action='store_true',
             default=False,
-            help=\
-            'Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied'  # noqa
+            help='Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied'  # noqa
         )
 
     @staticmethod
@@ -454,8 +452,7 @@ def chat_template(parser):
             '--chat-template',
             type=str,
             default=None,
-            help=\
-            'A JSON file or string that specifies the chat template configuration. '  # noqa
+            help='A JSON file or string that specifies the chat template configuration. '  # noqa
             'Please refer to https://lmdeploy.readthedocs.io/en/latest/advance/chat_template.html for the specification'  # noqa
         )
 
diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py
index d801a1f447..1d2df866f8 100644
--- a/lmdeploy/lite/utils/__init__.py
+++ b/lmdeploy/lite/utils/__init__.py
@@ -2,14 +2,14 @@
 
 from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs
 from .cal_qparams import (
-                          QParams,
-                          cal_qparams_per_channel_absmax,
-                          cal_qparams_per_channel_minmax,
-                          cal_qparams_per_group_absmax,
-                          cal_qparams_per_group_minmax,
-                          cal_qparams_per_tensor_absmax,
-                          cal_qparams_per_tensor_minmax,
-                          precise_round,
+    QParams,
+    cal_qparams_per_channel_absmax,
+    cal_qparams_per_channel_minmax,
+    cal_qparams_per_group_absmax,
+    cal_qparams_per_group_minmax,
+    cal_qparams_per_tensor_absmax,
+    cal_qparams_per_tensor_minmax,
+    precise_round,
 )
 from .calib_dataloader import get_calib_loaders
 from .collect import bimap_name_mod, collect_target_modules, collect_target_weights
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 3c7b43b5a4..77db60dfa6 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -451,16 +451,16 @@ def __post_init__(self):
         assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
         assert self.device_type in ['cuda', 'ascend', 'maca', 'camb'], (f'invalid device_type: {self.device_type}')
         assert self.kernel_block_size >= 16 and \
-               (self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \
-               f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}'
+            (self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \
+            f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}'
         assert self.block_size >= self.kernel_block_size and \
-               self.block_size % self.kernel_block_size == 0, \
-               (f'block_size must be >= kernel_block_size and an integer multiple '
-                f'of kernel_block_size, but got block_size {self.block_size} '
-                f'and kernel_block_size {self.kernel_block_size}')
+            self.block_size % self.kernel_block_size == 0, \
+            (f'block_size must be >= kernel_block_size and an integer multiple '
+             f'of kernel_block_size, but got block_size {self.block_size} '
+             f'and kernel_block_size {self.kernel_block_size}')
         if self.quant_policy > 0 and self.device_type not in ['cuda', 'ascend']:
             assert False, \
-                   'kv cache quantization only works for CUDA and ASCEND.'
+                'kv cache quantization only works for CUDA and ASCEND.'
         if self.device_type == 'camb' and self.block_size != 16:
             self.block_size = 16
             logger.warning('Currently, camb device requires block size to be 16, \
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
index bd8c78d8c1..f28bc0dbe2 100644
--- a/lmdeploy/metrics/loggers.py
+++ b/lmdeploy/metrics/loggers.py
@@ -116,10 +116,10 @@ def log(self):
                    f'{scheduler_stats.num_api_routed_reqs} / {scheduler_stats.num_api_waiting_reqs}, '
                    f'Engine (running/waiting): '
                    f'{scheduler_stats.num_running_reqs} / {scheduler_stats.num_waiting_reqs}, '
-                   f'KV cache: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, ')
+                   f'KV cache: {scheduler_stats.gpu_cache_usage * 100:.1f}%, ')
 
         if scheduler_stats.prefix_cache_hit_rate != 0:
-            log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, '
+            log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100:.1f}%, '
 
         if spec_msg is not None:
             log_msg += spec_msg
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index dbd8939ecf..873c3524b4 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -162,10 +162,10 @@ def get_prompt(self, prompt, sequence_start=True):
                     f'{self.assistant}'
             else:
                 return f'{self.user}{prompt}{self.eoh}' \
-                       f'{self.assistant}'
+                    f'{self.assistant}'
         else:
             return f'{self.separator}{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
+                f'{self.assistant}'
 
     def messages2prompt(self, messages, sequence_start=True, **kwargs):
         """Return the prompt that is concatenated with other elements in the
diff --git a/lmdeploy/pipeline.py b/lmdeploy/pipeline.py
index ca4c42bba0..818248a374 100644
--- a/lmdeploy/pipeline.py
+++ b/lmdeploy/pipeline.py
@@ -122,7 +122,8 @@ def infer(self,
                     res = res.extend(out) if res else out
                 outputs.append(res)
         finally:
-            if pbar: pbar.close()  # noqa
+            if pbar:
+                pbar.close()  # noqa
         if is_single:
             return outputs[0]
         return outputs
diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py
index d687d5a485..b011e54b48 100644
--- a/lmdeploy/profiler.py
+++ b/lmdeploy/profiler.py
@@ -166,8 +166,8 @@ def save_csv(self, csv_file: str, hyperparams):
                 f'{self.rps:.3f}',
                 f'{(self.input_throughput):.3f}',
                 f'{self.output_throughput:.3f}',
-                f'{self.e2e_mean*1000:.3f}',
-                f'{self.ttft_mean*1000:.3f}' if self.stream_output else '-',
-                f'{self.tpot_mean*1000:.3f}',
-                f'{self.itls_mean*1000:.3f}' if self.stream_output else '-',
+                f'{self.e2e_mean * 1000:.3f}',
+                f'{self.ttft_mean * 1000:.3f}' if self.stream_output else '-',
+                f'{self.tpot_mean * 1000:.3f}',
+                f'{self.itls_mean * 1000:.3f}' if self.stream_output else '-',
             ])
diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
index 78376df43f..7266b37d25 100644
--- a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
@@ -55,7 +55,7 @@ def get_total_slots():
         is_prefill_no_cache = False
         if not step_context.is_decoding:
             is_prefill_no_cache = \
-               all((step_context.q_seqlens ==
+                all((step_context.q_seqlens ==
                     step_context.kv_seqlens).tolist())
         q_start_loc = step_context.q_start_loc
         cu_seqlens = torch.cat((q_start_loc, step_context.q_seqlens.sum().unsqueeze(0))).int()
diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
index 55fb6e807b..fbd0de9e66 100644
--- a/lmdeploy/pytorch/engine/model_agent/agent.py
+++ b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -1180,7 +1180,7 @@ def wakeup(self, tags: list[str] | None = None):
         if 'weights' in tags:
             device = next(self.patched_model.get_model().parameters()).device
             assert device.type in ['cpu', 'meta']
-            spec_model =  self.spec_agent.get_model()
+            spec_model = self.spec_agent.get_model()
 
             if device.type == 'cpu':
                 self.patched_model.get_model().to(torch.cuda.current_device())
diff --git a/lmdeploy/pytorch/kernels/__init__.py b/lmdeploy/pytorch/kernels/__init__.py
index ae4a278777..4befc9b134 100644
--- a/lmdeploy/pytorch/kernels/__init__.py
+++ b/lmdeploy/pytorch/kernels/__init__.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from .w8a8_triton_kernels import (
-                                  matmul_kernel_dynamic_quant,
-                                  per_channel_quant,
-                                  per_token_quant_int8,
-                                  rms_norm_dynamic_quant,
+    matmul_kernel_dynamic_quant,
+    per_channel_quant,
+    per_token_quant_int8,
+    rms_norm_dynamic_quant,
 )
 
 __all__ = [
diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
index e6717b5ef4..474e9bd0ea 100644
--- a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
+++ b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
@@ -208,7 +208,7 @@ def _gemm_fp8_tma_pre_hook(nargs):
         'BLOCK_N': 64,
     }, num_stages=3, num_warps=4, pre_hook=_gemm_fp8_tma_pre_hook)
 ],
-                 key=['N', 'K'])
+    key=['N', 'K'])
 @triton.jit
 def _gemm_fp8_tma_kernel(
     desc_a,
@@ -296,7 +296,7 @@ def _gemm_fp8_tma_kernel(
         'BLOCK_N': 64,
     }, num_stages=3, num_warps=4)
 ],
-                 key=['N', 'K'])
+    key=['N', 'K'])
 @triton.jit
 def _gemm_fp8_kernel(
     A,
diff --git a/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py b/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py
index 0e7a853a90..1e42d0f7e5 100644
--- a/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py
+++ b/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py
@@ -121,7 +121,7 @@ def causal_conv1d_fwd_main(
                             init_col = k_val + w
                             if init_col < width - 1:
                                 out_vals[i] += w_local[w] * T.cast(Init_states[seq_idx_cur, c_idx, init_col],
-                                                                    T.float32)
+                                                                   T.float32)
                 else:
                     for w in T.unroll(width):
                         out_vals[i] += T.if_then_else(seq_idx_local[i + w] == seq_idx_cur,
diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
index 8437c16541..310dc23d6d 100644
--- a/lmdeploy/pytorch/kernels/cuda/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
@@ -17,16 +17,16 @@ def get_cuda_autotune_config():
             'BLOCK_SIZE_K': 64,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=3,
-                      num_warps=8),
+            num_stages=3,
+            num_warps=8),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 256,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         # SM8
         triton.Config({
             'BLOCK_SIZE_M': 128,
@@ -34,24 +34,24 @@ def get_cuda_autotune_config():
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 256,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 128,
             'BLOCK_SIZE_K': 64,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         # SM7-
         triton.Config({
             'BLOCK_SIZE_M': 64,
@@ -59,24 +59,24 @@ def get_cuda_autotune_config():
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 128,
             'BLOCK_SIZE_N': 32,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 32,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=5,
-                      num_warps=2),
+            num_stages=5,
+            num_warps=2),
     ]
 
 
diff --git a/lmdeploy/pytorch/kernels/cuda/rms_norm.py b/lmdeploy/pytorch/kernels/cuda/rms_norm.py
index 5238bab261..8376686e6a 100644
--- a/lmdeploy/pytorch/kernels/cuda/rms_norm.py
+++ b/lmdeploy/pytorch/kernels/cuda/rms_norm.py
@@ -191,8 +191,8 @@ def test_rms_norm(bsz, ctx_len, feat_len, dtype):
 
         torch_cost = (t1 - t0) / N_REPEATS * 1000
         triton_cost = (t2 - t1) / N_REPEATS * 1000
-        print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' \
-                f'  torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')
+        print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n'
+              f'  torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')
 
     test_rms_norm(1, 8128, 5120, torch.float16)
     test_rms_norm(1, 8128, 5120, torch.float32)
diff --git a/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py b/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py
index dd8c08109c..503ae85677 100644
--- a/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py
+++ b/lmdeploy/pytorch/kernels/cuda/w8a8_fused_moe.py
@@ -17,32 +17,32 @@ def get_cuda_autotune_config():
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 256,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 128,
             'BLOCK_SIZE_K': 64,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 128,
             'BLOCK_SIZE_N': 128,
             'BLOCK_SIZE_K': 128,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=3,
-                      num_warps=8),
+            num_stages=3,
+            num_warps=8),
     ]
 
 
diff --git a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
index bdff352823..6bdaf6de40 100644
--- a/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
+++ b/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py
@@ -599,6 +599,6 @@ def perf(ms):
                                       plot_name='bench-triton',
                                       args={
                                           'dtype': torch.float16,
-                                      })
+    })
     bench_funch = (triton.testing.perf_report(config))(bench_rms_and_linear)
     bench_funch.run(print_data=True)
diff --git a/lmdeploy/pytorch/models/deepseek_mtp.py b/lmdeploy/pytorch/models/deepseek_mtp.py
index a36a14cd34..7cb5653af6 100644
--- a/lmdeploy/pytorch/models/deepseek_mtp.py
+++ b/lmdeploy/pytorch/models/deepseek_mtp.py
@@ -753,7 +753,7 @@ def __skip_nextn(name, nextn_keys):
         num_hidden_layers = self.config.num_hidden_layers
 
         num_nextn_predict_layers = getattr(self.config, 'num_nextn_predict_layers', 1)
-        nextn_keys = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)]
+        nextn_keys = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)]
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
index 087d744bf9..29e9048599 100644
--- a/lmdeploy/pytorch/models/deepseek_v2.py
+++ b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -1345,7 +1345,7 @@ def __skip_layers():
         num_hidden_layers = self.config.num_hidden_layers
 
         num_nextn_predict_layers = getattr(self.config, 'num_nextn_predict_layers', 1)
-        nextn_keys = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)]
+        nextn_keys = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)]
 
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
diff --git a/lmdeploy/pytorch/models/glm4_moe.py b/lmdeploy/pytorch/models/glm4_moe.py
index 671a4e6e9a..2d332fcfce 100644
--- a/lmdeploy/pytorch/models/glm4_moe.py
+++ b/lmdeploy/pytorch/models/glm4_moe.py
@@ -550,7 +550,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         if hasattr(self.config, 'num_nextn_predict_layers'):
             num_hidden_layers = self.config.num_hidden_layers
             num_nextn_predict_layers = self.config.num_nextn_predict_layers
-            mtp_param_list = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)]
+            mtp_param_list = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)]
 
         # expert map
         num_experts = self.config.n_routed_experts
diff --git a/lmdeploy/pytorch/models/glm4moe_mtp.py b/lmdeploy/pytorch/models/glm4moe_mtp.py
index 4e5ec8c818..1cc5b51f18 100644
--- a/lmdeploy/pytorch/models/glm4moe_mtp.py
+++ b/lmdeploy/pytorch/models/glm4moe_mtp.py
@@ -110,7 +110,7 @@ def __skip_nextn(name, nextn_keys):
         num_hidden_layers = self.config.num_hidden_layers
 
         num_nextn_predict_layers = getattr(self.config, 'num_nextn_predict_layers', 1)
-        nextn_keys = [f'.layers.{num_hidden_layers+i}' for i in range(num_nextn_predict_layers)]
+        nextn_keys = [f'.layers.{num_hidden_layers + i}' for i in range(num_nextn_predict_layers)]
 
         # expert map
         num_experts = self.config.n_routed_experts
diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py
index aae94cb6d7..88d90a5690 100644
--- a/lmdeploy/pytorch/models/qwen3_5.py
+++ b/lmdeploy/pytorch/models/qwen3_5.py
@@ -433,13 +433,12 @@ def __init__(
                                               quant_config=quantization_config)
 
         self.in_proj_ba = build_merged_colwise_linear(self.hidden_size,
-                                              [self.num_v_heads, self.num_v_heads],
-                                              bias=False,
-                                              dtype=dtype,
-                                              device=device,
-                                              is_tp=True,
-                                              out_names=['b', 'a'])
-
+                                                      [self.num_v_heads, self.num_v_heads],
+                                                      bias=False,
+                                                      dtype=dtype,
+                                                      device=device,
+                                                      is_tp=True,
+                                                      out_names=['b', 'a'])
 
         # time step projection (discretization)
         # instantiate once and copy inv_dt in init_weights of PretrainedModel
@@ -1113,7 +1112,6 @@ def __init__(self,
         self.enable_return_routed_experts = False
         self.is_spec_decoding = get_build_model_context().num_spec_tokens > 0
 
-
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/lmdeploy/pytorch/strategies/ar/step_inputs.py b/lmdeploy/pytorch/strategies/ar/step_inputs.py
index 5fad97dcf2..96b74506ff 100644
--- a/lmdeploy/pytorch/strategies/ar/step_inputs.py
+++ b/lmdeploy/pytorch/strategies/ar/step_inputs.py
@@ -26,7 +26,7 @@
 
 
 def step_sampling_delta(sampling_delta: SamplingInputsDelta,
-                         next_token_ids: torch.Tensor) -> SamplingInputsDelta:
+                        next_token_ids: torch.Tensor) -> SamplingInputsDelta:
     """Advance sampling delta for one decode step."""
     sampling_delta.num_ignore_eos = sampling_delta.num_ignore_eos - 1
     if sampling_delta.random_offsets is not None:
diff --git a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py
index df749d019e..234ff92bd5 100644
--- a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py
+++ b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.messages import SchedulerSequence
 from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta
 
-from ..ar.model_agent import ARStoppingCriteria
+from ..ar.model_agent import ARStoppingCriteria, get_model_inputs_next_decoding
 from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy
 
 SeqList = list[SchedulerSequence]
@@ -179,11 +179,62 @@ def make_extra_inputs(self, seqs: 'SeqList', model_inputs: 'ModelInputs') -> Ext
         """Create extra inputs."""
         return ARSpecExtraInputs()
 
+    def update_extra_inputs(self, extra_inputs: ARSpecExtraInputs, delta: 'ModelInputsDelta') -> ARSpecExtraInputs:
+        """Update extra inputs with model inputs delta."""
+        return extra_inputs.update(delta)
+
     def make_extra_outputs(self, extra_inputs: ARSpecExtraInputs) -> ARSpecExtraOutputs:
         """Create extra outputs."""
         output = ARSpecExtraOutputs(draft_token_ids=extra_inputs.output_draft_token_ids)
         return output
 
+    def update_prefill_for_next_step(
+        self,
+        model_inputs: 'ModelInputs',
+        extra_inputs: ARSpecExtraInputs,
+        next_token_ids: torch.Tensor,
+        model_metas: Any,
+        extra_outputs: ARSpecExtraOutputs,
+    ) -> tuple['ModelInputs', ARSpecExtraInputs]:
+        """Step next decoding."""
+        next_token_ids = next_token_ids[:, None]
+        next_token_ids = torch.cat([next_token_ids, extra_outputs.draft_token_ids], dim=-1)
+        max_q_seqlen = next_token_ids.size(-1)
+        next_token_ids = next_token_ids.flatten()[None, :]
+        inputs = get_model_inputs_next_decoding(model_inputs,
+                                                next_token_ids,
+                                                max_q_seqlen=max_q_seqlen,
+                                                model_metas=model_metas)
+
+        # update mrope pos ids
+        mrope_pos_ids = inputs.mrope_pos_ids
+        if mrope_pos_ids is not None:
+            offsets = torch.arange(max_q_seqlen, dtype=mrope_pos_ids.dtype, device=mrope_pos_ids.device)[None, None, :]
+            mrope_pos_ids = mrope_pos_ids.unflatten(1, (-1, 1)).repeat(1, 1, max_q_seqlen) + offsets
+            inputs.mrope_pos_ids = mrope_pos_ids.flatten(1, 2)
+
+        extra_inputs = extra_inputs.clone()
+        return inputs, extra_inputs
+
+    def update_decoding_for_next_step(self, model_inputs: 'ModelInputs', next_token_ids: torch.Tensor, model_metas: Any,
+                                      extra_inputs: ARSpecExtraInputs, extra_outputs: ARSpecExtraOutputs):
+        """Step next inputs."""
+        model_inputs.model_metas = model_metas
+        step_seqlens = model_inputs.seq_length
+        batch_size = step_seqlens.size(0)
+
+        # update extra inputs
+        extra_inputs.output_token_ids = extra_outputs.draft_token_ids
+
+        # update inputs
+        step_seqlens = model_inputs.seq_length - extra_inputs.num_rejected_tokens
+        input_ids = next_token_ids.new_empty((batch_size, self.num_spec_tokens + 1))
+        input_ids[:, 0] = next_token_ids
+        input_ids[:, 1:] = extra_inputs.output_draft_token_ids
+        input_ids = input_ids.flatten()[None, :]
+        model_inputs = model_inputs.step(input_ids, step_seqlens)
+        return model_inputs, extra_inputs
+
     def post_sampling(self, inputs: 'ModelInputs', logits: torch.Tensor, next_token_ids: torch.LongTensor,
                       extra_inputs: ARSpecExtraInputs):
         """Post sampling."""
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index a259fbdd90..d87cfce38b 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -202,7 +202,7 @@ def _build_stat_loggers(self):
             metrics_processor.stat_loggers = self.stat_loggers
 
     def _if_session_stale(self, session: Session,
-                                   input_token_len: int) -> GenOut | None:
+                          input_token_len: int) -> GenOut | None:
         """If ``session.epoch`` was stamped by api_server and
         ``stop_all_session`` ran since then (the engine epoch changed), drop
         the session."""
@@ -449,7 +449,6 @@ def is_error(status):
         if not gen_config.ignore_eos:
             stop_ids = gen_config.stop_token_ids or []
 
-
         stale = self._if_session_stale(session, len(prompt_input['input_ids']))
         if stale is not None:
             metrics_processor.increase_failed_requests('abort')
@@ -460,7 +459,7 @@ def is_error(status):
         async with session.request_handle() as handle:
             if session.epoch is not None and session.epoch != self.epoch:
                 logger.info(f'[generate] session {session_id} got aborted before starting inference, '
-                               f'session.epoch={session.epoch}, async_engine.epoch={self.epoch}')
+                            f'session.epoch={session.epoch}, async_engine.epoch={self.epoch}')
                 metrics_processor.increase_failed_requests('abort')
                 yield GenOut(response='',
                              history_token_len=0,
diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py
index 667886273e..4af1d08e10 100644
--- a/lmdeploy/serve/proxy/proxy.py
+++ b/lmdeploy/serve/proxy/proxy.py
@@ -146,8 +146,8 @@ def update_config_file(self):
                     node_url: node_status.model_dump_json()
                     for node_url, node_status in nodes.items()
                 },
-                          config_file,
-                          indent=2)
+                    config_file,
+                    indent=2)
 
     def add(self, node_url: str, status: Status | None = None):
         """Add a node to the manager.
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 5271262f7b..2c94bfb18f 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -720,7 +720,7 @@ async def async_stream_infer(self,
                     grammar = compiler.compile_json_schema(decode_grammar)
                 else:
                     assert False, f'Decode grammar type {decode_grammar_type} should be in ' \
-                                   '["json_schema", "regex_schema", "json_object"]'
+                        '["json_schema", "regex_schema", "json_object"]'
 
                 self.model_inst.set_grammar(grammar)
             except ValueError as e:
diff --git a/lmdeploy/vl/__init__.py b/lmdeploy/vl/__init__.py
index a62115171d..b5b3247d60 100644
--- a/lmdeploy/vl/__init__.py
+++ b/lmdeploy/vl/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .utils import (
-                    encode_image_base64,
-                    encode_time_series_base64,
-                    encode_video_base64,
-                    load_image,
-                    load_time_series,
-                    load_video,
+    encode_image_base64,
+    encode_time_series_base64,
+    encode_video_base64,
+    load_image,
+    load_time_series,
+    load_video,
 )
 
 __all__ = [
diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py
index 6534886017..c15bbfe04b 100644
--- a/lmdeploy/vl/model/interns1_pro.py
+++ b/lmdeploy/vl/model/interns1_pro.py
@@ -12,7 +12,6 @@
 logger = get_logger('lmdeploy')
 
 
-
 @VISION_MODELS.register_module()
 class InternS1ProVisionModel(Qwen3VLModel):
     """InternS1Pro model.
diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py
index 6dc5eff4c4..49fe39ca05 100644
--- a/lmdeploy/vl/model/llava.py
+++ b/lmdeploy/vl/model/llava.py
@@ -364,7 +364,7 @@ def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
                                 feat = unpad_image(feat, image_sizes[img_idx])
                                 feat = torch.cat((feat, self.model.image_newline[:, None, None].expand(
                                     *feat.shape[:-1], 1).to(feat.device)),
-                                                 dim=-1)
+                                    dim=-1)
                                 feat = feat.flatten(1, 2).transpose(0, 1)
                             else:
                                 feat = feat.permute(0, 2, 1, 3, 4).contiguous()
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
index 89eaa7659a..ef2be1c2da 100644
--- a/lmdeploy/vl/model/xcomposer2.py
+++ b/lmdeploy/vl/model/xcomposer2.py
@@ -271,7 +271,7 @@ def proc_messages(messages, chat_template, sequence_start, model_type):
                     if n_images == 1:
                         prefix_image_token, prompt = IMAGE_TOKEN, content[0]
                     else:
-                        prompt = ''.join([f'Image{i+1} {IMAGE_TOKEN}; ' for i in range(n_images)]) + content[0]
+                        prompt = ''.join([f'Image{i + 1} {IMAGE_TOKEN}; ' for i in range(n_images)]) + content[0]
                 else:
                     prompt = ''.join([IMAGE_TOKEN] * n_images) + content[0]
             else:
diff --git a/pyproject.toml b/pyproject.toml
index 43b200dd4c..8d84226a55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,3 +20,7 @@ select = [
 ignore = [
   "E231", "E741"
 ]
+
+[tool.autopep8]
+max_line_length = 120
+aggressive = 3
diff --git a/tests/pytorch/engine/test_logits_process.py b/tests/pytorch/engine/test_logits_process.py
index 01c6f9777c..3ac2c7d2ef 100644
--- a/tests/pytorch/engine/test_logits_process.py
+++ b/tests/pytorch/engine/test_logits_process.py
@@ -150,7 +150,7 @@ def _get_emtas(n, window_size):
         [9, 8, 7, 3, 8, 7, 5, 9, 8, 7],
         [9, 8, 7, 3, 8, 7, 5, 9, 8, 7],
     ],
-                                 dtype=torch.int64)
+        dtype=torch.int64)
     n = torch.tensor([3, 3, 2], dtype=torch.int64)
     threshold = torch.tensor([3, 3, 3], dtype=torch.int64)
 
diff --git a/tests/pytorch/kernel/test_fill_kv_cache.py b/tests/pytorch/kernel/test_fill_kv_cache.py
index 43204af183..59c721a752 100644
--- a/tests/pytorch/kernel/test_fill_kv_cache.py
+++ b/tests/pytorch/kernel/test_fill_kv_cache.py
@@ -140,7 +140,7 @@ def gt(self, k_states, v_states, k_caches, v_caches, seq_lens, history_lens, blo
         ((1, 1, 1, 1), (1, 16, 31, 24)),
         ((1, 8, 16, 24), (1, 16, 31, 24)),
     ],
-                             indirect=True)
+        indirect=True)
     def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, block_offsets, q_start_loc, q_seq_length,
                            kv_seq_length, max_q_seq_length, gt):
         from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache
@@ -229,7 +229,7 @@ def gt(self, k_states, v_states, k_caches, v_caches, seq_lens, history_lens, blo
         ((1, 1, 1, 1), (1, 16, 31, 24)),
         ((1, 8, 16, 24), (1, 16, 31, 24)),
     ],
-                             indirect=True)
+        indirect=True)
     def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, k_scales_zeros, v_scales_zeros, block_offsets,
                            q_start_loc, q_seq_length, kv_seq_length, max_q_seq_length, gt):
         from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache
@@ -258,7 +258,7 @@ def nbits(self):
         ((1, 1, 1, 1), (1, 16, 31, 24)),
         ((1, 8, 16, 24), (1, 16, 31, 24)),
     ],
-                             indirect=True)
+        indirect=True)
     def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, k_scales_zeros, v_scales_zeros, block_offsets,
                            q_start_loc, q_seq_length, kv_seq_length, max_q_seq_length, gt, nbits):
         from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache
@@ -382,7 +382,7 @@ def uncache(self, k_caches, ks_caches, v_caches, vs_caches, cu_seqlen_q, kv_seql
         ((1, 1, 1, 1), (1, 128, 256, 200)),
         ((1, 64, 128, 50), (1, 128, 256, 200)),
     ],
-                             indirect=True)
+        indirect=True)
     def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches, ks_caches, vs_caches, block_offsets,
                            cu_seqlen_q, kv_seq_length, max_q_seq_length, gt, group_size, scale_fmt):
         from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import fill_kv_cache_blocked_fp8
diff --git a/tests/pytorch/spec_decode/test_reject_sample.py b/tests/pytorch/spec_decode/test_reject_sample.py
index c46ded9bcd..34f22000fe 100644
--- a/tests/pytorch/spec_decode/test_reject_sample.py
+++ b/tests/pytorch/spec_decode/test_reject_sample.py
@@ -100,8 +100,8 @@ def test_greedy_mixed_mismatch_positions(self):
             [10, 21, 30],
             [10, 20, 31],
         ],
-                             dtype=torch.long,
-                             device=device)
+            dtype=torch.long,
+            device=device)
         bonus = torch.tensor([99, 88, 77, 66], dtype=torch.long, device=device)
         si = SamplingInputs(max_top_k=1)
 
diff --git a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
index 1b575e574d..1a8827700b 100644
--- a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
+++ b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
@@ -93,8 +93,8 @@ def mock_messages(self):
     @pytest.fixture(scope='module')
     def mock_IMAGE_TOKEN_messages(self):
         return [
-            dict(role='system', content='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、' \
-                    '清华大学及多家合作单位联合开发的多模态大语言模型。'),
+            dict(role='system', content='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、'
+                 '清华大学及多家合作单位联合开发的多模态大语言模型。'),
             dict(role='user',
                  content=[
                      dict(type='text', text='<IMAGE_TOKEN>\nDescribe the following images in detail'),