From 9326ac2019956440e563fe5c1c97dc15cc8bee5a Mon Sep 17 00:00:00 2001 From: 43758726 <1462774833@qq.com> Date: Wed, 8 Apr 2026 09:22:08 +0000 Subject: [PATCH 1/2] make fp8 model quantized by llm-compressor can be inferenced in turbomind --- lmdeploy/turbomind/deploy/converter.py | 46 ++++++++++++++++++-------- lmdeploy/turbomind/deploy/parameter.py | 11 ++++++ lmdeploy/turbomind/deploy/policy.py | 9 +++-- 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index 9d697ef567..1a3a3e840f 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -31,7 +31,8 @@ def get_input_model_registered_name(model_path: str, model_format: str): return register_name -def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int): +def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int, + quantized_format: str): """Get the registered name of the turbomind model and its configuration according to the input model path, format and user-input config. The name will be used to access the OUTPUT_MODELS registry. @@ -42,6 +43,8 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s ['hf', 'awq', 'gptq'] dtype (str): the data type of the model's weights and activations group_size (int): the size of group used by awq model + quantized_format (str): the quantized format of compressed-tensors model, + which can be one of ['pack-quantized', 'float-quantized'] """ register_name = 'tm' @@ -75,11 +78,18 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s session_len = _get_and_verify_max_len(model_config, None) if model_format in ['awq', 'gptq', 'compressed-tensors']: - weight_type = 'int4' - dtype = 'float16' # force float16 for int4 quantized weights + if model_format in ['awq', 'gptq']: + weight_type = 'int4' + dtype = 'float16' # force float16 for int4 quantized weights + elif model_format == 'compressed-tensors': + if quantized_format == 'pack-quantized': + weight_type = 'int4' + model_format = 'awq' + dtype = 'float16' # force float16 for int4 quantized weights + elif quantized_format == 'float-quantized': + weight_type = 'fp8' + model_format = 'fp8' group_size = 128 if group_size == 0 else group_size - if model_format == 'compressed-tensors': - model_format = 'awq' elif model_format == 'fp8': weight_type = 'fp8' group_size = 128 @@ -196,18 +206,23 @@ def get_tm_model(model_path, _group_size = 32 elif quant_method == 'compressed-tensors': _format = quant_config['config_groups']['group_0']['format'] - assert _format == 'pack-quantized', ('compressed-tennsors only supports pack-quantized format, ' - f'but got {_format}') + assert _format in ['pack-quantized', 'float-quantized' + ], ('compressed-tennsors only supports pack-quantized/float-quantized format, ' + f'but got {_format}') _weights = quant_config['config_groups']['group_0']['weights'] _group_size = _weights['group_size'] _num_bits = _weights['num_bits'] _type = _weights['type'] - assert _num_bits == 4 and _type == 'int', ('pack-quantized requires 4-bit int, ' - f'but got {_num_bits}-bit {_type}') + assert (_num_bits == 4 and _type == 'int') or (_num_bits == 8 and _type == 'float'), ( + 'pack-quantized requires 4-bit int, ' + f'but got {_num_bits}-bit {_type}. ' + 'or float-quantized requires 8-bit float, ' + f'but got {_num_bits}-bit {_type}') else: assert 0, f'unsupported quant_config: {quant_config}' engine_config.model_format = quant_method + quantized_format = _format if quant_method == 'compressed-tensors' else None group_size = _group_size if engine_config.model_format in ['awq', 'gptq', 'compressed-tensors']: @@ -221,16 +236,19 @@ def get_tm_model(model_path, input_model_name = get_input_model_registered_name(model_path, engine_config.model_format) fp8_quant = (engine_config.model_format == 'fp8' and not quant_config) - input_policy = get_input_policy(engine_config.model_format) + input_policy = get_input_policy(engine_config.model_format, + quantized_format=quantized_format if quant_config else None) input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path, tokenizer_path=model_path, input_policy=input_policy, fp8_quant=fp8_quant) - output_model_name, tm_cfg = get_output_model_registered_name_and_config(model_path=model_path, - model_format=engine_config.model_format, - dtype=engine_config.dtype, - group_size=group_size) + output_model_name, tm_cfg = get_output_model_registered_name_and_config( + model_path=model_path, + model_format=engine_config.model_format, + dtype=engine_config.dtype, + group_size=group_size, + quantized_format=quantized_format if quant_config else None) if mixed_awq: # Mixed-precision AWQ: attention weights are fp16 (not quantized), diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py index 734b2d6de6..fc77bef2bf 100644 --- a/lmdeploy/turbomind/deploy/parameter.py +++ b/lmdeploy/turbomind/deploy/parameter.py @@ -78,6 +78,15 @@ def __call__(self, f, g, i): f(i, g('weight'), 'weight', identity) +class WeightScale(Parameter): + KEYS = '.weight_scale', '.weight' + + # TODO: flag any operations crossing the quant blocks as illegal + def __call__(self, f, g, i): + f(i, g('weight_scale'), 'scales', to_float, apply_gs=['w1', 'w3', 'w2']) + f(i, g('weight'), 'weight', identity) + + class CompressedWeight(Parameter): KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point' @@ -133,6 +142,8 @@ def get_params(keys: list[str], bias=0): ps.append(QuantWeightOnly()) if WeightScaleInv.take(keys): ps.append(WeightScaleInv()) + if WeightScale.take(keys): + ps.append(WeightScale()) xs = CompressedWeight.take(keys) if xs: ps.append(CompressedWeight(xs)) diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py index 4082df36d3..78c0a9fd49 100644 --- a/lmdeploy/turbomind/deploy/policy.py +++ b/lmdeploy/turbomind/deploy/policy.py @@ -68,7 +68,7 @@ def process_fp8(x: torch.Tensor, kind: str): return x.to(dtype=torch.bfloat16) -def process_compressed_tensor(x: torch.Tensor, kind: str): +def process_compressed_packed_tensor(x: torch.Tensor, kind: str): x = x.cuda() if x.dtype == torch.int32: xs = get_u4_slices(x, torch.uint8) @@ -79,7 +79,7 @@ def process_compressed_tensor(x: torch.Tensor, kind: str): return x -def get_input_policy(model_format): +def get_input_policy(model_format, quantized_format=None): if model_format == 'awq': return process_awq_gemm elif model_format == 'gptq': @@ -89,6 +89,9 @@ def get_input_policy(model_format): elif model_format == 'fp8': return process_fp8 elif model_format == 'compressed-tensors': - return process_compressed_tensor + if quantized_format == 'pack-quantized': + return process_compressed_packed_tensor + elif quantized_format == 'float-quantized': + return process_fp8 else: return to_cuda From 23b59e4dc0b5b4a1623b723befe1a48fce672f97 Mon Sep 17 00:00:00 2001 From: 43758726 <1462774833@qq.com> Date: Sat, 18 Apr 2026 16:16:21 +0000 Subject: [PATCH 2/2] add documents for llm-compressor fp8 quant --- docs/en/quantization/llm_compressor_fp8.md | 98 +++++++++++++++++++ ...m_compressor.md => llm_compressor_int4.md} | 18 ++-- docs/zh_cn/quantization/llm_compressor_fp8.md | 96 ++++++++++++++++++ ...m_compressor.md => llm_compressor_int4.md} | 16 +-- examples/lite/fp8/qwen3_30b_a3b_fp8.py | 68 +++++++++++++ examples/lite/{ => int4}/qwen3_30b_a3b_awq.py | 0 .../lite/{ => int4}/qwen3_30b_a3b_gptq.py | 0 lmdeploy/lite/apis/auto_awq.py | 3 +- lmdeploy/turbomind/deploy/converter.py | 14 +-- lmdeploy/turbomind/deploy/parameter.py | 18 ---- 10 files changed, 287 insertions(+), 44 deletions(-) create mode 100644 docs/en/quantization/llm_compressor_fp8.md rename docs/en/quantization/{llm_compressor.md => llm_compressor_int4.md} (72%) create mode 100644 docs/zh_cn/quantization/llm_compressor_fp8.md rename docs/zh_cn/quantization/{llm_compressor.md => llm_compressor_int4.md} (85%) create mode 100644 examples/lite/fp8/qwen3_30b_a3b_fp8.py rename examples/lite/{ => int4}/qwen3_30b_a3b_awq.py (100%) rename examples/lite/{ => int4}/qwen3_30b_a3b_gptq.py (100%) diff --git a/docs/en/quantization/llm_compressor_fp8.md b/docs/en/quantization/llm_compressor_fp8.md new file mode 100644 index 0000000000..ba282fd840 --- /dev/null +++ b/docs/en/quantization/llm_compressor_fp8.md @@ -0,0 +1,98 @@ +# llm-compressor-fp8 Support + +This guide aims to introduce how to use LMDeploy's TurboMind inference engine to run models quantized by the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) tool. + +Currently supported `llm-compressor-fp8` quantization types include: + +- AWQ、GPTQ + +These quantized models can run via the TurboMind engine on the following NVIDIA GPU architectures: + +| Compute Capability | Micro-architecture | GPUs | +| ------------------ | ------------------ | ------------------------------- | +| 7.0 | Volta | V100 | +| 7.2 | Volta | Jetson Xavier | +| 7.5 | Turing | GeForce RTX 20 series, T4 | +| 8.0 | Ampere | A100, A800, A30 | +| 8.6 | Ampere | GeForce RTX 30 series, A40, A10 | +| 8.7 | Ampere | Jetson Orin | +| 8.9 | Ada Lovelace | GeForce RTX 40 series, L40, L20 | +| 9.0 | Hopper | H20, H200, H100, GH200 | +| 12.0 | Blackwell | GeForce RTX 50 series | + +LMDeploy will continue to follow up and expand support for the `llm-compressor-fp8` project. + +The remainder of this document consists of the following sections: + + + +- [Model Quantization](#model-quantization) +- [Model Deployment](#model-deployment) +- [Accuracy Evaluation](#accuracy-evaluation) + + + +## Model Quantization + +`llm-compressor-fp8` provides a wealth of model quantization [examples](https://github.com/vllm-project/llm-compressor/tree/main/examples). Please refer to its tutorials to select a quantization algorithm supported by LMDeploy to complete your model quantization work. + +LMDeploy also provides a built-in [script](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/fp8/qwen3_30b_a3b_fp8.py) for FP8 quantization of **Qwen3-30B-A3B** using `llm-compressor-fp8` for your reference: + +```shell +# Create conda environment +conda create -n lmdeploy python=3.10 -y +conda activate lmdeploy + +# Install llm-compressor +pip install llmcompressor + +# Clone lmdeploy source code and run the quantization example +git clone https://github.com/InternLM/lmdeploy +cd lmdeploy +python examples/lite/fp8/qwen3_30b_a3b_fp8.py --work-dir ./qwen3_30b_a3b_fp8 + +``` + +In the following sections, we will use this quantized model as an example to introduce model deployment and accuracy evaluation methods. + +## Model Deployment + +### Offline Inference + +With the quantized model, offline batch processing can be implemented with just a few lines of code: + +```python +from lmdeploy import pipeline, TurbomindEngineConfig +engine_config = TurbomindEngineConfig() +with pipeline("./qwen3_30b_a3b_fp8", backend_config=engine_config) as pipe: + response = pipe(["Hi, pls intro yourself", "Shanghai is"]) + print(response) +``` + +For a detailed introduction to the pipeline, please refer to [here](https://lmdeploy.readthedocs.io/en/latest/llm/pipeline.html). + +### Online Serving + +LMDeploy api_server supports encapsulating the model as a service with a single command. The provided RESTful APIs are compatible with OpenAI interfaces. Below is an example of starting the service: + +```shell +lmdeploy serve api_server ./qwen3_30b_a3b_fp8 --backend turbomind +``` + +The default service port is 23333. After the server starts, you can access the service via the OpenAI SDK. For command arguments and methods to access the service, please read [this](https://lmdeploy.readthedocs.io/en/latest/llm/api_server.html) document. + +## Accuracy Evaluation + +We deployed FP8-quantized models of Qwen3-8B (Dense) and Qwen3-30B-A3B (MoE) as services via LMDeploy, and evaluated them on several academic datasets using [opencompass](https://github.com/open-compass/opencompass). The results show that the accuracy gap between the FP8-quantized models and the BF16 models is not significant, which is in line with expectations. + +| dataset | Qwen3-8B | | Qwen3-30B-A3B | | +| ----------------- | -------- | ----- | ------------- | ----- | +| | bf16 | fp8 | bf16 | fp8 | +| ifeval | 85.58 | 87.62 | 86.32 | 86.51 | +| hle | 5.05 | 5.89 | 7.00 | 7.51 | +| gpqa | 59.97 | 59.22 | 61.74 | 60.73 | +| aime2025 | 69.48 | 70.00 | 73.44 | 71.15 | +| mmlu_pro | 73.69 | 73.54 | 77.85 | 77.50 | +| LCBCodeGeneration | 50.86 | 49.81 | 56.67 | 56.86 | + +For reproduction methods, please refer to [this](https://lmdeploy.readthedocs.io/en/latest/benchmark/evaluate_with_opencompass.html) document. diff --git a/docs/en/quantization/llm_compressor.md b/docs/en/quantization/llm_compressor_int4.md similarity index 72% rename from docs/en/quantization/llm_compressor.md rename to docs/en/quantization/llm_compressor_int4.md index 2b6ab9bfc4..19a3f6d25c 100644 --- a/docs/en/quantization/llm_compressor.md +++ b/docs/en/quantization/llm_compressor_int4.md @@ -1,10 +1,10 @@ -# llm-compressor Support +# llm-compressor-int4 Support This guide aims to introduce how to use LMDeploy's TurboMind inference engine to run models quantized by the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) tool. -Currently supported `llm-compressor` quantization types include: +Currently supported `llm-compressor-int4` quantization types include: -- int4 quantization (e.g., AWQ, GPTQ) +- AWQ、GPTQ These quantized models can run via the TurboMind engine on the following NVIDIA GPU architectures: @@ -20,7 +20,7 @@ These quantized models can run via the TurboMind engine on the following NVIDIA | 9.0 | Hopper | H20, H200, H100, GH200 | | 12.0 | Blackwell | GeForce RTX 50 series | -LMDeploy will continue to follow up and expand support for the `llm-compressor` project. +LMDeploy will continue to follow up and expand support for the `llm-compressor-int4` project. The remainder of this document consists of the following sections: @@ -34,9 +34,9 @@ The remainder of this document consists of the following sections: ## Model Quantization -`llm-compressor` provides a wealth of model quantization [examples](https://github.com/vllm-project/llm-compressor/tree/main/examples). Please refer to its tutorials to select a quantization algorithm supported by LMDeploy to complete your model quantization work. +`llm-compressor-int4` provides a wealth of model quantization [examples](https://github.com/vllm-project/llm-compressor/tree/main/examples). Please refer to its tutorials to select a quantization algorithm supported by LMDeploy to complete your model quantization work. -LMDeploy also provides a built-in [script](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/qwen3_30b_a3b_awq.py) for AWQ quantization of **Qwen3-30B-A3B** using `llm-compressor` for your reference: +LMDeploy also provides a built-in [script](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/int4/qwen3_30b_a3b_awq.py) for AWQ quantization of **Qwen3-30B-A3B** using `llm-compressor-int4` for your reference: ```shell # Create conda environment @@ -49,7 +49,8 @@ pip install llmcompressor # Clone lmdeploy source code and run the quantization example git clone https://github.com/InternLM/lmdeploy cd lmdeploy -python examples/lite/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq +python examples/lite/int4/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq + ``` In the following sections, we will use this quantized model as an example to introduce model deployment and accuracy evaluation methods. @@ -62,7 +63,6 @@ With the quantized model, offline batch processing can be implemented with just ```python from lmdeploy import pipeline, TurbomindEngineConfig - engine_config = TurbomindEngineConfig() with pipeline("./qwen3_30b_a3b_4bit", backend_config=engine_config) as pipe: response = pipe(["Hi, pls intro yourself", "Shanghai is"]) @@ -83,7 +83,7 @@ The default service port is 23333. After the server starts, you can access the s ## Accuracy Evaluation -Aftering deploying AWQ symmetric/asymmetric quantized models of Qwen3-8B (Dense) and Qwen3-30B-A3B (MoE) as services via LMDeploy, we evaluated their accuracy on several academic datasets using [opencompass](https://github.com/open-compass/opencompass). Results indicate that, for Qwen3-8B, asymmetric quantization generally outperforms symmetric quantization, while Qwen3-30B-A3B shows no substantial difference between symmetric and asymmetric quantization. Compared with BF16, Qwen3-8B shows a smaller accuracy gap under both symmetric and asymmetric quantization than Qwen3-30B-A3B. Compared with BF16, accuracy drops significantly on long-output datasets such as aime2025 (avg 17,635 tokens) and LCB (avg 14,157 tokens), while on medium/short-output datasets like ifeval (avg 1,885 tokens) and mmlu_pro (avg 2,826 tokens), the accuracy is as expected. +We deployed AWQ symmetric/asymmetric quantized models of Qwen3-8B (Dense) and Qwen3-30B-A3B (MoE) as services via LMDeploy, and evaluated their accuracy on several academic datasets using [opencompass](https://github.com/open-compass/opencompass). Results indicate that, for Qwen3-8B, asymmetric quantization generally outperforms symmetric quantization, while Qwen3-30B-A3B shows no substantial difference between symmetric and asymmetric quantization. Compared with BF16, Qwen3-8B shows a smaller accuracy gap under both symmetric and asymmetric quantization than Qwen3-30B-A3B. Compared with BF16, accuracy drops significantly on long-output datasets such as aime2025 (avg 17,635 tokens) and LCB (avg 14,157 tokens), while on medium/short-output datasets like ifeval (avg 1,885 tokens) and mmlu_pro (avg 2,826 tokens), the accuracy is as expected. | dataset | Qwen3-8B | | | Qwen3-30B-A3B | | | | ----------------- | -------- | ------- | -------- | ------------- | ------- | -------- | diff --git a/docs/zh_cn/quantization/llm_compressor_fp8.md b/docs/zh_cn/quantization/llm_compressor_fp8.md new file mode 100644 index 0000000000..ac907210c8 --- /dev/null +++ b/docs/zh_cn/quantization/llm_compressor_fp8.md @@ -0,0 +1,96 @@ +# llm-compressor-fp8 支持 + +本指南旨在介绍如何使用 LMDeploy 的 TurboMind 推理引擎,运行经由 [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)工具进行fp8量化后的模型。 +目前支持的 `llm-compressor-fp8` 量化模型包括: + +- AWQ、GPTQ + +上述量化模型通过 TurboMind 引擎可以在以下 NVIDIA GPU 架构上运行: + +| Compute Capability | Micro-architecture | GPUs | +| ------------------ | ------------------ | ------------------------------- | +| 7.0 | Volta | V100 | +| 7.2 | Volta | Jetson Xavier | +| 7.5 | Turing | GeForce RTX 20 series, T4 | +| 8.0 | Ampere | A100, A800, A30 | +| 8.6 | Ampere | GeForce RTX 30 series, A40, A10 | +| 8.7 | Ampere | Jetson Orin | +| 8.9 | Ada Lovelace | GeForce RTX 40 series, L40, L20 | +| 9.0 | Hopper | H20, H200, H100, GH200 | +| 12.0 | Blackwell | GeForce RTX 50 series | + +LMDeploy 将持续跟进并扩展对 `llm-compressor-fp8` 项目的支持。 + +本文的其余部分由以下章节组成: + + + +- [模型量化](#模型量化) +- [模型部署](#模型部署) +- [精度评测](#精度评测) + + + +## 模型量化 + +`llm-compressor-fp8` 提供了丰富的模型量化[用例](https://github.com/vllm-project/llm-compressor/tree/main/examples),请参考其教程选择 LMDeploy 支持的量化算法,完成模型量化工作。 +LMDeploy 也内置了通过 `llm-compressor-fp8` 对 Qwen3-30B-A3B 进行 fp8 量化的[脚本](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/fp8/qwen3_30b_a3b_fp8.py),供大家进行参考: + +```shell +# 创建 conda 环境 +conda create -n lmdeploy python=3.10 -y +conda activate lmdeploy + +# 安装 llm-compressor +pip install llmcompressor + +# 下载 lmdeploy 源码,运行量化用用例 +git clone https://github.com/InternLM/lmdeploy +cd lmdeploy +python examples/lite/fp8/qwen3_30b_a3b_fp8.py --work-dir ./qwen3_30b_a3b_fp8 + +``` + +在接下来的章节中,我们以此量化模型为例,介绍模型部署、评测精度等方法 + +## 模型部署 + +### 离线推理 + +量化后的模型,通过以下几行简单的代码,可以实现离线批处理: + +```python +from lmdeploy import pipeline, TurbomindEngineConfig +engine_config = TurbomindEngineConfig() +with pipeline("./qwen3_30b_a3b_fp8", backend_config=engine_config) as pipe: + response = pipe(["Hi, pls intro yourself", "Shanghai is"]) + print(response) +``` + +关于 pipeline 的详细介绍,请参考[这里](https://lmdeploy.readthedocs.io/zh-cn/latest/llm/pipeline.html) + +### 在线服务 + +LMDeploy api_server 支持把模型一键封装为服务,对外提供的 RESTful API 兼容 openai 的接口。以下为服务启动的示例: + +```shell +lmdeploy serve api_server ./qwen3_30b_a3b_fp8 --backend turbomind +``` + +服务默认端口是23333。在 server 启动后,你可以通过 openai SDK 访问服务。关于服务的命令参数,以及访问服务的方式,可以阅读[这份](https://lmdeploy.readthedocs.io/zh-cn/latest/llm/api_server.html)文档 + +## 精度评测 + +我们将 Qwen3-8B (Dense) 与 Qwen3-30B-A3B (MoE) 的 FP8 量化模型通过 LMDeploy 部署为服务,并使用 [opencompass](https://github.com/open-compass/opencompass) 在多个学术数据集上评测。结果显示:Qwen3-8B 与 Qwen3-30B-A3B 的 FP8 量化模型精度与 BF16 模型差异不显著,精度符合预期。 + +| dataset | Qwen3-8B | | Qwen3-30B-A3B | | +| ----------------- | -------- | ----- | ------------- | ----- | +| | bf16 | fp8 | bf16 | fp8 | +| ifeval | 85.58 | 87.62 | 86.32 | 86.51 | +| hle | 5.05 | 5.89 | 7.00 | 7.51 | +| gpqa | 59.97 | 59.22 | 61.74 | 60.73 | +| aime2025 | 69.48 | 70.00 | 73.44 | 71.15 | +| mmlu_pro | 73.69 | 73.54 | 77.85 | 77.50 | +| LCBCodeGeneration | 50.86 | 49.81 | 56.67 | 56.86 | + +复现方式可以参考[这份](https://lmdeploy.readthedocs.io/zh-cn/latest/benchmark/evaluate_with_opencompass.html)文档 diff --git a/docs/zh_cn/quantization/llm_compressor.md b/docs/zh_cn/quantization/llm_compressor_int4.md similarity index 85% rename from docs/zh_cn/quantization/llm_compressor.md rename to docs/zh_cn/quantization/llm_compressor_int4.md index 8fdcbba6a4..0b0f471501 100644 --- a/docs/zh_cn/quantization/llm_compressor.md +++ b/docs/zh_cn/quantization/llm_compressor_int4.md @@ -1,9 +1,9 @@ -# llm-compressor 支持 +# llm-compressor-int4 支持 -本指南旨在介绍如何使用 LMDeploy 的 TurboMind 推理引擎,运行经由 [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)工具量化后的模型。 -目前支持的 `llm-compressor` 量化模型包括: +本指南旨在介绍如何使用 LMDeploy 的 TurboMind 推理引擎,运行经由 [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)工具进行int4量化后的模型。 +目前支持的 `llm-compressor-int4` 量化模型包括: -- int4 量化(例如 AWQ、GPTQ) +- AWQ、GPTQ 上述量化模型通过 TurboMind 引擎可以在以下 NVIDIA GPU 架构上运行: @@ -19,7 +19,7 @@ | 9.0 | Hopper | H20, H200, H100, GH200 | | 12.0 | Blackwell | GeForce RTX 50 series | -LMDeploy 将持续跟进并扩展对 `llm-compressor` 项目的支持。 +LMDeploy 将持续跟进并扩展对 `llm-compressor-int4` 项目的支持。 本文的其余部分由以下章节组成: @@ -33,8 +33,8 @@ LMDeploy 将持续跟进并扩展对 `llm-compressor` 项目的支持。 ## 模型量化 -`llm-compressor` 提供了丰富的模型量化[用例](https://github.com/vllm-project/llm-compressor/tree/main/examples),请参考其教程选择 LMDeploy 支持的量化算法,完成模型量化工作。 -LMDeploy 也内置了通过 `llm-compressor` 对 Qwen3-30B-A3B 进行 AWQ 量化的[脚本](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/qwen3_30b_a3b_awq.py),供大家进行参考: +`llm-compressor-int4` 提供了丰富的模型量化[用例](https://github.com/vllm-project/llm-compressor/tree/main/examples),请参考其教程选择 LMDeploy 支持的量化算法,完成模型量化工作。 +LMDeploy 也内置了通过 `llm-compressor-int4` 对 Qwen3-30B-A3B 进行 AWQ 量化的[脚本](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/int4/qwen3_30b_a3b_awq.py),供大家进行参考: ```shell # 创建 conda 环境 @@ -47,7 +47,7 @@ pip install llmcompressor # 下载 lmdeploy 源码,运行量化用用例 git clone https://github.com/InternLM/lmdeploy cd lmdeploy -python examples/lite/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq +python examples/lite/int4/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq ``` diff --git a/examples/lite/fp8/qwen3_30b_a3b_fp8.py b/examples/lite/fp8/qwen3_30b_a3b_fp8.py new file mode 100644 index 0000000000..4a2c218612 --- /dev/null +++ b/examples/lite/fp8/qwen3_30b_a3b_fp8.py @@ -0,0 +1,68 @@ +import argparse + +from compressed_tensors.offload import dispatch_model +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def parse_args(): + parser = argparse.ArgumentParser(description='Run FP8 quantization for Qwen3 model') + + parser.add_argument('--work-dir', + type=str, + default='./qwen3_30b_a3b_fp8', + required=True, + help='The directory to save the quantized model') + + parser.add_argument('--model-id', + type=str, + default='Qwen/Qwen3-30B-A3B', + help='The Hugging Face model ID to quantize') + return parser.parse_args() + +def main(): + # 1. Achieve command args + args = parse_args() + MODEL_ID = args.model_id + SAVE_DIR = args.work_dir + + print(f'Loading model: {MODEL_ID}') + print(f'Saving to: {SAVE_DIR}') + + # 2. Load_dataset and tokenizer + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype='auto', device_map='auto', trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) + + # 3. Configure quant args + # Configure the quantization algorithm and scheme. + # In this case, we: + # * quantize the weights to fp8 with per channel via ptq + # * quantize the activations to fp8 with dynamic per token + recipe = QuantizationModifier( + targets='Linear', + scheme='FP8_BLOCK', + ignore=['lm_head', 're:.*mlp.gate$'], + ) + + # 4. Run quantization + print('Starting quantization...') + oneshot(model=model, recipe=recipe) + + # 5. Confirm generations of the quantized model look sane + print('========== SAMPLE GENERATION ==============') + dispatch_model(model) + input_ids = tokenizer('Hello my name is', return_tensors='pt').input_ids.to( + model.device + ) + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print('==========================================') + + # 6. Save quantized model + print('Saving model...') + model.save_pretrained(SAVE_DIR) + tokenizer.save_pretrained(SAVE_DIR) + +if __name__ == '__main__': + main() diff --git a/examples/lite/qwen3_30b_a3b_awq.py b/examples/lite/int4/qwen3_30b_a3b_awq.py similarity index 100% rename from examples/lite/qwen3_30b_a3b_awq.py rename to examples/lite/int4/qwen3_30b_a3b_awq.py diff --git a/examples/lite/qwen3_30b_a3b_gptq.py b/examples/lite/int4/qwen3_30b_a3b_gptq.py similarity index 100% rename from examples/lite/qwen3_30b_a3b_gptq.py rename to examples/lite/int4/qwen3_30b_a3b_gptq.py diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py index 78a3ee1bbf..3bab4cf3f8 100644 --- a/lmdeploy/lite/apis/auto_awq.py +++ b/lmdeploy/lite/apis/auto_awq.py @@ -8,12 +8,11 @@ import torch from torch import nn +from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers from lmdeploy.lite.utils import collect_target_modules from lmdeploy.utils import try_import_deeplink -from .calibrate import LAYER_TYPE_MAP, calibrate - def save_vl_model(vl_model, model_path, dst_path): vl_model.save_pretrained(dst_path, safe_serialization=True) diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py index a4afeff552..8f54aa4778 100644 --- a/lmdeploy/turbomind/deploy/converter.py +++ b/lmdeploy/turbomind/deploy/converter.py @@ -66,7 +66,7 @@ def get_input_model_registered_name(model_path: str, model_format: str): def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int, - quantized_format: str): + quantized_format: str | None): """Get the registered name of the turbomind model and its configuration according to the input model path, format and user-input config. The name will be used to access the OUTPUT_MODELS registry. @@ -77,7 +77,7 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4'] dtype (str): the data type of the model's weights and activations group_size (int): the quantization group size used by grouped formats - quantized_format (str): the quantized format of compressed-tensors model, + quantized_format (str | None): the quantized format of compressed-tensors model, which can be one of ['pack-quantized', 'float-quantized'] """ register_name = 'tm' @@ -118,6 +118,9 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s weight_type = 'int4' dtype = 'float16' # force float16 for int4 quantized weights elif model_format == 'compressed-tensors': + assert quantized_format in ['pack-quantized', 'float-quantized'], ( + f'compressed-tensors format must be specified as "pack-quantized" or "float-quantized", ' + f'but got "{quantized_format}"') if quantized_format == 'pack-quantized': weight_type = 'int4' model_format = 'awq' @@ -240,17 +243,14 @@ def get_tm_model(model_path, elif quant_method == 'compressed-tensors': _format = quant_config['config_groups']['group_0']['format'] assert _format in ['pack-quantized', 'float-quantized' - ], ('compressed-tennsors only supports pack-quantized/float-quantized format, ' + ], ('compressed-tensors only supports pack-quantized/float-quantized format, ' f'but got {_format}') _weights = quant_config['config_groups']['group_0']['weights'] _group_size = _weights['group_size'] _num_bits = _weights['num_bits'] _type = _weights['type'] assert (_num_bits == 4 and _type == 'int') or (_num_bits == 8 and _type == 'float'), ( - 'pack-quantized requires 4-bit int, ' - f'but got {_num_bits}-bit {_type}. ' - 'or float-quantized requires 8-bit float, ' - f'but got {_num_bits}-bit {_type}') + f'pack-quantized requires int4 or fp8, but got type {_type} and {_num_bits} bits') else: assert 0, f'unsupported quant_config: {quant_config}' diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py index 4d6f34a895..96f4d8a3e7 100644 --- a/lmdeploy/turbomind/deploy/parameter.py +++ b/lmdeploy/turbomind/deploy/parameter.py @@ -121,21 +121,6 @@ def __call__(self, f, g, i): f(i, g('weight'), 'weight', identity) -class CompressedWeight(Parameter): - KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point' - - def __init__(self, xs): - self.has_zero_point = False - if any(key.endswith(self.KEYS[2]) for key in xs): - self.has_zero_point = True - - def __call__(self, f, g, i): - f(i, g('weight_packed'), 'qweight', pack_u4_row) - f(i, g('weight_scale'), 'scales', to_half, apply_gs=['w2']) - if self.has_zero_point: - f(i, g('weight_zero_point'), 'zeros', to_half, apply_gs=['w2']) - else: - f(i, generate_zero_point(g), 'zeros', to_half, apply_gs=['w2']) class Mxfp4Weight(Parameter): @@ -179,9 +164,6 @@ def get_params(keys: list[str], bias=0): ps.append(WeightScaleInv()) if WeightScale.take(keys): ps.append(WeightScale()) - xs = CompressedWeight.take(keys) - if xs: - ps.append(CompressedWeight(xs)) if Mxfp4Weight.take(keys): ps.append(Mxfp4Weight()) if Weight.take(keys):