From 9326ac2019956440e563fe5c1c97dc15cc8bee5a Mon Sep 17 00:00:00 2001
From: 43758726 <1462774833@qq.com>
Date: Wed, 8 Apr 2026 09:22:08 +0000
Subject: [PATCH 1/2] make fp8 model quantized by llm-compressor can be
 inferenced in turbomind

---
 lmdeploy/turbomind/deploy/converter.py | 46 ++++++++++++++++++--------
 lmdeploy/turbomind/deploy/parameter.py | 11 ++++++
 lmdeploy/turbomind/deploy/policy.py    |  9 +++--
 3 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 9d697ef567..1a3a3e840f 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -31,7 +31,8 @@ def get_input_model_registered_name(model_path: str, model_format: str):
     return register_name
 
 
-def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int):
+def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int,
+                                                quantized_format: str):
     """Get the registered name of the turbomind model and its configuration
     according to the input model path, format and user-input config. The name
     will be used to access the OUTPUT_MODELS registry.
@@ -42,6 +43,8 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s
             ['hf', 'awq', 'gptq']
         dtype (str): the data type of the model's weights and activations
         group_size (int): the size of group used by awq model
+        quantized_format (str): the quantized format of compressed-tensors model,
+            which can be one of ['pack-quantized', 'float-quantized']
     """
     register_name = 'tm'
 
@@ -75,11 +78,18 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s
     session_len = _get_and_verify_max_len(model_config, None)
 
     if model_format in ['awq', 'gptq', 'compressed-tensors']:
-        weight_type = 'int4'
-        dtype = 'float16'  # force float16 for int4 quantized weights
+        if model_format in ['awq', 'gptq']:
+            weight_type = 'int4'
+            dtype = 'float16'  # force float16 for int4 quantized weights
+        elif model_format == 'compressed-tensors':
+            if quantized_format == 'pack-quantized':
+                weight_type = 'int4'
+                model_format = 'awq'
+                dtype = 'float16'  # force float16 for int4 quantized weights
+            elif quantized_format == 'float-quantized':
+                weight_type = 'fp8'
+                model_format = 'fp8'
         group_size = 128 if group_size == 0 else group_size
-        if model_format == 'compressed-tensors':
-            model_format = 'awq'
     elif model_format == 'fp8':
         weight_type = 'fp8'
         group_size = 128
@@ -196,18 +206,23 @@ def get_tm_model(model_path,
             _group_size = 32
         elif quant_method == 'compressed-tensors':
             _format = quant_config['config_groups']['group_0']['format']
-            assert _format == 'pack-quantized', ('compressed-tennsors only supports pack-quantized format, '
-                                                 f'but got {_format}')
+            assert _format in ['pack-quantized', 'float-quantized'
+                               ], ('compressed-tennsors only supports pack-quantized/float-quantized format, '
+                                   f'but got {_format}')
             _weights = quant_config['config_groups']['group_0']['weights']
             _group_size = _weights['group_size']
             _num_bits = _weights['num_bits']
             _type = _weights['type']
-            assert _num_bits == 4 and _type == 'int', ('pack-quantized requires 4-bit int, '
-                                                       f'but got {_num_bits}-bit {_type}')
+            assert (_num_bits == 4 and _type == 'int') or (_num_bits == 8 and _type == 'float'), (
+                'pack-quantized requires 4-bit int, '
+                f'but got {_num_bits}-bit {_type}. '
+                'or float-quantized requires 8-bit float, '
+                f'but got {_num_bits}-bit {_type}')
         else:
             assert 0, f'unsupported quant_config: {quant_config}'
 
         engine_config.model_format = quant_method
+        quantized_format = _format if quant_method == 'compressed-tensors' else None
         group_size = _group_size
 
     if engine_config.model_format in ['awq', 'gptq', 'compressed-tensors']:
@@ -221,16 +236,19 @@ def get_tm_model(model_path,
     input_model_name = get_input_model_registered_name(model_path, engine_config.model_format)
 
     fp8_quant = (engine_config.model_format == 'fp8' and not quant_config)
-    input_policy = get_input_policy(engine_config.model_format)
+    input_policy = get_input_policy(engine_config.model_format,
+                                    quantized_format=quantized_format if quant_config else None)
     input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path,
                                                      tokenizer_path=model_path,
                                                      input_policy=input_policy,
                                                      fp8_quant=fp8_quant)
 
-    output_model_name, tm_cfg = get_output_model_registered_name_and_config(model_path=model_path,
-                                                                            model_format=engine_config.model_format,
-                                                                            dtype=engine_config.dtype,
-                                                                            group_size=group_size)
+    output_model_name, tm_cfg = get_output_model_registered_name_and_config(
+        model_path=model_path,
+        model_format=engine_config.model_format,
+        dtype=engine_config.dtype,
+        group_size=group_size,
+        quantized_format=quantized_format if quant_config else None)
 
     if mixed_awq:
         # Mixed-precision AWQ: attention weights are fp16 (not quantized),
diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
index 734b2d6de6..fc77bef2bf 100644
--- a/lmdeploy/turbomind/deploy/parameter.py
+++ b/lmdeploy/turbomind/deploy/parameter.py
@@ -78,6 +78,15 @@ def __call__(self, f, g, i):
         f(i, g('weight'), 'weight', identity)
 
 
+class WeightScale(Parameter):
+    KEYS = '.weight_scale', '.weight'
+
+    # TODO: flag any operations crossing the quant blocks as illegal
+    def __call__(self, f, g, i):
+        f(i, g('weight_scale'), 'scales', to_float, apply_gs=['w1', 'w3', 'w2'])
+        f(i, g('weight'), 'weight', identity)
+
+
 class CompressedWeight(Parameter):
     KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point'
 
@@ -133,6 +142,8 @@ def get_params(keys: list[str], bias=0):
         ps.append(QuantWeightOnly())
     if WeightScaleInv.take(keys):
         ps.append(WeightScaleInv())
+    if WeightScale.take(keys):
+        ps.append(WeightScale())
     xs = CompressedWeight.take(keys)
     if xs:
         ps.append(CompressedWeight(xs))
diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
index 4082df36d3..78c0a9fd49 100644
--- a/lmdeploy/turbomind/deploy/policy.py
+++ b/lmdeploy/turbomind/deploy/policy.py
@@ -68,7 +68,7 @@ def process_fp8(x: torch.Tensor, kind: str):
         return x.to(dtype=torch.bfloat16)
 
 
-def process_compressed_tensor(x: torch.Tensor, kind: str):
+def process_compressed_packed_tensor(x: torch.Tensor, kind: str):
     x = x.cuda()
     if x.dtype == torch.int32:
         xs = get_u4_slices(x, torch.uint8)
@@ -79,7 +79,7 @@ def process_compressed_tensor(x: torch.Tensor, kind: str):
     return x
 
 
-def get_input_policy(model_format):
+def get_input_policy(model_format, quantized_format=None):
     if model_format == 'awq':
         return process_awq_gemm
     elif model_format == 'gptq':
@@ -89,6 +89,9 @@ def get_input_policy(model_format):
     elif model_format == 'fp8':
         return process_fp8
     elif model_format == 'compressed-tensors':
-        return process_compressed_tensor
+        if quantized_format == 'pack-quantized':
+            return process_compressed_packed_tensor
+        elif quantized_format == 'float-quantized':
+            return process_fp8
     else:
         return to_cuda

From 23b59e4dc0b5b4a1623b723befe1a48fce672f97 Mon Sep 17 00:00:00 2001
From: 43758726 <1462774833@qq.com>
Date: Sat, 18 Apr 2026 16:16:21 +0000
Subject: [PATCH 2/2] add documents for llm-compressor fp8 quant

---
 docs/en/quantization/llm_compressor_fp8.md    | 98 +++++++++++++++++++
 ...m_compressor.md => llm_compressor_int4.md} | 18 ++--
 docs/zh_cn/quantization/llm_compressor_fp8.md | 96 ++++++++++++++++++
 ...m_compressor.md => llm_compressor_int4.md} | 16 +--
 examples/lite/fp8/qwen3_30b_a3b_fp8.py        | 68 +++++++++++++
 examples/lite/{ => int4}/qwen3_30b_a3b_awq.py |  0
 .../lite/{ => int4}/qwen3_30b_a3b_gptq.py     |  0
 lmdeploy/lite/apis/auto_awq.py                |  3 +-
 lmdeploy/turbomind/deploy/converter.py        | 14 +--
 lmdeploy/turbomind/deploy/parameter.py        | 18 ----
 10 files changed, 287 insertions(+), 44 deletions(-)
 create mode 100644 docs/en/quantization/llm_compressor_fp8.md
 rename docs/en/quantization/{llm_compressor.md => llm_compressor_int4.md} (72%)
 create mode 100644 docs/zh_cn/quantization/llm_compressor_fp8.md
 rename docs/zh_cn/quantization/{llm_compressor.md => llm_compressor_int4.md} (85%)
 create mode 100644 examples/lite/fp8/qwen3_30b_a3b_fp8.py
 rename examples/lite/{ => int4}/qwen3_30b_a3b_awq.py (100%)
 rename examples/lite/{ => int4}/qwen3_30b_a3b_gptq.py (100%)

diff --git a/docs/en/quantization/llm_compressor_fp8.md b/docs/en/quantization/llm_compressor_fp8.md
new file mode 100644
index 0000000000..ba282fd840
--- /dev/null
+++ b/docs/en/quantization/llm_compressor_fp8.md
@@ -0,0 +1,98 @@
+# llm-compressor-fp8 Support
+
+This guide aims to introduce how to use LMDeploy's TurboMind inference engine to run models quantized by the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) tool.
+
+Currently supported `llm-compressor-fp8` quantization types include:
+
+- AWQ、GPTQ
+
+These quantized models can run via the TurboMind engine on the following NVIDIA GPU architectures:
+
+| Compute Capability | Micro-architecture | GPUs                            |
+| ------------------ | ------------------ | ------------------------------- |
+| 7.0                | Volta              | V100                            |
+| 7.2                | Volta              | Jetson Xavier                   |
+| 7.5                | Turing             | GeForce RTX 20 series, T4       |
+| 8.0                | Ampere             | A100, A800, A30                 |
+| 8.6                | Ampere             | GeForce RTX 30 series, A40, A10 |
+| 8.7                | Ampere             | Jetson Orin                     |
+| 8.9                | Ada Lovelace       | GeForce RTX 40 series, L40, L20 |
+| 9.0                | Hopper             | H20, H200, H100, GH200          |
+| 12.0               | Blackwell          | GeForce RTX 50 series           |
+
+LMDeploy will continue to follow up and expand support for the `llm-compressor-fp8` project.
+
+The remainder of this document consists of the following sections:
+
+<!-- toc -->
+
+- [Model Quantization](#model-quantization)
+- [Model Deployment](#model-deployment)
+- [Accuracy Evaluation](#accuracy-evaluation)
+
+<!-- tocstop -->
+
+## Model Quantization
+
+`llm-compressor-fp8` provides a wealth of model quantization [examples](https://github.com/vllm-project/llm-compressor/tree/main/examples). Please refer to its tutorials to select a quantization algorithm supported by LMDeploy to complete your model quantization work.
+
+LMDeploy also provides a built-in [script](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/fp8/qwen3_30b_a3b_fp8.py) for FP8 quantization of **Qwen3-30B-A3B** using `llm-compressor-fp8` for your reference:
+
+```shell
+# Create conda environment
+conda create -n lmdeploy python=3.10 -y
+conda activate lmdeploy
+
+# Install llm-compressor
+pip install llmcompressor
+
+# Clone lmdeploy source code and run the quantization example
+git clone https://github.com/InternLM/lmdeploy
+cd lmdeploy
+python examples/lite/fp8/qwen3_30b_a3b_fp8.py --work-dir ./qwen3_30b_a3b_fp8
+
+```
+
+In the following sections, we will use this quantized model as an example to introduce model deployment and accuracy evaluation methods.
+
+## Model Deployment
+
+### Offline Inference
+
+With the quantized model, offline batch processing can be implemented with just a few lines of code:
+
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+engine_config = TurbomindEngineConfig()
+with pipeline("./qwen3_30b_a3b_fp8", backend_config=engine_config) as pipe:
+    response = pipe(["Hi, pls intro yourself", "Shanghai is"])
+    print(response)
+```
+
+For a detailed introduction to the pipeline, please refer to [here](https://lmdeploy.readthedocs.io/en/latest/llm/pipeline.html).
+
+### Online Serving
+
+LMDeploy api_server supports encapsulating the model as a service with a single command. The provided RESTful APIs are compatible with OpenAI interfaces. Below is an example of starting the service:
+
+```shell
+lmdeploy serve api_server ./qwen3_30b_a3b_fp8 --backend turbomind
+```
+
+The default service port is 23333. After the server starts, you can access the service via the OpenAI SDK. For command arguments and methods to access the service, please read [this](https://lmdeploy.readthedocs.io/en/latest/llm/api_server.html) document.
+
+## Accuracy Evaluation
+
+We deployed FP8-quantized models of Qwen3-8B (Dense) and Qwen3-30B-A3B (MoE) as services via LMDeploy, and evaluated them on several academic datasets using [opencompass](https://github.com/open-compass/opencompass). The results show that the accuracy gap between the FP8-quantized models and the BF16 models is not significant, which is in line with expectations.
+
+| dataset           | Qwen3-8B |       | Qwen3-30B-A3B |       |
+| ----------------- | -------- | ----- | ------------- | ----- |
+|                   | bf16     | fp8   | bf16          | fp8   |
+| ifeval            | 85.58    | 87.62 | 86.32         | 86.51 |
+| hle               | 5.05     | 5.89  | 7.00          | 7.51  |
+| gpqa              | 59.97    | 59.22 | 61.74         | 60.73 |
+| aime2025          | 69.48    | 70.00 | 73.44         | 71.15 |
+| mmlu_pro          | 73.69    | 73.54 | 77.85         | 77.50 |
+| LCBCodeGeneration | 50.86    | 49.81 | 56.67         | 56.86 |
+
+For reproduction methods, please refer to [this](https://lmdeploy.readthedocs.io/en/latest/benchmark/evaluate_with_opencompass.html) document.
diff --git a/docs/en/quantization/llm_compressor.md b/docs/en/quantization/llm_compressor_int4.md
similarity index 72%
rename from docs/en/quantization/llm_compressor.md
rename to docs/en/quantization/llm_compressor_int4.md
index 2b6ab9bfc4..19a3f6d25c 100644
--- a/docs/en/quantization/llm_compressor.md
+++ b/docs/en/quantization/llm_compressor_int4.md
@@ -1,10 +1,10 @@
-# llm-compressor Support
+# llm-compressor-int4 Support
 
 This guide aims to introduce how to use LMDeploy's TurboMind inference engine to run models quantized by the [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor) tool.
 
-Currently supported `llm-compressor` quantization types include:
+Currently supported `llm-compressor-int4` quantization types include:
 
-- int4 quantization (e.g., AWQ, GPTQ)
+- AWQ、GPTQ
 
 These quantized models can run via the TurboMind engine on the following NVIDIA GPU architectures:
 
@@ -20,7 +20,7 @@ These quantized models can run via the TurboMind engine on the following NVIDIA
 | 9.0                | Hopper             | H20, H200, H100, GH200          |
 | 12.0               | Blackwell          | GeForce RTX 50 series           |
 
-LMDeploy will continue to follow up and expand support for the `llm-compressor` project.
+LMDeploy will continue to follow up and expand support for the `llm-compressor-int4` project.
 
 The remainder of this document consists of the following sections:
 
@@ -34,9 +34,9 @@ The remainder of this document consists of the following sections:
 
 ## Model Quantization
 
-`llm-compressor` provides a wealth of model quantization [examples](https://github.com/vllm-project/llm-compressor/tree/main/examples). Please refer to its tutorials to select a quantization algorithm supported by LMDeploy to complete your model quantization work.
+`llm-compressor-int4` provides a wealth of model quantization [examples](https://github.com/vllm-project/llm-compressor/tree/main/examples). Please refer to its tutorials to select a quantization algorithm supported by LMDeploy to complete your model quantization work.
 
-LMDeploy also provides a built-in [script](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/qwen3_30b_a3b_awq.py) for AWQ quantization of **Qwen3-30B-A3B** using `llm-compressor` for your reference:
+LMDeploy also provides a built-in [script](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/int4/qwen3_30b_a3b_awq.py) for AWQ quantization of **Qwen3-30B-A3B** using `llm-compressor-int4` for your reference:
 
 ```shell
 # Create conda environment
@@ -49,7 +49,8 @@ pip install llmcompressor
 # Clone lmdeploy source code and run the quantization example
 git clone https://github.com/InternLM/lmdeploy
 cd lmdeploy
-python examples/lite/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq
+python examples/lite/int4/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq
+
 ```
 
 In the following sections, we will use this quantized model as an example to introduce model deployment and accuracy evaluation methods.
@@ -62,7 +63,6 @@ With the quantized model, offline batch processing can be implemented with just
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-
 engine_config = TurbomindEngineConfig()
 with pipeline("./qwen3_30b_a3b_4bit", backend_config=engine_config) as pipe:
     response = pipe(["Hi, pls intro yourself", "Shanghai is"])
@@ -83,7 +83,7 @@ The default service port is 23333. After the server starts, you can access the s
 
 ## Accuracy Evaluation
 
-Aftering deploying AWQ symmetric/asymmetric quantized models of Qwen3-8B (Dense) and Qwen3-30B-A3B (MoE) as services via LMDeploy, we evaluated their accuracy on several academic datasets using [opencompass](https://github.com/open-compass/opencompass). Results indicate that, for Qwen3-8B, asymmetric quantization generally outperforms symmetric quantization, while Qwen3-30B-A3B shows no substantial difference between symmetric and asymmetric quantization. Compared with BF16, Qwen3-8B shows a smaller accuracy gap under both symmetric and asymmetric quantization than Qwen3-30B-A3B. Compared with BF16, accuracy drops significantly on long-output datasets such as aime2025 (avg 17,635 tokens) and LCB (avg 14,157 tokens), while on medium/short-output datasets like ifeval (avg 1,885 tokens) and mmlu_pro (avg 2,826 tokens), the accuracy is as expected.
+We deployed AWQ symmetric/asymmetric quantized models of Qwen3-8B (Dense) and Qwen3-30B-A3B (MoE) as services via LMDeploy, and evaluated their accuracy on several academic datasets using [opencompass](https://github.com/open-compass/opencompass). Results indicate that, for Qwen3-8B, asymmetric quantization generally outperforms symmetric quantization, while Qwen3-30B-A3B shows no substantial difference between symmetric and asymmetric quantization. Compared with BF16, Qwen3-8B shows a smaller accuracy gap under both symmetric and asymmetric quantization than Qwen3-30B-A3B. Compared with BF16, accuracy drops significantly on long-output datasets such as aime2025 (avg 17,635 tokens) and LCB (avg 14,157 tokens), while on medium/short-output datasets like ifeval (avg 1,885 tokens) and mmlu_pro (avg 2,826 tokens), the accuracy is as expected.
 
 | dataset           | Qwen3-8B |         |          | Qwen3-30B-A3B |         |          |
 | ----------------- | -------- | ------- | -------- | ------------- | ------- | -------- |
diff --git a/docs/zh_cn/quantization/llm_compressor_fp8.md b/docs/zh_cn/quantization/llm_compressor_fp8.md
new file mode 100644
index 0000000000..ac907210c8
--- /dev/null
+++ b/docs/zh_cn/quantization/llm_compressor_fp8.md
@@ -0,0 +1,96 @@
+# llm-compressor-fp8 支持
+
+本指南旨在介绍如何使用 LMDeploy 的 TurboMind 推理引擎，运行经由 [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)工具进行fp8量化后的模型。
+目前支持的 `llm-compressor-fp8` 量化模型包括：
+
+- AWQ、GPTQ
+
+上述量化模型通过 TurboMind 引擎可以在以下 NVIDIA GPU 架构上运行：
+
+| Compute Capability | Micro-architecture | GPUs                            |
+| ------------------ | ------------------ | ------------------------------- |
+| 7.0                | Volta              | V100                            |
+| 7.2                | Volta              | Jetson Xavier                   |
+| 7.5                | Turing             | GeForce RTX 20 series, T4       |
+| 8.0                | Ampere             | A100, A800, A30                 |
+| 8.6                | Ampere             | GeForce RTX 30 series, A40, A10 |
+| 8.7                | Ampere             | Jetson Orin                     |
+| 8.9                | Ada Lovelace       | GeForce RTX 40 series, L40, L20 |
+| 9.0                | Hopper             | H20, H200, H100, GH200          |
+| 12.0               | Blackwell          | GeForce RTX 50 series           |
+
+LMDeploy 将持续跟进并扩展对 `llm-compressor-fp8` 项目的支持。
+
+本文的其余部分由以下章节组成：
+
+<!-- toc -->
+
+- [模型量化](#模型量化)
+- [模型部署](#模型部署)
+- [精度评测](#精度评测)
+
+<!-- tocstop -->
+
+## 模型量化
+
+`llm-compressor-fp8` 提供了丰富的模型量化[用例](https://github.com/vllm-project/llm-compressor/tree/main/examples)，请参考其教程选择 LMDeploy 支持的量化算法，完成模型量化工作。
+LMDeploy 也内置了通过 `llm-compressor-fp8` 对 Qwen3-30B-A3B 进行 fp8 量化的[脚本](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/fp8/qwen3_30b_a3b_fp8.py)，供大家进行参考：
+
+```shell
+# 创建 conda 环境
+conda create -n lmdeploy python=3.10 -y
+conda activate lmdeploy
+
+# 安装 llm-compressor
+pip install llmcompressor
+
+# 下载 lmdeploy 源码，运行量化用用例
+git clone https://github.com/InternLM/lmdeploy
+cd lmdeploy
+python examples/lite/fp8/qwen3_30b_a3b_fp8.py --work-dir ./qwen3_30b_a3b_fp8
+
+```
+
+在接下来的章节中，我们以此量化模型为例，介绍模型部署、评测精度等方法
+
+## 模型部署
+
+### 离线推理
+
+量化后的模型，通过以下几行简单的代码，可以实现离线批处理：
+
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+engine_config = TurbomindEngineConfig()
+with pipeline("./qwen3_30b_a3b_fp8", backend_config=engine_config) as pipe:
+    response = pipe(["Hi, pls intro yourself", "Shanghai is"])
+    print(response)
+```
+
+关于 pipeline 的详细介绍，请参考[这里](https://lmdeploy.readthedocs.io/zh-cn/latest/llm/pipeline.html)
+
+### 在线服务
+
+LMDeploy api_server 支持把模型一键封装为服务，对外提供的 RESTful API 兼容 openai 的接口。以下为服务启动的示例：
+
+```shell
+lmdeploy serve api_server ./qwen3_30b_a3b_fp8 --backend turbomind
+```
+
+服务默认端口是23333。在 server 启动后，你可以通过 openai SDK 访问服务。关于服务的命令参数，以及访问服务的方式，可以阅读[这份](https://lmdeploy.readthedocs.io/zh-cn/latest/llm/api_server.html)文档
+
+## 精度评测
+
+我们将 Qwen3-8B (Dense) 与 Qwen3-30B-A3B (MoE) 的 FP8 量化模型通过 LMDeploy 部署为服务，并使用 [opencompass](https://github.com/open-compass/opencompass) 在多个学术数据集上评测。结果显示：Qwen3-8B 与 Qwen3-30B-A3B 的 FP8 量化模型精度与 BF16 模型差异不显著，精度符合预期。
+
+| dataset           | Qwen3-8B |       | Qwen3-30B-A3B |       |
+| ----------------- | -------- | ----- | ------------- | ----- |
+|                   | bf16     | fp8   | bf16          | fp8   |
+| ifeval            | 85.58    | 87.62 | 86.32         | 86.51 |
+| hle               | 5.05     | 5.89  | 7.00          | 7.51  |
+| gpqa              | 59.97    | 59.22 | 61.74         | 60.73 |
+| aime2025          | 69.48    | 70.00 | 73.44         | 71.15 |
+| mmlu_pro          | 73.69    | 73.54 | 77.85         | 77.50 |
+| LCBCodeGeneration | 50.86    | 49.81 | 56.67         | 56.86 |
+
+复现方式可以参考[这份](https://lmdeploy.readthedocs.io/zh-cn/latest/benchmark/evaluate_with_opencompass.html)文档
diff --git a/docs/zh_cn/quantization/llm_compressor.md b/docs/zh_cn/quantization/llm_compressor_int4.md
similarity index 85%
rename from docs/zh_cn/quantization/llm_compressor.md
rename to docs/zh_cn/quantization/llm_compressor_int4.md
index 8fdcbba6a4..0b0f471501 100644
--- a/docs/zh_cn/quantization/llm_compressor.md
+++ b/docs/zh_cn/quantization/llm_compressor_int4.md
@@ -1,9 +1,9 @@
-# llm-compressor 支持
+# llm-compressor-int4 支持
 
-本指南旨在介绍如何使用 LMDeploy 的 TurboMind 推理引擎，运行经由 [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)工具量化后的模型。
-目前支持的 `llm-compressor` 量化模型包括：
+本指南旨在介绍如何使用 LMDeploy 的 TurboMind 推理引擎，运行经由 [vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)工具进行int4量化后的模型。
+目前支持的 `llm-compressor-int4` 量化模型包括：
 
-- int4 量化（例如 AWQ、GPTQ）
+- AWQ、GPTQ
 
 上述量化模型通过 TurboMind 引擎可以在以下 NVIDIA GPU 架构上运行：
 
@@ -19,7 +19,7 @@
 | 9.0                | Hopper             | H20, H200, H100, GH200          |
 | 12.0               | Blackwell          | GeForce RTX 50 series           |
 
-LMDeploy 将持续跟进并扩展对 `llm-compressor` 项目的支持。
+LMDeploy 将持续跟进并扩展对 `llm-compressor-int4` 项目的支持。
 
 本文的其余部分由以下章节组成：
 
@@ -33,8 +33,8 @@ LMDeploy 将持续跟进并扩展对 `llm-compressor` 项目的支持。
 
 ## 模型量化
 
-`llm-compressor` 提供了丰富的模型量化[用例](https://github.com/vllm-project/llm-compressor/tree/main/examples)，请参考其教程选择 LMDeploy 支持的量化算法，完成模型量化工作。
-LMDeploy 也内置了通过 `llm-compressor` 对 Qwen3-30B-A3B 进行 AWQ 量化的[脚本](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/qwen3_30b_a3b_awq.py)，供大家进行参考：
+`llm-compressor-int4` 提供了丰富的模型量化[用例](https://github.com/vllm-project/llm-compressor/tree/main/examples)，请参考其教程选择 LMDeploy 支持的量化算法，完成模型量化工作。
+LMDeploy 也内置了通过 `llm-compressor-int4` 对 Qwen3-30B-A3B 进行 AWQ 量化的[脚本](https://github.com/InternLM/lmdeploy/blob/main/examples/lite/int4/qwen3_30b_a3b_awq.py)，供大家进行参考：
 
 ```shell
 # 创建 conda 环境
@@ -47,7 +47,7 @@ pip install llmcompressor
 # 下载 lmdeploy 源码，运行量化用用例
 git clone https://github.com/InternLM/lmdeploy
 cd lmdeploy
-python examples/lite/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq
+python examples/lite/int4/qwen3_30b_a3b_awq.py --work-dir ./qwen3_30b_a3b_awq
 
 ```
 
diff --git a/examples/lite/fp8/qwen3_30b_a3b_fp8.py b/examples/lite/fp8/qwen3_30b_a3b_fp8.py
new file mode 100644
index 0000000000..4a2c218612
--- /dev/null
+++ b/examples/lite/fp8/qwen3_30b_a3b_fp8.py
@@ -0,0 +1,68 @@
+import argparse
+
+from compressed_tensors.offload import dispatch_model
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run FP8 quantization for Qwen3 model')
+
+    parser.add_argument('--work-dir',
+                        type=str,
+                        default='./qwen3_30b_a3b_fp8',
+                        required=True,
+                        help='The directory to save the quantized model')
+
+    parser.add_argument('--model-id',
+                        type=str,
+                        default='Qwen/Qwen3-30B-A3B',
+                        help='The Hugging Face model ID to quantize')
+    return parser.parse_args()
+
+def main():
+    # 1. Achieve command args
+    args = parse_args()
+    MODEL_ID = args.model_id
+    SAVE_DIR = args.work_dir
+
+    print(f'Loading model: {MODEL_ID}')
+    print(f'Saving to: {SAVE_DIR}')
+
+    # 2. Load_dataset and tokenizer
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype='auto', device_map='auto', trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+    # 3. Configure quant args
+    # Configure the quantization algorithm and scheme.
+    # In this case, we:
+    #   * quantize the weights to fp8 with per channel via ptq
+    #   * quantize the activations to fp8 with dynamic per token
+    recipe = QuantizationModifier(
+        targets='Linear',
+        scheme='FP8_BLOCK',
+        ignore=['lm_head', 're:.*mlp.gate$'],
+    )
+
+    # 4. Run quantization
+    print('Starting quantization...')
+    oneshot(model=model, recipe=recipe)
+
+    # 5. Confirm generations of the quantized model look sane
+    print('========== SAMPLE GENERATION ==============')
+    dispatch_model(model)
+    input_ids = tokenizer('Hello my name is', return_tensors='pt').input_ids.to(
+        model.device
+    )
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print('==========================================')
+
+    # 6. Save quantized model
+    print('Saving model...')
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/lite/qwen3_30b_a3b_awq.py b/examples/lite/int4/qwen3_30b_a3b_awq.py
similarity index 100%
rename from examples/lite/qwen3_30b_a3b_awq.py
rename to examples/lite/int4/qwen3_30b_a3b_awq.py
diff --git a/examples/lite/qwen3_30b_a3b_gptq.py b/examples/lite/int4/qwen3_30b_a3b_gptq.py
similarity index 100%
rename from examples/lite/qwen3_30b_a3b_gptq.py
rename to examples/lite/int4/qwen3_30b_a3b_gptq.py
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 78a3ee1bbf..3bab4cf3f8 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -8,12 +8,11 @@
 import torch
 from torch import nn
 
+from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.utils import try_import_deeplink
 
-from .calibrate import LAYER_TYPE_MAP, calibrate
-
 
 def save_vl_model(vl_model, model_path, dst_path):
     vl_model.save_pretrained(dst_path, safe_serialization=True)
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index a4afeff552..8f54aa4778 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -66,7 +66,7 @@ def get_input_model_registered_name(model_path: str, model_format: str):
 
 
 def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int,
-                                                quantized_format: str):
+                                                quantized_format: str | None):
     """Get the registered name of the turbomind model and its configuration
     according to the input model path, format and user-input config. The name
     will be used to access the OUTPUT_MODELS registry.
@@ -77,7 +77,7 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s
             ['hf', 'awq', 'gptq', 'compressed-tensors', 'fp8', 'mxfp4']
         dtype (str): the data type of the model's weights and activations
         group_size (int): the quantization group size used by grouped formats
-        quantized_format (str): the quantized format of compressed-tensors model,
+        quantized_format (str | None): the quantized format of compressed-tensors model,
             which can be one of ['pack-quantized', 'float-quantized']
     """
     register_name = 'tm'
@@ -118,6 +118,9 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s
             weight_type = 'int4'
             dtype = 'float16'  # force float16 for int4 quantized weights
         elif model_format == 'compressed-tensors':
+            assert quantized_format in ['pack-quantized', 'float-quantized'], (
+                f'compressed-tensors format must be specified as "pack-quantized" or "float-quantized", '
+                f'but got "{quantized_format}"')
             if quantized_format == 'pack-quantized':
                 weight_type = 'int4'
                 model_format = 'awq'
@@ -240,17 +243,14 @@ def get_tm_model(model_path,
         elif quant_method == 'compressed-tensors':
             _format = quant_config['config_groups']['group_0']['format']
             assert _format in ['pack-quantized', 'float-quantized'
-                               ], ('compressed-tennsors only supports pack-quantized/float-quantized format, '
+                               ], ('compressed-tensors only supports pack-quantized/float-quantized format, '
                                    f'but got {_format}')
             _weights = quant_config['config_groups']['group_0']['weights']
             _group_size = _weights['group_size']
             _num_bits = _weights['num_bits']
             _type = _weights['type']
             assert (_num_bits == 4 and _type == 'int') or (_num_bits == 8 and _type == 'float'), (
-                'pack-quantized requires 4-bit int, '
-                f'but got {_num_bits}-bit {_type}. '
-                'or float-quantized requires 8-bit float, '
-                f'but got {_num_bits}-bit {_type}')
+                f'pack-quantized requires int4 or fp8, but got type {_type} and {_num_bits} bits')
         else:
             assert 0, f'unsupported quant_config: {quant_config}'
 
diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
index 4d6f34a895..96f4d8a3e7 100644
--- a/lmdeploy/turbomind/deploy/parameter.py
+++ b/lmdeploy/turbomind/deploy/parameter.py
@@ -121,21 +121,6 @@ def __call__(self, f, g, i):
         f(i, g('weight'), 'weight', identity)
 
 
-class CompressedWeight(Parameter):
-    KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point'
-
-    def __init__(self, xs):
-        self.has_zero_point = False
-        if any(key.endswith(self.KEYS[2]) for key in xs):
-            self.has_zero_point = True
-
-    def __call__(self, f, g, i):
-        f(i, g('weight_packed'), 'qweight', pack_u4_row)
-        f(i, g('weight_scale'), 'scales', to_half, apply_gs=['w2'])
-        if self.has_zero_point:
-            f(i, g('weight_zero_point'), 'zeros', to_half, apply_gs=['w2'])
-        else:
-            f(i, generate_zero_point(g), 'zeros', to_half, apply_gs=['w2'])
 
 
 class Mxfp4Weight(Parameter):
@@ -179,9 +164,6 @@ def get_params(keys: list[str], bias=0):
         ps.append(WeightScaleInv())
     if WeightScale.take(keys):
         ps.append(WeightScale())
-    xs = CompressedWeight.take(keys)
-    if xs:
-        ps.append(CompressedWeight(xs))
     if Mxfp4Weight.take(keys):
         ps.append(Mxfp4Weight())
     if Weight.take(keys):