diff --git a/.github/workflows/test_gptq.yml b/.github/workflows/test_gptq.yml index bfb022420c..d4b9a15192 100644 --- a/.github/workflows/test_gptq.yml +++ b/.github/workflows/test_gptq.yml @@ -47,7 +47,7 @@ jobs: uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128 uv pip install .[tests] uv pip install pypcre "setuptools>=78.1.1,<82" - uv pip install "gptqmodel>=5.6.12" --no-build-isolation + uv pip install "gptqmodel>=7.0.0" - name: Run tests run: | diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index c02a402e36..29a2600799 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -1,27 +1,27 @@ # Quantization -## AutoGPTQ Integration +## GPT-QModel Integration -🤗 Optimum collaborated with [AutoGPTQ library](https://github.com/PanQiWei/AutoGPTQ) to provide a simple API that apply GPTQ quantization on language models. With GPTQ quantization, you can quantize your favorite language model to 8, 4, 3 or even 2 bits. This comes without a big drop of performance and with faster inference speed. This is supported by most GPU hardwares. +🤗 Optimum integrates with [GPT-QModel](https://github.com/ModelCloud/GPTQModel) to provide a simple API for GPTQ quantization on language models. With GPTQ quantization, you can quantize your favorite language model to 8, 4, 3 or even 2 bits. This comes without a big drop of performance and with faster inference speed. This is supported by most GPU hardware. If you want to quantize 🤗 Transformers models with GPTQ, follow this [documentation](https://huggingface.co/docs/transformers/main_classes/quantization). To learn more about the quantization technique used in GPTQ, please refer to: - the [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) paper -- the [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library used as the backend +- [GPT-QModel](https://github.com/ModelCloud/GPTQModel) -Note that the AutoGPTQ library provides more advanced usage (triton backend, fused attention, fused MLP) that are not integrated with Optimum. For now, we leverage only the CUDA kernel for GPTQ. +Optimum requires GPT-QModel for GPTQ quantization and quantized loading. ### Requirements You need to have the following requirements installed to run the code below: -- AutoGPTQ library: -`pip install auto-gptq` - - Optimum library: `pip install --upgrade optimum` +- GPT-QModel: +`pip install "gptqmodel>=7.0.0"` + - Install latest `transformers` library from source: `pip install --upgrade git+https://github.com/huggingface/transformers.git` @@ -74,22 +74,9 @@ empty_model.tie_weights() quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto") ``` -### Exllama kernels for faster inference - -With the release of exllamav2 kernels, you can get faster inference speed compared to exllama kernels for 4-bit model. It is activated by default: `disable_exllamav2=False` in [`~optimum.gptq.load_quantized_model`]. In order to use these kernels, you need to have the entire model on gpus. - -```py -from optimum.gptq import GPTQQuantizer, load_quantized_model -import torch - -from accelerate import init_empty_weights -with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) -empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto") -``` +### Kernel selection for faster inference -If you wish to use exllama kernels, you will have to change the version by setting `exllama_config`: +When GPTQ models are loaded through GPT-QModel, the runtime automatically selects an appropriate inference kernel for the current hardware and quantized model. In the common case, you should not pass `backend` explicitly. ```py from optimum.gptq import GPTQQuantizer, load_quantized_model @@ -99,10 +86,14 @@ from accelerate import init_empty_weights with init_empty_weights(): empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", exllama_config = {"version":1}) +quantized_model = load_quantized_model( + empty_model, + save_folder=save_folder, + device_map="auto", +) ``` -Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable exllama/exllamav2 kernels when you are finetuning your model with peft. +If you are finetuning with PEFT, prefer the default automatic backend selection unless you have a specific reason to override it. You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 713997abd1..2bb5222fe3 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors. +# Copyright 2023 HuggingFace Inc. team and GPTQ authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -81,7 +81,6 @@ def __init__( module_name_preceding_first_block: Optional[List[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - max_input_length: Optional[int] = None, cache_block_outputs: Optional[bool] = True, modules_in_block_to_quantize: Optional[List[List[str]]] = None, format: str = "gptq", @@ -125,9 +124,6 @@ def __init__( The batch size of the dataset pad_token_id (`Optional[int]`, defaults to `None`): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - max_input_length (`Optional[int]`, defaults to `None`): - The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. - It is specific to the exllama backend with act-order. cache_block_outputs (`bool`, defaults to `True`): Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models (e.g. ChatGLM) but can require more time. @@ -136,12 +132,12 @@ def __init__( The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially. If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]` format (`str`, *optional*, defaults to `gptq`): - GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only. + GPTQ weight format. `gptq`(v1) is used for broad checkpoint compatibility. `gptq_v2` is GPT-QModel only. meta (`Dict[str, any]`, *optional*): Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. - i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"] + For example, `meta.quantizer` can store version tags for Optimum and GPT-QModel. backend (`str`, *optional*): - Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py + Controls which gptq kernel to be used. Valid values come from GPT-QModel backends such as `auto` and `auto_trainable`. Ref GPT-QModel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py """ self.bits = bits @@ -160,7 +156,6 @@ def __init__( self.module_name_preceding_first_block = module_name_preceding_first_block self.batch_size = batch_size self.pad_token_id = pad_token_id - self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ self.cache_block_outputs = cache_block_outputs self.modules_in_block_to_quantize = modules_in_block_to_quantize @@ -320,37 +315,16 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st in_features = layer.weight.shape[0] out_features = layer.weight.shape[1] bias = layer.bias is not None - if is_gptqmodel_available(): - new_layer = self.quant_linear( - self.bits, - self.group_size, - self.desc_act, - self.sym, - in_features, - out_features, - bias, - weight_dtype=layer.weight.dtype, - ) - else: - if not (self.desc_act) or self.group_size == -1: - new_layer = self.quant_linear( - self.bits, - self.group_size, - in_features, - out_features, - bias, - use_cuda_fp16=self.use_cuda_fp16, - weight_dtype=layer.weight.dtype, - ) - else: - new_layer = self.quant_linear( - self.bits, - self.group_size, - in_features, - out_features, - bias, - weight_dtype=layer.weight.dtype, - ) + new_layer = self.quant_linear( + self.bits, + self.group_size, + self.desc_act, + self.sym, + in_features, + out_features, + bias, + weight_dtype=layer.weight.dtype, + ) new_layer.device = device setattr(module, attr, new_layer.to(device)) for name1, child in module.named_children(): @@ -378,30 +352,13 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): if not is_gptqmodel_available(): raise RuntimeError( - "gptqmodel is required in order to perform gptq quantization: `pip install gptqmodel`. Please notice that auto-gptq will be deprecated in the future." - ) - - gptq_supports_cpu = is_gptqmodel_available() - - if not gptq_supports_cpu and not torch.cuda.is_available(): - raise RuntimeError( - "No cuda gpu or cpu support using Intel/IPEX found. A gpu or cpu with Intel/IPEX is required for quantization." - ) - - if not self.sym and not is_gptqmodel_available(): - raise ValueError( - "Asymmetric sym=False quantization is not supported with auto-gptq. Please use gptqmodel: `pip install gptqmodel`" - ) - - if self.format == "gptq_v2" and not is_gptqmodel_available(): - raise ValueError( - "gptq_v2 format only supported with gptqmodel. Please install gptqmodel: `pip install gptqmodel`" + "GPT-QModel is required in order to perform gptq quantization: `pip install gptqmodel>=7.0.0`." ) model.eval() - # gptqmodel internal is gptq_v2 for asym support, gptq(v1) can only support sym=True - if is_gptqmodel_available() and self.format != "gptq_v2": + # GPT-QModel internal is gptq_v2 for asym support, gptq(v1) can only support sym=True + if self.format != "gptq_v2": self.format = "gptq_v2" # For Transformer model @@ -483,8 +440,6 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): blocks = recurse_getattr(model, self.block_name_to_quantize) cur_layer_device = get_device(blocks[0]) - if not is_gptqmodel_available() and cur_layer_device.type == "cpu": - cur_layer_device = 0 if not has_device_map: # put modules from module_name_preceding_first_block on cuda or xpu or cpu @@ -555,8 +510,6 @@ def store_input_hook(module, args, kwargs): block = block.to(0) layers = get_layers(block) block_device = get_device(block) - if not is_gptqmodel_available() and block_device.type == "cpu": - block_device = 0 if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0: if self.true_sequential: layers_name_list = self.modules_in_block_to_quantize @@ -669,18 +622,6 @@ class StoreAttr(object): model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act model = gptq_post_init(model, use_act_order=self.desc_act) - # Keep this compatibility guard for older gptqmodel versions where EXLLAMA_V1 still exists. - # This branch can be removed once we bump the minimum gptqmodel version and drop v1 support. - if ( - hasattr(BACKEND, "EXLLAMA_V1") - and self.backend == BACKEND.EXLLAMA_V1 - and self.desc_act - and self.max_input_length is not None - ): - from gptqmodel import exllama_set_max_input_length - - model = exllama_set_max_input_length(model, self.max_input_length) - return model def pack_model( @@ -746,7 +687,7 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa """ - # convert gptqmodel internal gptq_v2 format to v1 for max compatibility + # Convert GPT-QModel internal gptq_v2 format to v1 for max compatibility. if is_gptqmodel_available(): model, converted = hf_convert_gptq_v2_to_v1_format( model, self.sym, self.bits, self.quant_linear, self.format, self.meta @@ -772,7 +713,6 @@ def load_quantized_model( offload_folder: Optional[str] = None, offload_buffers: Optional[str] = None, offload_state_dict: bool = False, - max_input_length: Optional[int] = None, ): """ Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map. @@ -805,16 +745,13 @@ def load_quantized_model( If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map picked contains `"disk"` values. - max_input_length (`Optional[int]`, defaults to `None`): - The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. - It is specific to the exllama backend with act-order. Returns: `nn.Module`: The quantized model """ if not is_gptqmodel_available(): raise RuntimeError( - "gptqmodel (`pip install gptqmodel`) is required in order to load quantized weights. Please notice that auto-gptq will be deprecated in the future." + "GPT-QModel (`pip install gptqmodel>=7.0.0`) is required in order to load quantized weights." ) if not is_accelerate_available(): raise RuntimeError( @@ -844,7 +781,6 @@ def load_quantized_model( quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.backend = backend - quantizer.max_input_length = max_input_length model = quantizer.convert_model(model, device_map=device_map) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 0377d9321e..0cdd93f56d 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -28,7 +28,6 @@ ONNX_WEIGHTS_NAME, ) from .import_utils import ( - AUTOGPTQ_MINIMUM_VERSION, DIFFUSERS_MINIMUM_VERSION, GPTQMODEL_MINIMUM_VERSION, ORT_QUANTIZE_MINIMUM_VERSION, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 54f4f22709..7bc39f5634 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -29,8 +29,7 @@ TORCH_MINIMUM_VERSION = version.parse("2.1.0") TRANSFORMERS_MINIMUM_VERSION = version.parse("4.36.0") DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0") -AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99") # Allows 0.5.0.dev0 -GPTQMODEL_MINIMUM_VERSION = version.parse("1.6.0") +GPTQMODEL_MINIMUM_VERSION = version.parse("7.0.0") ORT_QUANTIZE_MINIMUM_VERSION = version.parse("1.4.0") # TODO: remove as optimm-onnx requires >=1.15.0 @@ -226,7 +225,8 @@ def is_gptqmodel_available(): return True else: raise ImportError( - f"Found an incompatible version of gptqmodel. Found version {v}, but only version >= {GPTQMODEL_MINIMUM_VERSION} are supported" + "Found an incompatible version of GPT-QModel (`gptqmodel` package). " + f"Found version {v}, but only versions >= {GPTQMODEL_MINIMUM_VERSION} are supported" ) diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 2f3a1a9408..2969c616e6 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -60,9 +60,9 @@ def require_accelerate(test_case): def require_gptqmodel(test_case): """ - Decorator marking a test that requires auto-gptq. These tests are skipped when auto-gptq isn't installed. + Decorator marking a test that requires GPT-QModel. These tests are skipped when `gptqmodel` isn't installed. """ - return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel")(test_case) + return unittest.skipUnless(is_gptqmodel_available(), "test requires GPT-QModel")(test_case) def require_torch_gpu(test_case): diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 20e2e3083c..4a51b06ef9 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -31,7 +31,7 @@ if is_gptqmodel_available(): - from gptqmodel import BACKEND, GPTQModel + from gptqmodel import GPTQModel from gptqmodel.quantization import FORMAT, METHOD from gptqmodel.utils.importer import hf_select_quant_linear_v2 @@ -55,14 +55,12 @@ class GPTQTest(unittest.TestCase): sym = True desc_act = False act_group_aware = True - quant_backend = BACKEND.AUTO - load_backend = BACKEND.AUTO cache_block_outputs = True modules_in_block_to_quantize = None device_map_for_quantization = "cuda" device_for_inference = 0 dataset = [ - "gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." + "GPT-QModel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." ] # called only once for all tests in this class @@ -89,7 +87,6 @@ def setUpClass(cls): sym=cls.sym, desc_act=cls.desc_act, act_group_aware=cls.act_group_aware, - backend=cls.quant_backend, cache_block_outputs=cls.cache_block_outputs, modules_in_block_to_quantize=cls.modules_in_block_to_quantize, ) @@ -129,19 +126,24 @@ def test_quantized_layers_class(self): format=FORMAT.GPTQ, quant_method=METHOD.GPTQ, device_map=self.device_map_for_quantization, - backend=self.quant_backend, pack=True, ) self.assertEqual(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__, QuantLinear) - def check_quantized_layers_type(self, model, value): - self.assertEqual(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE, value) - - def test_serialization(self): - """ - Test the serialization of the model and the loading of the quantized weights - """ + def check_quantized_layers_class(self, model): + QuantLinear = hf_select_quant_linear_v2( + bits=self.bits, + group_size=self.group_size, + desc_act=self.desc_act, + sym=self.sym, + format=FORMAT.GPTQ, + quant_method=METHOD.GPTQ, + device_map={"": self.device_for_inference}, + pack=True, + ) + self.assertEqual(model.transformer.h[0].mlp.dense_4h_to_h.__class__, QuantLinear) + def run_serialization_round_trip(self): with tempfile.TemporaryDirectory() as tmpdirname: self.tokenizer.save_pretrained(tmpdirname) self.quantizer.save(self.quantized_model, tmpdirname) @@ -155,17 +157,23 @@ def test_serialization(self): empty_model, save_folder=tmpdirname, device_map={"": self.device_for_inference}, - backend=self.load_backend, ) - self.check_quantized_layers_type(quantized_model_from_saved, "marlin") + self.check_quantized_layers_class(quantized_model_from_saved) - # transformers and gptqmodel compatibility + # Transformers and GPT-QModel compatibility. # quantized models are more compatible with device map than # device context managers (they're never used in transformers testing suite) _ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference}) _ = GPTQModel.load(tmpdirname, device_map={"": self.device_for_inference}) + def test_serialization(self): + """ + Test the serialization of the model and the loading of the quantized weights + """ + + self.run_serialization_round_trip() + class GPTQTestCPUInit(GPTQTest): device_map_for_quantization = "cpu" @@ -181,73 +189,11 @@ class GPTQTestActOrder(GPTQTest): expected_quantized_perplexity = 33 def test_serialization(self): - # act_order don't work with qlinear_cuda kernel - pass - - def test_exllama_serialization(self): """ - Test the serialization of the model and the loading of the quantized weights with exllama kernel + Test the serialization and post-quant load flow for act-order models. """ - with tempfile.TemporaryDirectory() as tmpdirname: - self.tokenizer.save_pretrained(tmpdirname) - self.quantizer.save(self.quantized_model, tmpdirname) - self.quantized_model.config.save_pretrained(tmpdirname) - with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_config( - AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16 - ) - empty_model.tie_weights() - quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": self.device_for_inference}, - backend=BACKEND.EXLLAMA_V2, - ) - self.check_quantized_layers_type(quantized_model_from_saved, "exllamav2") - - # transformers and gptqmodel compatibility - # quantized models are more compatible with device map than - # device context managers (they're never used in transformers testing suite) - _ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference}) - _ = GPTQModel.load(tmpdirname, device_map={"": self.device_for_inference}) - - -class GPTQTestExllamav2(GPTQTest): - desc_act = False - load_backend = BACKEND.EXLLAMA_V2 - - def test_serialization(self): - # don't need to test - pass - - def test_exllama_serialization(self): - """ - Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel - """ - - with tempfile.TemporaryDirectory() as tmpdirname: - self.tokenizer.save_pretrained(tmpdirname) - self.quantizer.save(self.quantized_model, tmpdirname) - self.quantized_model.config.save_pretrained(tmpdirname) - with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_config( - AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16 - ) - empty_model.tie_weights() - quantized_model_from_saved = load_quantized_model( - empty_model, - backend=self.load_backend, - save_folder=tmpdirname, - device_map={"": self.device_for_inference}, - ) - self.check_quantized_layers_type(quantized_model_from_saved, "exllamav2") - - # transformers and gptqmodel compatibility - # quantized models are more compatible with device map than - # device context managers (they're never used in transformers testing suite) - _ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference}) - _ = GPTQModel.load(tmpdirname, device_map={"": self.device_for_inference}) + self.run_serialization_round_trip() class GPTQTestNoBlockCaching(GPTQTest): @@ -268,6 +214,50 @@ def test_not_converted_layers(self): self.assertEqual(self.quantized_model.transformer.h[0].self_attention.dense.__class__.__name__, "Linear") +@require_gptqmodel +class GPTQPostInitTest(unittest.TestCase): + def test_post_init_model_with_real_quant_linear(self): + quantizer = GPTQQuantizer( + bits=4, + dataset=["gptq"], + desc_act=True, + act_group_aware=False, + ) + quantizer.quant_linear = hf_select_quant_linear_v2( + bits=quantizer.bits, + group_size=quantizer.group_size, + desc_act=quantizer.desc_act, + sym=quantizer.sym, + format=FORMAT.GPTQ, + quant_method=METHOD.GPTQ, + device_map={"": "cpu"}, + pack=True, + ) + + class Wrapper(torch.nn.Module): + # Minimal module tree that exercises the real GPT-QModel conversion and post-init path. + def __init__(self): + super().__init__() + self.layer = quantizer.quant_linear( + bits=quantizer.bits, + group_size=quantizer.group_size, + sym=quantizer.sym, + desc_act=quantizer.desc_act, + in_features=32, + out_features=32, + bias=False, + ) + + model = Wrapper() + self.assertEqual(model.layer.qzero_format(), 1) + + result = quantizer.post_init_model(model) + + self.assertIs(result, model) + self.assertTrue(model.quantize_config.desc_act) + self.assertEqual(model.layer.qzero_format(), 2) + + class GPTQUtilsTest(unittest.TestCase): """ Test utilities