From 80c92da062d7596bd3fceba77d674d97b96fe450 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 2 Nov 2025 20:49:14 -0800 Subject: [PATCH 01/57] add auto-round Signed-off-by: yiliu30 --- examples/quantization_w4a16/llama3_example.py | 2 ++ src/llmcompressor/modifiers/quantization/__init__.py | 1 + src/llmcompressor/modifiers/quantization/gptq/base.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index b03aacee35..038c0ebc9f 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -7,6 +7,8 @@ # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model_dir="/storage/yiliu7" +model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py index f6ad149fbb..106128f046 100644 --- a/src/llmcompressor/modifiers/quantization/__init__.py +++ b/src/llmcompressor/modifiers/quantization/__init__.py @@ -3,3 +3,4 @@ from .cache import * from .gptq import * from .quantization import * +from .autoround import * \ No newline at end of file diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 09f3e681c4..126a1f6556 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -262,7 +262,7 @@ def compress_modules(self): percdamp=self.dampening_frac, ) comp_logger.set_loss(loss) - + breakpoint() update_offload_parameter(module, "weight", quantized_weight) update_offload_parameter(module, "weight_scale", scale) update_offload_parameter(module, "weight_zero_point", zero_point) From 3266b79b3a038cc128fd7e506f6a5f1f78e23331 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 2 Nov 2025 21:50:33 -0800 Subject: [PATCH 02/57] add auto-round modifier Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 415 ++++++++++++++++++ 1 file changed, 415 insertions(+) create mode 100644 src/llmcompressor/modifiers/quantization/autoround/base.py diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py new file mode 100644 index 0000000000..7ad9bf3e8d --- /dev/null +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -0,0 +1,415 @@ +import contextlib +from typing import Dict, List, Optional, Tuple, Union + +import torch +from compressed_tensors.quantization import ( + QuantizationConfig, + QuantizationScheme, + QuantizationStrategy, +) +from compressed_tensors.quantization.quant_args import ActivationOrdering +from compressed_tensors.utils import ( + align_module_device, + get_execution_device, + getattr_chain, + match_named_modules, + update_offload_parameter, +) +from loguru import logger +from pydantic import PrivateAttr + +from llmcompressor.core import Event, EventType, State +from llmcompressor.modifiers import Modifier +from llmcompressor.modifiers.quantization.gptq.gptq_quantize import ( + accumulate_hessian, + make_empty_hessian, + quantize_weight, +) +from llmcompressor.modifiers.quantization.quantization import QuantizationMixin +from llmcompressor.sentinel import Sentinel +from llmcompressor.utils.metric_logging import CompressionLogger + +__all__ = ["AutoRoundModifier"] + + +from collections import defaultdict +import os + +FALLBACK_CHANGE = os.environ.get("FALLBACK_CHANGE", "0").lower() in ("1", "true", "yes") +_DEBUG = os.environ.get("DEBUG", "0").lower() in ("1", "true", "yes") + +all_module_input = defaultdict(list) +all_module_output = defaultdict(list) + + +def input_capture_hook(module, *args, **kwargs): + all_module_input[module._tmp_name].append((args, kwargs)) + + +def output_capture_hook(module, *args, **kwargs): + all_module_output[module._tmp_name].append((args, kwargs)) + + +def normalize_input(cur_inputs): + # TODO: move it to auto-round + input_ids = [] + input_others = {} + positional_inputs = [] + attention_mask = None + position_ids = None + cache_position = None + position_embeddings = (None, None) + for cur_inp in cur_inputs: + input_ids.append(cur_inp[0][0][0]) + for key, val in cur_inp[0][1].items(): + if key == "position_ids": + position_ids = val + elif key == "position_embeddings": + position_embeddings = val + elif key == "cache_position": + cache_position = val + input_others["position_ids"] = position_ids + input_others["positional_inputs"] = positional_inputs + input_others["attention_mask"] = attention_mask + input_others["position_embeddings"] = position_embeddings + input_others["cache_position"] = cache_position + return input_ids, input_others + + +def _is_decoding_layer(module, name): + return "decoderlayer" in module.__class__.__name__.lower() + + +class _LLModelWrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.layers = torch.nn.ModuleList() + + def forward(self, *args, **kwargs): + for layer in self.layers: + res = layer(*args, **kwargs) + return res + + +class _PretrainModelWrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.model = _LLModelWrapper() + + def forward(self, *args, **kwargs): + return self.model(*args, **kwargs) + + +def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper: + wrapped_model = _PretrainModelWrapper() + wrapped_model.model.layers.append(layer) + first_param = next(layer.parameters()) + wrapped_model.dtype = first_param.dtype + return wrapped_model + + + +class AutoRoundModifier(Modifier, QuantizationMixin): + """ + Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier + uses activations to calibrate a hessian matrix, which is then used to determine + optimal quantizion values and orderings for the model weights. + + | Sample yaml: + | test_stage: + | obcq_modifiers: + | AutoRoundModifier: + | block_size: 128 + | dampening_frac: 0.001 + | offload_hessians: False + | actorder: static + | config_groups: + | group_0: + | targets: + | - "Linear" + | input_activations: null + | output_activations: null + | weights: + | num_bits: 8 + | type: "int" + | symmetric: true + | strategy: group + | group_size: 128 + + Lifecycle: + - on_initialize + - apply config to model + - on_start + - add activation calibration hooks + - add gptq weight calibration hooks + - on_sequential_epoch_end + - quantize_weight + - on_finalize + - remove_hooks() + - model.apply(freeze_module_quantization) + + :param sequential_targets: list of layer names to compress during GPTQ, or + '__ALL__' to compress every layer in the model + :param block_size: Used to determine number of columns to compress in one pass + :param dampening_frac: Amount of dampening to apply to H, as a fraction of the + diagonal norm + :param actorder: order in which weight columns are quantized. Defaults to "static" + activation ordering, which achieves best accuracy recovery with no runtime cost. + For more information, see https://github.com/vllm-project/vllm/pull/8135 + :param offload_hessians: Set to True for decreased memory usage but increased + runtime. + + :param config_groups: dictionary specifying quantization schemes to apply to target + modules. Modules not matching a scheme target will NOT be quantized. + :param targets: list of layer names to quantize if a scheme is provided. Defaults + to Linear layers + :param ignore: optional list of module class names or submodule names to not + quantize even if they match a target in config_groups. Defaults to empty list. + :param scheme: a single quantization scheme to apply to the model. This is a + dictionary that supports all keys from QuantizationScheme except targets, which + will be set to the targets parameter set at the modifier level. Can also be set + to a dictionary of the format `preset_scheme_name: targets` for example: + `W8A8: ['Linear']` for weight and activation 8-bit. + :param kv_cache_scheme: optional QuantizationArgs, that specify the + quantization of the kv cache. If None, kv cache is not quantized. + When applying kv cache quantization to transformer AutoModelForCausalLM, + the kv_cache_scheme gets converted into a QuantizationScheme that: + - targets the `q_proj` and `k_proj` modules of the model. The outputs + of those modules are the keys and values that might be cached + - quantizes the outputs of the aformentioned layers, so that + keys and values are compressed before storing them in the cache + There is an explicit assumption that the model contains modules with + `k_proj` and `v_proj` in their names. If this is not the case + and kv_cache_scheme != None, the quantization of kv cache will fail + """ + + # gptq modifier arguments + sequential_targets: Union[str, List[str], None] = None + block_size: int = 128 + dampening_frac: Optional[float] = 0.01 + # TODO: this does not serialize / will be incorrectly written + actorder: Optional[Union[ActivationOrdering, Sentinel]] = Sentinel("static") + offload_hessians: bool = False + + # private variables + _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) + _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict) + _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict) + + _cur_layer_idx = PrivateAttr(default=0) + + + def resolve_quantization_config(self) -> QuantizationConfig: + config = super().resolve_quantization_config() + + def resolve_actorder(existing): + # sentinel default only overrides if existing is None + if self.actorder == Sentinel("static"): + return ActivationOrdering.STATIC if existing is None else existing + + # user-provided value always attempts to override + if existing is None or self.actorder == existing: + return self.actorder + + # if existing provided and conflicts + raise ValueError( + "Cannot resolve activation ordering when both " + "`AutoRoundModifier.actorder` and `QuantizationScheme.actorder` " + f"are provided and differ ({self.actorder}, {existing}). " + "Either unset `AutoRoundModifier.actorder` or " + "remove `actorder` from config groups." + ) + + for scheme in config.config_groups.values(): + assert isinstance(scheme, QuantizationScheme) + if ( + getattr_chain(scheme, "weights.strategy", None) + == QuantizationStrategy.GROUP + ): + scheme.weights.actorder = resolve_actorder(scheme.weights.actorder) + return config + + def on_initialize(self, state: State, **kwargs) -> bool: + """ + Initialize and run the GPTQ algorithm on the current state + + :param state: session state storing input model and calibration data + """ + # apply config to model and prepare calibration hooks + if QuantizationMixin.has_config(self): + QuantizationMixin.initialize_quantization(self, state.model) + + # prepare module names + self._module_names = { + m: name + for name, m in match_named_modules( + state.model, self.targets, self.ignore + ) + } + # add tmp name for each module for debugging + for name, mod in state.model.named_modules(): + mod._tmp_name = name + # freeze all model parameters + for name, param in state.model.named_parameters(): + param.requires_grad_(False) + + return True + + + def start_calibration(self, model: torch.nn.Module): + """ + Register activation calibration hooks (including kv_cache quantization) and enable quantization as we calibrate + + :param model: model to prepare for calibration + """ + + from compressed_tensors.quantization import enable_quantization + from llmcompressor.modifiers.quantization.calibration import apply_calibration_status + for _, module in match_named_modules(model, self.targets, self.ignore): + # Note: No need to register observers for auto-round + # self._initialize_observers(module) + self._calibration_hooks |= self._initialize_hooks(module) + apply_calibration_status(module) + + model.apply(enable_quantization) # quantize at the same time as calibrate + + + def on_start(self, state: State, event: Event, **kwargs): + self.started_ = True + + # register quantization calibration hooks + # assume quantization has been initialized by this modifier or one before it + # Replace it with call to self.start_calibration + # QuantizationMixin.start_calibration(self, state.model) + self.start_calibration( state.model) + for name, module in state.model.named_modules(): + if _is_decoding_layer(module, name): + # register input/output capture hooks for decoding layers + logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}") + module.register_forward_pre_hook(input_capture_hook, with_kwargs=True) + module.register_forward_hook(output_capture_hook, with_kwargs=True) + + + def on_event(self, state: State, event: Event, **kwargs): + if event.type_ == EventType.CALIBRATION_EPOCH_START: + if not self.started_: + self.on_start(state, None) + + if event.type_ == EventType.SEQUENTIAL_EPOCH_END: + self.autoround(state) + + if event.type_ == EventType.CALIBRATION_EPOCH_END: + if not self.ended_: + self.on_end(state, None) + + def autoround(self, state): + cur_layer_idx = self._cur_layer_idx + self._cur_layer_idx += 1 + logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}") + if cur_layer_idx >= len(state.model.model.layers): + logger.info( + f">>||>> All decoding layers have been processed for AutoRound." + ) + # self.compress_modules(return_directly=False) + return + decoding_layer = state.model.model.layers[cur_layer_idx] + logger.debug( + f">>||>> Strating AutoRound for decoding layer {getattr(decoding_layer, '_tmp_name', '')}" + ) + + wrapped_model = _wrap_decoding_layer(decoding_layer) + + with torch.enable_grad(), align_module_device(decoding_layer): + if _DEBUG: + iters = 4 + else: + iters = 200 + import auto_round + + ar = auto_round.AutoRound( + model=wrapped_model, + tokenizer="", + scheme="W4A16", + iters=iters, + enable_quanted_input=False, + # FIXME: batch size 1 causes error, looks like related to the input_others prepare + # batch_size=1 + # enable_torch_compile=True, + # enable_deterministic_algorithms=True, + ) + + ar.configure_layer_config() + + input_name = f"model.layers.{cur_layer_idx}" + cur_inputs = all_module_input[input_name] + input_ids, input_others = normalize_input(cur_inputs) + decoding_layer.tuning_device = torch.device("cuda") + + ar.quantize_block( + block=decoding_layer, + input_ids=input_ids, + input_others=input_others, + q_input=None, + device="cuda", + ) + # Update offload parameters and remove temporary attributes + for name, module in decoding_layer.named_modules(): + if hasattr(module, "weight_scale") and hasattr( + module, "weight_zero_point" + ): + logger.debug( + f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}" + ) + # weight = module.weight + weight_scale = module.scale + del module.scale + del module.zp + # TODO: update weight as well + # breakpoint() + + update_offload_parameter(module, "weight_scale", weight_scale) + + for module in list(self._num_samples.keys()): + name = self._module_names[module] + del self._num_samples[module] + decoding_layer.eval() + all_module_input.clear() + all_module_output.clear() + + + def on_end(self, state: State, event: Event, **kwargs): + """ + Finish calibrating by removing observers and calibration hooks + """ + self.ended_ = True + QuantizationMixin.end_calibration(self, state.model) + self.remove_hooks() # remove gptq hooks + + def on_finalize(self, state: State, **kwargs) -> bool: + """ + disable the quantization observers used by the OBCQ algorithm + + :param state: session state storing input model and calibration data + """ + if not self.ended_: + self.on_end(state, None) + + if len(self._num_samples) > 0: + raise ValueError(f"Failed to compress {len(self._num_samples)} modules") + + self._hessians = dict() + self._num_samples = dict() + + return True + + @contextlib.contextmanager + def _maybe_onload_hessian(self, module: torch.nn.Module): + if self.offload_hessians: + device = get_execution_device(module) + self._hessians[module] = self._hessians[module].to(device=device) + + yield + + if self.offload_hessians: + if module in self._hessians: # may have been deleted in context + self._hessians[module] = self._hessians[module].to(device="cpu") From 9c537ccaf73e7f0e785fcccd176ca0fdbf1be598 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 2 Nov 2025 22:14:42 -0800 Subject: [PATCH 03/57] refine code Signed-off-by: yiliu30 --- .../pipelines/layer_sequential/pipeline.py | 3 ++- src/llmcompressor/pipelines/sequential/pipeline.py | 10 +++++++--- src/llmcompressor/utils/helpers.py | 3 +++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 244edde87e..54d59e948a 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -19,6 +19,7 @@ from llmcompressor.pipelines.sequential.helpers import ( dispatch_for_sequential, get_sequential_targets, + DISABLE_QAC_MODIFIERS, ) from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context @@ -72,7 +73,7 @@ def __call__( # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ disable_qac = any( - type(mod).__name__ in ["GPTQModifier", "AWQModifier"] + type(mod).__name__ in DISABLE_QAC_MODIFIERS for mod in session.lifecycle.recipe.modifiers ) diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 261afd6544..1e7df2da53 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -15,7 +15,11 @@ get_sequential_targets, trace_subgraphs, ) -from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context +from llmcompressor.utils.helpers import ( + DisableQuantization, + calibration_forward_context, + DISABLE_QAC_MODIFIERS, +) if TYPE_CHECKING: from llmcompressor.args.dataset_arguments import DatasetArguments @@ -74,8 +78,8 @@ def __call__( # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ disable_qac = any( - type(mod).__name__ in ["GPTQModifier", "AWQModifier"] - for mod in session.lifecycle.recipe.modifiers + type(mod).__name__ in DISABLE_QAC_MODIFIERS + for mod in session.lifecycle.recipe.modifiers ) with contextlib.ExitStack() as stack: diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py index 9aaae59eb9..b1c6c02f0f 100644 --- a/src/llmcompressor/utils/helpers.py +++ b/src/llmcompressor/utils/helpers.py @@ -67,6 +67,7 @@ "calibration_forward_context", "patch_attr", "disable_hf_kernels", + "DISABLE_QAC_MODIFIERS" ] @@ -1082,3 +1083,5 @@ def patch_attr(base: object, attr: str, value: Any): setattr(base, attr, original_value) else: delattr(base, attr) + +DISABLE_QAC_MODIFIERS = ["GPTQModifier", "AWQModifier", "AutoRoundModifier"] \ No newline at end of file From bebe0fa1a87321eb1472465af2441f2abaf6d00e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 2 Nov 2025 22:23:58 -0800 Subject: [PATCH 04/57] disbale qac for auto-round Signed-off-by: yiliu30 --- src/llmcompressor/pipelines/layer_sequential/pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 54d59e948a..de3a093799 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -19,9 +19,12 @@ from llmcompressor.pipelines.sequential.helpers import ( dispatch_for_sequential, get_sequential_targets, +) +from llmcompressor.utils.helpers import ( + DisableQuantization, + calibration_forward_context, DISABLE_QAC_MODIFIERS, ) -from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context if TYPE_CHECKING: from llmcompressor.args.dataset_arguments import DatasetArguments From dfb0ff828fafbc48a845a629ddd2ea7961cda90f Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 2 Nov 2025 23:00:53 -0800 Subject: [PATCH 05/57] clean code Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 60 +------------------ 1 file changed, 2 insertions(+), 58 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 7ad9bf3e8d..13b8fe013f 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -20,11 +20,6 @@ from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier -from llmcompressor.modifiers.quantization.gptq.gptq_quantize import ( - accumulate_hessian, - make_empty_hessian, - quantize_weight, -) from llmcompressor.modifiers.quantization.quantization import QuantizationMixin from llmcompressor.sentinel import Sentinel from llmcompressor.utils.metric_logging import CompressionLogger @@ -188,45 +183,15 @@ class AutoRoundModifier(Modifier, QuantizationMixin): block_size: int = 128 dampening_frac: Optional[float] = 0.01 # TODO: this does not serialize / will be incorrectly written - actorder: Optional[Union[ActivationOrdering, Sentinel]] = Sentinel("static") - offload_hessians: bool = False # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) - _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict) - _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict) - + _cur_layer_idx = PrivateAttr(default=0) def resolve_quantization_config(self) -> QuantizationConfig: config = super().resolve_quantization_config() - - def resolve_actorder(existing): - # sentinel default only overrides if existing is None - if self.actorder == Sentinel("static"): - return ActivationOrdering.STATIC if existing is None else existing - - # user-provided value always attempts to override - if existing is None or self.actorder == existing: - return self.actorder - - # if existing provided and conflicts - raise ValueError( - "Cannot resolve activation ordering when both " - "`AutoRoundModifier.actorder` and `QuantizationScheme.actorder` " - f"are provided and differ ({self.actorder}, {existing}). " - "Either unset `AutoRoundModifier.actorder` or " - "remove `actorder` from config groups." - ) - - for scheme in config.config_groups.values(): - assert isinstance(scheme, QuantizationScheme) - if ( - getattr_chain(scheme, "weights.strategy", None) - == QuantizationStrategy.GROUP - ): - scheme.weights.actorder = resolve_actorder(scheme.weights.actorder) return config def on_initialize(self, state: State, **kwargs) -> bool: @@ -369,9 +334,6 @@ def autoround(self, state): update_offload_parameter(module, "weight_scale", weight_scale) - for module in list(self._num_samples.keys()): - name = self._module_names[module] - del self._num_samples[module] decoding_layer.eval() all_module_input.clear() all_module_output.clear() @@ -394,22 +356,4 @@ def on_finalize(self, state: State, **kwargs) -> bool: if not self.ended_: self.on_end(state, None) - if len(self._num_samples) > 0: - raise ValueError(f"Failed to compress {len(self._num_samples)} modules") - - self._hessians = dict() - self._num_samples = dict() - - return True - - @contextlib.contextmanager - def _maybe_onload_hessian(self, module: torch.nn.Module): - if self.offload_hessians: - device = get_execution_device(module) - self._hessians[module] = self._hessians[module].to(device=device) - - yield - - if self.offload_hessians: - if module in self._hessians: # may have been deleted in context - self._hessians[module] = self._hessians[module].to(device="cpu") + return True \ No newline at end of file From 513972c298f93fdf8b67a4a591bdfaea369bc463 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 2 Nov 2025 23:01:31 -0800 Subject: [PATCH 06/57] add compile after disable qac Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 13b8fe013f..a8fc4cd274 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -299,7 +299,7 @@ def autoround(self, state): enable_quanted_input=False, # FIXME: batch size 1 causes error, looks like related to the input_others prepare # batch_size=1 - # enable_torch_compile=True, + enable_torch_compile=True, # enable_deterministic_algorithms=True, ) From 2291cc41da63fb67674c137d78d91497f031000f Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 00:07:37 -0800 Subject: [PATCH 07/57] add iters and clean code Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index a8fc4cd274..c693246fcc 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -23,6 +23,8 @@ from llmcompressor.modifiers.quantization.quantization import QuantizationMixin from llmcompressor.sentinel import Sentinel from llmcompressor.utils.metric_logging import CompressionLogger +from compressed_tensors.quantization import enable_quantization +from llmcompressor.modifiers.quantization.calibration import apply_calibration_status __all__ = ["AutoRoundModifier"] @@ -180,13 +182,12 @@ class AutoRoundModifier(Modifier, QuantizationMixin): # gptq modifier arguments sequential_targets: Union[str, List[str], None] = None - block_size: int = 128 + iters: int = 200 dampening_frac: Optional[float] = 0.01 # TODO: this does not serialize / will be incorrectly written # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) - _cur_layer_idx = PrivateAttr(default=0) @@ -217,7 +218,6 @@ def on_initialize(self, state: State, **kwargs) -> bool: # freeze all model parameters for name, param in state.model.named_parameters(): param.requires_grad_(False) - return True @@ -228,8 +228,7 @@ def start_calibration(self, model: torch.nn.Module): :param model: model to prepare for calibration """ - from compressed_tensors.quantization import enable_quantization - from llmcompressor.modifiers.quantization.calibration import apply_calibration_status + for _, module in match_named_modules(model, self.targets, self.ignore): # Note: No need to register observers for auto-round # self._initialize_observers(module) @@ -285,17 +284,12 @@ def autoround(self, state): wrapped_model = _wrap_decoding_layer(decoding_layer) with torch.enable_grad(), align_module_device(decoding_layer): - if _DEBUG: - iters = 4 - else: - iters = 200 import auto_round - ar = auto_round.AutoRound( model=wrapped_model, tokenizer="", scheme="W4A16", - iters=iters, + iters=self.iters, enable_quanted_input=False, # FIXME: batch size 1 causes error, looks like related to the input_others prepare # batch_size=1 @@ -325,13 +319,11 @@ def autoround(self, state): logger.debug( f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}" ) - # weight = module.weight + # The model's weight is already quantized and determined in auto-round weight_scale = module.scale del module.scale del module.zp # TODO: update weight as well - # breakpoint() - update_offload_parameter(module, "weight_scale", weight_scale) decoding_layer.eval() From 40288534ce0e4bd27bb2339d7b76a2fcf4423e60 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 00:22:26 -0800 Subject: [PATCH 08/57] clean code Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +- src/llmcompressor/modifiers/quantization/gptq/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index c693246fcc..ae4272b4dd 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -319,7 +319,7 @@ def autoround(self, state): logger.debug( f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}" ) - # The model's weight is already quantized and determined in auto-round + # Note: The model's weight is already quantized and dequantized in-place by auto-round weight_scale = module.scale del module.scale del module.zp diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index cf1e47d841..385de9840a 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -266,7 +266,7 @@ def compress_modules(self): percdamp=self.dampening_frac, ) comp_logger.set_loss(loss) - breakpoint() + update_offload_parameter(module, "weight", quantized_weight) update_offload_parameter(module, "weight_scale", scale) update_offload_parameter(module, "weight_zero_point", zero_point) From 97ff9e02b0cdfd0ec95ed4c884ca85d76d9f8210 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 00:23:13 -0800 Subject: [PATCH 09/57] add example Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 examples/quantization_w4a16/auto_round_llama3_example.py diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py new file mode 100644 index 0000000000..e9d309c233 --- /dev/null +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -0,0 +1,149 @@ +import os +_DEBUG = os.environ.get("DEBUG", "0") == "1" +os.environ["TOKENIZERS_PARALLELISM"] = "false" +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import AutoRoundModifier +from llmcompressor.modifiers.quantization import AutoRoundModifier +# from llmcompressor.modifiers.quantization import QuantizationModifier as AutoRoundModifier +from llmcompressor.utils import dispatch_for_generation + +# Select model and load it. +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +model_id = "/data5/yliu7/HF_HOME/meta-llama/Llama-3.2-1B-Instruct" +model_id = "Qwen/Qwen2.5-0.5B" +model_id = "/data5/yliu7/HF_HOME/Qwen/Qwen2.5-0.5B" +model_id = "/data5/yliu7/meta-llama/meta-llama/Meta-Llama-3.1-8B-Instruct" + +model_dir="/storage/yiliu7" +# model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" + +model_dir="/storage/yiliu7" +model_name="meta-llama/Meta-Llama-3.1-8B-Instruct" +model_name="Qwen/Qwen2.5-0.5B/" + +model_id=f"{model_dir}/{model_name}" + + +# model_id = "facebook/opt-125m" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +if _DEBUG: + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM + from transformers.models.llama.modeling_llama import LlamaForCausalLM + import torch + + config = AutoConfig.from_pretrained(model_id) + config.num_hidden_layers = 2 # Use a smaller model for testing + # Fix configuration validation issues + # config.layer_types = config.layer_types[: config.num_hidden_layers] + + # Load the tokenizer and model + if "Qwen" in model_id: + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + model = Qwen2ForCausalLM(config) + else: + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + model = LlamaForCausalLM(config) + model.to(torch.bfloat16) + NUM_CALIBRATION_SAMPLES = 3 + MAX_SEQUENCE_LENGTH = 16 + iters = 4 + +else: + # Select number of samples. 512 samples is a good place to start. + # Increasing the number of samples can improve accuracy. + light = {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsamples": 128, "lr": 5e-3} + light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": 5e-3} + + light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": None} + # light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 32, "lr": None} + NUM_CALIBRATION_SAMPLES = light["nsamples"] + MAX_SEQUENCE_LENGTH = light["seqlen"] + iters = light["iters"] + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + + + +from auto_round.calib_dataset import get_dataset + +from llmcompressor.args import DatasetArguments +ds = get_dataset( + tokenizer=tokenizer, + seqlen=MAX_SEQUENCE_LENGTH, + nsamples=NUM_CALIBRATION_SAMPLES, +) +# data_args = DatasetArguments(shuffle_calibration_samples=False) +# Load dataset and preprocess. +# ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +# ds = ds.shuffle(seed=42) + + +# def preprocess(example): +# return { +# "text": tokenizer.apply_chat_template( +# example["messages"], +# tokenize=False, +# ) +# } + + +# ds = ds.map(preprocess) + + +# # Tokenize inputs. +# def tokenize(sample): +# return tokenizer( +# sample["text"], +# padding=False, +# max_length=MAX_SEQUENCE_LENGTH, +# truncation=True, +# add_special_tokens=False, +# ) + + +# ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Configure the quantization algorithm to run. +# * quantize the weights to 4 bit with GPTQ with a group size 128 +recipe = AutoRoundModifier( + targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters +) + + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + # !!! shuffle_calibration_samples: True -> mmlu 0.6574 + # !!! shuffle_calibration_samples: False -> mmlu 0.66 + shuffle_calibration_samples=False, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +sample = tokenizer("Explain AI in ", return_tensors="pt") +sample = {key: value.to(model.device) for key, value in sample.items()} +output = model.generate(**sample, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" +SAVE_DIR = f"{model_dir}/" + model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-disbale-shuffule-ar" +print(f"Saving quantized model to {SAVE_DIR}") +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) From cb7a5b4b4c5731179f8f14d2b050013d09b77fed Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 01:01:16 -0800 Subject: [PATCH 10/57] refine docs Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py | 2 - .../modifiers/quantization/autoround/base.py | 67 +++++++------------ 2 files changed, 26 insertions(+), 43 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index e9d309c233..e2e52e0a05 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -74,8 +74,6 @@ from auto_round.calib_dataset import get_dataset - -from llmcompressor.args import DatasetArguments ds = get_dataset( tokenizer=tokenizer, seqlen=MAX_SEQUENCE_LENGTH, diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index ae4272b4dd..4aa6124103 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -25,27 +25,18 @@ from llmcompressor.utils.metric_logging import CompressionLogger from compressed_tensors.quantization import enable_quantization from llmcompressor.modifiers.quantization.calibration import apply_calibration_status +from collections import defaultdict __all__ = ["AutoRoundModifier"] -from collections import defaultdict -import os -FALLBACK_CHANGE = os.environ.get("FALLBACK_CHANGE", "0").lower() in ("1", "true", "yes") -_DEBUG = os.environ.get("DEBUG", "0").lower() in ("1", "true", "yes") + all_module_input = defaultdict(list) all_module_output = defaultdict(list) -def input_capture_hook(module, *args, **kwargs): - all_module_input[module._tmp_name].append((args, kwargs)) - - -def output_capture_hook(module, *args, **kwargs): - all_module_output[module._tmp_name].append((args, kwargs)) - def normalize_input(cur_inputs): # TODO: move it to auto-round @@ -104,22 +95,17 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper: wrapped_model.dtype = first_param.dtype return wrapped_model - - class AutoRoundModifier(Modifier, QuantizationMixin): """ - Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier - uses activations to calibrate a hessian matrix, which is then used to determine - optimal quantizion values and orderings for the model weights. + Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier + leverages signed gradient descent (SignSGD) and block-wise loss to optimize rounding values + and weight clipping in a few steps. | Sample yaml: | test_stage: | obcq_modifiers: | AutoRoundModifier: - | block_size: 128 - | dampening_frac: 0.001 - | offload_hessians: False - | actorder: static + | iters: 200 | config_groups: | group_0: | targets: @@ -127,7 +113,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): | input_activations: null | output_activations: null | weights: - | num_bits: 8 + | num_bits: 4 | type: "int" | symmetric: true | strategy: group @@ -137,24 +123,15 @@ class AutoRoundModifier(Modifier, QuantizationMixin): - on_initialize - apply config to model - on_start - - add activation calibration hooks - - add gptq weight calibration hooks + - add input/output capture hooks to decoding layers - on_sequential_epoch_end - quantize_weight - on_finalize - remove_hooks() - model.apply(freeze_module_quantization) - :param sequential_targets: list of layer names to compress during GPTQ, or + :param sequential_targets: list of layer names to compress during AutoRound, or '__ALL__' to compress every layer in the model - :param block_size: Used to determine number of columns to compress in one pass - :param dampening_frac: Amount of dampening to apply to H, as a fraction of the - diagonal norm - :param actorder: order in which weight columns are quantized. Defaults to "static" - activation ordering, which achieves best accuracy recovery with no runtime cost. - For more information, see https://github.com/vllm-project/vllm/pull/8135 - :param offload_hessians: Set to True for decreased memory usage but increased - runtime. :param config_groups: dictionary specifying quantization schemes to apply to target modules. Modules not matching a scheme target will NOT be quantized. @@ -180,10 +157,9 @@ class AutoRoundModifier(Modifier, QuantizationMixin): and kv_cache_scheme != None, the quantization of kv cache will fail """ - # gptq modifier arguments + # AutoRound modifier arguments sequential_targets: Union[str, List[str], None] = None - iters: int = 200 - dampening_frac: Optional[float] = 0.01 + iters: Optional[int] = 200 # TODO: this does not serialize / will be incorrectly written # private variables @@ -197,7 +173,7 @@ def resolve_quantization_config(self) -> QuantizationConfig: def on_initialize(self, state: State, **kwargs) -> bool: """ - Initialize and run the GPTQ algorithm on the current state + Initialize and run the AutoRound algorithm on the current state :param state: session state storing input model and calibration data """ @@ -238,20 +214,29 @@ def start_calibration(self, model: torch.nn.Module): model.apply(enable_quantization) # quantize at the same time as calibrate + def input_capture_hook(self, module, *args, **kwargs): + all_module_input[module._tmp_name].append((args, kwargs)) + + + def output_capture_hook(self, module, *args, **kwargs): + all_module_output[module._tmp_name].append((args, kwargs)) + + + def on_start(self, state: State, event: Event, **kwargs): self.started_ = True # register quantization calibration hooks # assume quantization has been initialized by this modifier or one before it - # Replace it with call to self.start_calibration - # QuantizationMixin.start_calibration(self, state.model) - self.start_calibration( state.model) + self.start_calibration(state.model) for name, module in state.model.named_modules(): if _is_decoding_layer(module, name): # register input/output capture hooks for decoding layers logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}") - module.register_forward_pre_hook(input_capture_hook, with_kwargs=True) - module.register_forward_hook(output_capture_hook, with_kwargs=True) + # module.register_forward_pre_hook(input_capture_hook, with_kwargs=True) + # module.register_forward_hook(output_capture_hook, with_kwargs=True) + self.register_hook(module, self.input_capture_hook, "forward_pre", with_kwargs=True) + self.register_hook(module, self.output_capture_hook, "forward", with_kwargs=True) def on_event(self, state: State, event: Event, **kwargs): From 5a7500ed373b0bf828dea64d9ff00cb3b3e133eb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 01:03:50 -0800 Subject: [PATCH 11/57] refine example Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py | 40 +------------------ 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index e2e52e0a05..413ce50087 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -22,7 +22,7 @@ model_dir="/storage/yiliu7" model_name="meta-llama/Meta-Llama-3.1-8B-Instruct" -model_name="Qwen/Qwen2.5-0.5B/" +# model_name="Qwen/Qwen2.5-0.5B/" model_id=f"{model_dir}/{model_name}" @@ -67,51 +67,15 @@ iters = light["iters"] # Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" - - - from auto_round.calib_dataset import get_dataset ds = get_dataset( tokenizer=tokenizer, seqlen=MAX_SEQUENCE_LENGTH, nsamples=NUM_CALIBRATION_SAMPLES, ) -# data_args = DatasetArguments(shuffle_calibration_samples=False) -# Load dataset and preprocess. -# ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") -# ds = ds.shuffle(seed=42) - - -# def preprocess(example): -# return { -# "text": tokenizer.apply_chat_template( -# example["messages"], -# tokenize=False, -# ) -# } - - -# ds = ds.map(preprocess) - - -# # Tokenize inputs. -# def tokenize(sample): -# return tokenizer( -# sample["text"], -# padding=False, -# max_length=MAX_SEQUENCE_LENGTH, -# truncation=True, -# add_special_tokens=False, -# ) - - -# ds = ds.map(tokenize, remove_columns=ds.column_names) # Configure the quantization algorithm to run. -# * quantize the weights to 4 bit with GPTQ with a group size 128 +# * quantize the weights to 4 bit with AutoRound with a group size 128 recipe = AutoRoundModifier( targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters ) From d02a355690573ccdb0a399bd4160a7d51124c375 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 02:14:01 -0800 Subject: [PATCH 12/57] add init Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/llmcompressor/modifiers/quantization/autoround/__init__.py diff --git a/src/llmcompressor/modifiers/quantization/autoround/__init__.py b/src/llmcompressor/modifiers/quantization/autoround/__init__.py new file mode 100644 index 0000000000..a4291054b4 --- /dev/null +++ b/src/llmcompressor/modifiers/quantization/autoround/__init__.py @@ -0,0 +1,3 @@ +# ruff: noqa + +from .base import * From cea9d2f3b981a201dad0c6fb56edf17a089be06a Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 02:25:07 -0800 Subject: [PATCH 13/57] clean code Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 48 +++++++------------ 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 4aa6124103..3995e92f23 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -33,8 +33,7 @@ -all_module_input = defaultdict(list) -all_module_output = defaultdict(list) + @@ -144,17 +143,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin): will be set to the targets parameter set at the modifier level. Can also be set to a dictionary of the format `preset_scheme_name: targets` for example: `W8A8: ['Linear']` for weight and activation 8-bit. - :param kv_cache_scheme: optional QuantizationArgs, that specify the - quantization of the kv cache. If None, kv cache is not quantized. - When applying kv cache quantization to transformer AutoModelForCausalLM, - the kv_cache_scheme gets converted into a QuantizationScheme that: - - targets the `q_proj` and `k_proj` modules of the model. The outputs - of those modules are the keys and values that might be cached - - quantizes the outputs of the aformentioned layers, so that - keys and values are compressed before storing them in the cache - There is an explicit assumption that the model contains modules with - `k_proj` and `v_proj` in their names. If this is not the case - and kv_cache_scheme != None, the quantization of kv cache will fail """ # AutoRound modifier arguments @@ -165,7 +153,8 @@ class AutoRoundModifier(Modifier, QuantizationMixin): # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) _cur_layer_idx = PrivateAttr(default=0) - + _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) + _all_module_output: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) def resolve_quantization_config(self) -> QuantizationConfig: config = super().resolve_quantization_config() @@ -188,7 +177,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: state.model, self.targets, self.ignore ) } - # add tmp name for each module for debugging + # add temporary names to all modules for debugging for name, mod in state.model.named_modules(): mod._tmp_name = name # freeze all model parameters @@ -199,7 +188,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: def start_calibration(self, model: torch.nn.Module): """ - Register activation calibration hooks (including kv_cache quantization) and enable quantization as we calibrate + Register activation calibration hooks and enable quantization as we calibrate :param model: model to prepare for calibration """ @@ -215,11 +204,11 @@ def start_calibration(self, model: torch.nn.Module): def input_capture_hook(self, module, *args, **kwargs): - all_module_input[module._tmp_name].append((args, kwargs)) + self._all_module_input[module._tmp_name].append((args, kwargs)) def output_capture_hook(self, module, *args, **kwargs): - all_module_output[module._tmp_name].append((args, kwargs)) + self._all_module_output[module._tmp_name].append((args, kwargs)) @@ -233,8 +222,6 @@ def on_start(self, state: State, event: Event, **kwargs): if _is_decoding_layer(module, name): # register input/output capture hooks for decoding layers logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}") - # module.register_forward_pre_hook(input_capture_hook, with_kwargs=True) - # module.register_forward_hook(output_capture_hook, with_kwargs=True) self.register_hook(module, self.input_capture_hook, "forward_pre", with_kwargs=True) self.register_hook(module, self.output_capture_hook, "forward", with_kwargs=True) @@ -245,17 +232,19 @@ def on_event(self, state: State, event: Event, **kwargs): self.on_start(state, None) if event.type_ == EventType.SEQUENTIAL_EPOCH_END: - self.autoround(state) + self.apply_autoround(state) + self.post_autoround_cleanup() if event.type_ == EventType.CALIBRATION_EPOCH_END: if not self.ended_: self.on_end(state, None) - def autoround(self, state): + def apply_autoround(self, state): cur_layer_idx = self._cur_layer_idx self._cur_layer_idx += 1 logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}") if cur_layer_idx >= len(state.model.model.layers): + # skip the lm_head layer logger.info( f">>||>> All decoding layers have been processed for AutoRound." ) @@ -276,16 +265,13 @@ def autoround(self, state): scheme="W4A16", iters=self.iters, enable_quanted_input=False, - # FIXME: batch size 1 causes error, looks like related to the input_others prepare - # batch_size=1 enable_torch_compile=True, - # enable_deterministic_algorithms=True, ) ar.configure_layer_config() input_name = f"model.layers.{cur_layer_idx}" - cur_inputs = all_module_input[input_name] + cur_inputs = self._all_module_input[input_name] input_ids, input_others = normalize_input(cur_inputs) decoding_layer.tuning_device = torch.device("cuda") @@ -304,16 +290,16 @@ def autoround(self, state): logger.debug( f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}" ) - # Note: The model's weight is already quantized and dequantized in-place by auto-round + # Note: The model's weight is already quantized and dequantized in-place by auto-round. weight_scale = module.scale del module.scale del module.zp # TODO: update weight as well update_offload_parameter(module, "weight_scale", weight_scale) - - decoding_layer.eval() - all_module_input.clear() - all_module_output.clear() + decoding_layer.eval() + def post_autoround_cleanup(self): + self._all_module_input.clear() + self._all_module_output.clear() def on_end(self, state: State, event: Event, **kwargs): From 22be9b7c6a531552310e5d6446477170aec76991 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 02:31:22 -0800 Subject: [PATCH 14/57] format Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py | 25 +++++---- examples/quantization_w4a16/llama3_example.py | 4 +- .../modifiers/quantization/__init__.py | 2 +- .../modifiers/quantization/autoround/base.py | 51 +++++++------------ .../pipelines/layer_sequential/pipeline.py | 2 +- .../pipelines/sequential/pipeline.py | 4 +- src/llmcompressor/utils/helpers.py | 5 +- 7 files changed, 41 insertions(+), 52 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index 413ce50087..4cb865c315 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -1,12 +1,12 @@ import os + _DEBUG = os.environ.get("DEBUG", "0") == "1" os.environ["TOKENIZERS_PARALLELISM"] = "false" -from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.quantization import AutoRoundModifier -from llmcompressor.modifiers.quantization import AutoRoundModifier + # from llmcompressor.modifiers.quantization import QuantizationModifier as AutoRoundModifier from llmcompressor.utils import dispatch_for_generation @@ -17,14 +17,14 @@ model_id = "/data5/yliu7/HF_HOME/Qwen/Qwen2.5-0.5B" model_id = "/data5/yliu7/meta-llama/meta-llama/Meta-Llama-3.1-8B-Instruct" -model_dir="/storage/yiliu7" +model_dir = "/storage/yiliu7" # model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" -model_dir="/storage/yiliu7" -model_name="meta-llama/Meta-Llama-3.1-8B-Instruct" +model_dir = "/storage/yiliu7" +model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" # model_name="Qwen/Qwen2.5-0.5B/" -model_id=f"{model_dir}/{model_name}" +model_id = f"{model_dir}/{model_name}" # model_id = "facebook/opt-125m" @@ -32,10 +32,10 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) if _DEBUG: + import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM from transformers.models.llama.modeling_llama import LlamaForCausalLM - import torch + from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM config = AutoConfig.from_pretrained(model_id) config.num_hidden_layers = 2 # Use a smaller model for testing @@ -68,6 +68,7 @@ # Select calibration dataset. from auto_round.calib_dataset import get_dataset + ds = get_dataset( tokenizer=tokenizer, seqlen=MAX_SEQUENCE_LENGTH, @@ -79,7 +80,7 @@ recipe = AutoRoundModifier( targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters ) - + # Apply algorithms. oneshot( @@ -105,7 +106,11 @@ # Save to disk compressed. SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" -SAVE_DIR = f"{model_dir}/" + model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-disbale-shuffule-ar" +SAVE_DIR = ( + f"{model_dir}/" + + model_id.rstrip("/").split("/")[-1] + + "-W4A16-G128-disbale-shuffule-ar" +) print(f"Saving quantized model to {SAVE_DIR}") model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 038c0ebc9f..945335de36 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -7,8 +7,8 @@ # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model_dir="/storage/yiliu7" -model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" +model_dir = "/storage/yiliu7" +model_id = f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py index 7e4028279e..2c10fe4b97 100644 --- a/src/llmcompressor/modifiers/quantization/__init__.py +++ b/src/llmcompressor/modifiers/quantization/__init__.py @@ -2,4 +2,4 @@ from .gptq import * from .quantization import * -from .autoround import * \ No newline at end of file +from .autoround import * diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 3995e92f23..06c4801d6a 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -1,17 +1,12 @@ -import contextlib from typing import Dict, List, Optional, Tuple, Union import torch from compressed_tensors.quantization import ( QuantizationConfig, - QuantizationScheme, - QuantizationStrategy, + enable_quantization, ) -from compressed_tensors.quantization.quant_args import ActivationOrdering from compressed_tensors.utils import ( align_module_device, - get_execution_device, - getattr_chain, match_named_modules, update_offload_parameter, ) @@ -20,23 +15,12 @@ from llmcompressor.core import Event, EventType, State from llmcompressor.modifiers import Modifier -from llmcompressor.modifiers.quantization.quantization import QuantizationMixin -from llmcompressor.sentinel import Sentinel -from llmcompressor.utils.metric_logging import CompressionLogger -from compressed_tensors.quantization import enable_quantization from llmcompressor.modifiers.quantization.calibration import apply_calibration_status -from collections import defaultdict +from llmcompressor.modifiers.quantization.quantization import QuantizationMixin __all__ = ["AutoRoundModifier"] - - - - - - - def normalize_input(cur_inputs): # TODO: move it to auto-round input_ids = [] @@ -94,6 +78,7 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper: wrapped_model.dtype = first_param.dtype return wrapped_model + class AutoRoundModifier(Modifier, QuantizationMixin): """ Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier @@ -173,9 +158,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: # prepare module names self._module_names = { m: name - for name, m in match_named_modules( - state.model, self.targets, self.ignore - ) + for name, m in match_named_modules(state.model, self.targets, self.ignore) } # add temporary names to all modules for debugging for name, mod in state.model.named_modules(): @@ -185,7 +168,6 @@ def on_initialize(self, state: State, **kwargs) -> bool: param.requires_grad_(False) return True - def start_calibration(self, model: torch.nn.Module): """ Register activation calibration hooks and enable quantization as we calibrate @@ -193,7 +175,6 @@ def start_calibration(self, model: torch.nn.Module): :param model: model to prepare for calibration """ - for _, module in match_named_modules(model, self.targets, self.ignore): # Note: No need to register observers for auto-round # self._initialize_observers(module) @@ -202,16 +183,12 @@ def start_calibration(self, model: torch.nn.Module): model.apply(enable_quantization) # quantize at the same time as calibrate - def input_capture_hook(self, module, *args, **kwargs): self._all_module_input[module._tmp_name].append((args, kwargs)) - def output_capture_hook(self, module, *args, **kwargs): self._all_module_output[module._tmp_name].append((args, kwargs)) - - def on_start(self, state: State, event: Event, **kwargs): self.started_ = True @@ -221,10 +198,15 @@ def on_start(self, state: State, event: Event, **kwargs): for name, module in state.model.named_modules(): if _is_decoding_layer(module, name): # register input/output capture hooks for decoding layers - logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}") - self.register_hook(module, self.input_capture_hook, "forward_pre", with_kwargs=True) - self.register_hook(module, self.output_capture_hook, "forward", with_kwargs=True) - + logger.warning( + f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}" + ) + self.register_hook( + module, self.input_capture_hook, "forward_pre", with_kwargs=True + ) + self.register_hook( + module, self.output_capture_hook, "forward", with_kwargs=True + ) def on_event(self, state: State, event: Event, **kwargs): if event.type_ == EventType.CALIBRATION_EPOCH_START: @@ -246,7 +228,7 @@ def apply_autoround(self, state): if cur_layer_idx >= len(state.model.model.layers): # skip the lm_head layer logger.info( - f">>||>> All decoding layers have been processed for AutoRound." + ">>||>> All decoding layers have been processed for AutoRound." ) # self.compress_modules(return_directly=False) return @@ -259,6 +241,7 @@ def apply_autoround(self, state): with torch.enable_grad(), align_module_device(decoding_layer): import auto_round + ar = auto_round.AutoRound( model=wrapped_model, tokenizer="", @@ -297,11 +280,11 @@ def apply_autoround(self, state): # TODO: update weight as well update_offload_parameter(module, "weight_scale", weight_scale) decoding_layer.eval() + def post_autoround_cleanup(self): self._all_module_input.clear() self._all_module_output.clear() - def on_end(self, state: State, event: Event, **kwargs): """ Finish calibrating by removing observers and calibration hooks @@ -319,4 +302,4 @@ def on_finalize(self, state: State, **kwargs) -> bool: if not self.ended_: self.on_end(state, None) - return True \ No newline at end of file + return True diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index de3a093799..314cd8439e 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -21,9 +21,9 @@ get_sequential_targets, ) from llmcompressor.utils.helpers import ( + DISABLE_QAC_MODIFIERS, DisableQuantization, calibration_forward_context, - DISABLE_QAC_MODIFIERS, ) if TYPE_CHECKING: diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 1e7df2da53..0f341a37da 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -16,9 +16,9 @@ trace_subgraphs, ) from llmcompressor.utils.helpers import ( + DISABLE_QAC_MODIFIERS, DisableQuantization, calibration_forward_context, - DISABLE_QAC_MODIFIERS, ) if TYPE_CHECKING: @@ -79,7 +79,7 @@ def __call__( # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ disable_qac = any( type(mod).__name__ in DISABLE_QAC_MODIFIERS - for mod in session.lifecycle.recipe.modifiers + for mod in session.lifecycle.recipe.modifiers ) with contextlib.ExitStack() as stack: diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py index b1c6c02f0f..0be09bd062 100644 --- a/src/llmcompressor/utils/helpers.py +++ b/src/llmcompressor/utils/helpers.py @@ -67,7 +67,7 @@ "calibration_forward_context", "patch_attr", "disable_hf_kernels", - "DISABLE_QAC_MODIFIERS" + "DISABLE_QAC_MODIFIERS", ] @@ -1084,4 +1084,5 @@ def patch_attr(base: object, attr: str, value: Any): else: delattr(base, attr) -DISABLE_QAC_MODIFIERS = ["GPTQModifier", "AWQModifier", "AutoRoundModifier"] \ No newline at end of file + +DISABLE_QAC_MODIFIERS = ["GPTQModifier", "AWQModifier", "AutoRoundModifier"] From 6cdb402b30d4f815122afc9c9dfdba7fe0cabe95 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 03:10:06 -0800 Subject: [PATCH 15/57] refactor Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 06c4801d6a..e4993173af 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -109,7 +109,8 @@ class AutoRoundModifier(Modifier, QuantizationMixin): - on_start - add input/output capture hooks to decoding layers - on_sequential_epoch_end - - quantize_weight + - apply_autoround + - post_autoround_cleanup - on_finalize - remove_hooks() - model.apply(freeze_module_quantization) @@ -184,9 +185,13 @@ def start_calibration(self, model: torch.nn.Module): model.apply(enable_quantization) # quantize at the same time as calibrate def input_capture_hook(self, module, *args, **kwargs): + if module._tmp_name not in self._all_module_input: + self._all_module_input[module._tmp_name] = [] self._all_module_input[module._tmp_name].append((args, kwargs)) def output_capture_hook(self, module, *args, **kwargs): + if module._tmp_name not in self._all_module_output: + self._all_module_output[module._tmp_name] = [] self._all_module_output[module._tmp_name].append((args, kwargs)) def on_start(self, state: State, event: Event, **kwargs): @@ -227,10 +232,7 @@ def apply_autoround(self, state): logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}") if cur_layer_idx >= len(state.model.model.layers): # skip the lm_head layer - logger.info( - ">>||>> All decoding layers have been processed for AutoRound." - ) - # self.compress_modules(return_directly=False) + logger.info(">>||>> All decoding layers have been processed for AutoRound.") return decoding_layer = state.model.model.layers[cur_layer_idx] logger.debug( @@ -270,9 +272,6 @@ def apply_autoround(self, state): if hasattr(module, "weight_scale") and hasattr( module, "weight_zero_point" ): - logger.debug( - f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}" - ) # Note: The model's weight is already quantized and dequantized in-place by auto-round. weight_scale = module.scale del module.scale From e2814ebc997472a53436c129eb49afd2d4915daf Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 03:14:26 -0800 Subject: [PATCH 16/57] add ut Signed-off-by: yiliu30 --- .../transformers/autoround/test_oneshot.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 tests/llmcompressor/transformers/autoround/test_oneshot.py diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py new file mode 100644 index 0000000000..7e9adf1a76 --- /dev/null +++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py @@ -0,0 +1,92 @@ +import pytest +import torch +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from transformers import AutoModelForCausalLM, AutoTokenizer +from auto_round.calib_dataset import get_dataset +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization.autoround import AutoRoundModifier + +recipe_str = """ +quant_stage: + quant_modifiers: + AutoRoundModifier: + ignore: ["lm_head"] + iters: 10 + config_groups: + group_0: + targets: + - "Linear" + input_activations: null + output_activations: null + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: group + group_size: 128 +""" + +recipe_modifier_full = AutoRoundModifier( + ignore=["lm_head"], + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128), + ) + }, +) + + +@pytest.mark.parametrize( + "recipe", + [ + recipe_str, + recipe_modifier_full, + ], +) +def test_oneshot_application(recipe, tmp_path): + output = tmp_path / "oneshot_output" + model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + tokenizer = AutoTokenizer.from_pretrained(model) + dataset = get_dataset( + tokenizer=tokenizer, + seqlen=1024, + nsamples=32, + ) + + device = "cuda:0" if torch.cuda.is_available() else "cpu" + + oneshot( + model=model, + dataset=dataset, + output_dir=output, + recipe=recipe, + ) + model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device) + + # Check that the model is quantized + # for compression_config - decompress() will attach a quantization_config + # to the model as we decompress right away + # for quantization_config - we have CompressedLinear which will only + # decompress on the forward pass and does not call decompress(). Results + # in a slightly different parameter tree to access the quant config + quantization_config = model_loaded.config.quantization_config.quantization_config + assert quantization_config is not None + + # check config is set properly + assert "lm_head" in quantization_config.ignore + assert len(quantization_config.config_groups) == 1 + quant_scheme = quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + + weight_args = quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + + # Check a specific layer is quantized + targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + assert hasattr(targetted_linear_layer, "quantization_scheme") + + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") From 3e4a9fc5182d9cc7430daa324eccc064a0052217 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 04:36:32 -0800 Subject: [PATCH 17/57] test llama 3 Signed-off-by: yiliu30 --- examples/quantization_w4a16/auto_round_llama3_example.py | 1 + examples/quantization_w4a16/llama3_example.py | 3 ++- src/llmcompressor/modifiers/quantization/autoround/base.py | 5 ----- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index 4cb865c315..0d01643155 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -22,6 +22,7 @@ model_dir = "/storage/yiliu7" model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" +model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # model_name="Qwen/Qwen2.5-0.5B/" model_id = f"{model_dir}/{model_name}" diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 945335de36..32ab8bb843 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -9,6 +9,7 @@ model_id = "meta-llama/Meta-Llama-3-8B-Instruct" model_dir = "/storage/yiliu7" model_id = f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" +model_id = f"{model_dir}/meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -75,6 +76,6 @@ def tokenize(sample): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-GPTQ" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index e4993173af..5d0234510a 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -115,9 +115,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin): - remove_hooks() - model.apply(freeze_module_quantization) - :param sequential_targets: list of layer names to compress during AutoRound, or - '__ALL__' to compress every layer in the model - :param config_groups: dictionary specifying quantization schemes to apply to target modules. Modules not matching a scheme target will NOT be quantized. :param targets: list of layer names to quantize if a scheme is provided. Defaults @@ -132,9 +129,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): """ # AutoRound modifier arguments - sequential_targets: Union[str, List[str], None] = None iters: Optional[int] = 200 - # TODO: this does not serialize / will be incorrectly written # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) From aa34b656421ce3ea37e0d36e858cfc89dcf359f6 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 19:22:06 -0800 Subject: [PATCH 18/57] clean code Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py | 7 ++- .../modifiers/quantization/autoround/base.py | 45 ++++--------------- 2 files changed, 14 insertions(+), 38 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index 0d01643155..b717540df9 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -1,6 +1,7 @@ import os _DEBUG = os.environ.get("DEBUG", "0") == "1" +IS_LLAMA = os.environ.get("MODEL", "LLAMA") == "LLAMA" os.environ["TOKENIZERS_PARALLELISM"] = "false" from transformers import AutoModelForCausalLM, AutoTokenizer @@ -22,8 +23,10 @@ model_dir = "/storage/yiliu7" model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" -model_name = "meta-llama/Meta-Llama-3-8B-Instruct" -# model_name="Qwen/Qwen2.5-0.5B/" +if IS_LLAMA: + model_name = "meta-llama/Meta-Llama-3-8B-Instruct" +else: + model_name="Qwen/Qwen2.5-0.5B/" model_id = f"{model_dir}/{model_name}" diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 5d0234510a..fc38961d8f 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -21,32 +21,6 @@ __all__ = ["AutoRoundModifier"] -def normalize_input(cur_inputs): - # TODO: move it to auto-round - input_ids = [] - input_others = {} - positional_inputs = [] - attention_mask = None - position_ids = None - cache_position = None - position_embeddings = (None, None) - for cur_inp in cur_inputs: - input_ids.append(cur_inp[0][0][0]) - for key, val in cur_inp[0][1].items(): - if key == "position_ids": - position_ids = val - elif key == "position_embeddings": - position_embeddings = val - elif key == "cache_position": - cache_position = val - input_others["position_ids"] = position_ids - input_others["positional_inputs"] = positional_inputs - input_others["attention_mask"] = attention_mask - input_others["position_embeddings"] = position_embeddings - input_others["cache_position"] = cache_position - return input_ids, input_others - - def _is_decoding_layer(module, name): return "decoderlayer" in module.__class__.__name__.lower() @@ -87,7 +61,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): | Sample yaml: | test_stage: - | obcq_modifiers: + | modifiers: | AutoRoundModifier: | iters: 200 | config_groups: @@ -156,7 +130,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: m: name for name, m in match_named_modules(state.model, self.targets, self.ignore) } - # add temporary names to all modules for debugging + # add temporary names to all modules for name, mod in state.model.named_modules(): mod._tmp_name = name # freeze all model parameters @@ -249,18 +223,17 @@ def apply_autoround(self, state): ) ar.configure_layer_config() - + first_param = next(decoding_layer.parameters()) + device = first_param.device input_name = f"model.layers.{cur_layer_idx}" cur_inputs = self._all_module_input[input_name] - input_ids, input_others = normalize_input(cur_inputs) - decoding_layer.tuning_device = torch.device("cuda") + decoding_layer.tuning_device = device ar.quantize_block( block=decoding_layer, - input_ids=input_ids, - input_others=input_others, - q_input=None, - device="cuda", + inputs=cur_inputs, + normalize_inputs=True, + device=device, ) # Update offload parameters and remove temporary attributes for name, module in decoding_layer.named_modules(): @@ -289,7 +262,7 @@ def on_end(self, state: State, event: Event, **kwargs): def on_finalize(self, state: State, **kwargs) -> bool: """ - disable the quantization observers used by the OBCQ algorithm + disable the quantization observers used by the AutoRound algorithm :param state: session state storing input model and calibration data """ From afe2ff79a5fd340876534d78611021f4433ae64e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 20:06:01 -0800 Subject: [PATCH 19/57] parse layer-wise config Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 63 ++++++++++++------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index fc38961d8f..06e0fd3c73 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -1,13 +1,16 @@ -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple import torch from compressed_tensors.quantization import ( QuantizationConfig, + QuantizationStrategy, + QuantizationScheme, enable_quantization, ) from compressed_tensors.utils import ( align_module_device, match_named_modules, + getattr_chain, update_offload_parameter, ) from loguru import logger @@ -109,7 +112,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin): _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) _cur_layer_idx = PrivateAttr(default=0) _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) - _all_module_output: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) def resolve_quantization_config(self) -> QuantizationConfig: config = super().resolve_quantization_config() @@ -158,11 +160,6 @@ def input_capture_hook(self, module, *args, **kwargs): self._all_module_input[module._tmp_name] = [] self._all_module_input[module._tmp_name].append((args, kwargs)) - def output_capture_hook(self, module, *args, **kwargs): - if module._tmp_name not in self._all_module_output: - self._all_module_output[module._tmp_name] = [] - self._all_module_output[module._tmp_name].append((args, kwargs)) - def on_start(self, state: State, event: Event, **kwargs): self.started_ = True @@ -172,15 +169,9 @@ def on_start(self, state: State, event: Event, **kwargs): for name, module in state.model.named_modules(): if _is_decoding_layer(module, name): # register input/output capture hooks for decoding layers - logger.warning( - f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}" - ) self.register_hook( module, self.input_capture_hook, "forward_pre", with_kwargs=True ) - self.register_hook( - module, self.output_capture_hook, "forward", with_kwargs=True - ) def on_event(self, state: State, event: Event, **kwargs): if event.type_ == EventType.CALIBRATION_EPOCH_START: @@ -195,33 +186,60 @@ def on_event(self, state: State, event: Event, **kwargs): if not self.ended_: self.on_end(state, None) + def _mapping_config_to_autoround(self): + from auto_round.schemes import QuantizationScheme as ARQuantizationScheme + + resolved_config = self.resolved_config + quant_scheme = None + for scheme in resolved_config.config_groups.values(): + assert isinstance(scheme, QuantizationScheme), f"Expected QuantizationScheme, got {type(scheme)}" + quant_scheme = scheme + weight_args = quant_scheme.weights + # TODO: release below constraint in later PRs + assert weight_args.strategy == QuantizationStrategy.GROUP, ( + "Only group-wise quantization is supported in AutoRoundModifier for now, " + f"got {weight_args.strategy}" + ) + assert quant_scheme.input_activations is None, ( + "Input activation quantization is not supported in AutoRoundModifier, " + f"got {quant_scheme.input_activations}" + ) + assert quant_scheme.output_activations is None, ( + "Output activation quantization is not supported in AutoRoundModifier, " + f"got {quant_scheme.output_activations}" + ) + ar_quant_scheme = ARQuantizationScheme( + bits=weight_args.num_bits, + sym=weight_args.symmetric, + group_size=weight_args.group_size, + data_type=weight_args.type, + act_bits=16, + ) + return ar_quant_scheme + def apply_autoround(self, state): cur_layer_idx = self._cur_layer_idx self._cur_layer_idx += 1 logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}") if cur_layer_idx >= len(state.model.model.layers): # skip the lm_head layer - logger.info(">>||>> All decoding layers have been processed for AutoRound.") return decoding_layer = state.model.model.layers[cur_layer_idx] - logger.debug( - f">>||>> Strating AutoRound for decoding layer {getattr(decoding_layer, '_tmp_name', '')}" - ) wrapped_model = _wrap_decoding_layer(decoding_layer) with torch.enable_grad(), align_module_device(decoding_layer): import auto_round - + parsed_scheme = self._mapping_config_to_autoround() ar = auto_round.AutoRound( model=wrapped_model, tokenizer="", - scheme="W4A16", + scheme=parsed_scheme, iters=self.iters, enable_quanted_input=False, enable_torch_compile=True, ) - + # TODO: configure layer-wise config based on self.resolved_config ar.configure_layer_config() first_param = next(decoding_layer.parameters()) device = first_param.device @@ -236,11 +254,11 @@ def apply_autoround(self, state): device=device, ) # Update offload parameters and remove temporary attributes - for name, module in decoding_layer.named_modules(): + for _, module in decoding_layer.named_modules(): if hasattr(module, "weight_scale") and hasattr( module, "weight_zero_point" ): - # Note: The model's weight is already quantized and dequantized in-place by auto-round. + # Note: The model's weight is already q-dq in-place by auto-round. weight_scale = module.scale del module.scale del module.zp @@ -250,7 +268,6 @@ def apply_autoround(self, state): def post_autoround_cleanup(self): self._all_module_input.clear() - self._all_module_output.clear() def on_end(self, state: State, event: Event, **kwargs): """ From 8e9eccc141f9e0fd8dfedcada45fb309c204eca4 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 20:06:35 -0800 Subject: [PATCH 20/57] format Signed-off-by: yiliu30 --- examples/quantization_w4a16/auto_round_llama3_example.py | 2 +- .../modifiers/quantization/autoround/base.py | 8 +++++--- .../llmcompressor/transformers/autoround/test_oneshot.py | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index b717540df9..d8d72c8f60 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -26,7 +26,7 @@ if IS_LLAMA: model_name = "meta-llama/Meta-Llama-3-8B-Instruct" else: - model_name="Qwen/Qwen2.5-0.5B/" + model_name = "Qwen/Qwen2.5-0.5B/" model_id = f"{model_dir}/{model_name}" diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 06e0fd3c73..a366d803ff 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -3,14 +3,13 @@ import torch from compressed_tensors.quantization import ( QuantizationConfig, - QuantizationStrategy, QuantizationScheme, + QuantizationStrategy, enable_quantization, ) from compressed_tensors.utils import ( align_module_device, match_named_modules, - getattr_chain, update_offload_parameter, ) from loguru import logger @@ -192,7 +191,9 @@ def _mapping_config_to_autoround(self): resolved_config = self.resolved_config quant_scheme = None for scheme in resolved_config.config_groups.values(): - assert isinstance(scheme, QuantizationScheme), f"Expected QuantizationScheme, got {type(scheme)}" + assert isinstance( + scheme, QuantizationScheme + ), f"Expected QuantizationScheme, got {type(scheme)}" quant_scheme = scheme weight_args = quant_scheme.weights # TODO: release below constraint in later PRs @@ -230,6 +231,7 @@ def apply_autoround(self, state): with torch.enable_grad(), align_module_device(decoding_layer): import auto_round + parsed_scheme = self._mapping_config_to_autoround() ar = auto_round.AutoRound( model=wrapped_model, diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py index 7e9adf1a76..f85ae283b0 100644 --- a/tests/llmcompressor/transformers/autoround/test_oneshot.py +++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py @@ -1,8 +1,9 @@ import pytest import torch +from auto_round.calib_dataset import get_dataset from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round.calib_dataset import get_dataset + from llmcompressor import oneshot from llmcompressor.modifiers.quantization.autoround import AutoRoundModifier From 81f76affc8848f8778e702e73aad0609193473a5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 20:16:32 -0800 Subject: [PATCH 21/57] add docstring Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index a366d803ff..626ec920ee 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -219,6 +219,20 @@ def _mapping_config_to_autoround(self): return ar_quant_scheme def apply_autoround(self, state): + """Applies AutoRound quantization tuning on the current decoding layer. + + The tuning logic is below: + for iter in range(iters): + quant_output = forward(layer, cached_inputs) + loss = mse_loss(quant_output, original_output) + loss.backward() + optimizer.step() + if loss < best_loss: + best_params = save_params(layer) + For more details, please refer to the AutoRound repository: + https://github.com/intel/auto-round/ + """ + cur_layer_idx = self._cur_layer_idx self._cur_layer_idx += 1 logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}") From afa6150a235f8a25d9b1a0207b3387363cfa37e5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 21:11:29 -0800 Subject: [PATCH 22/57] add ar Signed-off-by: yiliu30 --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 3b68083a61..6fca6e710a 100644 --- a/setup.py +++ b/setup.py @@ -144,6 +144,10 @@ def localversion_func(version: ScmVersion) -> str: if BUILD_TYPE == "release" else "compressed-tensors>=0.12.3a2" ), + # TODO: replace it with the release version + ( + "auto_round @ git+https://github.com/intel/auto-round.git@llmc" + ), ], extras_require={ "dev": [ From 97217e78afc639f824628a915613a698f131aed4 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 3 Nov 2025 21:57:23 -0800 Subject: [PATCH 23/57] update example Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py | 80 ++----------------- 1 file changed, 8 insertions(+), 72 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index d8d72c8f60..0e598ba9bf 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -1,76 +1,18 @@ -import os - -_DEBUG = os.environ.get("DEBUG", "0") == "1" -IS_LLAMA = os.environ.get("MODEL", "LLAMA") == "LLAMA" -os.environ["TOKENIZERS_PARALLELISM"] = "false" from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.quantization import AutoRoundModifier - -# from llmcompressor.modifiers.quantization import QuantizationModifier as AutoRoundModifier from llmcompressor.utils import dispatch_for_generation # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model_id = "/data5/yliu7/HF_HOME/meta-llama/Llama-3.2-1B-Instruct" -model_id = "Qwen/Qwen2.5-0.5B" -model_id = "/data5/yliu7/HF_HOME/Qwen/Qwen2.5-0.5B" -model_id = "/data5/yliu7/meta-llama/meta-llama/Meta-Llama-3.1-8B-Instruct" - -model_dir = "/storage/yiliu7" -# model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" - -model_dir = "/storage/yiliu7" -model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" -if IS_LLAMA: - model_name = "meta-llama/Meta-Llama-3-8B-Instruct" -else: - model_name = "Qwen/Qwen2.5-0.5B/" - -model_id = f"{model_dir}/{model_name}" - - -# model_id = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) -if _DEBUG: - import torch - from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - from transformers.models.llama.modeling_llama import LlamaForCausalLM - from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM - - config = AutoConfig.from_pretrained(model_id) - config.num_hidden_layers = 2 # Use a smaller model for testing - # Fix configuration validation issues - # config.layer_types = config.layer_types[: config.num_hidden_layers] - - # Load the tokenizer and model - if "Qwen" in model_id: - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - model = Qwen2ForCausalLM(config) - else: - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - model = LlamaForCausalLM(config) - model.to(torch.bfloat16) - NUM_CALIBRATION_SAMPLES = 3 - MAX_SEQUENCE_LENGTH = 16 - iters = 4 - -else: - # Select number of samples. 512 samples is a good place to start. - # Increasing the number of samples can improve accuracy. - light = {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsamples": 128, "lr": 5e-3} - light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": 5e-3} - - light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": None} - # light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 32, "lr": None} - NUM_CALIBRATION_SAMPLES = light["nsamples"] - MAX_SEQUENCE_LENGTH = light["seqlen"] - iters = light["iters"] - # Select calibration dataset. +NUM_CALIBRATION_SAMPLES = 128 +MAX_SEQUENCE_LENGTH = 2048 +# Get aligned calibration dataset. from auto_round.calib_dataset import get_dataset ds = get_dataset( @@ -79,10 +21,11 @@ nsamples=NUM_CALIBRATION_SAMPLES, ) + # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with AutoRound with a group size 128 recipe = AutoRoundModifier( - targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters + targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200 ) @@ -93,8 +36,7 @@ recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - # !!! shuffle_calibration_samples: True -> mmlu 0.6574 - # !!! shuffle_calibration_samples: False -> mmlu 0.66 + # disbable shuffling to get slightly better mmlu score shuffle_calibration_samples=False, ) @@ -102,19 +44,13 @@ print("\n\n") print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) -sample = tokenizer("Explain AI in ", return_tensors="pt") +sample = tokenizer("Hello my name is", return_tensors="pt") sample = {key: value.to(model.device) for key, value in sample.items()} output = model.generate(**sample, max_new_tokens=100) print(tokenizer.decode(output[0])) print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" -SAVE_DIR = ( - f"{model_dir}/" - + model_id.rstrip("/").split("/")[-1] - + "-W4A16-G128-disbale-shuffule-ar" -) -print(f"Saving quantized model to {SAVE_DIR}") +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) From 3dcb434cc68d556cc7cbdbab4039f9f42a24c851 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 16:42:21 -0800 Subject: [PATCH 24/57] align api Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 626ec920ee..0bf36f8d75 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -83,7 +83,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): - on_initialize - apply config to model - on_start - - add input/output capture hooks to decoding layers + - add input capture hooks to decoding layers - on_sequential_epoch_end - apply_autoround - post_autoround_cleanup @@ -106,6 +106,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): # AutoRound modifier arguments iters: Optional[int] = 200 + enable_torch_compile: Optional[bool] = True # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) @@ -219,23 +220,28 @@ def _mapping_config_to_autoround(self): return ar_quant_scheme def apply_autoround(self, state): - """Applies AutoRound quantization tuning on the current decoding layer. + """ + Applies AutoRound quantization tuning on the current decoding layer. - The tuning logic is below: + The tuning logic is as follows: for iter in range(iters): - quant_output = forward(layer, cached_inputs) - loss = mse_loss(quant_output, original_output) - loss.backward() - optimizer.step() - if loss < best_loss: + quant_output = forward(layer, cached_inputs) + loss = mse_loss(quant_output, original_output) + loss.backward() + optimizer.step() + if loss < best_loss: best_params = save_params(layer) + + This method retrieves the current decoding layer, wraps it for compatibility with + AutoRound, and performs iterative optimization to minimize the quantization error. + The best parameters are tracked and applied to the layer after tuning. + For more details, please refer to the AutoRound repository: https://github.com/intel/auto-round/ """ - cur_layer_idx = self._cur_layer_idx + logger.info("Applying AutoRound to layer index: {}", cur_layer_idx) self._cur_layer_idx += 1 - logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}") if cur_layer_idx >= len(state.model.model.layers): # skip the lm_head layer return @@ -253,7 +259,7 @@ def apply_autoround(self, state): scheme=parsed_scheme, iters=self.iters, enable_quanted_input=False, - enable_torch_compile=True, + enable_torch_compile=self.enable_torch_compile, ) # TODO: configure layer-wise config based on self.resolved_config ar.configure_layer_config() @@ -268,6 +274,8 @@ def apply_autoround(self, state): inputs=cur_inputs, normalize_inputs=True, device=device, + # Leave offload for LLMC + auto_offload=False, ) # Update offload parameters and remove temporary attributes for _, module in decoding_layer.named_modules(): From aef77072f5a4a3e216cf9b8c3d291a3e6ea82739 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 16:42:49 -0800 Subject: [PATCH 25/57] format Signed-off-by: yiliu30 --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 6fca6e710a..6211ea90d8 100644 --- a/setup.py +++ b/setup.py @@ -145,9 +145,7 @@ def localversion_func(version: ScmVersion) -> str: else "compressed-tensors>=0.12.3a2" ), # TODO: replace it with the release version - ( - "auto_round @ git+https://github.com/intel/auto-round.git@llmc" - ), + ("auto_round @ git+https://github.com/intel/auto-round.git@llmc"), ], extras_require={ "dev": [ From 97e1ca26cfa22a77cc0835a0aaf97e382104717a Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 16:57:11 -0800 Subject: [PATCH 26/57] clean code Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 0bf36f8d75..2c9a011728 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -58,8 +58,8 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper: class AutoRoundModifier(Modifier, QuantizationMixin): """ Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier - leverages signed gradient descent (SignSGD) and block-wise loss to optimize rounding values - and weight clipping in a few steps. + leverages signed gradient descent (SignSGD) optimizer and block-wise loss to optimize + rounding values and weight clipping in a few steps. | Sample yaml: | test_stage: @@ -99,9 +99,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): quantize even if they match a target in config_groups. Defaults to empty list. :param scheme: a single quantization scheme to apply to the model. This is a dictionary that supports all keys from QuantizationScheme except targets, which - will be set to the targets parameter set at the modifier level. Can also be set - to a dictionary of the format `preset_scheme_name: targets` for example: - `W8A8: ['Linear']` for weight and activation 8-bit. + will be set to the targets parameter set at the modifier level. """ # AutoRound modifier arguments @@ -149,7 +147,6 @@ def start_calibration(self, model: torch.nn.Module): for _, module in match_named_modules(model, self.targets, self.ignore): # Note: No need to register observers for auto-round - # self._initialize_observers(module) self._calibration_hooks |= self._initialize_hooks(module) apply_calibration_status(module) @@ -168,7 +165,7 @@ def on_start(self, state: State, event: Event, **kwargs): self.start_calibration(state.model) for name, module in state.model.named_modules(): if _is_decoding_layer(module, name): - # register input/output capture hooks for decoding layers + # register input capture hooks for decoding layers self.register_hook( module, self.input_capture_hook, "forward_pre", with_kwargs=True ) @@ -299,7 +296,7 @@ def on_end(self, state: State, event: Event, **kwargs): """ self.ended_ = True QuantizationMixin.end_calibration(self, state.model) - self.remove_hooks() # remove gptq hooks + self.remove_hooks() def on_finalize(self, state: State, **kwargs) -> bool: """ From c75c272e150e1a37de29b0846ad3a7486ca67904 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 16:59:20 -0800 Subject: [PATCH 27/57] fix typo Signed-off-by: yiliu30 --- examples/quantization_w4a16/auto_round_llama3_example.py | 2 +- examples/quantization_w4a16/llama3_example.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index 0e598ba9bf..d6d2003e6e 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -36,7 +36,7 @@ recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - # disbable shuffling to get slightly better mmlu score + # disable shuffling to get slightly better mmlu score shuffle_calibration_samples=False, ) diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py index 32ab8bb843..b03aacee35 100644 --- a/examples/quantization_w4a16/llama3_example.py +++ b/examples/quantization_w4a16/llama3_example.py @@ -7,9 +7,6 @@ # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model_dir = "/storage/yiliu7" -model_id = f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct" -model_id = f"{model_dir}/meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -76,6 +73,6 @@ def tokenize(sample): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-GPTQ" +SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) From 3d8a0c83d74334a989c43824a27adcc8de330e5c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 17:02:55 -0800 Subject: [PATCH 28/57] small iters for ut Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +- tests/llmcompressor/transformers/autoround/test_oneshot.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 2c9a011728..0f9a2894ff 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -165,7 +165,7 @@ def on_start(self, state: State, event: Event, **kwargs): self.start_calibration(state.model) for name, module in state.model.named_modules(): if _is_decoding_layer(module, name): - # register input capture hooks for decoding layers + # register input capture hook for decoding layers self.register_hook( module, self.input_capture_hook, "forward_pre", with_kwargs=True ) diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py index f85ae283b0..f1618d2753 100644 --- a/tests/llmcompressor/transformers/autoround/test_oneshot.py +++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py @@ -29,6 +29,7 @@ recipe_modifier_full = AutoRoundModifier( ignore=["lm_head"], + iters=10, config_groups={ "group_0": QuantizationScheme( targets=["Linear"], From 6729a75648493851ed88efc7b8c905a3817c525d Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 17:12:32 -0800 Subject: [PATCH 29/57] format Signed-off-by: yiliu30 --- .../quantization_w4a16/auto_round_llama3_example.py | 2 +- .../modifiers/quantization/autoround/base.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py index d6d2003e6e..2c97ee7794 100644 --- a/examples/quantization_w4a16/auto_round_llama3_example.py +++ b/examples/quantization_w4a16/auto_round_llama3_example.py @@ -1,3 +1,4 @@ +from auto_round.calib_dataset import get_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot @@ -13,7 +14,6 @@ NUM_CALIBRATION_SAMPLES = 128 MAX_SEQUENCE_LENGTH = 2048 # Get aligned calibration dataset. -from auto_round.calib_dataset import get_dataset ds = get_dataset( tokenizer=tokenizer, diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 0f9a2894ff..075bc7942b 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -57,9 +57,9 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper: class AutoRoundModifier(Modifier, QuantizationMixin): """ - Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier - leverages signed gradient descent (SignSGD) optimizer and block-wise loss to optimize - rounding values and weight clipping in a few steps. + Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. + This modifier leverages signed gradient descent (SignSGD) optimizer and + block-wise loss to optimize rounding values and weight clipping in a few steps. | Sample yaml: | test_stage: @@ -229,9 +229,10 @@ def apply_autoround(self, state): if loss < best_loss: best_params = save_params(layer) - This method retrieves the current decoding layer, wraps it for compatibility with - AutoRound, and performs iterative optimization to minimize the quantization error. - The best parameters are tracked and applied to the layer after tuning. + This method retrieves the current decoding layer, wraps it for + compatibility with AutoRound, and performs iterative optimization + to minimize the quantization error. The best parameters are tracked + and applied to the layer after tuning. For more details, please refer to the AutoRound repository: https://github.com/intel/auto-round/ From bb4dbe86bb914037ef7757e5f9637d793566d874 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 17:18:08 -0800 Subject: [PATCH 30/57] refine comment Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 075bc7942b..23d9cb8586 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -284,7 +284,7 @@ def apply_autoround(self, state): weight_scale = module.scale del module.scale del module.zp - # TODO: update weight as well + # TODO: update zero_point as well if needed update_offload_parameter(module, "weight_scale", weight_scale) decoding_layer.eval() From 2adf0e77c094a3c23d69282eab3a6e96356f1414 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 19:33:49 -0800 Subject: [PATCH 31/57] replace papaer link Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 23d9cb8586..744453b299 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -57,7 +57,7 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper: class AutoRoundModifier(Modifier, QuantizationMixin): """ - Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. + Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf. This modifier leverages signed gradient descent (SignSGD) optimizer and block-wise loss to optimize rounding values and weight clipping in a few steps. From dd9bde9b75f621c78193430c0d55396aa2398f3c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 4 Nov 2025 21:53:59 -0800 Subject: [PATCH 32/57] correct comments Signed-off-by: yiliu30 --- src/llmcompressor/pipelines/layer_sequential/pipeline.py | 3 ++- src/llmcompressor/pipelines/sequential/pipeline.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py index 314cd8439e..b8fbe32a3f 100644 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py @@ -74,7 +74,8 @@ def __call__( LifecycleCallbacks.calibration_epoch_start() - # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ + # TODO: remove this to enable quantization aware calibration + # for GPTQ, AWQ and AutoRound. disable_qac = any( type(mod).__name__ in DISABLE_QAC_MODIFIERS for mod in session.lifecycle.recipe.modifiers diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 0f341a37da..de40ab1f83 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -76,7 +76,8 @@ def __call__( LifecycleCallbacks.calibration_epoch_start() - # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ + # TODO: remove this to enable quantization aware calibration + # for GPTQ, AWQ and AutoRound. disable_qac = any( type(mod).__name__ in DISABLE_QAC_MODIFIERS for mod in session.lifecycle.recipe.modifiers From 7d972558cc5c10acde7a24b163289719378f0a57 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 03:09:32 -0500 Subject: [PATCH 33/57] update comments Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 744453b299..8b517b2c70 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -283,8 +283,7 @@ def apply_autoround(self, state): # Note: The model's weight is already q-dq in-place by auto-round. weight_scale = module.scale del module.scale - del module.zp - # TODO: update zero_point as well if needed + # TODO: update zero_point after supporting asymmetric quantization update_offload_parameter(module, "weight_scale", weight_scale) decoding_layer.eval() From f298e82bbc41c7145f2ba373d02d2356c04f82c5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 03:12:35 -0500 Subject: [PATCH 34/57] refine code Signed-off-by: yiliu30 --- .../modifiers/quantization/autoround/base.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index 8b517b2c70..d6444a9c9f 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -115,6 +115,15 @@ def resolve_quantization_config(self) -> QuantizationConfig: config = super().resolve_quantization_config() return config + def _add_temporary_names(self, model: torch.nn.Module): + for name, mod in model.named_modules(): + mod._tmp_name = name + + def _remove_temporary_names(self, model: torch.nn.Module): + for _, mod in model.named_modules(): + if hasattr(mod, "_tmp_name"): + del mod._tmp_name + def on_initialize(self, state: State, **kwargs) -> bool: """ Initialize and run the AutoRound algorithm on the current state @@ -130,9 +139,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: m: name for name, m in match_named_modules(state.model, self.targets, self.ignore) } - # add temporary names to all modules - for name, mod in state.model.named_modules(): - mod._tmp_name = name + self._add_temporary_names(state.model) # freeze all model parameters for name, param in state.model.named_parameters(): param.requires_grad_(False) @@ -296,6 +303,7 @@ def on_end(self, state: State, event: Event, **kwargs): """ self.ended_ = True QuantizationMixin.end_calibration(self, state.model) + self._remove_temporary_names(state.model) self.remove_hooks() def on_finalize(self, state: State, **kwargs) -> bool: From 73c357135e6e8d6d068f41be9e8ac9b281d63b99 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 03:14:48 -0500 Subject: [PATCH 35/57] add more checks Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/quantization/autoround/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py index d6444a9c9f..6581747eff 100644 --- a/src/llmcompressor/modifiers/quantization/autoround/base.py +++ b/src/llmcompressor/modifiers/quantization/autoround/base.py @@ -195,13 +195,18 @@ def _mapping_config_to_autoround(self): resolved_config = self.resolved_config quant_scheme = None + # TODO: release below constraint in later PRs + assert len(resolved_config.config_groups) == 1, ( + "AutoRoundModifier only supports one quantization scheme for now, " + f"got {len(resolved_config.config_groups)}" + ) + for scheme in resolved_config.config_groups.values(): assert isinstance( scheme, QuantizationScheme ), f"Expected QuantizationScheme, got {type(scheme)}" quant_scheme = scheme weight_args = quant_scheme.weights - # TODO: release below constraint in later PRs assert weight_args.strategy == QuantizationStrategy.GROUP, ( "Only group-wise quantization is supported in AutoRoundModifier for now, " f"got {weight_args.strategy}" From eb1639782d9b43a8d990418fcb35335c99f335c1 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 23:23:07 -0800 Subject: [PATCH 36/57] update example Signed-off-by: yiliu30 --- .../auto_round_llama3_example.py => autoround/llama3_example.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{quantization_w4a16/auto_round_llama3_example.py => autoround/llama3_example.py} (100%) diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/autoround/llama3_example.py similarity index 100% rename from examples/quantization_w4a16/auto_round_llama3_example.py rename to examples/autoround/llama3_example.py From 9cb1f062f4838c1f2e0965e67895696dc78cc97a Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 23:25:00 -0800 Subject: [PATCH 37/57] move auto-round to modifier Signed-off-by: yiliu30 --- examples/autoround/llama3_example.py | 2 +- src/llmcompressor/modifiers/__init__.py | 2 ++ .../modifiers/{quantization => }/autoround/__init__.py | 0 .../modifiers/{quantization => }/autoround/base.py | 0 src/llmcompressor/modifiers/quantization/__init__.py | 1 - tests/llmcompressor/transformers/autoround/test_oneshot.py | 2 +- 6 files changed, 4 insertions(+), 3 deletions(-) rename src/llmcompressor/modifiers/{quantization => }/autoround/__init__.py (100%) rename src/llmcompressor/modifiers/{quantization => }/autoround/base.py (100%) diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py index 2c97ee7794..e968066510 100644 --- a/examples/autoround/llama3_example.py +++ b/examples/autoround/llama3_example.py @@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import AutoRoundModifier +from llmcompressor.modifiers import AutoRoundModifier from llmcompressor.utils import dispatch_for_generation # Select model and load it. diff --git a/src/llmcompressor/modifiers/__init__.py b/src/llmcompressor/modifiers/__init__.py index 65cd78b983..ec30c4174a 100644 --- a/src/llmcompressor/modifiers/__init__.py +++ b/src/llmcompressor/modifiers/__init__.py @@ -7,6 +7,7 @@ extensible compression workflows. """ +from .autoround import AutoRoundModifier from .factory import ModifierFactory from .interface import ModifierInterface from .modifier import Modifier @@ -15,4 +16,5 @@ "ModifierFactory", "ModifierInterface", "Modifier", + "AutoRoundModifier", ] diff --git a/src/llmcompressor/modifiers/quantization/autoround/__init__.py b/src/llmcompressor/modifiers/autoround/__init__.py similarity index 100% rename from src/llmcompressor/modifiers/quantization/autoround/__init__.py rename to src/llmcompressor/modifiers/autoround/__init__.py diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py similarity index 100% rename from src/llmcompressor/modifiers/quantization/autoround/base.py rename to src/llmcompressor/modifiers/autoround/base.py diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py index 2c10fe4b97..1ca6912221 100644 --- a/src/llmcompressor/modifiers/quantization/__init__.py +++ b/src/llmcompressor/modifiers/quantization/__init__.py @@ -2,4 +2,3 @@ from .gptq import * from .quantization import * -from .autoround import * diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py index f1618d2753..d973398d90 100644 --- a/tests/llmcompressor/transformers/autoround/test_oneshot.py +++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py @@ -5,7 +5,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers.quantization.autoround import AutoRoundModifier +from llmcompressor.modifiers import AutoRoundModifier recipe_str = """ quant_stage: From 76e0d21926c4d2b736991cf642640f85e450aa28 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 23:34:12 -0800 Subject: [PATCH 38/57] apply untie Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 6581747eff..ad9434680d 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -19,6 +19,9 @@ from llmcompressor.modifiers import Modifier from llmcompressor.modifiers.quantization.calibration import apply_calibration_status from llmcompressor.modifiers.quantization.quantization import QuantizationMixin +from llmcompressor.transformers.compression.compressed_tensors_utils import ( + untie_if_target_shared_embedding, +) __all__ = ["AutoRoundModifier"] @@ -151,6 +154,7 @@ def start_calibration(self, model: torch.nn.Module): :param model: model to prepare for calibration """ + untie_if_target_shared_embedding(model, self._module_names.values()) for _, module in match_named_modules(model, self.targets, self.ignore): # Note: No need to register observers for auto-round From 1cbe919ff772c684616d85f55307d269c85a7d9e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 23:36:42 -0800 Subject: [PATCH 39/57] correct docstring Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index ad9434680d..4128627d04 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -129,7 +129,7 @@ def _remove_temporary_names(self, model: torch.nn.Module): def on_initialize(self, state: State, **kwargs) -> bool: """ - Initialize and run the AutoRound algorithm on the current state + Initialize the model state for quantization and calibration. :param state: session state storing input model and calibration data """ From 9fa5efb7cb0c7a98ed3bfefc69bf04ccb6e8ffb3 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 23:39:05 -0800 Subject: [PATCH 40/57] enable ci Signed-off-by: yiliu30 --- .github/workflows/test-check-transformers.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 4ffde0b5e2..8bc7f97f7a 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -97,6 +97,10 @@ jobs: if: (success() || failure()) && steps.install.outcome == 'success' run: | pytest -v tests/llmcompressor/transformers/gptq + - name: Running AutoRound Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/autoround - name: Running ONESHOT Tests if: (success() || failure()) && steps.install.outcome == 'success' run: | From 7937d807fb294983025ce8481a0d5e6d7d5b9666 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 5 Nov 2025 23:58:51 -0800 Subject: [PATCH 41/57] revert import AutoRoundModifier into modfifier directly Signed-off-by: yiliu30 --- examples/autoround/llama3_example.py | 2 +- src/llmcompressor/modifiers/__init__.py | 2 -- tests/llmcompressor/transformers/autoround/test_oneshot.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py index e968066510..9843073bdc 100644 --- a/examples/autoround/llama3_example.py +++ b/examples/autoround/llama3_example.py @@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers import AutoRoundModifier +from llmcompressor.modifiers.autoround import AutoRoundModifier from llmcompressor.utils import dispatch_for_generation # Select model and load it. diff --git a/src/llmcompressor/modifiers/__init__.py b/src/llmcompressor/modifiers/__init__.py index ec30c4174a..65cd78b983 100644 --- a/src/llmcompressor/modifiers/__init__.py +++ b/src/llmcompressor/modifiers/__init__.py @@ -7,7 +7,6 @@ extensible compression workflows. """ -from .autoround import AutoRoundModifier from .factory import ModifierFactory from .interface import ModifierInterface from .modifier import Modifier @@ -16,5 +15,4 @@ "ModifierFactory", "ModifierInterface", "Modifier", - "AutoRoundModifier", ] diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py index d973398d90..77f6c91707 100644 --- a/tests/llmcompressor/transformers/autoround/test_oneshot.py +++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py @@ -5,7 +5,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -from llmcompressor.modifiers import AutoRoundModifier +from llmcompressor.modifiers.autoround import AutoRoundModifier recipe_str = """ quant_stage: From e58b2bd03441934ab0b595a17f99b4d659f63ab7 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 6 Nov 2025 00:05:47 -0800 Subject: [PATCH 42/57] update Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 4128627d04..04da438a05 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -274,6 +274,7 @@ def apply_autoround(self, state): iters=self.iters, enable_quanted_input=False, enable_torch_compile=self.enable_torch_compile, + batch_dim=0, ) # TODO: configure layer-wise config based on self.resolved_config ar.configure_layer_config() From 4c94187da0cb1afc505e919ee42e8693416212bb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 6 Nov 2025 19:38:02 -0800 Subject: [PATCH 43/57] clean Signed-off-by: yiliu30 --- .../pipelines/layer_sequential/pipeline.py | 130 ------------------ 1 file changed, 130 deletions(-) delete mode 100644 src/llmcompressor/pipelines/layer_sequential/pipeline.py diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py deleted file mode 100644 index b8fbe32a3f..0000000000 --- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py +++ /dev/null @@ -1,130 +0,0 @@ -import contextlib -from typing import TYPE_CHECKING - -import torch -import tqdm -from compressed_tensors.utils import disable_offloading, get_execution_device -from torch.utils.data.dataloader import DataLoader - -from llmcompressor.core import LifecycleCallbacks, active_session -from llmcompressor.modifiers.utils.hooks import HooksMixin -from llmcompressor.pipelines.cache import IntermediatesCache -from llmcompressor.pipelines.layer_sequential.helpers import ( - capture_first_layer_intermediates, - match_modules, - maybe_inject_pos_embeddings, - to_next_layer_kwargs, -) -from llmcompressor.pipelines.registry import CalibrationPipeline -from llmcompressor.pipelines.sequential.helpers import ( - dispatch_for_sequential, - get_sequential_targets, -) -from llmcompressor.utils.helpers import ( - DISABLE_QAC_MODIFIERS, - DisableQuantization, - calibration_forward_context, -) - -if TYPE_CHECKING: - from llmcompressor.args.dataset_arguments import DatasetArguments - - -__all__ = ["LayerSequentialPipeline"] - - -@CalibrationPipeline.register("layer_sequential") -class LayerSequentialPipeline(CalibrationPipeline): - @staticmethod - def __call__( - model: torch.nn.Module, dataloader: DataLoader, dataset_args: "DatasetArguments" - ): - """ - Run a layer-wise sequential data pipeline according to the following steps: - - 1. Layers are identified according to `sequential_targets` - 2. A hook is attached to the first layer. This hook raises an exception which is - then caught and used to capture the input arguments to the first layer - 3. The inputs to the first layer are used to calibrate the first layer, and the - output of the previous layer is used as inputs to calibrate the next layer - - This pipeline requires that the model have distinct layers defined in its - architecture and that the outputs of the previous layer are exactly the inputs - to the next layer. This is violated by encoder-decoder architectures, among - others. - - If your model architecture violates these assumptions, consider using the - sequential pipeline (see llmcompressor.pipelines.sequential). Architectures - which are known to fail these assumptions include GPT-J and most vision models - - :param model: model being calibrated - :param dataloader: loads data for calibration - :param dataset_args: dataset arguments relevant to pipelines - """ - session = active_session() - - # prepare model for sequential onloading - dispatch_for_sequential(model) - model_device = get_execution_device(model) - - # find layers - modifiers = session.lifecycle.recipe.modifiers - sequential_targets = get_sequential_targets(modifiers, model, dataset_args) - layers = match_modules(model, sequential_targets) - - LifecycleCallbacks.calibration_epoch_start() - - # TODO: remove this to enable quantization aware calibration - # for GPTQ, AWQ and AutoRound. - disable_qac = any( - type(mod).__name__ in DISABLE_QAC_MODIFIERS - for mod in session.lifecycle.recipe.modifiers - ) - - with contextlib.ExitStack() as stack: - stack.enter_context(calibration_forward_context(model)) - if not dataset_args.quantization_aware_calibration or disable_qac: - stack.enter_context(DisableQuantization(model)) - - # prepare intermediates cache - intermediates: IntermediatesCache = capture_first_layer_intermediates( - model, layers[0], dataloader, model_device - ) - - num_layers = len(layers) - for layer_index, layer in enumerate(layers): - # prepare tqdm description texts - calib_desc = f"({layer_index + 1}/{num_layers}): Calibrating" - prop_desc = f"({layer_index + 1}/{num_layers}): Propagating" - - # reduce memory movement by keeping modules onloaded - with disable_offloading(): - # do a preliminary pass to trigger modifier hooks - for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc): - inputs = intermediates.fetch(batch_idx) - layer(**inputs) - - LifecycleCallbacks.sequential_epoch_end() - - # this pass does not trigger modifier hooks - # and is only used for capturing outputs from - # newly compressed modules - with HooksMixin.disable_hooks(): - for batch_idx in tqdm.tqdm( - range(len(dataloader)), desc=prop_desc - ): - inputs = intermediates.fetch(batch_idx) - output = layer(**inputs) - - if layer_index < num_layers - 1: - next_layer = layers[layer_index + 1] - output = to_next_layer_kwargs(output, next_layer) - output = maybe_inject_pos_embeddings( - output, next_layer, inputs - ) - - intermediates.delete(batch_idx) - intermediates.update(batch_idx, output) - - # redundant, finish any remaining compression - LifecycleCallbacks.calibration_epoch_end() From 7ea844283a1c8a1e057c5ba1a00eabb63e253884 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 6 Nov 2025 23:19:21 -0800 Subject: [PATCH 44/57] fix Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 04da438a05..8e45a520b2 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -114,9 +114,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin): _cur_layer_idx = PrivateAttr(default=0) _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) - def resolve_quantization_config(self) -> QuantizationConfig: - config = super().resolve_quantization_config() - return config def _add_temporary_names(self, model: torch.nn.Module): for name, mod in model.named_modules(): From f52c0c0cb1d23f1f950d0a7f2efcff4989aa32b3 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 7 Nov 2025 00:34:11 -0800 Subject: [PATCH 45/57] refactor Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 150 ++++++++++-------- 1 file changed, 82 insertions(+), 68 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 8e45a520b2..39d991eb78 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -1,8 +1,7 @@ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import torch from compressed_tensors.quantization import ( - QuantizationConfig, QuantizationScheme, QuantizationStrategy, enable_quantization, @@ -12,6 +11,7 @@ match_named_modules, update_offload_parameter, ) +from llmcompressor.utils.pytorch.module import get_no_split_params from loguru import logger from pydantic import PrivateAttr @@ -26,10 +26,6 @@ __all__ = ["AutoRoundModifier"] -def _is_decoding_layer(module, name): - return "decoderlayer" in module.__class__.__name__.lower() - - class _LLModelWrapper(torch.nn.Module): def __init__(self): super().__init__() @@ -104,7 +100,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): dictionary that supports all keys from QuantizationScheme except targets, which will be set to the targets parameter set at the modifier level. """ - + sequential_targets: Union[str, List[str], None] = None # AutoRound modifier arguments iters: Optional[int] = 200 enable_torch_compile: Optional[bool] = True @@ -114,16 +110,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin): _cur_layer_idx = PrivateAttr(default=0) _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) - - def _add_temporary_names(self, model: torch.nn.Module): - for name, mod in model.named_modules(): - mod._tmp_name = name - - def _remove_temporary_names(self, model: torch.nn.Module): - for _, mod in model.named_modules(): - if hasattr(mod, "_tmp_name"): - del mod._tmp_name - def on_initialize(self, state: State, **kwargs) -> bool: """ Initialize the model state for quantization and calibration. @@ -141,8 +127,10 @@ def on_initialize(self, state: State, **kwargs) -> bool: } self._add_temporary_names(state.model) # freeze all model parameters - for name, param in state.model.named_parameters(): + for _, param in state.model.named_parameters(): param.requires_grad_(False) + + self.sequential_targets = self._infer_sequential_targets(state.model) return True def start_calibration(self, model: torch.nn.Module): @@ -171,8 +159,8 @@ def on_start(self, state: State, event: Event, **kwargs): # register quantization calibration hooks # assume quantization has been initialized by this modifier or one before it self.start_calibration(state.model) - for name, module in state.model.named_modules(): - if _is_decoding_layer(module, name): + for _, module in state.model.named_modules(): + if self._is_decoding_layer(module): # register input capture hook for decoding layers self.register_hook( module, self.input_capture_hook, "forward_pre", with_kwargs=True @@ -184,52 +172,15 @@ def on_event(self, state: State, event: Event, **kwargs): self.on_start(state, None) if event.type_ == EventType.SEQUENTIAL_EPOCH_END: - self.apply_autoround(state) + subgraph = kwargs.pop("subgraph", None) + self.apply_autoround(state, subgraph) self.post_autoround_cleanup() if event.type_ == EventType.CALIBRATION_EPOCH_END: if not self.ended_: self.on_end(state, None) - def _mapping_config_to_autoround(self): - from auto_round.schemes import QuantizationScheme as ARQuantizationScheme - - resolved_config = self.resolved_config - quant_scheme = None - # TODO: release below constraint in later PRs - assert len(resolved_config.config_groups) == 1, ( - "AutoRoundModifier only supports one quantization scheme for now, " - f"got {len(resolved_config.config_groups)}" - ) - - for scheme in resolved_config.config_groups.values(): - assert isinstance( - scheme, QuantizationScheme - ), f"Expected QuantizationScheme, got {type(scheme)}" - quant_scheme = scheme - weight_args = quant_scheme.weights - assert weight_args.strategy == QuantizationStrategy.GROUP, ( - "Only group-wise quantization is supported in AutoRoundModifier for now, " - f"got {weight_args.strategy}" - ) - assert quant_scheme.input_activations is None, ( - "Input activation quantization is not supported in AutoRoundModifier, " - f"got {quant_scheme.input_activations}" - ) - assert quant_scheme.output_activations is None, ( - "Output activation quantization is not supported in AutoRoundModifier, " - f"got {quant_scheme.output_activations}" - ) - ar_quant_scheme = ARQuantizationScheme( - bits=weight_args.num_bits, - sym=weight_args.symmetric, - group_size=weight_args.group_size, - data_type=weight_args.type, - act_bits=16, - ) - return ar_quant_scheme - - def apply_autoround(self, state): + def apply_autoround(self, state, subgraph): """ Applies AutoRound quantization tuning on the current decoding layer. @@ -250,13 +201,18 @@ def apply_autoround(self, state): For more details, please refer to the AutoRound repository: https://github.com/intel/auto-round/ """ - cur_layer_idx = self._cur_layer_idx - logger.info("Applying AutoRound to layer index: {}", cur_layer_idx) - self._cur_layer_idx += 1 - if cur_layer_idx >= len(state.model.model.layers): - # skip the lm_head layer + modules = list(subgraph.submodules(model=state.model)) + + decoding_layers = [m for m in modules if self._is_decoding_layer(m)] + if len(decoding_layers) == 0: return - decoding_layer = state.model.model.layers[cur_layer_idx] + assert len(decoding_layers) == 1, ( + "Only one decoding layer is expected in the subgraph, " + f"found {len(decoding_layers)}." + ) + decoding_layer = decoding_layers[0] + + logger.info("Applying AutoRound on layer {}", decoding_layer._tmp_name) wrapped_model = _wrap_decoding_layer(decoding_layer) @@ -277,8 +233,7 @@ def apply_autoround(self, state): ar.configure_layer_config() first_param = next(decoding_layer.parameters()) device = first_param.device - input_name = f"model.layers.{cur_layer_idx}" - cur_inputs = self._all_module_input[input_name] + cur_inputs = self._all_module_input[decoding_layer._tmp_name] decoding_layer.tuning_device = device ar.quantize_block( @@ -323,3 +278,62 @@ def on_finalize(self, state: State, **kwargs) -> bool: self.on_end(state, None) return True + + def _add_temporary_names(self, model: torch.nn.Module): + for name, mod in model.named_modules(): + mod._tmp_name = name + + def _remove_temporary_names(self, model: torch.nn.Module): + for _, mod in model.named_modules(): + if hasattr(mod, "_tmp_name"): + del mod._tmp_name + + def _is_decoding_layer(self, module: torch.nn.Module) -> bool: + return module.__class__.__name__ in self.sequential_targets + + def _infer_sequential_targets(self, model: torch.nn.Module) -> str | list[str]: + match self.sequential_targets: + case None: + return get_no_split_params(model) + case str(): + return [self.sequential_targets] + case _: + return self.sequential_targets + + def _mapping_config_to_autoround(self): + from auto_round.schemes import QuantizationScheme as ARQuantizationScheme + + resolved_config = self.resolved_config + quant_scheme = None + # TODO: release below constraint in later PRs + assert len(resolved_config.config_groups) == 1, ( + "AutoRoundModifier only supports one quantization scheme for now, " + f"got {len(resolved_config.config_groups)}" + ) + + for scheme in resolved_config.config_groups.values(): + assert isinstance( + scheme, QuantizationScheme + ), f"Expected QuantizationScheme, got {type(scheme)}" + quant_scheme = scheme + weight_args = quant_scheme.weights + assert weight_args.strategy == QuantizationStrategy.GROUP, ( + "Only group-wise quantization is supported in AutoRoundModifier for now, " + f"got {weight_args.strategy}" + ) + assert quant_scheme.input_activations is None, ( + "Input activation quantization is not supported in AutoRoundModifier, " + f"got {quant_scheme.input_activations}" + ) + assert quant_scheme.output_activations is None, ( + "Output activation quantization is not supported in AutoRoundModifier, " + f"got {quant_scheme.output_activations}" + ) + ar_quant_scheme = ARQuantizationScheme( + bits=weight_args.num_bits, + sym=weight_args.symmetric, + group_size=weight_args.group_size, + data_type=weight_args.type, + act_bits=16, + ) + return ar_quant_scheme \ No newline at end of file From 4a9c4aa53777fbd0cd50cb898cb31ab6a09c02df Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 7 Nov 2025 00:36:53 -0800 Subject: [PATCH 46/57] format Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 39d991eb78..f2759d9de9 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -11,7 +11,6 @@ match_named_modules, update_offload_parameter, ) -from llmcompressor.utils.pytorch.module import get_no_split_params from loguru import logger from pydantic import PrivateAttr @@ -22,6 +21,7 @@ from llmcompressor.transformers.compression.compressed_tensors_utils import ( untie_if_target_shared_embedding, ) +from llmcompressor.utils.pytorch.module import get_no_split_params __all__ = ["AutoRoundModifier"] @@ -100,6 +100,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): dictionary that supports all keys from QuantizationScheme except targets, which will be set to the targets parameter set at the modifier level. """ + sequential_targets: Union[str, List[str], None] = None # AutoRound modifier arguments iters: Optional[int] = 200 @@ -129,7 +130,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: # freeze all model parameters for _, param in state.model.named_parameters(): param.requires_grad_(False) - + self.sequential_targets = self._infer_sequential_targets(state.model) return True @@ -219,11 +220,11 @@ def apply_autoround(self, state, subgraph): with torch.enable_grad(), align_module_device(decoding_layer): import auto_round - parsed_scheme = self._mapping_config_to_autoround() + ar_quant_scheme = self._mapping_config_to_autoround() ar = auto_round.AutoRound( model=wrapped_model, tokenizer="", - scheme=parsed_scheme, + scheme=ar_quant_scheme, iters=self.iters, enable_quanted_input=False, enable_torch_compile=self.enable_torch_compile, @@ -336,4 +337,4 @@ def _mapping_config_to_autoround(self): data_type=weight_args.type, act_bits=16, ) - return ar_quant_scheme \ No newline at end of file + return ar_quant_scheme From 0567df6034327eea2838c5a7b2ef7db3033c255a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 7 Nov 2025 16:39:47 +0800 Subject: [PATCH 47/57] Update src/llmcompressor/modifiers/autoround/base.py Co-authored-by: Brian Dellabetta Signed-off-by: Yi Liu --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index f2759d9de9..510f5116fd 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -104,7 +104,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): sequential_targets: Union[str, List[str], None] = None # AutoRound modifier arguments iters: Optional[int] = 200 - enable_torch_compile: Optional[bool] = True + enable_torch_compile: bool = True # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) From 650a19ca0a7e9e8d74806f5907540d601d119fb8 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 7 Nov 2025 00:58:33 -0800 Subject: [PATCH 48/57] refine docs Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index f2759d9de9..b6859a7049 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -192,12 +192,7 @@ def apply_autoround(self, state, subgraph): loss.backward() optimizer.step() if loss < best_loss: - best_params = save_params(layer) - - This method retrieves the current decoding layer, wraps it for - compatibility with AutoRound, and performs iterative optimization - to minimize the quantization error. The best parameters are tracked - and applied to the layer after tuning. + best_params = update_params(layer) For more details, please refer to the AutoRound repository: https://github.com/intel/auto-round/ From 5cd35a6a9023a92f7aa47a4af537dc0811180345 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 7 Nov 2025 19:55:27 -0800 Subject: [PATCH 49/57] fix import Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 9ac1a4c0c5..be690e6344 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -1,6 +1,8 @@ from typing import Dict, List, Optional, Tuple, Union import torch +from auto_round import AutoRound +from auto_round.schemes import QuantizationScheme as ARQuantizationScheme from compressed_tensors.quantization import ( QuantizationScheme, QuantizationStrategy, @@ -213,10 +215,8 @@ def apply_autoround(self, state, subgraph): wrapped_model = _wrap_decoding_layer(decoding_layer) with torch.enable_grad(), align_module_device(decoding_layer): - import auto_round - ar_quant_scheme = self._mapping_config_to_autoround() - ar = auto_round.AutoRound( + ar = AutoRound( model=wrapped_model, tokenizer="", scheme=ar_quant_scheme, @@ -297,8 +297,6 @@ def _infer_sequential_targets(self, model: torch.nn.Module) -> str | list[str]: return self.sequential_targets def _mapping_config_to_autoround(self): - from auto_round.schemes import QuantizationScheme as ARQuantizationScheme - resolved_config = self.resolved_config quant_scheme = None # TODO: release below constraint in later PRs From 678b123363d66a0b7308d642bdf1768801b6db0b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 8 Nov 2025 12:08:01 +0800 Subject: [PATCH 50/57] Update src/llmcompressor/modifiers/autoround/base.py Co-authored-by: Brian Dellabetta Signed-off-by: Yi Liu --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index be690e6344..b235ede13d 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -105,7 +105,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): sequential_targets: Union[str, List[str], None] = None # AutoRound modifier arguments - iters: Optional[int] = 200 + iters: int = 200 enable_torch_compile: bool = True # private variables From a8c63d388b91d3002898b2be733d3acaa6f3c0ab Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 9 Nov 2025 18:13:09 -0800 Subject: [PATCH 51/57] add qinput Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index be690e6344..becdd485fb 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -112,6 +112,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin): _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) _cur_layer_idx = PrivateAttr(default=0) _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) + _q_input: Optional[torch.Tensor] = PrivateAttr(default=None) def on_initialize(self, state: State, **kwargs) -> bool: """ @@ -221,7 +222,6 @@ def apply_autoround(self, state, subgraph): tokenizer="", scheme=ar_quant_scheme, iters=self.iters, - enable_quanted_input=False, enable_torch_compile=self.enable_torch_compile, batch_dim=0, ) @@ -232,14 +232,16 @@ def apply_autoround(self, state, subgraph): cur_inputs = self._all_module_input[decoding_layer._tmp_name] decoding_layer.tuning_device = device - ar.quantize_block( + q_input, _ = ar.quantize_block( block=decoding_layer, inputs=cur_inputs, + q_input=self._q_input, normalize_inputs=True, device=device, # Leave offload for LLMC auto_offload=False, ) + self._q_input = q_input # Update offload parameters and remove temporary attributes for _, module in decoding_layer.named_modules(): if hasattr(module, "weight_scale") and hasattr( From fbc047aaa9b7c2d4f07fc53391dea5c5b14d45c4 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 9 Nov 2025 18:14:42 -0800 Subject: [PATCH 52/57] clean cache Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 6b6868a9e6..e025583864 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -265,6 +265,7 @@ def on_end(self, state: State, event: Event, **kwargs): QuantizationMixin.end_calibration(self, state.model) self._remove_temporary_names(state.model) self.remove_hooks() + self._q_input = None def on_finalize(self, state: State, **kwargs) -> bool: """ From 96b6490ca798500829e4bd4ae3ca60e5986ff602 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 9 Nov 2025 18:41:17 -0800 Subject: [PATCH 53/57] align api Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index e025583864..4b72e99b22 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -110,7 +110,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin): # private variables _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict) - _cur_layer_idx = PrivateAttr(default=0) _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict) _q_input: Optional[torch.Tensor] = PrivateAttr(default=None) @@ -236,7 +235,6 @@ def apply_autoround(self, state, subgraph): block=decoding_layer, inputs=cur_inputs, q_input=self._q_input, - normalize_inputs=True, device=device, # Leave offload for LLMC auto_offload=False, From d00d41b3d028c622be88c0b5e59ff4e608142035 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 9 Nov 2025 19:50:14 -0800 Subject: [PATCH 54/57] fix Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 4b72e99b22..338b404832 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -213,6 +213,7 @@ def apply_autoround(self, state, subgraph): logger.info("Applying AutoRound on layer {}", decoding_layer._tmp_name) wrapped_model = _wrap_decoding_layer(decoding_layer) + wrapped_model.name_or_path = state.model.name_or_path with torch.enable_grad(), align_module_device(decoding_layer): ar_quant_scheme = self._mapping_config_to_autoround() From d4a8fb00f209e02aab317c50024c299aeb03ee96 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 9 Nov 2025 21:00:59 -0800 Subject: [PATCH 55/57] fix Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 338b404832..02801369a3 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -223,10 +223,10 @@ def apply_autoround(self, state, subgraph): scheme=ar_quant_scheme, iters=self.iters, enable_torch_compile=self.enable_torch_compile, - batch_dim=0, ) # TODO: configure layer-wise config based on self.resolved_config ar.configure_layer_config() + ar.batch_dim = 0 first_param = next(decoding_layer.parameters()) device = first_param.device cur_inputs = self._all_module_input[decoding_layer._tmp_name] From 487fcd2d624e98a137fb0d549f4dfcfab35b9463 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 9 Nov 2025 21:02:32 -0800 Subject: [PATCH 56/57] update Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 02801369a3..2480751a9b 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -225,7 +225,7 @@ def apply_autoround(self, state, subgraph): enable_torch_compile=self.enable_torch_compile, ) # TODO: configure layer-wise config based on self.resolved_config - ar.configure_layer_config() + ar.configure_layer_config(enable_gguf_official_mixed=False) ar.batch_dim = 0 first_param = next(decoding_layer.parameters()) device = first_param.device From 3adc879708c7167b69a2aa4562e8b1afbd9d4c3f Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 11 Nov 2025 16:25:02 -0800 Subject: [PATCH 57/57] add requires_gpu for ut Signed-off-by: yiliu30 --- tests/llmcompressor/transformers/autoround/test_oneshot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py index 77f6c91707..ce167864e9 100644 --- a/tests/llmcompressor/transformers/autoround/test_oneshot.py +++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py @@ -6,6 +6,7 @@ from llmcompressor import oneshot from llmcompressor.modifiers.autoround import AutoRoundModifier +from tests.testing_utils import requires_gpu recipe_str = """ quant_stage: @@ -39,6 +40,7 @@ ) +@requires_gpu(1) @pytest.mark.parametrize( "recipe", [