From 80c92da062d7596bd3fceba77d674d97b96fe450 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 2 Nov 2025 20:49:14 -0800
Subject: [PATCH 01/57] add auto-round

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/quantization_w4a16/llama3_example.py         | 2 ++
 src/llmcompressor/modifiers/quantization/__init__.py  | 1 +
 src/llmcompressor/modifiers/quantization/gptq/base.py | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index b03aacee35..038c0ebc9f 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -7,6 +7,8 @@
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_dir="/storage/yiliu7"
+model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py
index f6ad149fbb..106128f046 100644
--- a/src/llmcompressor/modifiers/quantization/__init__.py
+++ b/src/llmcompressor/modifiers/quantization/__init__.py
@@ -3,3 +3,4 @@
 from .cache import *
 from .gptq import *
 from .quantization import *
+from .autoround import *
\ No newline at end of file
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 09f3e681c4..126a1f6556 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -262,7 +262,7 @@ def compress_modules(self):
                     percdamp=self.dampening_frac,
                 )
                 comp_logger.set_loss(loss)
-
+            breakpoint()
             update_offload_parameter(module, "weight", quantized_weight)
             update_offload_parameter(module, "weight_scale", scale)
             update_offload_parameter(module, "weight_zero_point", zero_point)

From 3266b79b3a038cc128fd7e506f6a5f1f78e23331 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 2 Nov 2025 21:50:33 -0800
Subject: [PATCH 02/57] add auto-round modifier

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py  | 415 ++++++++++++++++++
 1 file changed, 415 insertions(+)
 create mode 100644 src/llmcompressor/modifiers/quantization/autoround/base.py

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
new file mode 100644
index 0000000000..7ad9bf3e8d
--- /dev/null
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -0,0 +1,415 @@
+import contextlib
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from compressed_tensors.quantization import (
+    QuantizationConfig,
+    QuantizationScheme,
+    QuantizationStrategy,
+)
+from compressed_tensors.quantization.quant_args import ActivationOrdering
+from compressed_tensors.utils import (
+    align_module_device,
+    get_execution_device,
+    getattr_chain,
+    match_named_modules,
+    update_offload_parameter,
+)
+from loguru import logger
+from pydantic import PrivateAttr
+
+from llmcompressor.core import Event, EventType, State
+from llmcompressor.modifiers import Modifier
+from llmcompressor.modifiers.quantization.gptq.gptq_quantize import (
+    accumulate_hessian,
+    make_empty_hessian,
+    quantize_weight,
+)
+from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
+from llmcompressor.sentinel import Sentinel
+from llmcompressor.utils.metric_logging import CompressionLogger
+
+__all__ = ["AutoRoundModifier"]
+
+
+from collections import defaultdict
+import os
+
+FALLBACK_CHANGE = os.environ.get("FALLBACK_CHANGE", "0").lower() in ("1", "true", "yes")
+_DEBUG = os.environ.get("DEBUG", "0").lower() in ("1", "true", "yes")
+
+all_module_input = defaultdict(list)
+all_module_output = defaultdict(list)
+
+
+def input_capture_hook(module, *args, **kwargs):
+    all_module_input[module._tmp_name].append((args, kwargs))
+
+
+def output_capture_hook(module, *args, **kwargs):
+    all_module_output[module._tmp_name].append((args, kwargs))
+
+
+def normalize_input(cur_inputs):
+    # TODO: move it to auto-round
+    input_ids = []
+    input_others = {}
+    positional_inputs = []
+    attention_mask = None
+    position_ids = None
+    cache_position = None
+    position_embeddings = (None, None)
+    for cur_inp in cur_inputs:
+        input_ids.append(cur_inp[0][0][0])
+        for key, val in cur_inp[0][1].items():
+            if key == "position_ids":
+                position_ids = val
+            elif key == "position_embeddings":
+                position_embeddings = val
+            elif key == "cache_position":
+                cache_position = val
+    input_others["position_ids"] = position_ids
+    input_others["positional_inputs"] = positional_inputs
+    input_others["attention_mask"] = attention_mask
+    input_others["position_embeddings"] = position_embeddings
+    input_others["cache_position"] = cache_position
+    return input_ids, input_others
+
+
+def _is_decoding_layer(module, name):
+    return "decoderlayer" in module.__class__.__name__.lower()
+
+
+class _LLModelWrapper(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+
+    def forward(self, *args, **kwargs):
+        for layer in self.layers:
+            res = layer(*args, **kwargs)
+        return res
+
+
+class _PretrainModelWrapper(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = _LLModelWrapper()
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+
+def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
+    wrapped_model = _PretrainModelWrapper()
+    wrapped_model.model.layers.append(layer)
+    first_param = next(layer.parameters())
+    wrapped_model.dtype = first_param.dtype
+    return wrapped_model
+
+
+
+class AutoRoundModifier(Modifier, QuantizationMixin):
+    """
+    Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier
+    uses activations to calibrate a hessian matrix, which is then used to determine
+    optimal quantizion values and orderings for the model weights.
+
+    | Sample yaml:
+    | test_stage:
+    |    obcq_modifiers:
+    |      AutoRoundModifier:
+    |          block_size: 128
+    |          dampening_frac: 0.001
+    |          offload_hessians: False
+    |          actorder: static
+    |          config_groups:
+    |            group_0:
+    |                targets:
+    |                  - "Linear"
+    |                input_activations: null
+    |                output_activations: null
+    |                weights:
+    |                    num_bits: 8
+    |                    type: "int"
+    |                    symmetric: true
+    |                    strategy: group
+    |                    group_size: 128
+
+    Lifecycle:
+        - on_initialize
+            - apply config to model
+        - on_start
+            - add activation calibration hooks
+            - add gptq weight calibration hooks
+        - on_sequential_epoch_end
+            - quantize_weight
+        - on_finalize
+            - remove_hooks()
+            - model.apply(freeze_module_quantization)
+
+    :param sequential_targets: list of layer names to compress during GPTQ, or
+        '__ALL__' to compress every layer in the model
+    :param block_size: Used to determine number of columns to compress in one pass
+    :param dampening_frac: Amount of dampening to apply to H, as a fraction of the
+        diagonal norm
+    :param actorder: order in which weight columns are quantized. Defaults to "static"
+        activation ordering, which achieves best accuracy recovery with no runtime cost.
+        For more information, see https://github.com/vllm-project/vllm/pull/8135
+    :param offload_hessians: Set to True for decreased memory usage but increased
+        runtime.
+
+    :param config_groups: dictionary specifying quantization schemes to apply to target
+        modules. Modules not matching a scheme target will NOT be quantized.
+    :param targets: list of layer names to quantize if a scheme is provided. Defaults
+        to Linear layers
+    :param ignore: optional list of module class names or submodule names to not
+        quantize even if they match a target in config_groups. Defaults to empty list.
+    :param scheme: a single quantization scheme to apply to the model. This is a
+        dictionary that supports all keys from QuantizationScheme except targets, which
+        will be set to the targets parameter set at the modifier level. Can also be set
+        to a dictionary of the format `preset_scheme_name: targets` for example:
+        `W8A8: ['Linear']` for weight and activation 8-bit.
+    :param kv_cache_scheme: optional QuantizationArgs, that specify the
+        quantization of the kv cache. If None, kv cache is not quantized.
+        When applying kv cache quantization to transformer AutoModelForCausalLM,
+        the kv_cache_scheme gets converted into a QuantizationScheme that:
+            - targets the `q_proj` and `k_proj` modules of the model. The outputs
+              of those modules are the keys and values that might be cached
+            - quantizes the outputs of the aformentioned layers, so that
+              keys and values are compressed before storing them in the cache
+        There is an explicit assumption that the model contains modules with
+        `k_proj` and `v_proj` in their names. If this is not the case
+        and kv_cache_scheme != None, the quantization of kv cache will fail
+    """
+
+    # gptq modifier arguments
+    sequential_targets: Union[str, List[str], None] = None
+    block_size: int = 128
+    dampening_frac: Optional[float] = 0.01
+    # TODO: this does not serialize / will be incorrectly written
+    actorder: Optional[Union[ActivationOrdering, Sentinel]] = Sentinel("static")
+    offload_hessians: bool = False
+
+    # private variables
+    _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
+    _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict)
+    _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict)
+    
+    _cur_layer_idx = PrivateAttr(default=0)
+    
+
+    def resolve_quantization_config(self) -> QuantizationConfig:
+        config = super().resolve_quantization_config()
+
+        def resolve_actorder(existing):
+            # sentinel default only overrides if existing is None
+            if self.actorder == Sentinel("static"):
+                return ActivationOrdering.STATIC if existing is None else existing
+
+            # user-provided value always attempts to override
+            if existing is None or self.actorder == existing:
+                return self.actorder
+
+            # if existing provided and conflicts
+            raise ValueError(
+                "Cannot resolve activation ordering when both "
+                "`AutoRoundModifier.actorder` and `QuantizationScheme.actorder` "
+                f"are provided and differ ({self.actorder}, {existing}). "
+                "Either unset `AutoRoundModifier.actorder` or "
+                "remove `actorder` from config groups."
+            )
+
+        for scheme in config.config_groups.values():
+            assert isinstance(scheme, QuantizationScheme)
+            if (
+                getattr_chain(scheme, "weights.strategy", None)
+                == QuantizationStrategy.GROUP
+            ):
+                scheme.weights.actorder = resolve_actorder(scheme.weights.actorder)
+        return config
+
+    def on_initialize(self, state: State, **kwargs) -> bool:
+        """
+        Initialize and run the GPTQ algorithm on the current state
+
+        :param state: session state storing input model and calibration data
+        """
+        # apply config to model and prepare calibration hooks
+        if QuantizationMixin.has_config(self):
+            QuantizationMixin.initialize_quantization(self, state.model)
+
+        # prepare module names
+        self._module_names = {
+            m: name
+            for name, m in match_named_modules(
+                state.model, self.targets, self.ignore
+            )
+        }
+        # add tmp name for each module for debugging
+        for name, mod in state.model.named_modules():
+            mod._tmp_name = name
+        # freeze all model parameters
+        for name, param in state.model.named_parameters():
+            param.requires_grad_(False)
+
+        return True
+
+
+    def start_calibration(self, model: torch.nn.Module):
+        """
+        Register activation calibration hooks (including kv_cache quantization) and enable quantization as we calibrate
+
+        :param model: model to prepare for calibration
+        """
+
+        from compressed_tensors.quantization import enable_quantization
+        from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
+        for _, module in match_named_modules(model, self.targets, self.ignore):
+            # Note: No need to register observers for auto-round
+            # self._initialize_observers(module)
+            self._calibration_hooks |= self._initialize_hooks(module)
+            apply_calibration_status(module)
+
+        model.apply(enable_quantization)  # quantize at the same time as calibrate
+
+
+    def on_start(self, state: State, event: Event, **kwargs):
+        self.started_ = True
+
+        # register quantization calibration hooks
+        # assume quantization has been initialized by this modifier or one before it
+        # Replace it with call to self.start_calibration
+        # QuantizationMixin.start_calibration(self, state.model)
+        self.start_calibration( state.model)
+        for name, module in state.model.named_modules():
+            if _is_decoding_layer(module, name):
+                # register input/output capture hooks for decoding layers
+                logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}")
+                module.register_forward_pre_hook(input_capture_hook, with_kwargs=True)
+                module.register_forward_hook(output_capture_hook, with_kwargs=True)
+
+
+    def on_event(self, state: State, event: Event, **kwargs):
+        if event.type_ == EventType.CALIBRATION_EPOCH_START:
+            if not self.started_:
+                self.on_start(state, None)
+
+        if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
+            self.autoround(state)
+
+        if event.type_ == EventType.CALIBRATION_EPOCH_END:
+            if not self.ended_:
+                self.on_end(state, None)
+
+    def autoround(self, state):
+        cur_layer_idx = self._cur_layer_idx
+        self._cur_layer_idx += 1
+        logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}")
+        if cur_layer_idx >= len(state.model.model.layers):
+            logger.info(
+                f">>||>> All decoding layers have been processed for AutoRound."
+            )
+            # self.compress_modules(return_directly=False)
+            return
+        decoding_layer = state.model.model.layers[cur_layer_idx]
+        logger.debug(
+            f">>||>> Strating AutoRound for decoding layer {getattr(decoding_layer, '_tmp_name', '')}"
+        )
+
+        wrapped_model = _wrap_decoding_layer(decoding_layer)
+
+        with torch.enable_grad(), align_module_device(decoding_layer):
+            if _DEBUG:
+                iters = 4
+            else:
+                iters = 200
+            import auto_round
+
+            ar = auto_round.AutoRound(
+                model=wrapped_model,
+                tokenizer="",
+                scheme="W4A16",
+                iters=iters,
+                enable_quanted_input=False,
+                # FIXME: batch size 1 causes error, looks like related to the input_others prepare
+                # batch_size=1
+                # enable_torch_compile=True,
+                # enable_deterministic_algorithms=True,
+            )
+
+            ar.configure_layer_config()
+
+            input_name = f"model.layers.{cur_layer_idx}"
+            cur_inputs = all_module_input[input_name]
+            input_ids, input_others = normalize_input(cur_inputs)
+            decoding_layer.tuning_device = torch.device("cuda")
+
+            ar.quantize_block(
+                block=decoding_layer,
+                input_ids=input_ids,
+                input_others=input_others,
+                q_input=None,
+                device="cuda",
+            )
+            # Update offload parameters and remove temporary attributes
+            for name, module in decoding_layer.named_modules():
+                if hasattr(module, "weight_scale") and hasattr(
+                    module, "weight_zero_point"
+                ):
+                    logger.debug(
+                        f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}"
+                    )
+                    # weight = module.weight
+                    weight_scale = module.scale
+                    del module.scale
+                    del module.zp
+                    # TODO: update weight as well
+                    # breakpoint()
+                    
+                    update_offload_parameter(module, "weight_scale", weight_scale)
+
+            for module in list(self._num_samples.keys()):
+                name = self._module_names[module]
+                del self._num_samples[module]
+            decoding_layer.eval()
+            all_module_input.clear()
+            all_module_output.clear()
+
+
+    def on_end(self, state: State, event: Event, **kwargs):
+        """
+        Finish calibrating by removing observers and calibration hooks
+        """
+        self.ended_ = True
+        QuantizationMixin.end_calibration(self, state.model)
+        self.remove_hooks()  # remove gptq hooks
+
+    def on_finalize(self, state: State, **kwargs) -> bool:
+        """
+        disable the quantization observers used by the OBCQ algorithm
+
+        :param state: session state storing input model and calibration data
+        """
+        if not self.ended_:
+            self.on_end(state, None)
+
+        if len(self._num_samples) > 0:
+            raise ValueError(f"Failed to compress {len(self._num_samples)} modules")
+
+        self._hessians = dict()
+        self._num_samples = dict()
+
+        return True
+
+    @contextlib.contextmanager
+    def _maybe_onload_hessian(self, module: torch.nn.Module):
+        if self.offload_hessians:
+            device = get_execution_device(module)
+            self._hessians[module] = self._hessians[module].to(device=device)
+
+        yield
+
+        if self.offload_hessians:
+            if module in self._hessians:  # may have been deleted in context
+                self._hessians[module] = self._hessians[module].to(device="cpu")

From 9c537ccaf73e7f0e785fcccd176ca0fdbf1be598 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 2 Nov 2025 22:14:42 -0800
Subject: [PATCH 03/57] refine code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../pipelines/layer_sequential/pipeline.py             |  3 ++-
 src/llmcompressor/pipelines/sequential/pipeline.py     | 10 +++++++---
 src/llmcompressor/utils/helpers.py                     |  3 +++
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 244edde87e..54d59e948a 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -19,6 +19,7 @@
 from llmcompressor.pipelines.sequential.helpers import (
     dispatch_for_sequential,
     get_sequential_targets,
+    DISABLE_QAC_MODIFIERS,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
@@ -72,7 +73,7 @@ def __call__(
 
         # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ
         disable_qac = any(
-            type(mod).__name__ in ["GPTQModifier", "AWQModifier"]
+            type(mod).__name__ in DISABLE_QAC_MODIFIERS
             for mod in session.lifecycle.recipe.modifiers
         )
 
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 261afd6544..1e7df2da53 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -15,7 +15,11 @@
     get_sequential_targets,
     trace_subgraphs,
 )
-from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
+from llmcompressor.utils.helpers import (
+    DisableQuantization,
+    calibration_forward_context,
+    DISABLE_QAC_MODIFIERS,
+)
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
@@ -74,8 +78,8 @@ def __call__(
 
         # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ
         disable_qac = any(
-            type(mod).__name__ in ["GPTQModifier", "AWQModifier"]
-            for mod in session.lifecycle.recipe.modifiers
+            type(mod).__name__ in DISABLE_QAC_MODIFIERS
+            for mod in session.lifecycle.recipe.modifiers 
         )
 
         with contextlib.ExitStack() as stack:
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
index 9aaae59eb9..b1c6c02f0f 100644
--- a/src/llmcompressor/utils/helpers.py
+++ b/src/llmcompressor/utils/helpers.py
@@ -67,6 +67,7 @@
     "calibration_forward_context",
     "patch_attr",
     "disable_hf_kernels",
+    "DISABLE_QAC_MODIFIERS"
 ]
 
 
@@ -1082,3 +1083,5 @@ def patch_attr(base: object, attr: str, value: Any):
             setattr(base, attr, original_value)
         else:
             delattr(base, attr)
+
+DISABLE_QAC_MODIFIERS = ["GPTQModifier", "AWQModifier", "AutoRoundModifier"]
\ No newline at end of file

From bebe0fa1a87321eb1472465af2441f2abaf6d00e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 2 Nov 2025 22:23:58 -0800
Subject: [PATCH 04/57] disbale qac for auto-round

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/pipelines/layer_sequential/pipeline.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 54d59e948a..de3a093799 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -19,9 +19,12 @@
 from llmcompressor.pipelines.sequential.helpers import (
     dispatch_for_sequential,
     get_sequential_targets,
+)
+from llmcompressor.utils.helpers import (
+    DisableQuantization,
+    calibration_forward_context,
     DISABLE_QAC_MODIFIERS,
 )
-from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments

From dfb0ff828fafbc48a845a629ddd2ea7961cda90f Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 2 Nov 2025 23:00:53 -0800
Subject: [PATCH 05/57] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py  | 60 +------------------
 1 file changed, 2 insertions(+), 58 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 7ad9bf3e8d..13b8fe013f 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -20,11 +20,6 @@
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.quantization.gptq.gptq_quantize import (
-    accumulate_hessian,
-    make_empty_hessian,
-    quantize_weight,
-)
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
 from llmcompressor.sentinel import Sentinel
 from llmcompressor.utils.metric_logging import CompressionLogger
@@ -188,45 +183,15 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     block_size: int = 128
     dampening_frac: Optional[float] = 0.01
     # TODO: this does not serialize / will be incorrectly written
-    actorder: Optional[Union[ActivationOrdering, Sentinel]] = Sentinel("static")
-    offload_hessians: bool = False
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
-    _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict)
-    _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict)
-    
+
     _cur_layer_idx = PrivateAttr(default=0)
     
 
     def resolve_quantization_config(self) -> QuantizationConfig:
         config = super().resolve_quantization_config()
-
-        def resolve_actorder(existing):
-            # sentinel default only overrides if existing is None
-            if self.actorder == Sentinel("static"):
-                return ActivationOrdering.STATIC if existing is None else existing
-
-            # user-provided value always attempts to override
-            if existing is None or self.actorder == existing:
-                return self.actorder
-
-            # if existing provided and conflicts
-            raise ValueError(
-                "Cannot resolve activation ordering when both "
-                "`AutoRoundModifier.actorder` and `QuantizationScheme.actorder` "
-                f"are provided and differ ({self.actorder}, {existing}). "
-                "Either unset `AutoRoundModifier.actorder` or "
-                "remove `actorder` from config groups."
-            )
-
-        for scheme in config.config_groups.values():
-            assert isinstance(scheme, QuantizationScheme)
-            if (
-                getattr_chain(scheme, "weights.strategy", None)
-                == QuantizationStrategy.GROUP
-            ):
-                scheme.weights.actorder = resolve_actorder(scheme.weights.actorder)
         return config
 
     def on_initialize(self, state: State, **kwargs) -> bool:
@@ -369,9 +334,6 @@ def autoround(self, state):
                     
                     update_offload_parameter(module, "weight_scale", weight_scale)
 
-            for module in list(self._num_samples.keys()):
-                name = self._module_names[module]
-                del self._num_samples[module]
             decoding_layer.eval()
             all_module_input.clear()
             all_module_output.clear()
@@ -394,22 +356,4 @@ def on_finalize(self, state: State, **kwargs) -> bool:
         if not self.ended_:
             self.on_end(state, None)
 
-        if len(self._num_samples) > 0:
-            raise ValueError(f"Failed to compress {len(self._num_samples)} modules")
-
-        self._hessians = dict()
-        self._num_samples = dict()
-
-        return True
-
-    @contextlib.contextmanager
-    def _maybe_onload_hessian(self, module: torch.nn.Module):
-        if self.offload_hessians:
-            device = get_execution_device(module)
-            self._hessians[module] = self._hessians[module].to(device=device)
-
-        yield
-
-        if self.offload_hessians:
-            if module in self._hessians:  # may have been deleted in context
-                self._hessians[module] = self._hessians[module].to(device="cpu")
+        return True
\ No newline at end of file

From 513972c298f93fdf8b67a4a591bdfaea369bc463 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 2 Nov 2025 23:01:31 -0800
Subject: [PATCH 06/57] add compile after disable qac

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 13b8fe013f..a8fc4cd274 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -299,7 +299,7 @@ def autoround(self, state):
                 enable_quanted_input=False,
                 # FIXME: batch size 1 causes error, looks like related to the input_others prepare
                 # batch_size=1
-                # enable_torch_compile=True,
+                enable_torch_compile=True,
                 # enable_deterministic_algorithms=True,
             )
 

From 2291cc41da63fb67674c137d78d91497f031000f Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 00:07:37 -0800
Subject: [PATCH 07/57] add iters and clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py  | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index a8fc4cd274..c693246fcc 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -23,6 +23,8 @@
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
 from llmcompressor.sentinel import Sentinel
 from llmcompressor.utils.metric_logging import CompressionLogger
+from compressed_tensors.quantization import enable_quantization
+from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
 
 __all__ = ["AutoRoundModifier"]
 
@@ -180,13 +182,12 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
 
     # gptq modifier arguments
     sequential_targets: Union[str, List[str], None] = None
-    block_size: int = 128
+    iters: int = 200
     dampening_frac: Optional[float] = 0.01
     # TODO: this does not serialize / will be incorrectly written
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
-
     _cur_layer_idx = PrivateAttr(default=0)
     
 
@@ -217,7 +218,6 @@ def on_initialize(self, state: State, **kwargs) -> bool:
         # freeze all model parameters
         for name, param in state.model.named_parameters():
             param.requires_grad_(False)
-
         return True
 
 
@@ -228,8 +228,7 @@ def start_calibration(self, model: torch.nn.Module):
         :param model: model to prepare for calibration
         """
 
-        from compressed_tensors.quantization import enable_quantization
-        from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
+
         for _, module in match_named_modules(model, self.targets, self.ignore):
             # Note: No need to register observers for auto-round
             # self._initialize_observers(module)
@@ -285,17 +284,12 @@ def autoround(self, state):
         wrapped_model = _wrap_decoding_layer(decoding_layer)
 
         with torch.enable_grad(), align_module_device(decoding_layer):
-            if _DEBUG:
-                iters = 4
-            else:
-                iters = 200
             import auto_round
-
             ar = auto_round.AutoRound(
                 model=wrapped_model,
                 tokenizer="",
                 scheme="W4A16",
-                iters=iters,
+                iters=self.iters,
                 enable_quanted_input=False,
                 # FIXME: batch size 1 causes error, looks like related to the input_others prepare
                 # batch_size=1
@@ -325,13 +319,11 @@ def autoround(self, state):
                     logger.debug(
                         f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}"
                     )
-                    # weight = module.weight
+                    # The model's weight is already quantized and determined in auto-round
                     weight_scale = module.scale
                     del module.scale
                     del module.zp
                     # TODO: update weight as well
-                    # breakpoint()
-                    
                     update_offload_parameter(module, "weight_scale", weight_scale)
 
             decoding_layer.eval()

From 40288534ce0e4bd27bb2339d7b76a2fcf4423e60 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 00:22:26 -0800
Subject: [PATCH 08/57] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +-
 src/llmcompressor/modifiers/quantization/gptq/base.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index c693246fcc..ae4272b4dd 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -319,7 +319,7 @@ def autoround(self, state):
                     logger.debug(
                         f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}"
                     )
-                    # The model's weight is already quantized and determined in auto-round
+                    # Note: The model's weight is already quantized and dequantized in-place by auto-round
                     weight_scale = module.scale
                     del module.scale
                     del module.zp
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index cf1e47d841..385de9840a 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -266,7 +266,7 @@ def compress_modules(self):
                     percdamp=self.dampening_frac,
                 )
                 comp_logger.set_loss(loss)
-            breakpoint()
+
             update_offload_parameter(module, "weight", quantized_weight)
             update_offload_parameter(module, "weight_scale", scale)
             update_offload_parameter(module, "weight_zero_point", zero_point)

From 97ff9e02b0cdfd0ec95ed4c884ca85d76d9f8210 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 00:23:13 -0800
Subject: [PATCH 09/57] add example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py              | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 examples/quantization_w4a16/auto_round_llama3_example.py

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
new file mode 100644
index 0000000000..e9d309c233
--- /dev/null
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -0,0 +1,149 @@
+import os
+_DEBUG = os.environ.get("DEBUG", "0") == "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import AutoRoundModifier
+from llmcompressor.modifiers.quantization import AutoRoundModifier
+# from llmcompressor.modifiers.quantization import QuantizationModifier as AutoRoundModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "/data5/yliu7/HF_HOME/meta-llama/Llama-3.2-1B-Instruct"
+model_id = "Qwen/Qwen2.5-0.5B"
+model_id = "/data5/yliu7/HF_HOME/Qwen/Qwen2.5-0.5B"
+model_id = "/data5/yliu7/meta-llama/meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+model_dir="/storage/yiliu7"
+# model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+model_dir="/storage/yiliu7"
+model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
+model_name="Qwen/Qwen2.5-0.5B/"
+
+model_id=f"{model_dir}/{model_name}"
+
+
+# model_id = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+if _DEBUG:
+    from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
+    from transformers.models.llama.modeling_llama import LlamaForCausalLM
+    import torch
+
+    config = AutoConfig.from_pretrained(model_id)
+    config.num_hidden_layers = 2  # Use a smaller model for testing
+    # Fix configuration validation issues
+    # config.layer_types = config.layer_types[: config.num_hidden_layers]
+
+    # Load the tokenizer and model
+    if "Qwen" in model_id:
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        model = Qwen2ForCausalLM(config)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        model = LlamaForCausalLM(config)
+    model.to(torch.bfloat16)
+    NUM_CALIBRATION_SAMPLES = 3
+    MAX_SEQUENCE_LENGTH = 16
+    iters = 4
+
+else:
+    # Select number of samples. 512 samples is a good place to start.
+    # Increasing the number of samples can improve accuracy.
+    light = {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsamples": 128, "lr": 5e-3}
+    light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": 5e-3}
+
+    light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": None}
+    # light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 32, "lr": None}
+    NUM_CALIBRATION_SAMPLES = light["nsamples"]
+    MAX_SEQUENCE_LENGTH = light["seqlen"]
+    iters = light["iters"]
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+
+
+from auto_round.calib_dataset import get_dataset
+
+from llmcompressor.args import DatasetArguments
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+# data_args = DatasetArguments(shuffle_calibration_samples=False)
+# Load dataset and preprocess.
+# ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+# ds = ds.shuffle(seed=42)
+
+
+# def preprocess(example):
+#     return {
+#         "text": tokenizer.apply_chat_template(
+#             example["messages"],
+#             tokenize=False,
+#         )
+#     }
+
+
+# ds = ds.map(preprocess)
+
+
+# # Tokenize inputs.
+# def tokenize(sample):
+#     return tokenizer(
+#         sample["text"],
+#         padding=False,
+#         max_length=MAX_SEQUENCE_LENGTH,
+#         truncation=True,
+#         add_special_tokens=False,
+#     )
+
+
+# ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+recipe = AutoRoundModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters
+)
+                           
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    # !!! shuffle_calibration_samples: True -> mmlu 0.6574
+    # !!! shuffle_calibration_samples: False -> mmlu 0.66
+    shuffle_calibration_samples=False,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Explain AI in ", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = f"{model_dir}/" + model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-disbale-shuffule-ar"
+print(f"Saving quantized model to {SAVE_DIR}")
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)

From cb7a5b4b4c5731179f8f14d2b050013d09b77fed Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 01:01:16 -0800
Subject: [PATCH 10/57] refine docs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py              |  2 -
 .../modifiers/quantization/autoround/base.py  | 67 +++++++------------
 2 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index e9d309c233..e2e52e0a05 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -74,8 +74,6 @@
 
 
 from auto_round.calib_dataset import get_dataset
-
-from llmcompressor.args import DatasetArguments
 ds = get_dataset(
     tokenizer=tokenizer,
     seqlen=MAX_SEQUENCE_LENGTH,
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index ae4272b4dd..4aa6124103 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -25,27 +25,18 @@
 from llmcompressor.utils.metric_logging import CompressionLogger
 from compressed_tensors.quantization import enable_quantization
 from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
+from collections import defaultdict
 
 __all__ = ["AutoRoundModifier"]
 
 
-from collections import defaultdict
-import os
 
-FALLBACK_CHANGE = os.environ.get("FALLBACK_CHANGE", "0").lower() in ("1", "true", "yes")
-_DEBUG = os.environ.get("DEBUG", "0").lower() in ("1", "true", "yes")
+
 
 all_module_input = defaultdict(list)
 all_module_output = defaultdict(list)
 
 
-def input_capture_hook(module, *args, **kwargs):
-    all_module_input[module._tmp_name].append((args, kwargs))
-
-
-def output_capture_hook(module, *args, **kwargs):
-    all_module_output[module._tmp_name].append((args, kwargs))
-
 
 def normalize_input(cur_inputs):
     # TODO: move it to auto-round
@@ -104,22 +95,17 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
     wrapped_model.dtype = first_param.dtype
     return wrapped_model
 
-
-
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
-    Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier
-    uses activations to calibrate a hessian matrix, which is then used to determine
-    optimal quantizion values and orderings for the model weights.
+    Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier
+    leverages signed gradient descent (SignSGD) and block-wise loss to optimize rounding values
+    and weight clipping in a few steps.
 
     | Sample yaml:
     | test_stage:
     |    obcq_modifiers:
     |      AutoRoundModifier:
-    |          block_size: 128
-    |          dampening_frac: 0.001
-    |          offload_hessians: False
-    |          actorder: static
+    |          iters: 200
     |          config_groups:
     |            group_0:
     |                targets:
@@ -127,7 +113,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     |                input_activations: null
     |                output_activations: null
     |                weights:
-    |                    num_bits: 8
+    |                    num_bits: 4
     |                    type: "int"
     |                    symmetric: true
     |                    strategy: group
@@ -137,24 +123,15 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         - on_initialize
             - apply config to model
         - on_start
-            - add activation calibration hooks
-            - add gptq weight calibration hooks
+            - add input/output capture hooks to decoding layers
         - on_sequential_epoch_end
             - quantize_weight
         - on_finalize
             - remove_hooks()
             - model.apply(freeze_module_quantization)
 
-    :param sequential_targets: list of layer names to compress during GPTQ, or
+    :param sequential_targets: list of layer names to compress during AutoRound, or
         '__ALL__' to compress every layer in the model
-    :param block_size: Used to determine number of columns to compress in one pass
-    :param dampening_frac: Amount of dampening to apply to H, as a fraction of the
-        diagonal norm
-    :param actorder: order in which weight columns are quantized. Defaults to "static"
-        activation ordering, which achieves best accuracy recovery with no runtime cost.
-        For more information, see https://github.com/vllm-project/vllm/pull/8135
-    :param offload_hessians: Set to True for decreased memory usage but increased
-        runtime.
 
     :param config_groups: dictionary specifying quantization schemes to apply to target
         modules. Modules not matching a scheme target will NOT be quantized.
@@ -180,10 +157,9 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         and kv_cache_scheme != None, the quantization of kv cache will fail
     """
 
-    # gptq modifier arguments
+    # AutoRound modifier arguments
     sequential_targets: Union[str, List[str], None] = None
-    iters: int = 200
-    dampening_frac: Optional[float] = 0.01
+    iters: Optional[int] = 200
     # TODO: this does not serialize / will be incorrectly written
 
     # private variables
@@ -197,7 +173,7 @@ def resolve_quantization_config(self) -> QuantizationConfig:
 
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
-        Initialize and run the GPTQ algorithm on the current state
+        Initialize and run the AutoRound algorithm on the current state
 
         :param state: session state storing input model and calibration data
         """
@@ -238,20 +214,29 @@ def start_calibration(self, model: torch.nn.Module):
         model.apply(enable_quantization)  # quantize at the same time as calibrate
 
 
+    def input_capture_hook(self, module, *args, **kwargs):
+        all_module_input[module._tmp_name].append((args, kwargs))
+
+
+    def output_capture_hook(self, module, *args, **kwargs):
+        all_module_output[module._tmp_name].append((args, kwargs))
+
+
+
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
         # register quantization calibration hooks
         # assume quantization has been initialized by this modifier or one before it
-        # Replace it with call to self.start_calibration
-        # QuantizationMixin.start_calibration(self, state.model)
-        self.start_calibration( state.model)
+        self.start_calibration(state.model)
         for name, module in state.model.named_modules():
             if _is_decoding_layer(module, name):
                 # register input/output capture hooks for decoding layers
                 logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}")
-                module.register_forward_pre_hook(input_capture_hook, with_kwargs=True)
-                module.register_forward_hook(output_capture_hook, with_kwargs=True)
+                # module.register_forward_pre_hook(input_capture_hook, with_kwargs=True)
+                # module.register_forward_hook(output_capture_hook, with_kwargs=True)
+                self.register_hook(module, self.input_capture_hook, "forward_pre", with_kwargs=True)
+                self.register_hook(module, self.output_capture_hook, "forward", with_kwargs=True)
 
 
     def on_event(self, state: State, event: Event, **kwargs):

From 5a7500ed373b0bf828dea64d9ff00cb3b3e133eb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 01:03:50 -0800
Subject: [PATCH 11/57] refine example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py              | 40 +------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index e2e52e0a05..413ce50087 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -22,7 +22,7 @@
 
 model_dir="/storage/yiliu7"
 model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
-model_name="Qwen/Qwen2.5-0.5B/"
+# model_name="Qwen/Qwen2.5-0.5B/"
 
 model_id=f"{model_dir}/{model_name}"
 
@@ -67,51 +67,15 @@
     iters = light["iters"]
 
 # Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-
-
-
 from auto_round.calib_dataset import get_dataset
 ds = get_dataset(
     tokenizer=tokenizer,
     seqlen=MAX_SEQUENCE_LENGTH,
     nsamples=NUM_CALIBRATION_SAMPLES,
 )
-# data_args = DatasetArguments(shuffle_calibration_samples=False)
-# Load dataset and preprocess.
-# ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-# ds = ds.shuffle(seed=42)
-
-
-# def preprocess(example):
-#     return {
-#         "text": tokenizer.apply_chat_template(
-#             example["messages"],
-#             tokenize=False,
-#         )
-#     }
-
-
-# ds = ds.map(preprocess)
-
-
-# # Tokenize inputs.
-# def tokenize(sample):
-#     return tokenizer(
-#         sample["text"],
-#         padding=False,
-#         max_length=MAX_SEQUENCE_LENGTH,
-#         truncation=True,
-#         add_special_tokens=False,
-#     )
-
-
-# ds = ds.map(tokenize, remove_columns=ds.column_names)
 
 # Configure the quantization algorithm to run.
-#   * quantize the weights to 4 bit with GPTQ with a group size 128
+#   * quantize the weights to 4 bit with AutoRound with a group size 128
 recipe = AutoRoundModifier(
     targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters
 )

From d02a355690573ccdb0a399bd4160a7d51124c375 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 02:14:01 -0800
Subject: [PATCH 12/57] add init

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/__init__.py | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 src/llmcompressor/modifiers/quantization/autoround/__init__.py

diff --git a/src/llmcompressor/modifiers/quantization/autoround/__init__.py b/src/llmcompressor/modifiers/quantization/autoround/__init__.py
new file mode 100644
index 0000000000..a4291054b4
--- /dev/null
+++ b/src/llmcompressor/modifiers/quantization/autoround/__init__.py
@@ -0,0 +1,3 @@
+# ruff: noqa
+
+from .base import *

From cea9d2f3b981a201dad0c6fb56edf17a089be06a Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 02:25:07 -0800
Subject: [PATCH 13/57] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py  | 48 +++++++------------
 1 file changed, 17 insertions(+), 31 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 4aa6124103..3995e92f23 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -33,8 +33,7 @@
 
 
 
-all_module_input = defaultdict(list)
-all_module_output = defaultdict(list)
+
 
 
 
@@ -144,17 +143,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         will be set to the targets parameter set at the modifier level. Can also be set
         to a dictionary of the format `preset_scheme_name: targets` for example:
         `W8A8: ['Linear']` for weight and activation 8-bit.
-    :param kv_cache_scheme: optional QuantizationArgs, that specify the
-        quantization of the kv cache. If None, kv cache is not quantized.
-        When applying kv cache quantization to transformer AutoModelForCausalLM,
-        the kv_cache_scheme gets converted into a QuantizationScheme that:
-            - targets the `q_proj` and `k_proj` modules of the model. The outputs
-              of those modules are the keys and values that might be cached
-            - quantizes the outputs of the aformentioned layers, so that
-              keys and values are compressed before storing them in the cache
-        There is an explicit assumption that the model contains modules with
-        `k_proj` and `v_proj` in their names. If this is not the case
-        and kv_cache_scheme != None, the quantization of kv cache will fail
     """
 
     # AutoRound modifier arguments
@@ -165,7 +153,8 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
     _cur_layer_idx = PrivateAttr(default=0)
-    
+    _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
+    _all_module_output: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
 
     def resolve_quantization_config(self) -> QuantizationConfig:
         config = super().resolve_quantization_config()
@@ -188,7 +177,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 state.model, self.targets, self.ignore
             )
         }
-        # add tmp name for each module for debugging
+        # add temporary names to all modules for debugging
         for name, mod in state.model.named_modules():
             mod._tmp_name = name
         # freeze all model parameters
@@ -199,7 +188,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
 
     def start_calibration(self, model: torch.nn.Module):
         """
-        Register activation calibration hooks (including kv_cache quantization) and enable quantization as we calibrate
+        Register activation calibration hooks and enable quantization as we calibrate
 
         :param model: model to prepare for calibration
         """
@@ -215,11 +204,11 @@ def start_calibration(self, model: torch.nn.Module):
 
 
     def input_capture_hook(self, module, *args, **kwargs):
-        all_module_input[module._tmp_name].append((args, kwargs))
+        self._all_module_input[module._tmp_name].append((args, kwargs))
 
 
     def output_capture_hook(self, module, *args, **kwargs):
-        all_module_output[module._tmp_name].append((args, kwargs))
+        self._all_module_output[module._tmp_name].append((args, kwargs))
 
 
 
@@ -233,8 +222,6 @@ def on_start(self, state: State, event: Event, **kwargs):
             if _is_decoding_layer(module, name):
                 # register input/output capture hooks for decoding layers
                 logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}")
-                # module.register_forward_pre_hook(input_capture_hook, with_kwargs=True)
-                # module.register_forward_hook(output_capture_hook, with_kwargs=True)
                 self.register_hook(module, self.input_capture_hook, "forward_pre", with_kwargs=True)
                 self.register_hook(module, self.output_capture_hook, "forward", with_kwargs=True)
 
@@ -245,17 +232,19 @@ def on_event(self, state: State, event: Event, **kwargs):
                 self.on_start(state, None)
 
         if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
-            self.autoround(state)
+            self.apply_autoround(state)
+            self.post_autoround_cleanup()
 
         if event.type_ == EventType.CALIBRATION_EPOCH_END:
             if not self.ended_:
                 self.on_end(state, None)
 
-    def autoround(self, state):
+    def apply_autoround(self, state):
         cur_layer_idx = self._cur_layer_idx
         self._cur_layer_idx += 1
         logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}")
         if cur_layer_idx >= len(state.model.model.layers):
+            # skip the lm_head layer
             logger.info(
                 f">>||>> All decoding layers have been processed for AutoRound."
             )
@@ -276,16 +265,13 @@ def autoround(self, state):
                 scheme="W4A16",
                 iters=self.iters,
                 enable_quanted_input=False,
-                # FIXME: batch size 1 causes error, looks like related to the input_others prepare
-                # batch_size=1
                 enable_torch_compile=True,
-                # enable_deterministic_algorithms=True,
             )
 
             ar.configure_layer_config()
 
             input_name = f"model.layers.{cur_layer_idx}"
-            cur_inputs = all_module_input[input_name]
+            cur_inputs = self._all_module_input[input_name]
             input_ids, input_others = normalize_input(cur_inputs)
             decoding_layer.tuning_device = torch.device("cuda")
 
@@ -304,16 +290,16 @@ def autoround(self, state):
                     logger.debug(
                         f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}"
                     )
-                    # Note: The model's weight is already quantized and dequantized in-place by auto-round
+                    # Note: The model's weight is already quantized and dequantized in-place by auto-round.
                     weight_scale = module.scale
                     del module.scale
                     del module.zp
                     # TODO: update weight as well
                     update_offload_parameter(module, "weight_scale", weight_scale)
-
-            decoding_layer.eval()
-            all_module_input.clear()
-            all_module_output.clear()
+        decoding_layer.eval()
+    def post_autoround_cleanup(self):
+        self._all_module_input.clear()
+        self._all_module_output.clear()
 
 
     def on_end(self, state: State, event: Event, **kwargs):

From 22be9b7c6a531552310e5d6446477170aec76991 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 02:31:22 -0800
Subject: [PATCH 14/57] format

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py              | 25 +++++----
 examples/quantization_w4a16/llama3_example.py |  4 +-
 .../modifiers/quantization/__init__.py        |  2 +-
 .../modifiers/quantization/autoround/base.py  | 51 +++++++------------
 .../pipelines/layer_sequential/pipeline.py    |  2 +-
 .../pipelines/sequential/pipeline.py          |  4 +-
 src/llmcompressor/utils/helpers.py            |  5 +-
 7 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index 413ce50087..4cb865c315 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -1,12 +1,12 @@
 import os
+
 _DEBUG = os.environ.get("DEBUG", "0") == "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import AutoRoundModifier
-from llmcompressor.modifiers.quantization import AutoRoundModifier
+
 # from llmcompressor.modifiers.quantization import QuantizationModifier as AutoRoundModifier
 from llmcompressor.utils import dispatch_for_generation
 
@@ -17,14 +17,14 @@
 model_id = "/data5/yliu7/HF_HOME/Qwen/Qwen2.5-0.5B"
 model_id = "/data5/yliu7/meta-llama/meta-llama/Meta-Llama-3.1-8B-Instruct"
 
-model_dir="/storage/yiliu7"
+model_dir = "/storage/yiliu7"
 # model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
 
-model_dir="/storage/yiliu7"
-model_name="meta-llama/Meta-Llama-3.1-8B-Instruct"
+model_dir = "/storage/yiliu7"
+model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 # model_name="Qwen/Qwen2.5-0.5B/"
 
-model_id=f"{model_dir}/{model_name}"
+model_id = f"{model_dir}/{model_name}"
 
 
 # model_id = "facebook/opt-125m"
@@ -32,10 +32,10 @@
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 if _DEBUG:
+    import torch
     from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-    from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
     from transformers.models.llama.modeling_llama import LlamaForCausalLM
-    import torch
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
 
     config = AutoConfig.from_pretrained(model_id)
     config.num_hidden_layers = 2  # Use a smaller model for testing
@@ -68,6 +68,7 @@
 
 # Select calibration dataset.
 from auto_round.calib_dataset import get_dataset
+
 ds = get_dataset(
     tokenizer=tokenizer,
     seqlen=MAX_SEQUENCE_LENGTH,
@@ -79,7 +80,7 @@
 recipe = AutoRoundModifier(
     targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters
 )
-                           
+
 
 # Apply algorithms.
 oneshot(
@@ -105,7 +106,11 @@
 
 # Save to disk compressed.
 SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
-SAVE_DIR = f"{model_dir}/" + model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-disbale-shuffule-ar"
+SAVE_DIR = (
+    f"{model_dir}/"
+    + model_id.rstrip("/").split("/")[-1]
+    + "-W4A16-G128-disbale-shuffule-ar"
+)
 print(f"Saving quantized model to {SAVE_DIR}")
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 038c0ebc9f..945335de36 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -7,8 +7,8 @@
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_dir="/storage/yiliu7"
-model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
+model_dir = "/storage/yiliu7"
+model_id = f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py
index 7e4028279e..2c10fe4b97 100644
--- a/src/llmcompressor/modifiers/quantization/__init__.py
+++ b/src/llmcompressor/modifiers/quantization/__init__.py
@@ -2,4 +2,4 @@
 
 from .gptq import *
 from .quantization import *
-from .autoround import *
\ No newline at end of file
+from .autoround import *
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 3995e92f23..06c4801d6a 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -1,17 +1,12 @@
-import contextlib
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from compressed_tensors.quantization import (
     QuantizationConfig,
-    QuantizationScheme,
-    QuantizationStrategy,
+    enable_quantization,
 )
-from compressed_tensors.quantization.quant_args import ActivationOrdering
 from compressed_tensors.utils import (
     align_module_device,
-    get_execution_device,
-    getattr_chain,
     match_named_modules,
     update_offload_parameter,
 )
@@ -20,23 +15,12 @@
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
-from llmcompressor.sentinel import Sentinel
-from llmcompressor.utils.metric_logging import CompressionLogger
-from compressed_tensors.quantization import enable_quantization
 from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
-from collections import defaultdict
+from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
 
 __all__ = ["AutoRoundModifier"]
 
 
-
-
-
-
-
-
-
 def normalize_input(cur_inputs):
     # TODO: move it to auto-round
     input_ids = []
@@ -94,6 +78,7 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
     wrapped_model.dtype = first_param.dtype
     return wrapped_model
 
+
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
     Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier
@@ -173,9 +158,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
         # prepare module names
         self._module_names = {
             m: name
-            for name, m in match_named_modules(
-                state.model, self.targets, self.ignore
-            )
+            for name, m in match_named_modules(state.model, self.targets, self.ignore)
         }
         # add temporary names to all modules for debugging
         for name, mod in state.model.named_modules():
@@ -185,7 +168,6 @@ def on_initialize(self, state: State, **kwargs) -> bool:
             param.requires_grad_(False)
         return True
 
-
     def start_calibration(self, model: torch.nn.Module):
         """
         Register activation calibration hooks and enable quantization as we calibrate
@@ -193,7 +175,6 @@ def start_calibration(self, model: torch.nn.Module):
         :param model: model to prepare for calibration
         """
 
-
         for _, module in match_named_modules(model, self.targets, self.ignore):
             # Note: No need to register observers for auto-round
             # self._initialize_observers(module)
@@ -202,16 +183,12 @@ def start_calibration(self, model: torch.nn.Module):
 
         model.apply(enable_quantization)  # quantize at the same time as calibrate
 
-
     def input_capture_hook(self, module, *args, **kwargs):
         self._all_module_input[module._tmp_name].append((args, kwargs))
 
-
     def output_capture_hook(self, module, *args, **kwargs):
         self._all_module_output[module._tmp_name].append((args, kwargs))
 
-
-
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
@@ -221,10 +198,15 @@ def on_start(self, state: State, event: Event, **kwargs):
         for name, module in state.model.named_modules():
             if _is_decoding_layer(module, name):
                 # register input/output capture hooks for decoding layers
-                logger.warning(f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}")
-                self.register_hook(module, self.input_capture_hook, "forward_pre", with_kwargs=True)
-                self.register_hook(module, self.output_capture_hook, "forward", with_kwargs=True)
-
+                logger.warning(
+                    f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}"
+                )
+                self.register_hook(
+                    module, self.input_capture_hook, "forward_pre", with_kwargs=True
+                )
+                self.register_hook(
+                    module, self.output_capture_hook, "forward", with_kwargs=True
+                )
 
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
@@ -246,7 +228,7 @@ def apply_autoround(self, state):
         if cur_layer_idx >= len(state.model.model.layers):
             # skip the lm_head layer
             logger.info(
-                f">>||>> All decoding layers have been processed for AutoRound."
+                ">>||>> All decoding layers have been processed for AutoRound."
             )
             # self.compress_modules(return_directly=False)
             return
@@ -259,6 +241,7 @@ def apply_autoround(self, state):
 
         with torch.enable_grad(), align_module_device(decoding_layer):
             import auto_round
+
             ar = auto_round.AutoRound(
                 model=wrapped_model,
                 tokenizer="",
@@ -297,11 +280,11 @@ def apply_autoround(self, state):
                     # TODO: update weight as well
                     update_offload_parameter(module, "weight_scale", weight_scale)
         decoding_layer.eval()
+
     def post_autoround_cleanup(self):
         self._all_module_input.clear()
         self._all_module_output.clear()
 
-
     def on_end(self, state: State, event: Event, **kwargs):
         """
         Finish calibrating by removing observers and calibration hooks
@@ -319,4 +302,4 @@ def on_finalize(self, state: State, **kwargs) -> bool:
         if not self.ended_:
             self.on_end(state, None)
 
-        return True
\ No newline at end of file
+        return True
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index de3a093799..314cd8439e 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -21,9 +21,9 @@
     get_sequential_targets,
 )
 from llmcompressor.utils.helpers import (
+    DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
-    DISABLE_QAC_MODIFIERS,
 )
 
 if TYPE_CHECKING:
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 1e7df2da53..0f341a37da 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -16,9 +16,9 @@
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import (
+    DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
-    DISABLE_QAC_MODIFIERS,
 )
 
 if TYPE_CHECKING:
@@ -79,7 +79,7 @@ def __call__(
         # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ
         disable_qac = any(
             type(mod).__name__ in DISABLE_QAC_MODIFIERS
-            for mod in session.lifecycle.recipe.modifiers 
+            for mod in session.lifecycle.recipe.modifiers
         )
 
         with contextlib.ExitStack() as stack:
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
index b1c6c02f0f..0be09bd062 100644
--- a/src/llmcompressor/utils/helpers.py
+++ b/src/llmcompressor/utils/helpers.py
@@ -67,7 +67,7 @@
     "calibration_forward_context",
     "patch_attr",
     "disable_hf_kernels",
-    "DISABLE_QAC_MODIFIERS"
+    "DISABLE_QAC_MODIFIERS",
 ]
 
 
@@ -1084,4 +1084,5 @@ def patch_attr(base: object, attr: str, value: Any):
         else:
             delattr(base, attr)
 
-DISABLE_QAC_MODIFIERS = ["GPTQModifier", "AWQModifier", "AutoRoundModifier"]
\ No newline at end of file
+
+DISABLE_QAC_MODIFIERS = ["GPTQModifier", "AWQModifier", "AutoRoundModifier"]

From 6cdb402b30d4f815122afc9c9dfdba7fe0cabe95 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 03:10:06 -0800
Subject: [PATCH 15/57] refactor

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py      | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 06c4801d6a..e4993173af 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -109,7 +109,8 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         - on_start
             - add input/output capture hooks to decoding layers
         - on_sequential_epoch_end
-            - quantize_weight
+            - apply_autoround
+            - post_autoround_cleanup
         - on_finalize
             - remove_hooks()
             - model.apply(freeze_module_quantization)
@@ -184,9 +185,13 @@ def start_calibration(self, model: torch.nn.Module):
         model.apply(enable_quantization)  # quantize at the same time as calibrate
 
     def input_capture_hook(self, module, *args, **kwargs):
+        if module._tmp_name not in self._all_module_input:
+            self._all_module_input[module._tmp_name] = []
         self._all_module_input[module._tmp_name].append((args, kwargs))
 
     def output_capture_hook(self, module, *args, **kwargs):
+        if module._tmp_name not in self._all_module_output:
+            self._all_module_output[module._tmp_name] = []
         self._all_module_output[module._tmp_name].append((args, kwargs))
 
     def on_start(self, state: State, event: Event, **kwargs):
@@ -227,10 +232,7 @@ def apply_autoround(self, state):
         logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}")
         if cur_layer_idx >= len(state.model.model.layers):
             # skip the lm_head layer
-            logger.info(
-                ">>||>> All decoding layers have been processed for AutoRound."
-            )
-            # self.compress_modules(return_directly=False)
+            logger.info(">>||>> All decoding layers have been processed for AutoRound.")
             return
         decoding_layer = state.model.model.layers[cur_layer_idx]
         logger.debug(
@@ -270,9 +272,6 @@ def apply_autoround(self, state):
                 if hasattr(module, "weight_scale") and hasattr(
                     module, "weight_zero_point"
                 ):
-                    logger.debug(
-                        f"Updating offload parameters for module {getattr(module, '_tmp_name', '')} || {name}"
-                    )
                     # Note: The model's weight is already quantized and dequantized in-place by auto-round.
                     weight_scale = module.scale
                     del module.scale

From e2814ebc997472a53436c129eb49afd2d4915daf Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 03:14:26 -0800
Subject: [PATCH 16/57] add ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../transformers/autoround/test_oneshot.py    | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 tests/llmcompressor/transformers/autoround/test_oneshot.py

diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py
new file mode 100644
index 0000000000..7e9adf1a76
--- /dev/null
+++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py
@@ -0,0 +1,92 @@
+import pytest
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round.calib_dataset import get_dataset
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization.autoround import AutoRoundModifier
+
+recipe_str = """
+quant_stage:
+    quant_modifiers:
+        AutoRoundModifier:
+            ignore: ["lm_head"]
+            iters: 10
+            config_groups:
+                group_0:
+                    targets:
+                        - "Linear"
+                    input_activations: null
+                    output_activations: null
+                    weights:
+                        num_bits: 4
+                        type: "int"
+                        symmetric: true
+                        strategy: group
+                        group_size: 128
+"""
+
+recipe_modifier_full = AutoRoundModifier(
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
+        )
+    },
+)
+
+
+@pytest.mark.parametrize(
+    "recipe",
+    [
+        recipe_str,
+        recipe_modifier_full,
+    ],
+)
+def test_oneshot_application(recipe, tmp_path):
+    output = tmp_path / "oneshot_output"
+    model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    dataset = get_dataset(
+        tokenizer=tokenizer,
+        seqlen=1024,
+        nsamples=32,
+    )
+
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+    oneshot(
+        model=model,
+        dataset=dataset,
+        output_dir=output,
+        recipe=recipe,
+    )
+    model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)
+
+    # Check that the model is quantized
+    # for compression_config - decompress() will attach a quantization_config
+    # to the model as we decompress right away
+    # for quantization_config - we have CompressedLinear which will only
+    # decompress on the forward pass and does not call decompress(). Results
+    # in a slightly different parameter tree to access the quant config
+    quantization_config = model_loaded.config.quantization_config.quantization_config
+    assert quantization_config is not None
+
+    # check config is set properly
+    assert "lm_head" in quantization_config.ignore
+    assert len(quantization_config.config_groups) == 1
+    quant_scheme = quantization_config.config_groups["group_0"]
+    assert isinstance(quant_scheme, QuantizationScheme)
+
+    weight_args = quantization_config.config_groups["group_0"].weights
+    assert isinstance(weight_args, QuantizationArgs)
+    assert weight_args.num_bits == 4
+
+    # Check a specific layer is quantized
+    targetted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
+    assert hasattr(targetted_linear_layer, "quantization_scheme")
+
+    # Check lm-head is not quantized
+    not_targetted = model_loaded.lm_head
+    assert not hasattr(not_targetted, "quantization_scheme")

From 3e4a9fc5182d9cc7430daa324eccc064a0052217 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 04:36:32 -0800
Subject: [PATCH 17/57] test llama 3

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/quantization_w4a16/auto_round_llama3_example.py   | 1 +
 examples/quantization_w4a16/llama3_example.py              | 3 ++-
 src/llmcompressor/modifiers/quantization/autoround/base.py | 5 -----
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index 4cb865c315..0d01643155 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -22,6 +22,7 @@
 
 model_dir = "/storage/yiliu7"
 model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 # model_name="Qwen/Qwen2.5-0.5B/"
 
 model_id = f"{model_dir}/{model_name}"
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 945335de36..32ab8bb843 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -9,6 +9,7 @@
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 model_dir = "/storage/yiliu7"
 model_id = f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
+model_id = f"{model_dir}/meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -75,6 +76,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-GPTQ"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index e4993173af..5d0234510a 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -115,9 +115,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
             - remove_hooks()
             - model.apply(freeze_module_quantization)
 
-    :param sequential_targets: list of layer names to compress during AutoRound, or
-        '__ALL__' to compress every layer in the model
-
     :param config_groups: dictionary specifying quantization schemes to apply to target
         modules. Modules not matching a scheme target will NOT be quantized.
     :param targets: list of layer names to quantize if a scheme is provided. Defaults
@@ -132,9 +129,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     """
 
     # AutoRound modifier arguments
-    sequential_targets: Union[str, List[str], None] = None
     iters: Optional[int] = 200
-    # TODO: this does not serialize / will be incorrectly written
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)

From aa34b656421ce3ea37e0d36e858cfc89dcf359f6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 19:22:06 -0800
Subject: [PATCH 18/57] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py              |  7 ++-
 .../modifiers/quantization/autoround/base.py  | 45 ++++---------------
 2 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index 0d01643155..b717540df9 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -1,6 +1,7 @@
 import os
 
 _DEBUG = os.environ.get("DEBUG", "0") == "1"
+IS_LLAMA = os.environ.get("MODEL", "LLAMA") == "LLAMA"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -22,8 +23,10 @@
 
 model_dir = "/storage/yiliu7"
 model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
-# model_name="Qwen/Qwen2.5-0.5B/"
+if IS_LLAMA:
+    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+else:
+    model_name="Qwen/Qwen2.5-0.5B/"
 
 model_id = f"{model_dir}/{model_name}"
 
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 5d0234510a..fc38961d8f 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -21,32 +21,6 @@
 __all__ = ["AutoRoundModifier"]
 
 
-def normalize_input(cur_inputs):
-    # TODO: move it to auto-round
-    input_ids = []
-    input_others = {}
-    positional_inputs = []
-    attention_mask = None
-    position_ids = None
-    cache_position = None
-    position_embeddings = (None, None)
-    for cur_inp in cur_inputs:
-        input_ids.append(cur_inp[0][0][0])
-        for key, val in cur_inp[0][1].items():
-            if key == "position_ids":
-                position_ids = val
-            elif key == "position_embeddings":
-                position_embeddings = val
-            elif key == "cache_position":
-                cache_position = val
-    input_others["position_ids"] = position_ids
-    input_others["positional_inputs"] = positional_inputs
-    input_others["attention_mask"] = attention_mask
-    input_others["position_embeddings"] = position_embeddings
-    input_others["cache_position"] = cache_position
-    return input_ids, input_others
-
-
 def _is_decoding_layer(module, name):
     return "decoderlayer" in module.__class__.__name__.lower()
 
@@ -87,7 +61,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
 
     | Sample yaml:
     | test_stage:
-    |    obcq_modifiers:
+    |    modifiers:
     |      AutoRoundModifier:
     |          iters: 200
     |          config_groups:
@@ -156,7 +130,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
             m: name
             for name, m in match_named_modules(state.model, self.targets, self.ignore)
         }
-        # add temporary names to all modules for debugging
+        # add temporary names to all modules
         for name, mod in state.model.named_modules():
             mod._tmp_name = name
         # freeze all model parameters
@@ -249,18 +223,17 @@ def apply_autoround(self, state):
             )
 
             ar.configure_layer_config()
-
+            first_param = next(decoding_layer.parameters())
+            device = first_param.device
             input_name = f"model.layers.{cur_layer_idx}"
             cur_inputs = self._all_module_input[input_name]
-            input_ids, input_others = normalize_input(cur_inputs)
-            decoding_layer.tuning_device = torch.device("cuda")
+            decoding_layer.tuning_device = device
 
             ar.quantize_block(
                 block=decoding_layer,
-                input_ids=input_ids,
-                input_others=input_others,
-                q_input=None,
-                device="cuda",
+                inputs=cur_inputs,
+                normalize_inputs=True,
+                device=device,
             )
             # Update offload parameters and remove temporary attributes
             for name, module in decoding_layer.named_modules():
@@ -289,7 +262,7 @@ def on_end(self, state: State, event: Event, **kwargs):
 
     def on_finalize(self, state: State, **kwargs) -> bool:
         """
-        disable the quantization observers used by the OBCQ algorithm
+        disable the quantization observers used by the AutoRound algorithm
 
         :param state: session state storing input model and calibration data
         """

From afe2ff79a5fd340876534d78611021f4433ae64e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 20:06:01 -0800
Subject: [PATCH 19/57] parse layer-wise config

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py  | 63 ++++++++++++-------
 1 file changed, 40 insertions(+), 23 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index fc38961d8f..06e0fd3c73 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -1,13 +1,16 @@
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple
 
 import torch
 from compressed_tensors.quantization import (
     QuantizationConfig,
+    QuantizationStrategy,
+    QuantizationScheme,
     enable_quantization,
 )
 from compressed_tensors.utils import (
     align_module_device,
     match_named_modules,
+    getattr_chain,
     update_offload_parameter,
 )
 from loguru import logger
@@ -109,7 +112,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
     _cur_layer_idx = PrivateAttr(default=0)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
-    _all_module_output: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
 
     def resolve_quantization_config(self) -> QuantizationConfig:
         config = super().resolve_quantization_config()
@@ -158,11 +160,6 @@ def input_capture_hook(self, module, *args, **kwargs):
             self._all_module_input[module._tmp_name] = []
         self._all_module_input[module._tmp_name].append((args, kwargs))
 
-    def output_capture_hook(self, module, *args, **kwargs):
-        if module._tmp_name not in self._all_module_output:
-            self._all_module_output[module._tmp_name] = []
-        self._all_module_output[module._tmp_name].append((args, kwargs))
-
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
@@ -172,15 +169,9 @@ def on_start(self, state: State, event: Event, **kwargs):
         for name, module in state.model.named_modules():
             if _is_decoding_layer(module, name):
                 # register input/output capture hooks for decoding layers
-                logger.warning(
-                    f">> Registering input/output capture hooks for decoding layer {getattr(module, '_tmp_name', '')} || {name}"
-                )
                 self.register_hook(
                     module, self.input_capture_hook, "forward_pre", with_kwargs=True
                 )
-                self.register_hook(
-                    module, self.output_capture_hook, "forward", with_kwargs=True
-                )
 
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
@@ -195,33 +186,60 @@ def on_event(self, state: State, event: Event, **kwargs):
             if not self.ended_:
                 self.on_end(state, None)
 
+    def _mapping_config_to_autoround(self):
+        from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
+
+        resolved_config = self.resolved_config
+        quant_scheme = None
+        for scheme in resolved_config.config_groups.values():
+            assert isinstance(scheme, QuantizationScheme), f"Expected QuantizationScheme, got {type(scheme)}"
+            quant_scheme = scheme
+        weight_args = quant_scheme.weights
+        # TODO: release below constraint in later PRs
+        assert weight_args.strategy == QuantizationStrategy.GROUP, (
+            "Only group-wise quantization is supported in AutoRoundModifier for now, "
+            f"got {weight_args.strategy}"
+        )
+        assert quant_scheme.input_activations is None, (
+            "Input activation quantization is not supported in AutoRoundModifier, "
+            f"got {quant_scheme.input_activations}"
+        )
+        assert quant_scheme.output_activations is None, (
+            "Output activation quantization is not supported in AutoRoundModifier, "
+            f"got {quant_scheme.output_activations}"
+        )
+        ar_quant_scheme = ARQuantizationScheme(
+            bits=weight_args.num_bits,
+            sym=weight_args.symmetric,
+            group_size=weight_args.group_size,
+            data_type=weight_args.type,
+            act_bits=16,
+        )
+        return ar_quant_scheme
+
     def apply_autoround(self, state):
         cur_layer_idx = self._cur_layer_idx
         self._cur_layer_idx += 1
         logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}")
         if cur_layer_idx >= len(state.model.model.layers):
             # skip the lm_head layer
-            logger.info(">>||>> All decoding layers have been processed for AutoRound.")
             return
         decoding_layer = state.model.model.layers[cur_layer_idx]
-        logger.debug(
-            f">>||>> Strating AutoRound for decoding layer {getattr(decoding_layer, '_tmp_name', '')}"
-        )
 
         wrapped_model = _wrap_decoding_layer(decoding_layer)
 
         with torch.enable_grad(), align_module_device(decoding_layer):
             import auto_round
-
+            parsed_scheme = self._mapping_config_to_autoround()
             ar = auto_round.AutoRound(
                 model=wrapped_model,
                 tokenizer="",
-                scheme="W4A16",
+                scheme=parsed_scheme,
                 iters=self.iters,
                 enable_quanted_input=False,
                 enable_torch_compile=True,
             )
-
+            # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config()
             first_param = next(decoding_layer.parameters())
             device = first_param.device
@@ -236,11 +254,11 @@ def apply_autoround(self, state):
                 device=device,
             )
             # Update offload parameters and remove temporary attributes
-            for name, module in decoding_layer.named_modules():
+            for _, module in decoding_layer.named_modules():
                 if hasattr(module, "weight_scale") and hasattr(
                     module, "weight_zero_point"
                 ):
-                    # Note: The model's weight is already quantized and dequantized in-place by auto-round.
+                    # Note: The model's weight is already q-dq in-place by auto-round.
                     weight_scale = module.scale
                     del module.scale
                     del module.zp
@@ -250,7 +268,6 @@ def apply_autoround(self, state):
 
     def post_autoround_cleanup(self):
         self._all_module_input.clear()
-        self._all_module_output.clear()
 
     def on_end(self, state: State, event: Event, **kwargs):
         """

From 8e9eccc141f9e0fd8dfedcada45fb309c204eca4 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 20:06:35 -0800
Subject: [PATCH 20/57] format

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/quantization_w4a16/auto_round_llama3_example.py  | 2 +-
 .../modifiers/quantization/autoround/base.py              | 8 +++++---
 .../llmcompressor/transformers/autoround/test_oneshot.py  | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index b717540df9..d8d72c8f60 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -26,7 +26,7 @@
 if IS_LLAMA:
     model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
 else:
-    model_name="Qwen/Qwen2.5-0.5B/"
+    model_name = "Qwen/Qwen2.5-0.5B/"
 
 model_id = f"{model_dir}/{model_name}"
 
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 06e0fd3c73..a366d803ff 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -3,14 +3,13 @@
 import torch
 from compressed_tensors.quantization import (
     QuantizationConfig,
-    QuantizationStrategy,
     QuantizationScheme,
+    QuantizationStrategy,
     enable_quantization,
 )
 from compressed_tensors.utils import (
     align_module_device,
     match_named_modules,
-    getattr_chain,
     update_offload_parameter,
 )
 from loguru import logger
@@ -192,7 +191,9 @@ def _mapping_config_to_autoround(self):
         resolved_config = self.resolved_config
         quant_scheme = None
         for scheme in resolved_config.config_groups.values():
-            assert isinstance(scheme, QuantizationScheme), f"Expected QuantizationScheme, got {type(scheme)}"
+            assert isinstance(
+                scheme, QuantizationScheme
+            ), f"Expected QuantizationScheme, got {type(scheme)}"
             quant_scheme = scheme
         weight_args = quant_scheme.weights
         # TODO: release below constraint in later PRs
@@ -230,6 +231,7 @@ def apply_autoround(self, state):
 
         with torch.enable_grad(), align_module_device(decoding_layer):
             import auto_round
+
             parsed_scheme = self._mapping_config_to_autoround()
             ar = auto_round.AutoRound(
                 model=wrapped_model,
diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py
index 7e9adf1a76..f85ae283b0 100644
--- a/tests/llmcompressor/transformers/autoround/test_oneshot.py
+++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py
@@ -1,8 +1,9 @@
 import pytest
 import torch
+from auto_round.calib_dataset import get_dataset
 from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from auto_round.calib_dataset import get_dataset
+
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization.autoround import AutoRoundModifier
 

From 81f76affc8848f8778e702e73aad0609193473a5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 20:16:32 -0800
Subject: [PATCH 21/57] add docstring

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py       | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index a366d803ff..626ec920ee 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -219,6 +219,20 @@ def _mapping_config_to_autoround(self):
         return ar_quant_scheme
 
     def apply_autoround(self, state):
+        """Applies AutoRound quantization tuning on the current decoding layer.
+
+        The tuning logic is below:
+        for iter in range(iters):
+           quant_output = forward(layer, cached_inputs)
+           loss = mse_loss(quant_output, original_output)
+           loss.backward()
+           optimizer.step()
+           if loss < best_loss:
+                best_params = save_params(layer)
+        For more details, please refer to the AutoRound repository:
+        https://github.com/intel/auto-round/
+        """
+
         cur_layer_idx = self._cur_layer_idx
         self._cur_layer_idx += 1
         logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}")

From afa6150a235f8a25d9b1a0207b3387363cfa37e5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 21:11:29 -0800
Subject: [PATCH 22/57] add ar

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.py b/setup.py
index 3b68083a61..6fca6e710a 100644
--- a/setup.py
+++ b/setup.py
@@ -144,6 +144,10 @@ def localversion_func(version: ScmVersion) -> str:
             if BUILD_TYPE == "release"
             else "compressed-tensors>=0.12.3a2"
         ),
+        # TODO: replace it with the release version
+        (
+            "auto_round @ git+https://github.com/intel/auto-round.git@llmc"
+        ),
     ],
     extras_require={
         "dev": [

From 97217e78afc639f824628a915613a698f131aed4 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 3 Nov 2025 21:57:23 -0800
Subject: [PATCH 23/57] update example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py              | 80 ++-----------------
 1 file changed, 8 insertions(+), 72 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index d8d72c8f60..0e598ba9bf 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -1,76 +1,18 @@
-import os
-
-_DEBUG = os.environ.get("DEBUG", "0") == "1"
-IS_LLAMA = os.environ.get("MODEL", "LLAMA") == "LLAMA"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import AutoRoundModifier
-
-# from llmcompressor.modifiers.quantization import QuantizationModifier as AutoRoundModifier
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_id = "/data5/yliu7/HF_HOME/meta-llama/Llama-3.2-1B-Instruct"
-model_id = "Qwen/Qwen2.5-0.5B"
-model_id = "/data5/yliu7/HF_HOME/Qwen/Qwen2.5-0.5B"
-model_id = "/data5/yliu7/meta-llama/meta-llama/Meta-Llama-3.1-8B-Instruct"
-
-model_dir = "/storage/yiliu7"
-# model_id=f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
-
-model_dir = "/storage/yiliu7"
-model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-if IS_LLAMA:
-    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
-else:
-    model_name = "Qwen/Qwen2.5-0.5B/"
-
-model_id = f"{model_dir}/{model_name}"
-
-
-# model_id = "facebook/opt-125m"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-if _DEBUG:
-    import torch
-    from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-    from transformers.models.llama.modeling_llama import LlamaForCausalLM
-    from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
-
-    config = AutoConfig.from_pretrained(model_id)
-    config.num_hidden_layers = 2  # Use a smaller model for testing
-    # Fix configuration validation issues
-    # config.layer_types = config.layer_types[: config.num_hidden_layers]
-
-    # Load the tokenizer and model
-    if "Qwen" in model_id:
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        model = Qwen2ForCausalLM(config)
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        model = LlamaForCausalLM(config)
-    model.to(torch.bfloat16)
-    NUM_CALIBRATION_SAMPLES = 3
-    MAX_SEQUENCE_LENGTH = 16
-    iters = 4
-
-else:
-    # Select number of samples. 512 samples is a good place to start.
-    # Increasing the number of samples can improve accuracy.
-    light = {"batch_size": 8, "iters": 50, "seqlen": 2048, "nsamples": 128, "lr": 5e-3}
-    light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": 5e-3}
-
-    light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 128, "lr": None}
-    # light = {"batch_size": 8, "iters": 200, "seqlen": 2048, "nsamples": 32, "lr": None}
-    NUM_CALIBRATION_SAMPLES = light["nsamples"]
-    MAX_SEQUENCE_LENGTH = light["seqlen"]
-    iters = light["iters"]
-
 # Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = 128
+MAX_SEQUENCE_LENGTH = 2048
+# Get aligned calibration dataset.
 from auto_round.calib_dataset import get_dataset
 
 ds = get_dataset(
@@ -79,10 +21,11 @@
     nsamples=NUM_CALIBRATION_SAMPLES,
 )
 
+
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with AutoRound with a group size 128
 recipe = AutoRoundModifier(
-    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=iters
+    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=200
 )
 
 
@@ -93,8 +36,7 @@
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    # !!! shuffle_calibration_samples: True -> mmlu 0.6574
-    # !!! shuffle_calibration_samples: False -> mmlu 0.66
+    # disbable shuffling to get slightly better mmlu score
     shuffle_calibration_samples=False,
 )
 
@@ -102,19 +44,13 @@
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-sample = tokenizer("Explain AI in ", return_tensors="pt")
+sample = tokenizer("Hello my name is", return_tensors="pt")
 sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
-SAVE_DIR = (
-    f"{model_dir}/"
-    + model_id.rstrip("/").split("/")[-1]
-    + "-W4A16-G128-disbale-shuffule-ar"
-)
-print(f"Saving quantized model to {SAVE_DIR}")
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)

From 3dcb434cc68d556cc7cbdbab4039f9f42a24c851 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 16:42:21 -0800
Subject: [PATCH 24/57] align api

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py  | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 626ec920ee..0bf36f8d75 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -83,7 +83,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         - on_initialize
             - apply config to model
         - on_start
-            - add input/output capture hooks to decoding layers
+            - add input capture hooks to decoding layers
         - on_sequential_epoch_end
             - apply_autoround
             - post_autoround_cleanup
@@ -106,6 +106,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
 
     # AutoRound modifier arguments
     iters: Optional[int] = 200
+    enable_torch_compile: Optional[bool] = True
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
@@ -219,23 +220,28 @@ def _mapping_config_to_autoround(self):
         return ar_quant_scheme
 
     def apply_autoround(self, state):
-        """Applies AutoRound quantization tuning on the current decoding layer.
+        """
+        Applies AutoRound quantization tuning on the current decoding layer.
 
-        The tuning logic is below:
+        The tuning logic is as follows:
         for iter in range(iters):
-           quant_output = forward(layer, cached_inputs)
-           loss = mse_loss(quant_output, original_output)
-           loss.backward()
-           optimizer.step()
-           if loss < best_loss:
+            quant_output = forward(layer, cached_inputs)
+            loss = mse_loss(quant_output, original_output)
+            loss.backward()
+            optimizer.step()
+            if loss < best_loss:
                 best_params = save_params(layer)
+
+        This method retrieves the current decoding layer, wraps it for compatibility with
+        AutoRound, and performs iterative optimization to minimize the quantization error.
+        The best parameters are tracked and applied to the layer after tuning.
+
         For more details, please refer to the AutoRound repository:
         https://github.com/intel/auto-round/
         """
-
         cur_layer_idx = self._cur_layer_idx
+        logger.info("Applying AutoRound to layer index: {}", cur_layer_idx)
         self._cur_layer_idx += 1
-        logger.info(f">>||>> AutoRound for decoding layer index {cur_layer_idx}")
         if cur_layer_idx >= len(state.model.model.layers):
             # skip the lm_head layer
             return
@@ -253,7 +259,7 @@ def apply_autoround(self, state):
                 scheme=parsed_scheme,
                 iters=self.iters,
                 enable_quanted_input=False,
-                enable_torch_compile=True,
+                enable_torch_compile=self.enable_torch_compile,
             )
             # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config()
@@ -268,6 +274,8 @@ def apply_autoround(self, state):
                 inputs=cur_inputs,
                 normalize_inputs=True,
                 device=device,
+                # Leave offload for LLMC
+                auto_offload=False,
             )
             # Update offload parameters and remove temporary attributes
             for _, module in decoding_layer.named_modules():

From aef77072f5a4a3e216cf9b8c3d291a3e6ea82739 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 16:42:49 -0800
Subject: [PATCH 25/57] format

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 setup.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 6fca6e710a..6211ea90d8 100644
--- a/setup.py
+++ b/setup.py
@@ -145,9 +145,7 @@ def localversion_func(version: ScmVersion) -> str:
             else "compressed-tensors>=0.12.3a2"
         ),
         # TODO: replace it with the release version
-        (
-            "auto_round @ git+https://github.com/intel/auto-round.git@llmc"
-        ),
+        ("auto_round @ git+https://github.com/intel/auto-round.git@llmc"),
     ],
     extras_require={
         "dev": [

From 97e1ca26cfa22a77cc0835a0aaf97e382104717a Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 16:57:11 -0800
Subject: [PATCH 26/57] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py        | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 0bf36f8d75..2c9a011728 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -58,8 +58,8 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
     Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier
-    leverages signed gradient descent (SignSGD) and block-wise loss to optimize rounding values
-    and weight clipping in a few steps.
+    leverages signed gradient descent (SignSGD) optimizer and block-wise loss to optimize
+    rounding values and weight clipping in a few steps.
 
     | Sample yaml:
     | test_stage:
@@ -99,9 +99,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         quantize even if they match a target in config_groups. Defaults to empty list.
     :param scheme: a single quantization scheme to apply to the model. This is a
         dictionary that supports all keys from QuantizationScheme except targets, which
-        will be set to the targets parameter set at the modifier level. Can also be set
-        to a dictionary of the format `preset_scheme_name: targets` for example:
-        `W8A8: ['Linear']` for weight and activation 8-bit.
+        will be set to the targets parameter set at the modifier level.
     """
 
     # AutoRound modifier arguments
@@ -149,7 +147,6 @@ def start_calibration(self, model: torch.nn.Module):
 
         for _, module in match_named_modules(model, self.targets, self.ignore):
             # Note: No need to register observers for auto-round
-            # self._initialize_observers(module)
             self._calibration_hooks |= self._initialize_hooks(module)
             apply_calibration_status(module)
 
@@ -168,7 +165,7 @@ def on_start(self, state: State, event: Event, **kwargs):
         self.start_calibration(state.model)
         for name, module in state.model.named_modules():
             if _is_decoding_layer(module, name):
-                # register input/output capture hooks for decoding layers
+                # register input capture hooks for decoding layers
                 self.register_hook(
                     module, self.input_capture_hook, "forward_pre", with_kwargs=True
                 )
@@ -299,7 +296,7 @@ def on_end(self, state: State, event: Event, **kwargs):
         """
         self.ended_ = True
         QuantizationMixin.end_calibration(self, state.model)
-        self.remove_hooks()  # remove gptq hooks
+        self.remove_hooks()
 
     def on_finalize(self, state: State, **kwargs) -> bool:
         """

From c75c272e150e1a37de29b0846ad3a7486ca67904 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 16:59:20 -0800
Subject: [PATCH 27/57] fix typo

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/quantization_w4a16/auto_round_llama3_example.py | 2 +-
 examples/quantization_w4a16/llama3_example.py            | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index 0e598ba9bf..d6d2003e6e 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -36,7 +36,7 @@
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    # disbable shuffling to get slightly better mmlu score
+    # disable shuffling to get slightly better mmlu score
     shuffle_calibration_samples=False,
 )
 
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
index 32ab8bb843..b03aacee35 100644
--- a/examples/quantization_w4a16/llama3_example.py
+++ b/examples/quantization_w4a16/llama3_example.py
@@ -7,9 +7,6 @@
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_dir = "/storage/yiliu7"
-model_id = f"{model_dir}/meta-llama/Meta-Llama-3.1-8B-Instruct"
-model_id = f"{model_dir}/meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -76,6 +73,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-GPTQ"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)

From 3d8a0c83d74334a989c43824a27adcc8de330e5c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 17:02:55 -0800
Subject: [PATCH 28/57] small iters for ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +-
 tests/llmcompressor/transformers/autoround/test_oneshot.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 2c9a011728..0f9a2894ff 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -165,7 +165,7 @@ def on_start(self, state: State, event: Event, **kwargs):
         self.start_calibration(state.model)
         for name, module in state.model.named_modules():
             if _is_decoding_layer(module, name):
-                # register input capture hooks for decoding layers
+                # register input capture hook for decoding layers
                 self.register_hook(
                     module, self.input_capture_hook, "forward_pre", with_kwargs=True
                 )
diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py
index f85ae283b0..f1618d2753 100644
--- a/tests/llmcompressor/transformers/autoround/test_oneshot.py
+++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py
@@ -29,6 +29,7 @@
 
 recipe_modifier_full = AutoRoundModifier(
     ignore=["lm_head"],
+    iters=10,
     config_groups={
         "group_0": QuantizationScheme(
             targets=["Linear"],

From 6729a75648493851ed88efc7b8c905a3817c525d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 17:12:32 -0800
Subject: [PATCH 29/57] format

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization_w4a16/auto_round_llama3_example.py |  2 +-
 .../modifiers/quantization/autoround/base.py        | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/quantization_w4a16/auto_round_llama3_example.py
index d6d2003e6e..2c97ee7794 100644
--- a/examples/quantization_w4a16/auto_round_llama3_example.py
+++ b/examples/quantization_w4a16/auto_round_llama3_example.py
@@ -1,3 +1,4 @@
+from auto_round.calib_dataset import get_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
@@ -13,7 +14,6 @@
 NUM_CALIBRATION_SAMPLES = 128
 MAX_SEQUENCE_LENGTH = 2048
 # Get aligned calibration dataset.
-from auto_round.calib_dataset import get_dataset
 
 ds = get_dataset(
     tokenizer=tokenizer,
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 0f9a2894ff..075bc7942b 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -57,9 +57,9 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
 
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
-    Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516. This modifier
-    leverages signed gradient descent (SignSGD) optimizer and block-wise loss to optimize
-    rounding values and weight clipping in a few steps.
+    Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516.
+    This modifier leverages signed gradient descent (SignSGD) optimizer and
+    block-wise loss to optimize rounding values and weight clipping in a few steps.
 
     | Sample yaml:
     | test_stage:
@@ -229,9 +229,10 @@ def apply_autoround(self, state):
             if loss < best_loss:
                 best_params = save_params(layer)
 
-        This method retrieves the current decoding layer, wraps it for compatibility with
-        AutoRound, and performs iterative optimization to minimize the quantization error.
-        The best parameters are tracked and applied to the layer after tuning.
+        This method retrieves the current decoding layer, wraps it for
+        compatibility with AutoRound, and performs iterative optimization
+        to minimize the quantization error. The best parameters are tracked
+        and applied to the layer after tuning.
 
         For more details, please refer to the AutoRound repository:
         https://github.com/intel/auto-round/

From bb4dbe86bb914037ef7757e5f9637d793566d874 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 17:18:08 -0800
Subject: [PATCH 30/57] refine comment

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 075bc7942b..23d9cb8586 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -284,7 +284,7 @@ def apply_autoround(self, state):
                     weight_scale = module.scale
                     del module.scale
                     del module.zp
-                    # TODO: update weight as well
+                    # TODO: update zero_point as well if needed
                     update_offload_parameter(module, "weight_scale", weight_scale)
         decoding_layer.eval()
 

From 2adf0e77c094a3c23d69282eab3a6e96356f1414 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 19:33:49 -0800
Subject: [PATCH 31/57] replace papaer link

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 23d9cb8586..744453b299 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -57,7 +57,7 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
 
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
-    Implements the AutoRound algorithm from https://arxiv.org/pdf/2309.05516.
+    Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf.
     This modifier leverages signed gradient descent (SignSGD) optimizer and
     block-wise loss to optimize rounding values and weight clipping in a few steps.
 

From dd9bde9b75f621c78193430c0d55396aa2398f3c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 4 Nov 2025 21:53:59 -0800
Subject: [PATCH 32/57] correct comments

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/pipelines/layer_sequential/pipeline.py | 3 ++-
 src/llmcompressor/pipelines/sequential/pipeline.py       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 314cd8439e..b8fbe32a3f 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -74,7 +74,8 @@ def __call__(
 
         LifecycleCallbacks.calibration_epoch_start()
 
-        # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ
+        # TODO: remove this to enable quantization aware calibration
+        # for GPTQ, AWQ and AutoRound.
         disable_qac = any(
             type(mod).__name__ in DISABLE_QAC_MODIFIERS
             for mod in session.lifecycle.recipe.modifiers
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 0f341a37da..de40ab1f83 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -76,7 +76,8 @@ def __call__(
 
         LifecycleCallbacks.calibration_epoch_start()
 
-        # TODO: remove this to enable quantization aware calibration for GPTQ and AWQ
+        # TODO: remove this to enable quantization aware calibration
+        # for GPTQ, AWQ and AutoRound.
         disable_qac = any(
             type(mod).__name__ in DISABLE_QAC_MODIFIERS
             for mod in session.lifecycle.recipe.modifiers

From 7d972558cc5c10acde7a24b163289719378f0a57 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 03:09:32 -0500
Subject: [PATCH 33/57] update comments

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 744453b299..8b517b2c70 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -283,8 +283,7 @@ def apply_autoround(self, state):
                     # Note: The model's weight is already q-dq in-place by auto-round.
                     weight_scale = module.scale
                     del module.scale
-                    del module.zp
-                    # TODO: update zero_point as well if needed
+                    # TODO: update zero_point after supporting asymmetric quantization
                     update_offload_parameter(module, "weight_scale", weight_scale)
         decoding_layer.eval()
 

From f298e82bbc41c7145f2ba373d02d2356c04f82c5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 03:12:35 -0500
Subject: [PATCH 34/57] refine code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modifiers/quantization/autoround/base.py       | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index 8b517b2c70..d6444a9c9f 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -115,6 +115,15 @@ def resolve_quantization_config(self) -> QuantizationConfig:
         config = super().resolve_quantization_config()
         return config
 
+    def _add_temporary_names(self, model: torch.nn.Module):
+        for name, mod in model.named_modules():
+            mod._tmp_name = name
+
+    def _remove_temporary_names(self, model: torch.nn.Module):
+        for _, mod in model.named_modules():
+            if hasattr(mod, "_tmp_name"):
+                del mod._tmp_name
+
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
         Initialize and run the AutoRound algorithm on the current state
@@ -130,9 +139,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
             m: name
             for name, m in match_named_modules(state.model, self.targets, self.ignore)
         }
-        # add temporary names to all modules
-        for name, mod in state.model.named_modules():
-            mod._tmp_name = name
+        self._add_temporary_names(state.model)
         # freeze all model parameters
         for name, param in state.model.named_parameters():
             param.requires_grad_(False)
@@ -296,6 +303,7 @@ def on_end(self, state: State, event: Event, **kwargs):
         """
         self.ended_ = True
         QuantizationMixin.end_calibration(self, state.model)
+        self._remove_temporary_names(state.model)
         self.remove_hooks()
 
     def on_finalize(self, state: State, **kwargs) -> bool:

From 73c357135e6e8d6d068f41be9e8ac9b281d63b99 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 03:14:48 -0500
Subject: [PATCH 35/57] add more checks

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/quantization/autoround/base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/quantization/autoround/base.py
index d6444a9c9f..6581747eff 100644
--- a/src/llmcompressor/modifiers/quantization/autoround/base.py
+++ b/src/llmcompressor/modifiers/quantization/autoround/base.py
@@ -195,13 +195,18 @@ def _mapping_config_to_autoround(self):
 
         resolved_config = self.resolved_config
         quant_scheme = None
+        # TODO: release below constraint in later PRs
+        assert len(resolved_config.config_groups) == 1, (
+            "AutoRoundModifier only supports one quantization scheme for now, "
+            f"got {len(resolved_config.config_groups)}"
+        )
+
         for scheme in resolved_config.config_groups.values():
             assert isinstance(
                 scheme, QuantizationScheme
             ), f"Expected QuantizationScheme, got {type(scheme)}"
             quant_scheme = scheme
         weight_args = quant_scheme.weights
-        # TODO: release below constraint in later PRs
         assert weight_args.strategy == QuantizationStrategy.GROUP, (
             "Only group-wise quantization is supported in AutoRoundModifier for now, "
             f"got {weight_args.strategy}"

From eb1639782d9b43a8d990418fcb35335c99f335c1 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 23:23:07 -0800
Subject: [PATCH 36/57] update example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round_llama3_example.py => autoround/llama3_example.py}  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/{quantization_w4a16/auto_round_llama3_example.py => autoround/llama3_example.py} (100%)

diff --git a/examples/quantization_w4a16/auto_round_llama3_example.py b/examples/autoround/llama3_example.py
similarity index 100%
rename from examples/quantization_w4a16/auto_round_llama3_example.py
rename to examples/autoround/llama3_example.py

From 9cb1f062f4838c1f2e0965e67895696dc78cc97a Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 23:25:00 -0800
Subject: [PATCH 37/57] move auto-round to modifier

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/llama3_example.py                            | 2 +-
 src/llmcompressor/modifiers/__init__.py                         | 2 ++
 .../modifiers/{quantization => }/autoround/__init__.py          | 0
 .../modifiers/{quantization => }/autoround/base.py              | 0
 src/llmcompressor/modifiers/quantization/__init__.py            | 1 -
 tests/llmcompressor/transformers/autoround/test_oneshot.py      | 2 +-
 6 files changed, 4 insertions(+), 3 deletions(-)
 rename src/llmcompressor/modifiers/{quantization => }/autoround/__init__.py (100%)
 rename src/llmcompressor/modifiers/{quantization => }/autoround/base.py (100%)

diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py
index 2c97ee7794..e968066510 100644
--- a/examples/autoround/llama3_example.py
+++ b/examples/autoround/llama3_example.py
@@ -2,7 +2,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import AutoRoundModifier
+from llmcompressor.modifiers import AutoRoundModifier
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
diff --git a/src/llmcompressor/modifiers/__init__.py b/src/llmcompressor/modifiers/__init__.py
index 65cd78b983..ec30c4174a 100644
--- a/src/llmcompressor/modifiers/__init__.py
+++ b/src/llmcompressor/modifiers/__init__.py
@@ -7,6 +7,7 @@
 extensible compression workflows.
 """
 
+from .autoround import AutoRoundModifier
 from .factory import ModifierFactory
 from .interface import ModifierInterface
 from .modifier import Modifier
@@ -15,4 +16,5 @@
     "ModifierFactory",
     "ModifierInterface",
     "Modifier",
+    "AutoRoundModifier",
 ]
diff --git a/src/llmcompressor/modifiers/quantization/autoround/__init__.py b/src/llmcompressor/modifiers/autoround/__init__.py
similarity index 100%
rename from src/llmcompressor/modifiers/quantization/autoround/__init__.py
rename to src/llmcompressor/modifiers/autoround/__init__.py
diff --git a/src/llmcompressor/modifiers/quantization/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
similarity index 100%
rename from src/llmcompressor/modifiers/quantization/autoround/base.py
rename to src/llmcompressor/modifiers/autoround/base.py
diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py
index 2c10fe4b97..1ca6912221 100644
--- a/src/llmcompressor/modifiers/quantization/__init__.py
+++ b/src/llmcompressor/modifiers/quantization/__init__.py
@@ -2,4 +2,3 @@
 
 from .gptq import *
 from .quantization import *
-from .autoround import *
diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py
index f1618d2753..d973398d90 100644
--- a/tests/llmcompressor/transformers/autoround/test_oneshot.py
+++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py
@@ -5,7 +5,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization.autoround import AutoRoundModifier
+from llmcompressor.modifiers import AutoRoundModifier
 
 recipe_str = """
 quant_stage:

From 76e0d21926c4d2b736991cf642640f85e450aa28 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 23:34:12 -0800
Subject: [PATCH 38/57] apply untie

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 6581747eff..ad9434680d 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -19,6 +19,9 @@
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.quantization.calibration import apply_calibration_status
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_if_target_shared_embedding,
+)
 
 __all__ = ["AutoRoundModifier"]
 
@@ -151,6 +154,7 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
+        untie_if_target_shared_embedding(model, self._module_names.values())
 
         for _, module in match_named_modules(model, self.targets, self.ignore):
             # Note: No need to register observers for auto-round

From 1cbe919ff772c684616d85f55307d269c85a7d9e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 23:36:42 -0800
Subject: [PATCH 39/57] correct docstring

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index ad9434680d..4128627d04 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -129,7 +129,7 @@ def _remove_temporary_names(self, model: torch.nn.Module):
 
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
-        Initialize and run the AutoRound algorithm on the current state
+        Initialize the model state for quantization and calibration.
 
         :param state: session state storing input model and calibration data
         """

From 9fa5efb7cb0c7a98ed3bfefc69bf04ccb6e8ffb3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 23:39:05 -0800
Subject: [PATCH 40/57] enable ci

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .github/workflows/test-check-transformers.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml
index 4ffde0b5e2..8bc7f97f7a 100644
--- a/.github/workflows/test-check-transformers.yaml
+++ b/.github/workflows/test-check-transformers.yaml
@@ -97,6 +97,10 @@ jobs:
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest -v tests/llmcompressor/transformers/gptq
+      - name: Running AutoRound Tests
+        if: (success() || failure()) && steps.install.outcome == 'success'
+        run: |
+          pytest -v tests/llmcompressor/transformers/autoround
       - name: Running ONESHOT Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |

From 7937d807fb294983025ce8481a0d5e6d7d5b9666 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 5 Nov 2025 23:58:51 -0800
Subject: [PATCH 41/57] revert import AutoRoundModifier into modfifier directly

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/llama3_example.py                       | 2 +-
 src/llmcompressor/modifiers/__init__.py                    | 2 --
 tests/llmcompressor/transformers/autoround/test_oneshot.py | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/autoround/llama3_example.py b/examples/autoround/llama3_example.py
index e968066510..9843073bdc 100644
--- a/examples/autoround/llama3_example.py
+++ b/examples/autoround/llama3_example.py
@@ -2,7 +2,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers import AutoRoundModifier
+from llmcompressor.modifiers.autoround import AutoRoundModifier
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
diff --git a/src/llmcompressor/modifiers/__init__.py b/src/llmcompressor/modifiers/__init__.py
index ec30c4174a..65cd78b983 100644
--- a/src/llmcompressor/modifiers/__init__.py
+++ b/src/llmcompressor/modifiers/__init__.py
@@ -7,7 +7,6 @@
 extensible compression workflows.
 """
 
-from .autoround import AutoRoundModifier
 from .factory import ModifierFactory
 from .interface import ModifierInterface
 from .modifier import Modifier
@@ -16,5 +15,4 @@
     "ModifierFactory",
     "ModifierInterface",
     "Modifier",
-    "AutoRoundModifier",
 ]
diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py
index d973398d90..77f6c91707 100644
--- a/tests/llmcompressor/transformers/autoround/test_oneshot.py
+++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py
@@ -5,7 +5,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.modifiers import AutoRoundModifier
+from llmcompressor.modifiers.autoround import AutoRoundModifier
 
 recipe_str = """
 quant_stage:

From e58b2bd03441934ab0b595a17f99b4d659f63ab7 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 6 Nov 2025 00:05:47 -0800
Subject: [PATCH 42/57] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 4128627d04..04da438a05 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -274,6 +274,7 @@ def apply_autoround(self, state):
                 iters=self.iters,
                 enable_quanted_input=False,
                 enable_torch_compile=self.enable_torch_compile,
+                batch_dim=0,
             )
             # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config()

From 4c94187da0cb1afc505e919ee42e8693416212bb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 6 Nov 2025 19:38:02 -0800
Subject: [PATCH 43/57] clean

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../pipelines/layer_sequential/pipeline.py    | 130 ------------------
 1 file changed, 130 deletions(-)
 delete mode 100644 src/llmcompressor/pipelines/layer_sequential/pipeline.py

diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
deleted file mode 100644
index b8fbe32a3f..0000000000
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import contextlib
-from typing import TYPE_CHECKING
-
-import torch
-import tqdm
-from compressed_tensors.utils import disable_offloading, get_execution_device
-from torch.utils.data.dataloader import DataLoader
-
-from llmcompressor.core import LifecycleCallbacks, active_session
-from llmcompressor.modifiers.utils.hooks import HooksMixin
-from llmcompressor.pipelines.cache import IntermediatesCache
-from llmcompressor.pipelines.layer_sequential.helpers import (
-    capture_first_layer_intermediates,
-    match_modules,
-    maybe_inject_pos_embeddings,
-    to_next_layer_kwargs,
-)
-from llmcompressor.pipelines.registry import CalibrationPipeline
-from llmcompressor.pipelines.sequential.helpers import (
-    dispatch_for_sequential,
-    get_sequential_targets,
-)
-from llmcompressor.utils.helpers import (
-    DISABLE_QAC_MODIFIERS,
-    DisableQuantization,
-    calibration_forward_context,
-)
-
-if TYPE_CHECKING:
-    from llmcompressor.args.dataset_arguments import DatasetArguments
-
-
-__all__ = ["LayerSequentialPipeline"]
-
-
-@CalibrationPipeline.register("layer_sequential")
-class LayerSequentialPipeline(CalibrationPipeline):
-    @staticmethod
-    def __call__(
-        model: torch.nn.Module, dataloader: DataLoader, dataset_args: "DatasetArguments"
-    ):
-        """
-        Run a layer-wise sequential data pipeline according to the following steps:
-
-        1. Layers are identified according to `sequential_targets`
-        2. A hook is attached to the first layer. This hook raises an exception which is
-            then caught and used to capture the input arguments to the first layer
-        3. The inputs to the first layer are used to calibrate the first layer, and the
-            output of the previous layer is used as inputs to calibrate the next layer
-
-        This pipeline requires that the model have distinct layers defined in its
-        architecture and that the outputs of the previous layer are exactly the inputs
-        to the next layer. This is violated by encoder-decoder architectures, among
-        others.
-
-        If your model architecture violates these assumptions, consider using the
-        sequential pipeline (see llmcompressor.pipelines.sequential). Architectures
-        which are known to fail these assumptions include GPT-J and most vision models
-
-        :param model: model being calibrated
-        :param dataloader: loads data for calibration
-        :param dataset_args: dataset arguments relevant to pipelines
-        """
-        session = active_session()
-
-        # prepare model for sequential onloading
-        dispatch_for_sequential(model)
-        model_device = get_execution_device(model)
-
-        # find layers
-        modifiers = session.lifecycle.recipe.modifiers
-        sequential_targets = get_sequential_targets(modifiers, model, dataset_args)
-        layers = match_modules(model, sequential_targets)
-
-        LifecycleCallbacks.calibration_epoch_start()
-
-        # TODO: remove this to enable quantization aware calibration
-        # for GPTQ, AWQ and AutoRound.
-        disable_qac = any(
-            type(mod).__name__ in DISABLE_QAC_MODIFIERS
-            for mod in session.lifecycle.recipe.modifiers
-        )
-
-        with contextlib.ExitStack() as stack:
-            stack.enter_context(calibration_forward_context(model))
-            if not dataset_args.quantization_aware_calibration or disable_qac:
-                stack.enter_context(DisableQuantization(model))
-
-            # prepare intermediates cache
-            intermediates: IntermediatesCache = capture_first_layer_intermediates(
-                model, layers[0], dataloader, model_device
-            )
-
-            num_layers = len(layers)
-            for layer_index, layer in enumerate(layers):
-                # prepare tqdm description texts
-                calib_desc = f"({layer_index + 1}/{num_layers}): Calibrating"
-                prop_desc = f"({layer_index + 1}/{num_layers}): Propagating"
-
-                # reduce memory movement by keeping modules onloaded
-                with disable_offloading():
-                    # do a preliminary pass to trigger modifier hooks
-                    for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
-                        inputs = intermediates.fetch(batch_idx)
-                        layer(**inputs)
-
-                    LifecycleCallbacks.sequential_epoch_end()
-
-                    # this pass does not trigger modifier hooks
-                    # and is only used for capturing outputs from
-                    # newly compressed modules
-                    with HooksMixin.disable_hooks():
-                        for batch_idx in tqdm.tqdm(
-                            range(len(dataloader)), desc=prop_desc
-                        ):
-                            inputs = intermediates.fetch(batch_idx)
-                            output = layer(**inputs)
-
-                            if layer_index < num_layers - 1:
-                                next_layer = layers[layer_index + 1]
-                                output = to_next_layer_kwargs(output, next_layer)
-                                output = maybe_inject_pos_embeddings(
-                                    output, next_layer, inputs
-                                )
-
-                                intermediates.delete(batch_idx)
-                                intermediates.update(batch_idx, output)
-
-            # redundant, finish any remaining compression
-            LifecycleCallbacks.calibration_epoch_end()

From 7ea844283a1c8a1e057c5ba1a00eabb63e253884 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 6 Nov 2025 23:19:21 -0800
Subject: [PATCH 44/57] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 04da438a05..8e45a520b2 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -114,9 +114,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     _cur_layer_idx = PrivateAttr(default=0)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
 
-    def resolve_quantization_config(self) -> QuantizationConfig:
-        config = super().resolve_quantization_config()
-        return config
 
     def _add_temporary_names(self, model: torch.nn.Module):
         for name, mod in model.named_modules():

From f52c0c0cb1d23f1f950d0a7f2efcff4989aa32b3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 7 Nov 2025 00:34:11 -0800
Subject: [PATCH 45/57] refactor

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 150 ++++++++++--------
 1 file changed, 82 insertions(+), 68 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 8e45a520b2..39d991eb78 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -1,8 +1,7 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from compressed_tensors.quantization import (
-    QuantizationConfig,
     QuantizationScheme,
     QuantizationStrategy,
     enable_quantization,
@@ -12,6 +11,7 @@
     match_named_modules,
     update_offload_parameter,
 )
+from llmcompressor.utils.pytorch.module import get_no_split_params
 from loguru import logger
 from pydantic import PrivateAttr
 
@@ -26,10 +26,6 @@
 __all__ = ["AutoRoundModifier"]
 
 
-def _is_decoding_layer(module, name):
-    return "decoderlayer" in module.__class__.__name__.lower()
-
-
 class _LLModelWrapper(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -104,7 +100,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         dictionary that supports all keys from QuantizationScheme except targets, which
         will be set to the targets parameter set at the modifier level.
     """
-
+    sequential_targets: Union[str, List[str], None] = None
     # AutoRound modifier arguments
     iters: Optional[int] = 200
     enable_torch_compile: Optional[bool] = True
@@ -114,16 +110,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     _cur_layer_idx = PrivateAttr(default=0)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
 
-
-    def _add_temporary_names(self, model: torch.nn.Module):
-        for name, mod in model.named_modules():
-            mod._tmp_name = name
-
-    def _remove_temporary_names(self, model: torch.nn.Module):
-        for _, mod in model.named_modules():
-            if hasattr(mod, "_tmp_name"):
-                del mod._tmp_name
-
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
         Initialize the model state for quantization and calibration.
@@ -141,8 +127,10 @@ def on_initialize(self, state: State, **kwargs) -> bool:
         }
         self._add_temporary_names(state.model)
         # freeze all model parameters
-        for name, param in state.model.named_parameters():
+        for _, param in state.model.named_parameters():
             param.requires_grad_(False)
+        
+        self.sequential_targets = self._infer_sequential_targets(state.model)
         return True
 
     def start_calibration(self, model: torch.nn.Module):
@@ -171,8 +159,8 @@ def on_start(self, state: State, event: Event, **kwargs):
         # register quantization calibration hooks
         # assume quantization has been initialized by this modifier or one before it
         self.start_calibration(state.model)
-        for name, module in state.model.named_modules():
-            if _is_decoding_layer(module, name):
+        for _, module in state.model.named_modules():
+            if self._is_decoding_layer(module):
                 # register input capture hook for decoding layers
                 self.register_hook(
                     module, self.input_capture_hook, "forward_pre", with_kwargs=True
@@ -184,52 +172,15 @@ def on_event(self, state: State, event: Event, **kwargs):
                 self.on_start(state, None)
 
         if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
-            self.apply_autoround(state)
+            subgraph = kwargs.pop("subgraph", None)
+            self.apply_autoround(state, subgraph)
             self.post_autoround_cleanup()
 
         if event.type_ == EventType.CALIBRATION_EPOCH_END:
             if not self.ended_:
                 self.on_end(state, None)
 
-    def _mapping_config_to_autoround(self):
-        from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
-
-        resolved_config = self.resolved_config
-        quant_scheme = None
-        # TODO: release below constraint in later PRs
-        assert len(resolved_config.config_groups) == 1, (
-            "AutoRoundModifier only supports one quantization scheme for now, "
-            f"got {len(resolved_config.config_groups)}"
-        )
-
-        for scheme in resolved_config.config_groups.values():
-            assert isinstance(
-                scheme, QuantizationScheme
-            ), f"Expected QuantizationScheme, got {type(scheme)}"
-            quant_scheme = scheme
-        weight_args = quant_scheme.weights
-        assert weight_args.strategy == QuantizationStrategy.GROUP, (
-            "Only group-wise quantization is supported in AutoRoundModifier for now, "
-            f"got {weight_args.strategy}"
-        )
-        assert quant_scheme.input_activations is None, (
-            "Input activation quantization is not supported in AutoRoundModifier, "
-            f"got {quant_scheme.input_activations}"
-        )
-        assert quant_scheme.output_activations is None, (
-            "Output activation quantization is not supported in AutoRoundModifier, "
-            f"got {quant_scheme.output_activations}"
-        )
-        ar_quant_scheme = ARQuantizationScheme(
-            bits=weight_args.num_bits,
-            sym=weight_args.symmetric,
-            group_size=weight_args.group_size,
-            data_type=weight_args.type,
-            act_bits=16,
-        )
-        return ar_quant_scheme
-
-    def apply_autoround(self, state):
+    def apply_autoround(self, state, subgraph):
         """
         Applies AutoRound quantization tuning on the current decoding layer.
 
@@ -250,13 +201,18 @@ def apply_autoround(self, state):
         For more details, please refer to the AutoRound repository:
         https://github.com/intel/auto-round/
         """
-        cur_layer_idx = self._cur_layer_idx
-        logger.info("Applying AutoRound to layer index: {}", cur_layer_idx)
-        self._cur_layer_idx += 1
-        if cur_layer_idx >= len(state.model.model.layers):
-            # skip the lm_head layer
+        modules = list(subgraph.submodules(model=state.model))
+
+        decoding_layers = [m for m in modules if self._is_decoding_layer(m)]
+        if len(decoding_layers) == 0:
             return
-        decoding_layer = state.model.model.layers[cur_layer_idx]
+        assert len(decoding_layers) == 1, (
+            "Only one decoding layer is expected in the subgraph, "
+            f"found {len(decoding_layers)}."
+        )
+        decoding_layer = decoding_layers[0]
+
+        logger.info("Applying AutoRound on layer {}", decoding_layer._tmp_name)
 
         wrapped_model = _wrap_decoding_layer(decoding_layer)
 
@@ -277,8 +233,7 @@ def apply_autoround(self, state):
             ar.configure_layer_config()
             first_param = next(decoding_layer.parameters())
             device = first_param.device
-            input_name = f"model.layers.{cur_layer_idx}"
-            cur_inputs = self._all_module_input[input_name]
+            cur_inputs = self._all_module_input[decoding_layer._tmp_name]
             decoding_layer.tuning_device = device
 
             ar.quantize_block(
@@ -323,3 +278,62 @@ def on_finalize(self, state: State, **kwargs) -> bool:
             self.on_end(state, None)
 
         return True
+
+    def _add_temporary_names(self, model: torch.nn.Module):
+        for name, mod in model.named_modules():
+            mod._tmp_name = name
+
+    def _remove_temporary_names(self, model: torch.nn.Module):
+        for _, mod in model.named_modules():
+            if hasattr(mod, "_tmp_name"):
+                del mod._tmp_name
+
+    def _is_decoding_layer(self, module: torch.nn.Module) -> bool:
+        return module.__class__.__name__ in self.sequential_targets
+
+    def _infer_sequential_targets(self, model: torch.nn.Module) -> str | list[str]:
+        match self.sequential_targets:
+            case None:
+                return get_no_split_params(model)
+            case str():
+                return [self.sequential_targets]
+            case _:
+                return self.sequential_targets
+
+    def _mapping_config_to_autoround(self):
+        from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
+
+        resolved_config = self.resolved_config
+        quant_scheme = None
+        # TODO: release below constraint in later PRs
+        assert len(resolved_config.config_groups) == 1, (
+            "AutoRoundModifier only supports one quantization scheme for now, "
+            f"got {len(resolved_config.config_groups)}"
+        )
+
+        for scheme in resolved_config.config_groups.values():
+            assert isinstance(
+                scheme, QuantizationScheme
+            ), f"Expected QuantizationScheme, got {type(scheme)}"
+            quant_scheme = scheme
+        weight_args = quant_scheme.weights
+        assert weight_args.strategy == QuantizationStrategy.GROUP, (
+            "Only group-wise quantization is supported in AutoRoundModifier for now, "
+            f"got {weight_args.strategy}"
+        )
+        assert quant_scheme.input_activations is None, (
+            "Input activation quantization is not supported in AutoRoundModifier, "
+            f"got {quant_scheme.input_activations}"
+        )
+        assert quant_scheme.output_activations is None, (
+            "Output activation quantization is not supported in AutoRoundModifier, "
+            f"got {quant_scheme.output_activations}"
+        )
+        ar_quant_scheme = ARQuantizationScheme(
+            bits=weight_args.num_bits,
+            sym=weight_args.symmetric,
+            group_size=weight_args.group_size,
+            data_type=weight_args.type,
+            act_bits=16,
+        )
+        return ar_quant_scheme
\ No newline at end of file

From 4a9c4aa53777fbd0cd50cb898cb31ab6a09c02df Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 7 Nov 2025 00:36:53 -0800
Subject: [PATCH 46/57] format

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 39d991eb78..f2759d9de9 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -11,7 +11,6 @@
     match_named_modules,
     update_offload_parameter,
 )
-from llmcompressor.utils.pytorch.module import get_no_split_params
 from loguru import logger
 from pydantic import PrivateAttr
 
@@ -22,6 +21,7 @@
 from llmcompressor.transformers.compression.compressed_tensors_utils import (
     untie_if_target_shared_embedding,
 )
+from llmcompressor.utils.pytorch.module import get_no_split_params
 
 __all__ = ["AutoRoundModifier"]
 
@@ -100,6 +100,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
         dictionary that supports all keys from QuantizationScheme except targets, which
         will be set to the targets parameter set at the modifier level.
     """
+
     sequential_targets: Union[str, List[str], None] = None
     # AutoRound modifier arguments
     iters: Optional[int] = 200
@@ -129,7 +130,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
         # freeze all model parameters
         for _, param in state.model.named_parameters():
             param.requires_grad_(False)
-        
+
         self.sequential_targets = self._infer_sequential_targets(state.model)
         return True
 
@@ -219,11 +220,11 @@ def apply_autoround(self, state, subgraph):
         with torch.enable_grad(), align_module_device(decoding_layer):
             import auto_round
 
-            parsed_scheme = self._mapping_config_to_autoround()
+            ar_quant_scheme = self._mapping_config_to_autoround()
             ar = auto_round.AutoRound(
                 model=wrapped_model,
                 tokenizer="",
-                scheme=parsed_scheme,
+                scheme=ar_quant_scheme,
                 iters=self.iters,
                 enable_quanted_input=False,
                 enable_torch_compile=self.enable_torch_compile,
@@ -336,4 +337,4 @@ def _mapping_config_to_autoround(self):
             data_type=weight_args.type,
             act_bits=16,
         )
-        return ar_quant_scheme
\ No newline at end of file
+        return ar_quant_scheme

From 0567df6034327eea2838c5a7b2ef7db3033c255a Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 7 Nov 2025 16:39:47 +0800
Subject: [PATCH 47/57] Update src/llmcompressor/modifiers/autoround/base.py

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Signed-off-by: Yi Liu <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index f2759d9de9..510f5116fd 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -104,7 +104,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     sequential_targets: Union[str, List[str], None] = None
     # AutoRound modifier arguments
     iters: Optional[int] = 200
-    enable_torch_compile: Optional[bool] = True
+    enable_torch_compile: bool = True
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)

From 650a19ca0a7e9e8d74806f5907540d601d119fb8 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 7 Nov 2025 00:58:33 -0800
Subject: [PATCH 48/57] refine docs

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index f2759d9de9..b6859a7049 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -192,12 +192,7 @@ def apply_autoround(self, state, subgraph):
             loss.backward()
             optimizer.step()
             if loss < best_loss:
-                best_params = save_params(layer)
-
-        This method retrieves the current decoding layer, wraps it for
-        compatibility with AutoRound, and performs iterative optimization
-        to minimize the quantization error. The best parameters are tracked
-        and applied to the layer after tuning.
+                best_params = update_params(layer)
 
         For more details, please refer to the AutoRound repository:
         https://github.com/intel/auto-round/

From 5cd35a6a9023a92f7aa47a4af537dc0811180345 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 7 Nov 2025 19:55:27 -0800
Subject: [PATCH 49/57] fix import

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 9ac1a4c0c5..be690e6344 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -1,6 +1,8 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
+from auto_round import AutoRound
+from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
 from compressed_tensors.quantization import (
     QuantizationScheme,
     QuantizationStrategy,
@@ -213,10 +215,8 @@ def apply_autoround(self, state, subgraph):
         wrapped_model = _wrap_decoding_layer(decoding_layer)
 
         with torch.enable_grad(), align_module_device(decoding_layer):
-            import auto_round
-
             ar_quant_scheme = self._mapping_config_to_autoround()
-            ar = auto_round.AutoRound(
+            ar = AutoRound(
                 model=wrapped_model,
                 tokenizer="",
                 scheme=ar_quant_scheme,
@@ -297,8 +297,6 @@ def _infer_sequential_targets(self, model: torch.nn.Module) -> str | list[str]:
                 return self.sequential_targets
 
     def _mapping_config_to_autoround(self):
-        from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
-
         resolved_config = self.resolved_config
         quant_scheme = None
         # TODO: release below constraint in later PRs

From 678b123363d66a0b7308d642bdf1768801b6db0b Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Sat, 8 Nov 2025 12:08:01 +0800
Subject: [PATCH 50/57] Update src/llmcompressor/modifiers/autoround/base.py

Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Signed-off-by: Yi Liu <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index be690e6344..b235ede13d 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -105,7 +105,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
 
     sequential_targets: Union[str, List[str], None] = None
     # AutoRound modifier arguments
-    iters: Optional[int] = 200
+    iters: int = 200
     enable_torch_compile: bool = True
 
     # private variables

From a8c63d388b91d3002898b2be733d3acaa6f3c0ab Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 9 Nov 2025 18:13:09 -0800
Subject: [PATCH 51/57] add qinput

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index be690e6344..becdd485fb 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -112,6 +112,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
     _cur_layer_idx = PrivateAttr(default=0)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
+    _q_input: Optional[torch.Tensor] = PrivateAttr(default=None)
 
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
@@ -221,7 +222,6 @@ def apply_autoround(self, state, subgraph):
                 tokenizer="",
                 scheme=ar_quant_scheme,
                 iters=self.iters,
-                enable_quanted_input=False,
                 enable_torch_compile=self.enable_torch_compile,
                 batch_dim=0,
             )
@@ -232,14 +232,16 @@ def apply_autoround(self, state, subgraph):
             cur_inputs = self._all_module_input[decoding_layer._tmp_name]
             decoding_layer.tuning_device = device
 
-            ar.quantize_block(
+            q_input, _ = ar.quantize_block(
                 block=decoding_layer,
                 inputs=cur_inputs,
+                q_input=self._q_input,
                 normalize_inputs=True,
                 device=device,
                 # Leave offload for LLMC
                 auto_offload=False,
             )
+            self._q_input = q_input
             # Update offload parameters and remove temporary attributes
             for _, module in decoding_layer.named_modules():
                 if hasattr(module, "weight_scale") and hasattr(

From fbc047aaa9b7c2d4f07fc53391dea5c5b14d45c4 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 9 Nov 2025 18:14:42 -0800
Subject: [PATCH 52/57] clean cache

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 6b6868a9e6..e025583864 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -265,6 +265,7 @@ def on_end(self, state: State, event: Event, **kwargs):
         QuantizationMixin.end_calibration(self, state.model)
         self._remove_temporary_names(state.model)
         self.remove_hooks()
+        self._q_input = None
 
     def on_finalize(self, state: State, **kwargs) -> bool:
         """

From 96b6490ca798500829e4bd4ae3ca60e5986ff602 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 9 Nov 2025 18:41:17 -0800
Subject: [PATCH 53/57] align api

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index e025583864..4b72e99b22 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -110,7 +110,6 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
 
     # private variables
     _module_names: Dict[torch.nn.Module, str] = PrivateAttr(default_factory=dict)
-    _cur_layer_idx = PrivateAttr(default=0)
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
     _q_input: Optional[torch.Tensor] = PrivateAttr(default=None)
 
@@ -236,7 +235,6 @@ def apply_autoround(self, state, subgraph):
                 block=decoding_layer,
                 inputs=cur_inputs,
                 q_input=self._q_input,
-                normalize_inputs=True,
                 device=device,
                 # Leave offload for LLMC
                 auto_offload=False,

From d00d41b3d028c622be88c0b5e59ff4e608142035 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 9 Nov 2025 19:50:14 -0800
Subject: [PATCH 54/57] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 4b72e99b22..338b404832 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -213,6 +213,7 @@ def apply_autoround(self, state, subgraph):
         logger.info("Applying AutoRound on layer {}", decoding_layer._tmp_name)
 
         wrapped_model = _wrap_decoding_layer(decoding_layer)
+        wrapped_model.name_or_path = state.model.name_or_path
 
         with torch.enable_grad(), align_module_device(decoding_layer):
             ar_quant_scheme = self._mapping_config_to_autoround()

From d4a8fb00f209e02aab317c50024c299aeb03ee96 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 9 Nov 2025 21:00:59 -0800
Subject: [PATCH 55/57] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 338b404832..02801369a3 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -223,10 +223,10 @@ def apply_autoround(self, state, subgraph):
                 scheme=ar_quant_scheme,
                 iters=self.iters,
                 enable_torch_compile=self.enable_torch_compile,
-                batch_dim=0,
             )
             # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config()
+            ar.batch_dim = 0
             first_param = next(decoding_layer.parameters())
             device = first_param.device
             cur_inputs = self._all_module_input[decoding_layer._tmp_name]

From 487fcd2d624e98a137fb0d549f4dfcfab35b9463 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 9 Nov 2025 21:02:32 -0800
Subject: [PATCH 56/57] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 02801369a3..2480751a9b 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -225,7 +225,7 @@ def apply_autoround(self, state, subgraph):
                 enable_torch_compile=self.enable_torch_compile,
             )
             # TODO: configure layer-wise config based on self.resolved_config
-            ar.configure_layer_config()
+            ar.configure_layer_config(enable_gguf_official_mixed=False)
             ar.batch_dim = 0
             first_param = next(decoding_layer.parameters())
             device = first_param.device

From 3adc879708c7167b69a2aa4562e8b1afbd9d4c3f Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 11 Nov 2025 16:25:02 -0800
Subject: [PATCH 57/57] add requires_gpu for ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 tests/llmcompressor/transformers/autoround/test_oneshot.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/llmcompressor/transformers/autoround/test_oneshot.py b/tests/llmcompressor/transformers/autoround/test_oneshot.py
index 77f6c91707..ce167864e9 100644
--- a/tests/llmcompressor/transformers/autoround/test_oneshot.py
+++ b/tests/llmcompressor/transformers/autoround/test_oneshot.py
@@ -6,6 +6,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
+from tests.testing_utils import requires_gpu
 
 recipe_str = """
 quant_stage:
@@ -39,6 +40,7 @@
 )
 
 
+@requires_gpu(1)
 @pytest.mark.parametrize(
     "recipe",
     [