Merge branch 'main' into fix_qwen3

dsikka · web-flow · commit dd4b94221141 · 2025-10-31T12:40:08.000-04:00
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -64,10 +64,11 @@ class ModelArguments:
     )
 
     tie_word_embeddings: bool = field(
-        default=False,
+        default=True,
         metadata={
             "help": "Whether the model's input and output word embeddings "
-            "should be tied. Note that this is only relevant if the "
+            "should attempt to be left tied. False means always untie."
+            " Note that this is only relevant if the "
             "model has a output word embedding layer."
         },
     )
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -233,7 +233,7 @@ def oneshot(
     processor: Optional[Union[str, ProcessorMixin]] = None,
     use_auth_token: bool = False,
     precision: str = "auto",
-    tie_word_embeddings: bool = False,
+    tie_word_embeddings: bool = True,
     trust_remote_code_model: bool = False,
     save_compressed: bool = True,
     model_revision: str = "main",
@@ -282,7 +282,7 @@ def oneshot(
         models.
     :param precision: Precision to cast model weights to, default to auto.
     :param tie_word_embeddings: Whether the model's input and output word embeddings
-        should be tied.
+        should be left tied if possible. False means always untie.
     :param trust_remote_code_model: Whether to allow for custom models to execute
         their own modeling files.
     :param save_compressed: Whether to compress sparse models during save.
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -59,7 +59,6 @@ def pre_process(
     Raises:
         FileNotFoundError: If the model or processor path is invalid.
     """
-    _warn_tied_embeddings(model_args.tie_word_embeddings)
 
     # Initialize model
     if isinstance(model_args.model, (str, PosixPath)):
@@ -150,21 +149,6 @@ def post_process(
         reset_session()
 
 
-def _warn_tied_embeddings(tie_word_embeddings: bool = False):
-    """
-    Logs a warning if the model has tied word embeddings.
-    The `tie_word_embeddings` flag may cause issues during saving in the one-shot
-    calibration workflow due to shared tensor addresses.
-    """
-    if tie_word_embeddings:
-        logger.debug(
-            "The tie_word_embeddings flag is by default set to False. "
-            "This guarantees that the one-shot algorithm saves the final "
-            "weights without errors. Detected tie_word_embeddings=True. "
-            "This may cause issues with the one-shot algorithm on save."
-        )
-
-
 def initialize_model_from_path(
     model_args: ModelArguments,
     training_args: Optional[TrainingArguments] = None,
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -34,6 +34,9 @@
     reset_quantization_status,
 )
 from llmcompressor.modifiers.utils.hooks import HooksMixin
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_if_target_shared_embedding,
+)
 
 __all__ = ["QuantizationMixin"]
 
@@ -179,6 +182,12 @@ def start_calibration(self, model: torch.nn.Module):
 
         :param model: model to prepare for calibration
         """
+
+        matched_module_generator = (
+            x[1] for x in match_named_modules(model, self.resolved_targets, self.ignore)
+        )
+        untie_if_target_shared_embedding(model, matched_module_generator)
+
         for _, module in match_named_modules(model, self.resolved_targets, self.ignore):
             self._initialize_observers(module)
             self._calibration_hooks |= self._initialize_hooks(module)
diff --git a/src/llmcompressor/modifiers/transform/quip/base.py b/src/llmcompressor/modifiers/transform/quip/base.py
@@ -7,11 +7,14 @@
     TransformScheme,
     apply_transform_config,
 )
-from compressed_tensors.utils import TorchDtype
+from compressed_tensors.utils import TorchDtype, match_named_modules
 from pydantic import Field, ValidationInfo, field_validator
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_if_target_shared_embedding,
+)
 
 __all__ = ["QuIPModifier"]
 
@@ -100,6 +103,16 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
+        def matched_module_generator():
+            for scheme in self.transform_config.config_groups.values():
+                for arg in scheme.apply:
+                    gen = match_named_modules(state.model, arg.targets, arg.ignore)
+                    for _, module in gen:
+                        yield module
+
+        # Untie embeddings if they will be targeted by transforms
+        untie_if_target_shared_embedding(state.model, matched_module_generator())
+
         apply_transform_config(state.model, self.transform_config)
 
     def on_event(self, state: State, event: Event, **kwargs):
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -16,6 +16,9 @@
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modeling import center_embeddings, fuse_norm_linears
 from llmcompressor.modifiers import Modifier
+from llmcompressor.transformers.compression.compressed_tensors_utils import (
+    untie_word_embeddings,
+)
 
 from .mappings import SpinQuantMapping, infer_mapping_from_model
 from .norm_mappings import NormMapping, infer_norm_mapping_from_model
@@ -148,6 +151,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
+        # needed any time embeddings/lm_head is modified
+        untie_word_embeddings(state.model)
         # needs to happen after the model has been hooked to execute on the GPU
         # otherwise we're applying weight transforms on CPU
         self._center_embeddings(state.model)
diff --git a/src/llmcompressor/transformers/compression/compressed_tensors_utils.py b/src/llmcompressor/transformers/compression/compressed_tensors_utils.py
@@ -1,5 +1,6 @@
 import os
 import weakref
+from collections.abc import Generator
 from functools import wraps
 from typing import Optional
 
@@ -126,8 +127,15 @@ def untie_word_embeddings(model: PreTrainedModel):
 
     :param model: model to fix
     """
-    input_embed = model.get_input_embeddings()
-    output_embed = model.get_output_embeddings()
+    try:
+        input_embed = model.get_input_embeddings()
+        output_embed = model.get_output_embeddings()
+    except NotImplementedError as e:
+        logger.warning(
+            f"cannot untie model of type {model.__class__} which doesn't have "
+            f"get_input_embeddings and get_output_embeddings implmented\n{e}"
+        )
+        return
 
     for module in (input_embed, output_embed):
         if module is None or not hasattr(module, "weight"):
@@ -149,6 +157,80 @@ def untie_word_embeddings(model: PreTrainedModel):
         model.config.tie_word_embeddings = False
 
 
+def _get_embeddings_or_warn(
+    model: torch.nn.Module,
+) -> tuple[torch.nn.Module | None, torch.nn.Module | None]:
+    if not (
+        hasattr(model, "get_input_embeddings")
+        and hasattr(model, "get_output_embeddings")
+    ):
+        logger.warning(
+            f"{model.__class__} doesn't have attribute get_input_embeddings and"
+            " get_output_embeddings implemented."
+            "\nThis can cause"
+            " problems when quantizing layers with shared weights"
+        )
+        return None, None
+
+    try:
+        input_embeddings, output_embeddings = (
+            model.get_input_embeddings(),
+            model.get_output_embeddings(),
+        )
+    except NotImplementedError as e:
+        logger.warning(
+            f"{model.__class__} doesn't have get_input_embeddings and "
+            "get_output_embeddings implemented."
+            "\nThis can cause"
+            " problems when quantizing layers with shared weights"
+            f"\n{e}"
+        )
+        return None, None
+
+    if not (
+        isinstance(input_embeddings, torch.nn.Module)
+        and isinstance(output_embeddings, torch.nn.Module)
+    ):
+        logger.warning(
+            f"expected modules from {model.__class__} get_input_embeddings and"
+            f" get_output_embeddings but got {type(input_embeddings)}"
+            f"  and {type(output_embeddings)}."
+            "\nThis can cause"
+            " problems when quantizing layers with shared weights"
+        )
+        return None, None
+    return input_embeddings, output_embeddings
+
+
+def untie_if_target_shared_embedding(
+    model: torch.nn.Module, matched_module_generator: Generator[torch.nn.Module]
+):
+    """
+    Helper method that checks for shared input/output embedding and unties them
+    if either shows up in the matched_module_generator
+
+    :param model: model to untie if embeddings are shared and targeted by
+        matched_module_generator
+    :param matched_module_generator: Generator of all modules (not names) which
+            will be modified by quantization or transformation
+    """
+    input_embeddings, output_embeddings = _get_embeddings_or_warn(model)
+
+    if None in (input_embeddings, output_embeddings):  # if couldn't find embeddings
+        return
+
+    if (
+        input_embeddings.weight is not output_embeddings.weight
+    ):  # if not shared, can ignore
+        return
+
+    # if shared, check if either is targeted
+    for module in matched_module_generator:
+        if module in (input_embeddings, output_embeddings):
+            untie_word_embeddings(model)
+            return
+
+
 def get_model_compressor(
     model: torch.nn.Module,
     sparsity_config: Optional[SparsityCompressionConfig] = None,
diff --git a/tests/llmcompressor/modifiers/quantization/test_handling_shared_embeddings.py b/tests/llmcompressor/modifiers/quantization/test_handling_shared_embeddings.py
diff --git a/tests/llmcompressor/modifiers/transform/test_correctness.py b/tests/llmcompressor/modifiers/transform/test_correctness.py
diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py

Original file line number	Diff line number	Diff line change
`@@ -64,10 +64,11 @@ class ModelArguments:`
`64`	`64`	`)`
`65`	`65`
`66`	`66`	`tie_word_embeddings: bool = field(`
`67`		`- default=False,`
	`67`	`+ default=True,`
`68`	`68`	`metadata={`
`69`	`69`	`"help": "Whether the model's input and output word embeddings "`
`70`		`- "should be tied. Note that this is only relevant if the "`
	`70`	`+ "should attempt to be left tied. False means always untie."`
	`71`	`+ " Note that this is only relevant if the "`
`71`	`72`	`"model has a output word embedding layer."`
`72`	`73`	`},`
`73`	`74`	`)`