standardize get_fused_names

kylesayrs · kylesayrs · commit b5c3db46c5e7 · 2025-11-18T12:14:18.000-05:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/model_free/__init__.py b/src/llmcompressor/entrypoints/model_free/__init__.py
@@ -11,6 +11,7 @@
 from compressed_tensors.utils.match import _match_name
 from loguru import logger
 from safetensors.torch import load_file, save_file
+from torch.nn import Module
 
 from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
 from llmcompressor.entrypoints.model_free.lifecycle import (
@@ -169,11 +170,18 @@ def _process_file_microscale_scheme(
     """
     assert is_microscale_scheme(scheme), "Use `_process_file` for non microscale scheme"
     tensors = load_file(file_path)
-    fused_names = get_fused_names(tensors)
-    fused_names_to_parent = {
-        name: prefix for prefix, names in fused_names.items() for name in names
+    fused_sets, unmatched_sets = get_fused_names(tensors)
+    assert len(unmatched_sets) <= 0  # should be caught by `validate_safetensors_index`
+
+    fused_name_to_fused_index: dict[str, int]  # fused_name -> fused_index
+    fused_modules: dict[int, dict[str, Module]]  # fused_index -> named_modules
+
+    fused_name_to_fused_index = {
+        name: index
+        for index, matched_set in enumerate(fused_sets)
+        for name in matched_set.values()
     }
-    fused_parent_submodules = defaultdict(dict)
+    fused_modules = defaultdict(dict)
 
     for name in list(tensors.keys()):
         module_name, param_name = name.rsplit(".", 1)
@@ -187,9 +195,9 @@ def _process_file_microscale_scheme(
 
         # 2. calibrate weight qparams. Delay scale/zp calibration for fused modules
         calibrate_global_scale(module)
-        if name in fused_names_to_parent:
-            fused_parent = fused_names_to_parent[name]
-            fused_parent_submodules[fused_parent][name] = module
+        if name in fused_name_to_fused_index:
+            fused_index = fused_name_to_fused_index[name]
+            fused_modules[fused_index][name] = module
             continue
 
         calibrate_scale_zp(module)
@@ -204,7 +212,7 @@ def _process_file_microscale_scheme(
             tensors[key] = value.to("cpu")
 
     # compress and save miscroscale fused modules
-    for parent_name, named_modules in fused_parent_submodules.items():
+    for named_modules in fused_modules.values():
         # 2.1. fuse global scales
         global_scales = [m.weight_global_scale for m in named_modules.values()]
         fused_global_scale = torch.min(torch.cat(global_scales, dim=0))
@@ -216,7 +224,7 @@ def _process_file_microscale_scheme(
             # 2.2. finish calibration with fused global scales
             calibrate_scale_zp(module)
 
-            # 3. compress module using qparams
+            # 3. compress module using miscroscale qparams
             compress_module(module)
 
             # 4. save compressed data (on cpu)
diff --git a/src/llmcompressor/entrypoints/model_free/helpers.py b/src/llmcompressor/entrypoints/model_free/helpers.py
@@ -3,6 +3,7 @@
 from typing import Mapping, TypeVar
 
 import torch
+from compressed_tensors.utils.match import _match_name
 from loguru import logger
 from transformers.file_utils import CONFIG_NAME
 
@@ -11,9 +12,15 @@
     "find_safetensors_index_path",
     "find_config_path",
     "find_safetensors_index_file",
+    "match_names_set_eager",
+    "MatchedNamesSet",
     "invert_mapping",
 ]
 
+KeyType = TypeVar("K")
+ValueType = TypeVar("V")
+MatchedNamesSet = dict[str, str | None]
+
 
 def gpu_if_available(device: torch.device | str | None) -> torch.device:
     if device is not None:
@@ -54,8 +61,38 @@ def find_safetensors_index_file(model_files: dict[str, str]) -> str | None:
     return None
 
 
-KeyType = TypeVar("K")
-ValueType = TypeVar("V")
+def match_names_set_eager(
+    names: set[str] | list[str],
+    targets: set[str] | list[str],
+    return_unmatched: bool = True,
+) -> list[MatchedNamesSet] | tuple[list[MatchedNamesSet], MatchedNamesSet]:
+    matched_sets = []
+    matches = dict.fromkeys(targets, None)
+
+    for name in names:
+        # match until we get a full set
+        for target in targets:
+            if _match_name(name, target):
+                if matches[target] is None:
+                    matches[target] = name
+                else:
+                    # matched target twice without completing a set
+                    raise ValueError(
+                        f"Matched a {target} twice before "
+                        f"completing set ({matches[target]}, {name})"
+                    )
+
+        # once we have a full set, yield and reset
+        if all((matches[target] is not None for target in targets)):
+            matched_sets.append(matches)
+            matches = dict.fromkeys(targets, None)
+
+    unmatched_set = matches if any((v is not None for v in matches.values())) else None
+
+    if return_unmatched:
+        return matched_sets, unmatched_set
+    else:
+        return matched_sets
 
 
 def invert_mapping(
diff --git a/src/llmcompressor/entrypoints/model_free/microscale.py b/src/llmcompressor/entrypoints/model_free/microscale.py
@@ -1,87 +1,43 @@
-import torch
 from compressed_tensors.quantization import QuantizationScheme, QuantizationStrategy
-from compressed_tensors.utils.match import _match_name
 
-__all__ = ["get_fused_names", "is_microscale_scheme", "match_names_set_eager"]
+from llmcompressor.entrypoints.model_free.helpers import (
+    MatchedNamesSet,
+    match_names_set_eager,
+)
 
+__all__ = ["is_microscale_scheme", "get_fused_names", "DEFAULT_FUSED_MAPPINGS"]
 
-MatchedNamesSet = dict[str, str | None]
+
+DEFAULT_FUSED_MAPPINGS = [
+    [
+        "re:.*(attn|attention)\.q_proj\.weight$",
+        "re:.*(attn|attention)\.k_proj\.weight$",
+        "re:.*(attn|attention)\.v_proj\.weight$",
+    ],
+    [
+        "re:.*(attn|attention)\.wq_a\.weight$",
+        "re:.*(attn|attention)\.wkv_a_with_mqa\.weight$",
+    ],
+    ["re:.*mlp\.gate_proj\.weight$", "re:.*attn\.up_proj\.weight$"],
+    ["re:.*w1\.weight$", "re:.*w3\.weight$"],
+]
 
 
 def is_microscale_scheme(scheme: QuantizationScheme) -> bool:
     assert scheme.weights is not None
     return scheme.weights.strategy == QuantizationStrategy.TENSOR_GROUP
 
 
-def match_names_set_eager(
+def get_fused_names(
     tensor_names: set[str] | list[str],
-    targets: set[str] | list[str],
-    return_unmatched: bool = True,
-) -> list[MatchedNamesSet] | tuple[list[MatchedNamesSet], MatchedNamesSet]:
-    matched_sets = []
-    matches = dict.fromkeys(targets, None)
-
-    for name in tensor_names:
-        # match until we get a full set
-        for target in targets:
-            if _match_name(name, target):
-                if matches[target] is None:
-                    matches[target] = name
-                else:
-                    # matched target twice without completing a set
-                    raise ValueError(
-                        f"Matched a {target} twice before "
-                        f"completing set ({matches[target]}, {name})"
-                    )
-
-        # once we have a full set, yield and reset
-        if all((matches[target] is not None for target in targets)):
-            matched_sets.append(matches)
-            matches = dict.fromkeys(targets, None)
-
-    unmatched_set = matches if any((v is not None for v in matches.values())) else None
-
-    if return_unmatched:
-        return matched_sets, unmatched_set
-    else:
-        return matched_sets
-
-
-def get_fused_names(tensors: dict[str, torch.Tensor]) -> dict[str, list[str]]:
-    fused_names = {}
-
-    for name in tensors:
-        parts = name.rsplit(".")
-        if len(parts) < 3:
-            continue
-
-        parent, module, param = parts[-3:]
-
-        if (
-            ("attn" in parent or "attention" in parent)
-            and module == "q_proj"
-            and param == "weight"
-        ):
-            parent_name = ".".join((*parts[:-3], parent))
-            q_name = ".".join((parent_name, "q_proj", param))
-            k_name = ".".join((parent_name, "k_proj", param))
-            v_name = ".".join((parent_name, "v_proj", param))
-
-            submodule_names = [q_name, k_name, v_name]
-
-            if all(name in tensors for name in submodule_names):
-                assert parent_name not in fused_names
-                fused_names[parent_name] = submodule_names
-
-        if "mlp" in parent and module == "gate_proj" and param == "weight":
-            parent_name = ".".join((*parts[:-3], parent))
-            gate_name = ".".join((parent_name, "gate_proj", param))
-            up_name = ".".join((parent_name, "up_proj", param))
-
-            submodule_names = [gate_name, up_name]
+) -> tuple[list[MatchedNamesSet], list[MatchedNamesSet]]:
+    matched = []
+    unmatched = []
+    for mapping in DEFAULT_FUSED_MAPPINGS:
+        _matched, _unmatched = match_names_set_eager(tensor_names, mapping)
 
-            if all(name in tensors for name in submodule_names):
-                assert parent_name not in fused_names
-                fused_names[parent_name] = submodule_names
+        matched.extend(_matched)
+        if _unmatched is not None:
+            unmatched.append(_unmatched)
 
-    return fused_names
+    return matched, unmatched
diff --git a/src/llmcompressor/entrypoints/model_free/reindex_fused_weights.py b/src/llmcompressor/entrypoints/model_free/reindex_fused_weights.py
@@ -13,7 +13,10 @@
     find_safetensors_index_file,
     invert_mapping,
 )
-from llmcompressor.entrypoints.model_free.microscale import match_names_set_eager
+from llmcompressor.entrypoints.model_free.microscale import (
+    DEFAULT_FUSED_MAPPINGS,
+    get_fused_names,
+)
 from llmcompressor.entrypoints.model_free.model_utils import (
     get_checkpoint_files,
     is_weights_file,
@@ -25,23 +28,6 @@
 # 1. the incomplete set is the last set of weights (sorted alphabetically)
 # 2. the remainder of the incomplete set is the next file (sorted alphabetically)
 
-model_stub = ""
-fused_mappings: list[list[str]] = []
-
-DEFAULT_FUSED_MAPPINGS = [
-    [
-        "re:.*(attn|attention)\.q_proj\.weight$",
-        "re:.*(attn|attention)\.k_proj\.weight$",
-        "re:.*(attn|attention)\.v_proj\.weight$",
-    ],
-    [
-        "re:.*(attn|attention)\.wq_a\.weight$",
-        "re:.*(attn|attention)\.wkv_a_with_mqa\.weight$",
-    ],
-    ["re:.*mlp\.gate_proj\.weight$", "re:.*attn\.up_proj\.weight$"],
-    ["re:.*w1\.weight$", "re:.*w3\.weight$"],
-]
-
 
 def main(
     model_stub: str,
@@ -96,20 +82,18 @@ def main(
             carry_over_tensors = {}
 
         tensor_names = sorted(list(tensors.keys()))
-        for mapping in fused_mappings:
-            _matches, unmatched = match_names_set_eager(tensor_names, mapping)
-
-            if unmatched is not None:
-                # move to carry over
-                unmatched_tensors = {
-                    key: tensors[key] for key in unmatched.values() if key is not None
-                }
-                carry_over_tensors.update(unmatched_tensors)
-
-                # delete from current file
-                for key in unmatched_tensors:
-                    tensor_names.remove(key)
-                    del tensors[key]
+        _matches, unmatched_sets = get_fused_names(tensor_names)
+        for unmatched in unmatched_sets:
+            # move to carry over
+            unmatched_tensors = {
+                key: tensors[key] for key in unmatched.values() if key is not None
+            }
+            carry_over_tensors.update(unmatched_tensors)
+
+            # delete from current file
+            for key in unmatched_tensors:
+                tensor_names.remove(key)
+                del tensors[key]
 
         # save tensors after modification
         executor.submit(with_progress(save_file, tensors, save_path, progress=progress))