From 7115bdab421f3f7fefb0e349fe0daaf3ad7bcf07 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 12:26:47 -0500 Subject: [PATCH 01/23] remove training support --- .../workflows/test-check-transformers.yaml | 4 +- examples/trl_mixin/README.md | 32 -- examples/trl_mixin/ex_trl_constant.py | 64 --- examples/trl_mixin/ex_trl_distillation.py | 79 --- examples/trl_mixin/sft_trainer.py | 22 - src/llmcompressor/args/dataset_arguments.py | 15 +- src/llmcompressor/args/training_arguments.py | 44 -- src/llmcompressor/entrypoints/__init__.py | 1 - src/llmcompressor/entrypoints/train.py | 146 ----- src/llmcompressor/transformers/__init__.py | 1 + .../{finetune => }/data/__init__.py | 0 .../transformers/{finetune => }/data/base.py | 0 .../transformers/{finetune => }/data/c4.py | 0 .../{finetune => }/data/cnn_dailymail.py | 0 .../{finetune => }/data/custom.py | 0 .../{finetune => }/data/data_helpers.py | 0 .../{finetune => }/data/evolcodealpaca.py | 0 .../{finetune => }/data/flickr_30k.py | 0 .../transformers/{finetune => }/data/gsm8k.py | 0 .../{finetune => }/data/open_platypus.py | 0 .../{finetune => }/data/peoples_speech.py | 0 .../{finetune => }/data/ultrachat_200k.py | 0 .../{finetune => }/data/wikitext.py | 0 .../transformers/finetune/README.md | 85 --- .../transformers/finetune/__init__.py | 4 - .../transformers/finetune/callbacks.py | 121 ---- .../transformers/finetune/session_mixin.py | 537 ------------------ .../transformers/finetune/trainer.py | 18 - .../{finetune => data}/__init__.py | 0 .../{finetune => }/data/conftest.py | 0 .../data/test_dataset_helpers.py | 0 .../data/test_dataset_loading.py | 0 .../{finetune => }/data/test_registry.py | 0 .../transformers/finetune/data/__init__.py | 0 .../finetune/finetune_custom/config1.yaml | 5 - .../finetune/finetune_custom/config2.yaml | 5 - .../finetune_custom/gpu/gpu_config.yaml | 5 - .../finetune/finetune_generic/config1.yaml | 4 - .../finetune_oneshot_configs/config.yaml | 8 - .../gpu/gpu_config.yaml | 7 - .../finetune/finetune_tokenizer/config1.yaml | 5 - .../finetune/test_alternate_recipe.yaml | 22 - .../test_finetune_no_recipe_custom_dataset.py | 137 ----- .../finetune/test_finetune_recipe.yaml | 19 - .../finetune/test_finetune_without_recipe.py | 31 - .../finetune/test_oneshot_and_finetune.py | 122 ---- ...est_oneshot_and_finetune_with_tokenizer.py | 62 -- .../finetune/test_oneshot_then_finetune.py | 160 ------ .../finetune/test_quantization.yaml | 31 - .../transformers/finetune/test_safetensors.py | 42 -- .../finetune/test_session_mixin.py | 65 --- 51 files changed, 7 insertions(+), 1896 deletions(-) delete mode 100644 examples/trl_mixin/README.md delete mode 100644 examples/trl_mixin/ex_trl_constant.py delete mode 100644 examples/trl_mixin/ex_trl_distillation.py delete mode 100644 examples/trl_mixin/sft_trainer.py delete mode 100644 src/llmcompressor/args/training_arguments.py delete mode 100644 src/llmcompressor/entrypoints/train.py rename src/llmcompressor/transformers/{finetune => }/data/__init__.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/base.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/c4.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/cnn_dailymail.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/custom.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/data_helpers.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/evolcodealpaca.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/flickr_30k.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/gsm8k.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/open_platypus.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/peoples_speech.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/ultrachat_200k.py (100%) rename src/llmcompressor/transformers/{finetune => }/data/wikitext.py (100%) delete mode 100644 src/llmcompressor/transformers/finetune/README.md delete mode 100644 src/llmcompressor/transformers/finetune/__init__.py delete mode 100644 src/llmcompressor/transformers/finetune/callbacks.py delete mode 100644 src/llmcompressor/transformers/finetune/session_mixin.py delete mode 100644 src/llmcompressor/transformers/finetune/trainer.py rename tests/llmcompressor/transformers/{finetune => data}/__init__.py (100%) rename tests/llmcompressor/transformers/{finetune => }/data/conftest.py (100%) rename tests/llmcompressor/transformers/{finetune => }/data/test_dataset_helpers.py (100%) rename tests/llmcompressor/transformers/{finetune => }/data/test_dataset_loading.py (100%) rename tests/llmcompressor/transformers/{finetune => }/data/test_registry.py (100%) delete mode 100644 tests/llmcompressor/transformers/finetune/data/__init__.py delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py delete mode 100644 tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py delete mode 100644 tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py delete mode 100644 tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py delete mode 100644 tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py delete mode 100644 tests/llmcompressor/transformers/finetune/test_quantization.yaml delete mode 100644 tests/llmcompressor/transformers/finetune/test_safetensors.py delete mode 100644 tests/llmcompressor/transformers/finetune/test_session_mixin.py diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 368a85a1da..12dc6baeb9 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -93,10 +93,10 @@ jobs: if: (success() || failure()) && steps.install.outcome == 'success' run: | pytest -v tests/llmcompressor/transformers/compression - - name: Run Finetune Tests + - name: Run Data Tests if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest -v tests/llmcompressor/transformers/finetune + pytest -v tests/llmcompressor/transformers/data - name: Running GPTQ Tests if: (success() || failure()) && steps.install.outcome == 'success' run: | diff --git a/examples/trl_mixin/README.md b/examples/trl_mixin/README.md deleted file mode 100644 index fde2d3d1c8..0000000000 --- a/examples/trl_mixin/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Sparse Finetuning with TRL's SFTTrainer - -The `SessionManagerMixin` can be added to other Trainer classes that inherit from -[Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer). - -For example, we can add LLM Compressor support to TRL's SFTTrainer like so: - -Note: install `trl` using `pip install trl` - -```python -from trl import SFTTrainer as TRLSFTTrainer - -class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): - ... -``` - -The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during -supervised finetuning, will full support for all of the original TRL features. The full -class is defined in the script `sft_trainer.py` and requires very minimal -additional code: just a dataset load override to support passing in tokenized datasets -to the Trainer. - -### Examples - -* Script `ex_trl_constant.py`: finetunes a 50% sparse Llama-7b model, -using TRL's dataset preprocessing. Sparsity is maintained throughout training by -applying a `ConstantPruningModifier` recipe to the `SFTTrainer` - -* Script `ex_trl_distillation.py`: finetunes a 50% sparse Llama-7b -model using knowledge distillation from a dense Llama-7b model. Sparsity is maintained -throughout training with a `ConstantPruningModifier` and layer-wise knowledge -distillation is handled by the `OutputDistillationModifier` \ No newline at end of file diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py deleted file mode 100644 index b0abb75202..0000000000 --- a/examples/trl_mixin/ex_trl_constant.py +++ /dev/null @@ -1,64 +0,0 @@ -# NOTE: Fine tuning can require more steps than is shown in the example -# See the Axolotl integration blog post for best fine tuning practices -# https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open - -from datasets import load_dataset -from sft_trainer import SFTTrainer -from transformers import AutoModelForCausalLM, AutoTokenizer -from trl import DataCollatorForCompletionOnlyLM - -from llmcompressor.args import ModelArguments - -model_path = "neuralmagic/Llama-2-7b-pruned50-retrained" -output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data" -model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(model_path) -tokenizer.pad_token = tokenizer.eos_token - -# recipe for maintaining model sparsity during finetuning -recipe = """ -test_stage: - pruning_modifiers: - ConstantPruningModifier: - targets: ['re:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight', - 're:.*o_proj.weight','re:.*gate_proj.weight', 're:.*up_proj.weight', - 're:.*down_proj.weight'] - start: 0 -""" - -# Load gsm8k using TRL dataset tools -dataset = load_dataset("gsm8k", "main", split="train") - - -def formatting_prompts_func(example): - output_texts = [] - for i in range(len(example["question"])): - text = f"Question: {example['question'][i]}\n Answer: {example['answer'][i]}" - output_texts.append(text) - return output_texts - - -response_template = "Answer:" -collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer) - -trl_sft_config_args = dict( - output_dir=output_dir, - num_train_epochs=0.6, - logging_steps=50, - gradient_checkpointing=True, - max_seq_length=512, -) -model_args = ModelArguments(model=model) - -# This step can be supplanted by fine tuning via integrated FT libraries such as Axolotl -trainer = SFTTrainer( - model=model, - processing_class=tokenizer, - recipe=recipe, - train_dataset=dataset, - formatting_func=formatting_prompts_func, - data_collator=collator, - trl_sft_config_args=trl_sft_config_args, - model_args=model_args, -) -trainer.train() diff --git a/examples/trl_mixin/ex_trl_distillation.py b/examples/trl_mixin/ex_trl_distillation.py deleted file mode 100644 index 421fa96f37..0000000000 --- a/examples/trl_mixin/ex_trl_distillation.py +++ /dev/null @@ -1,79 +0,0 @@ -# NOTE: Fine tuning can require more steps than is shown in the example -# See the Axolotl integration blog post for best fine tuning practices -# https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open - -from sft_trainer import SFTTrainer -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor.args import DatasetArguments, ModelArguments -from llmcompressor.transformers import TextGenerationDataset - -model_path = "neuralmagic/Llama-2-7b-pruned50-retrained" -teacher_path = "neuralmagic/Llama-2-7b-gsm8k" -output_dir = "./output_trl_sft_test_7b_gsm8k" - -model = AutoModelForCausalLM.from_pretrained( - model_path, torch_dtype="auto", device_map="auto" -) -teacher = AutoModelForCausalLM.from_pretrained( - teacher_path, torch_dtype="auto", device_map="auto" -) - -tokenizer = AutoTokenizer.from_pretrained(model_path) -max_seq_length = 512 - -# Load gsm8k using SparseML dataset tools -dataset_args = DatasetArguments( - dataset="gsm8k", dataset_config_name="main", max_seq_length=max_seq_length -) -dataset_manager = TextGenerationDataset.load_from_registry( - dataset_args.dataset, - dataset_args=dataset_args, - split="train", - processor=tokenizer, -) -train_dataset = dataset_manager() -print(f"--> Training Set Length = {len(train_dataset)}") - -# recipe for maintaining model sparsity during finetuning -recipe = """ -test_stage: - pruning_modifiers: - ConstantPruningModifier: - targets: ['re:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight', - 're:.*o_proj.weight', 're:.*gate_proj.weight', 're:.*up_proj.weight', - 're:.*down_proj.weight'] - start: 0 - OutputDistillationModifier: - targets: ['re:model.layers.\\d+$'] - comparison: "square_head" - start: 0 - orig_scale: 1.0 - distill_scale: 1.0 -""" - -trl_sft_config_args = dict( - output_dir=output_dir, - num_train_epochs=0.6, - logging_steps=50, - gradient_checkpointing=True, - bf16=True, - save_safetensors=False, # workaround for shared tensors - max_seq_length=max_seq_length, - packing=True, -) -model_args = ModelArguments(model=model, distill_teacher=teacher) - -# This step can be supplanted by fine tuning via integrated FT libraries such as Axolotl -trainer = SFTTrainer( - model=model, - teacher=teacher, - processing_class=tokenizer, - recipe=recipe, - train_dataset=train_dataset, - trl_sft_config_args=trl_sft_config_args, - dataset_args=dataset_args, - model_args=model_args, -) -trainer.train() -trainer.save_model(output_dir) diff --git a/examples/trl_mixin/sft_trainer.py b/examples/trl_mixin/sft_trainer.py deleted file mode 100644 index 5abb05f4ef..0000000000 --- a/examples/trl_mixin/sft_trainer.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Dict, Optional - -from trl import SFTConfig as TRLSFTConfig -from trl import SFTTrainer as TRLSFTTrainer - -from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn - -__all__ = ["SFTTrainer"] - - -class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): - def __init__(self, trl_sft_config_args: Optional[Dict] = None, *args, **kwargs): - if trl_sft_config_args is not None: - kwargs["args"] = TRLSFTConfig(**trl_sft_config_args) - super().__init__(*args, **kwargs) - - def _prepare_dataset(self, dataset, *args, **kwargs): - if "input_ids" in dataset.column_names: - # dataset is already tokenized, skip preprocessing - return dataset - - return super()._prepare_dataset(dataset, *args, **kwargs) diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py index d94837b264..2618b90197 100644 --- a/src/llmcompressor/args/dataset_arguments.py +++ b/src/llmcompressor/args/dataset_arguments.py @@ -16,7 +16,7 @@ @dataclass class DVCDatasetArguments: """ - Arguments for training using DVC + Arguments for calibration using DVC """ dvc_data_repository: str | None = field( @@ -28,7 +28,7 @@ class DVCDatasetArguments: @dataclass class CustomDatasetArguments(DVCDatasetArguments): """ - Arguments for training using custom datasets + Arguments for calibration using custom datasets """ dataset_path: str | None = field( @@ -78,8 +78,8 @@ class CustomDatasetArguments(DVCDatasetArguments): @dataclass class DatasetArguments(CustomDatasetArguments): """ - Arguments pertaining to what data we are going to input our model for - calibration, training + Arguments pertaining to what data we are going to use for + calibration Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command line @@ -152,13 +152,6 @@ class DatasetArguments(CustomDatasetArguments): "in the batch (which can be faster on GPU but will be slower on TPU)." }, ) - max_train_samples: int | None = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number " - "of training examples to this value if set." - }, - ) min_tokens_per_module: float | None = field( default=None, metadata={ diff --git a/src/llmcompressor/args/training_arguments.py b/src/llmcompressor/args/training_arguments.py deleted file mode 100644 index b5fb508e73..0000000000 --- a/src/llmcompressor/args/training_arguments.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Training argument classes for LLM compression workflows. - -This module defines dataclass-based argument containers for configuring -training and one-shot calibration workflows. Extends HuggingFace's -TrainingArguments with additional parameters specific to compression and -stage-based execution. -""" - -from dataclasses import dataclass, field - -from transformers import TrainingArguments as HFTrainingArgs - -__all__ = [ - "TrainingArguments", -] - - -@dataclass -class TrainingArguments(HFTrainingArgs): - """ - Training arguments specific to LLM Compressor Transformers workflow using - HFTrainingArgs as base class - - """ - - do_oneshot: bool | None = field( - default=False, - metadata={"help": "Whether to run one-shot calibration in stages"}, - ) - run_stages: bool | None = field( - default=False, metadata={"help": "Whether to trigger recipe stage by stage"} - ) - output_dir: str = field( - default="./output", - metadata={ - "help": "The output directory where the model safetensors, " - "recipe, config, and optionally checkpoints will be written." - }, - ) - - @property - def place_model_on_device(self): - return False diff --git a/src/llmcompressor/entrypoints/__init__.py b/src/llmcompressor/entrypoints/__init__.py index 5f1ba6b9c7..b6c5e94ba2 100644 --- a/src/llmcompressor/entrypoints/__init__.py +++ b/src/llmcompressor/entrypoints/__init__.py @@ -8,6 +8,5 @@ """ from .oneshot import Oneshot, oneshot -from .train import train from .model_free import model_free_ptq from .utils import post_process, pre_process diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py deleted file mode 100644 index d5b9ed951e..0000000000 --- a/src/llmcompressor/entrypoints/train.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -Training entrypoint for fine-tuning models with compression support. - -Provides the main training entry point that supports both vanilla -fine-tuning and compression-aware training workflows. Integrates with -HuggingFace transformers and supports knowledge distillation, pruning, -and quantization during the training process. -""" - -import math -import os - -from compressed_tensors.utils import deprecated -from loguru import logger -from transformers import PreTrainedModel - -from llmcompressor.args import parse_args -from llmcompressor.core.session_functions import active_session -from llmcompressor.datasets.utils import get_processed_dataset -from llmcompressor.transformers.finetune.trainer import Trainer -from llmcompressor.utils.dev import dispatch_for_generation - -from .utils import post_process, pre_process - - -@deprecated( - message=( - "Training support will be removed in future releases. Please use " - "the llmcompressor Axolotl integration for fine-tuning " - "https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open" # noqa: E501 - ) -) -def train(**kwargs) -> PreTrainedModel: - """ - Fine-tuning entrypoint that supports vanilla fine-tuning and - knowledge distillation for compressed model using `oneshot`. - - - This entrypoint is responsible the entire fine-tuning lifecycle, including - preprocessing (model and tokenizer/processor initialization), fine-tuning, - and postprocessing (saving outputs). The intructions for fine-tuning compressed - model can be specified by using a recipe. - - - **Input Keyword Arguments:** - `kwargs` are parsed into: - - `model_args`: Arguments for loading and configuring a pretrained model - (e.g., `AutoModelForCausalLM`). - - `dataset_args`: Arguments for dataset-related configurations, such as - calibration dataloaders. - - `recipe_args`: Arguments for defining and configuring recipes that specify - optimization actions. - - `training_args`: rguments for defining and configuring training parameters - - Parsers are defined in `src/llmcompressor/args/`. - - - **Lifecycle Overview:** - The fine-tuning lifecycle consists of three steps: - 1. **Preprocessing**: - - Instantiates a pretrained model and tokenizer/processor. - - Ensures input and output embedding layers are untied if they share - tensors. - - Patches the model to include additional functionality for saving with - quantization configurations. - 2. **Training**: - - Finetunes the model using a global `CompressionSession` and applies - recipe-defined modifiers (e.g., `ConstantPruningModifier`, - `OutputDistillationModifier`) - 3. **Postprocessing**: - - Saves the model, tokenizer/processor, and configuration to the specified - `output_dir`. - - - **Usage:** - ```python - train(model=model, recipe=recipe, dataset=dataset) - - ``` - - """ - model_args, dataset_args, recipe_args, training_args, output_dir = parse_args( - include_training_args=True, **kwargs - ) - - pre_process(model_args, dataset_args, output_dir) - dispatch_for_generation(model_args.model) # train is dispatched same as generation - - processed_dataset = get_processed_dataset( - dataset_args=dataset_args, - processor=model_args.processor, - ) - training_dataset = processed_dataset.get("train") - - # create output dir for stages - original_output_dir = output_dir = training_args.output_dir - if all([output_dir, recipe_args, getattr(recipe_args, "stage", None)]): - output_dir = os.path.join(original_output_dir, recipe_args.stage) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - # update output dir in training args - logger.info( - f"Stage detected for training. Updating output dir to: {output_dir}" - ) - training_args.output_dir = output_dir - - trainer = Trainer( - model=model_args.model, - teacher=model_args.distill_teacher, - recipe=recipe_args.recipe, - recipe_args=recipe_args.recipe_args, - args=training_args, - model_args=model_args, - dataset_args=dataset_args, - train_dataset=training_dataset, - processing_class=model_args.processor, - data_collator=dataset_args.data_collator, - ) - - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - - logger.info("*** Train ***") - - session = active_session() - session.reset() - train_result = trainer.train( - resume_from_checkpoint=checkpoint, - stage=recipe_args.stage, - ) - - # return output - metrics = train_result.metrics - metrics["train_samples"] = len(training_dataset) - metrics["perplexity"] = math.exp(metrics["train_loss"]) - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - - # this includes saving the state, optimizer and scheduler - # TODO: support all save args, not just skip_sparsity_compression_stats - trainer.save_model( - output_dir=training_args.output_dir, skip_sparsity_compression_stats=False - ) - - post_process(recipe_args=recipe_args) - training_args.output_dir = original_output_dir - - return model_args.model diff --git a/src/llmcompressor/transformers/__init__.py b/src/llmcompressor/transformers/__init__.py index fd751a6a26..2e018413ac 100644 --- a/src/llmcompressor/transformers/__init__.py +++ b/src/llmcompressor/transformers/__init__.py @@ -7,3 +7,4 @@ # (import order matters for circular import avoidance) from .utils import * from .finetune import * +from .data import TextGenerationDataset diff --git a/src/llmcompressor/transformers/finetune/data/__init__.py b/src/llmcompressor/transformers/data/__init__.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/__init__.py rename to src/llmcompressor/transformers/data/__init__.py diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/data/base.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/base.py rename to src/llmcompressor/transformers/data/base.py diff --git a/src/llmcompressor/transformers/finetune/data/c4.py b/src/llmcompressor/transformers/data/c4.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/c4.py rename to src/llmcompressor/transformers/data/c4.py diff --git a/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py b/src/llmcompressor/transformers/data/cnn_dailymail.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/cnn_dailymail.py rename to src/llmcompressor/transformers/data/cnn_dailymail.py diff --git a/src/llmcompressor/transformers/finetune/data/custom.py b/src/llmcompressor/transformers/data/custom.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/custom.py rename to src/llmcompressor/transformers/data/custom.py diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/data/data_helpers.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/data_helpers.py rename to src/llmcompressor/transformers/data/data_helpers.py diff --git a/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py b/src/llmcompressor/transformers/data/evolcodealpaca.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/evolcodealpaca.py rename to src/llmcompressor/transformers/data/evolcodealpaca.py diff --git a/src/llmcompressor/transformers/finetune/data/flickr_30k.py b/src/llmcompressor/transformers/data/flickr_30k.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/flickr_30k.py rename to src/llmcompressor/transformers/data/flickr_30k.py diff --git a/src/llmcompressor/transformers/finetune/data/gsm8k.py b/src/llmcompressor/transformers/data/gsm8k.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/gsm8k.py rename to src/llmcompressor/transformers/data/gsm8k.py diff --git a/src/llmcompressor/transformers/finetune/data/open_platypus.py b/src/llmcompressor/transformers/data/open_platypus.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/open_platypus.py rename to src/llmcompressor/transformers/data/open_platypus.py diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/data/peoples_speech.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/peoples_speech.py rename to src/llmcompressor/transformers/data/peoples_speech.py diff --git a/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py b/src/llmcompressor/transformers/data/ultrachat_200k.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/ultrachat_200k.py rename to src/llmcompressor/transformers/data/ultrachat_200k.py diff --git a/src/llmcompressor/transformers/finetune/data/wikitext.py b/src/llmcompressor/transformers/data/wikitext.py similarity index 100% rename from src/llmcompressor/transformers/finetune/data/wikitext.py rename to src/llmcompressor/transformers/data/wikitext.py diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md deleted file mode 100644 index f677cfd0a3..0000000000 --- a/src/llmcompressor/transformers/finetune/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Sparse Finetuning - -## Launching from Python - -```python -from llmcompressor import train - -model = "./sparsegpt_deployment" -teacher_model = "Xenova/llama2.c-stories15M" -dataset_name = "open_platypus" -concatenate_data = False -output_dir = "./output_finetune" -recipe = "test_trainer_recipe.yaml" -num_train_epochs=2 -overwrite_output_dir = True -splits = { - "train": "train[:50%]", -} - -train( - model=model, - distill_teacher=teacher_model, - dataset=dataset_name, - output_dir=output_dir, - recipe=recipe, - num_train_epochs=num_train_epochs, - overwrite_output_dir=overwrite_output_dir, - concatenate_data = concatenate_data, - splits = splits -) -``` - -## Additional Configuration - -Finetuning arguments are split up into 3 groups: - -* ModelArguments: `src/llmcompressor/args/model_arguments.py` -* TrainingArguments: `src/llmcompressor/args/training_arguments.py` -* DatasetArguments: `src/llmcompressor/args/dataset_arguments.py` -* RecipeArguments: `src/llmcompressor/args/recipe_arguments.py` - - -## Running Multi-Stage Recipes - -A recipe can be run stage-by-stage by setting `run_stages` to `True` or calling the -`llmcompressor.transformers.apply/compress` pathways. Each stage in the recipe should have -a `run_type` attribute set to either `oneshot` or `train` when running in sequential -mode. - -See [example_alternating_recipe.yaml](../../../../examples/finetuning/example_alternating_recipe.yaml) for an example -of a staged recipe for Llama. - -test_multi.py -```python -from llmcompressor.transformers import apply -from transformers import AutoModelForCausalLM - -model = "../ml-experiments/nlg-text_generation/llama_pretrain-llama_7b-base/dense/training" - -dataset_name = "open_platypus" -concatenate_data = False -run_stages=True -output_dir = "./output_finetune_multi" -recipe = "example_alternating_recipe.yaml" -num_train_epochs=1 -overwrite_output_dir = True -splits = { - "train": "train[:95%]", - "calibration": "train[95%:100%]" -} - -apply( - model_name_or_path=model, - dataset_name=dataset_name, - run_stages=run_stages, - output_dir=output_dir, - recipe=recipe, - num_train_epochs=num_train_epochs, - overwrite_output_dir=overwrite_output_dir, - concatenate_data = concatenate_data, - remove_unused_columns = False, - splits = splits -) - -``` \ No newline at end of file diff --git a/src/llmcompressor/transformers/finetune/__init__.py b/src/llmcompressor/transformers/finetune/__init__.py deleted file mode 100644 index 4d76c27542..0000000000 --- a/src/llmcompressor/transformers/finetune/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# ruff: noqa - -from .data import TextGenerationDataset -from .session_mixin import SessionManagerMixIn diff --git a/src/llmcompressor/transformers/finetune/callbacks.py b/src/llmcompressor/transformers/finetune/callbacks.py deleted file mode 100644 index daed32057e..0000000000 --- a/src/llmcompressor/transformers/finetune/callbacks.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Training callbacks for compression-aware fine-tuning workflows. - -This module provides custom trainer callbacks that integrate compression -session management with HuggingFace training loops. Handles precision -control, training loop monitoring, and compression lifecycle events -during model fine-tuning. -""" - -import math - -from transformers import TrainerCallback, TrainerControl, TrainingArguments -from transformers.trainer_callback import TrainerState - -from llmcompressor.core import active_session -from llmcompressor.core import callbacks as session_callbacks - -__all__ = [ - "DisableHalfPrecisionCallback", - "TrainingLoopCallbacks", -] - - -class TrainingLoopCallbacks(TrainerCallback): - """ - TrainerCallback for triggering CompressionSession callbacks in the training loop. - Used to update the model reference(for running with FSDP) and trigger the post- - optim callbacks in each modifier. - - :param trainer: LLM Compressor trainer that will call back into this object - :param args: args to be passed to base TrainerCallback - :param kwargs: key word arguments to be passed to base TrainerCallback - """ - - def __init__(self, trainer, *args, **kwargs): - super().__init__(*args, **kwargs) - self.trainer = trainer - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - """ - Event called at the beginning of training. Update the session reference to the - model, as it will have changed to a wrapper if FSDP is enabled - """ - super().on_train_begin(args, state, control, **kwargs) - session = active_session() - session.state.model = self.trainer.model - - def on_step_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - """ - Event called at the end of a training step. If using gradient accumulation, - one training step might take several inputs. - - Triggers optimizer post_step and batch_end in the active CompressionSession - """ - super().on_step_end(args, state, control, **kwargs) - session_callbacks.optim_post_step() - session_callbacks.batch_end() - - def on_substep_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - """ - Event called at the end of an substep during gradient accumulation. - - Triggers optimizer post_step and batch_end in the active CompressionSession - """ - super().on_substep_end(args, state, control, **kwargs) - session_callbacks.optim_post_step() - session_callbacks.batch_end() - - -class DisableHalfPrecisionCallback(TrainerCallback): - """ - TrainerCallback for disabling FP16 training before QAT training begins - - :param trainer: LLM Compressor trainer that will call back into this object - :param args: args to be passed to base TrainerCallback - :param kwargs: key word arguments to be passed to base TrainerCallback - """ - - def __init__(self, trainer, *args, **kwargs): - super().__init__(*args, **kwargs) - self.trainer = trainer - self.on_begin_called = False - self.quant_start_epoch = math.inf - - def qat_active(self) -> bool: - """ - :return: True if a quantization modifier is active in the current session - """ - session = active_session() - return session.state.model.qat_active() - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs, - ): - """ - Event called at the beginning of an epoch. - """ - super().on_epoch_begin(args, state, control, **kwargs) - self.on_begin_called = True diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py deleted file mode 100644 index e344705d71..0000000000 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ /dev/null @@ -1,537 +0,0 @@ -import inspect -import math -import os -from dataclasses import asdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union - -import torch -from loguru import logger -from torch.nn import Module -from torch.utils.data import IterableDataset -from transformers.trainer_callback import TrainerState -from transformers.trainer_utils import get_last_checkpoint - -from llmcompressor.core import active_session, callbacks, create_session -from llmcompressor.metrics import LoggerManager -from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import ( - KDModelWrapper, -) -from llmcompressor.pytorch.model_load.helpers import get_session_model, save_checkpoint -from llmcompressor.pytorch.utils import ModuleSparsificationInfo -from llmcompressor.transformers.finetune.callbacks import ( - DisableHalfPrecisionCallback, - TrainingLoopCallbacks, -) -from llmcompressor.utils.fsdp.context import summon_full_params_context -from llmcompressor.utils.pytorch import qat_active - -if TYPE_CHECKING: - from llmcompressor.args import DatasetArguments, ModelArguments - -__all__ = [ - "SessionManagerMixIn", -] - -TRAINER_STATE_NAME = "trainer_state.json" -METADATA_ARGS = [ - "per_device_train_batch_size", - "max_seq_length", - "save_safetensors", - "fp16", -] - - -class SessionManagerMixIn: - """ - Mix-In class to extend the Hugging Face Trainer class to support LLM Compressor - recipes for one-shot and finetuning flows. - - :param recipe: path to recipe file to apply during training - :param recipe_args: additional kwargs to use for evaluating recipe - :param dataset_args: kwargs for configuring dataset loading - :param teacher: optional teacher model to use for distillation - """ - - def __init__( - self, - recipe: str, - model_args: "ModelArguments", - dataset_args: Optional["DatasetArguments"] = None, - teacher: Optional[Union[Module, str]] = None, - recipe_args: Optional[Union[Dict[str, Any], str]] = None, - **kwargs, - ): - self.recipe = recipe - self.recipe_args = recipe_args - self.model_args = model_args - self.teacher = teacher - - # parse training and metadata args - training_args = kwargs.get("args") - - self.metadata = None - if training_args is not None: - # trl_sft_trainer pathway. Both training_args and dataset_args - # have `max_seq_length` which causes collision error. This is the - # only shared parameter, where training arg is `TRLSFTConfig` that - # inherits HuggingFace's `TrainingArguments` - training_args_dict = training_args.to_dict() - if "max_seq_length" in training_args_dict: - training_args_dict["training_args_max_seq_length"] = ( - training_args_dict.pop("max_seq_length") - ) - logger.warning( - "Detected `max_seq_length` in both dataset_args ", - "and training_args. This is expected for TRL in distillation. ", - "Updating metadata to `training_args_max_seq_length`", - ) - - self.metadata = self._extract_metadata( - metadata_args=METADATA_ARGS, - training_args_dict=training_args_dict, - dataset_args_dict=asdict(dataset_args) if dataset_args else {}, - ) - - # setup metrics and session - self.logger_manager = LoggerManager(log_python=False) - create_session() - - # call Trainer initialization - super().__init__(**kwargs) - self.accelerator.wait_for_everyone() - - # setup callbacks and loss - self.optim_callbacks = TrainingLoopCallbacks(self) - self.callback_handler.add_callback(self.optim_callbacks) - self.callback_disable_fp16 = DisableHalfPrecisionCallback(self) - self.callback_handler.add_callback(self.callback_disable_fp16) - self.criterion = torch.nn.CrossEntropyLoss() - - model_signature = inspect.signature(self.model.forward) - self._signature_columns = list(model_signature.parameters.keys()) - - if self.teacher is not None and teacher not in ("disable", "self"): - teacher_signature = inspect.signature(self.teacher.forward) - self._teacher_signature_columns = list(teacher_signature.parameters.keys()) - else: - self._teacher_signature_columns = None - - if self.is_fsdp_enabled: - self._prepare_model_for_fsdp() - - if dataset_args is not None: - self.min_tokens_per_module = dataset_args.min_tokens_per_module - - def initialize_session( - self, - epoch: float, - checkpoint: Optional[str] = None, - stage: Optional[str] = None, - ): - """ - Initialize the CompressionSession from the specified epoch, evaluates the recipe - and initialized the modifiers for the training session - - :param epoch: Epoch to initialize session from, usually 0 unless loading - from a checkpoint - :param checkpoint: Optional checkpoint to initialize from to continue training - :param stage: Optional stage of recipe to run, or None to run all stages - """ - session = active_session() - if session.lifecycle.initialized_ or session.lifecycle.finalized: - return False - - train_data = self.get_train_dataloader() - - self.accelerator.wait_for_everyone() - with summon_full_params_context(self.model, offload_to_cpu=True): - active_session().initialize( - recipe=self.recipe, - recipe_stage=stage, - recipe_args=self.recipe_args, - model=self.model, - teacher_model=self.teacher, # TODO: what about for self/disable? - train_data=train_data, - start=epoch, - copy_data=False, - attach_optim_callbacks=True, - fsdp_active=self.is_fsdp_enabled, - metadata=self.metadata, - ) - - self.accelerator.wait_for_everyone() - model = get_session_model() - self.model_wrapped = self.model = model - - if self.recipe is None: - logger.warning( - "No training recipe was provided, finetuning will be run " - "without event callbacks to LLM Compressor. To supply a recipe " - "pass a yaml file or string to the `recipe` argument." - ) - - if hasattr(torch, "xpu") and torch.xpu.is_available(): - torch.xpu.empty_cache() - else: - torch.cuda.empty_cache() - - def finalize_session(self): - """ - Wrap up training by finalizing all modifiers initialized in the current session - """ - session = active_session() - if not session.lifecycle.initialized_ or session.lifecycle.finalized: - return False - - with summon_full_params_context(self.model, offload_to_cpu=True): - # in order to update each layer we need to gathers all its parameters - active_session().finalize() - logger.info("Finalized LLM Compressor session") - model = get_session_model() - self.model = model - if hasattr(torch, "xpu") and torch.xpu.is_available(): - torch.xpu.empty_cache() - else: - torch.cuda.empty_cache() - - def create_optimizer(self): - """ - Override the optimizer to apply and update the recipe while training. - create_optimizer must exist in the parent class and should set - self.optimizer to the optimizer state and optionally set self.scaler - if using amp. - """ - - self._check_super_defined("create_optimizer") - super().create_optimizer() - - # n_gpu handled internally by dataloader - total_batch_size = ( - self.args.per_device_train_batch_size - * self.args.gradient_accumulation_steps - ) - - if isinstance(self.train_dataset, IterableDataset): - logger.warning( - "Training is being run with a streamed dataset, " - "steps_per_epoch cannot be determined and will default to " - "1. LLM Compressor modifiers utilizing this statistic may not " - "behave as expected. " - ) - self.total_steps_per_epoch = 1 - else: - self.total_steps_per_epoch = math.ceil( - len(self.train_dataset) / total_batch_size - ) - - active_session().initialize( - optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch - ) - - return self.optimizer - - def create_scheduler( - self, num_training_steps: int, optimizer: torch.optim.Optimizer = None - ): - """ - Create an LR scheduler to work with the applied recipes. This is a placeholder - that just calls the super method, but would be expanded upon if we ever - implement a LearningRateModifier. - - :param num_training_steps: the total number of training steps - :param optimizer: pre-initialized optimizer - """ - - # TODO: we don't currently have a LR scheduler in the new modifier framework - self._check_super_defined("create_scheduler") - return super().create_scheduler( - num_training_steps=num_training_steps, optimizer=optimizer - ) - - def training_step( - self, - model: torch.nn.Module, - inputs: Dict[str, Union[torch.Tensor, Any]], - num_items_in_batch: Optional[int] = None, - ) -> torch.Tensor: - """ - Overrides the Trainer's training step to trigger the batch_start callback to - the modifiers, then calls the parent function. - - :param model: the model to compute the loss for - :param inputs: the inputs to pass through the model for calculating the loss - :return: output of the model - """ - self._check_super_defined("training_step") - - callbacks.batch_start(batch_data=inputs, global_step=self.state.epoch) - model_outputs = super().training_step( - model=model, inputs=inputs, num_items_in_batch=num_items_in_batch - ) - - return model_outputs - - def compute_loss( - self, - model: Module, - inputs: Dict[str, Any], - return_outputs: bool = False, - num_items_in_batch: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, Any]]: - """ - Override for the compute_loss to factor trigger callbacks and filter columns - - :param model: the model to compute the loss for - :param inputs: the inputs to pass through the model for calculating the loss - :param return_outputs: True to return the outputs with the loss, - False otherwise - :param num_items_in_batch: the number of items which contribute to loss - :return: the resulting loss if not return_outputs, otherwise a tuple - containing the loss and the model's outputs - """ - self._check_super_defined("compute_loss") - - # TODO: do we need these model signature columns? - inputs = {k: inputs[k] for k in inputs if k in self._signature_columns} - loss = super().compute_loss( - model=model, - inputs=inputs, - return_outputs=return_outputs, - num_items_in_batch=num_items_in_batch, - ) - - # take the mean across multiple GPUs - # this is done outside the compute_loss function in the parent, replicating it - # here for LLM Compressor logging and distillation - loss = loss.mean() - - # Log step-wise loss and perplexity, for llama-recipes comparison - # we want this before distillation loss so perplexity isn't thrown off - do_log = self.state.global_step % self.args.logging_steps == 0 - if do_log: - log = {} - log["step_loss"] = loss.item() - log["perplexity"] = torch.exp(loss).item() - - if active_session().lifecycle.initialized_: - state = callbacks.loss_calculated(loss=loss) - if state and state.loss is not None: - loss = state.loss - if do_log: - log["distill_step_loss"] = loss.item() - log["step_loss"] - callbacks.optim_pre_step() - - if do_log: - self.log(log) - - return loss - - def train(self, *args, stage: Optional[str] = None, **kwargs): - """ - Run a sparsification training cycle. Runs initialization for the sparse session - before calling super().train() and finalization of the session after. - - Logs sparsification details for the trained model. - - :param args: positional args to pass to super().train() - :param stage: Optional stage of recipe to run, or None to run all stages - :param kwargs: keyword args to pass to super().train() - :return: the output from super.train() - """ - - # lifecycle - checkpoint, epoch = self._calculate_checkpoint_info(kwargs) - self.initialize_session(epoch=epoch, checkpoint=checkpoint, stage=stage) - - # do not save checkpoints as compressed - original_save_compressed = self.model_args.save_compressed - self.model_args.save_compressed = False - - # train with accelerator - self.accelerator.wait_for_everyone() - output = super().train(*args, **kwargs) - self.accelerator.wait_for_everyone() - - # restore original setting for saving final model - self.model_args.save_compressed = original_save_compressed - - # lifecycle - self.finalize_session() - self.accelerator.wait_for_everyone() - - # log model sparsity - self.maybe_log_model_sparsification() - self.accelerator.wait_for_everyone() - - return output - - # TODO: support all save args, not just skip_sparsity_compression_stats - def save_model( - self, - output_dir: str, - _internal_call: bool = False, - skip_sparsity_compression_stats: Optional[bool] = True, - ): - """ - Override of the save_model function and expects it to exist in the parent. - Calls into super() to save the model and additionally saves any recipes - that were used with the model within the model folder. - - :param output_dir: the path to save the recipes into - :param _internal_call: True if this is an internal call from - the trainer in super(). Called from - self.save_model(output_dir, _internal_call=True) - in transformers/trainer/Trainer::_save_checkpoint - - """ - if active_session() is None: - logger.warning( - "No active session found, skipping saving of recipes and model." - ) - return - - # knowledge distillation requires making wrappers transparent during - if isinstance(self.model, KDModelWrapper): - self.model.prepare_for_save() # TODO: move to finalize - - # save checkpoint - # note that skip_sparsity_compression_stats - # is True by default to avoid high runtime cost - self.save_state() - if self.accelerator.is_main_process: - processor = getattr(self, "processing_class", self.tokenizer) - # TODO: need to port over all saving parameters so that all - # checkpoints are saved in the same way - save_checkpoint( - output_dir, - model=self.model, - processor=processor, - save_safetensors=self.args.save_safetensors, - save_compressed=self.model_args.save_compressed, - skip_sparsity_compression_stats=skip_sparsity_compression_stats, - ) - self.accelerator.wait_for_everyone() - - if isinstance(self.model, KDModelWrapper): - self.model.finish_save() - - def maybe_log_model_sparsification(self): - """ - Log info on model sparsity and quantization if possible. Only print logs on the - main process, and avoid logging for quantized FSDP models - """ - with summon_full_params_context(self.model, offload_to_cpu=True): - # offload to avoid OOM errors - if not self.accelerator.is_main_process: - # only calculate stats rank0 GPU - return - if self.is_fsdp_enabled and qat_active(self.model): - # due to state dict changes we can't log sparsity info with quantized - # models in FSDP - return - - self.log_model_sparsification() - - def log_model_sparsification(self): - """ - Log the current model sparsification info including pruned and quantized states - """ - sparsification_info = ModuleSparsificationInfo(self.model) - - logger.info( - f"Sparsification info for {type(self.model).__name__}: " - f"{sparsification_info.params_total} total params. " - ) - sparsity_percent_formatted = "{:.2f}".format( - sparsification_info.params_sparse_percent - ) - logger.info( - f"There are {sparsification_info.params_total} prunable " - f"params which have {sparsity_percent_formatted}% " - "avg sparsity." - ) - - quant_percent_formatted = "{:.2f}".format( - sparsification_info.params_quantized_percent - ) - logger.info( - f"There are {sparsification_info.params_total} quantizable " - f"params, with a quantization percentage of " - f"{quant_percent_formatted}%." - ) - - def _prepare_model_for_fsdp(self): - """ - Sets up FSDP ahead of time so we can run one-shot in FSDP mode - """ - self.model.to("cpu") - self.model = self.accelerator.prepare(self.model) - self.accelerator.wait_for_everyone() - - if self.teacher is not None: - self.teacher.to("cpu") - for n, p in self.teacher.named_parameters(): - p.requires_grad = False - self.teacher = self.accelerator.prepare(self.teacher) - self.teacher.eval() - self.accelerator.wait_for_everyone() - - def _extract_metadata( - self, - metadata_args: List[str], - training_args_dict: Dict[str, Any], - dataset_args_dict: Dict[str, Any], - ) -> Dict[str, Any]: - metadata = {} - if not training_args_dict.keys().isdisjoint(dataset_args_dict.keys()): - raise ValueError( - "Found common keys in `training_args` and `data args`. " - "This is prohibitive and may lead to undesired behavior." - ) - - args_dict = {**training_args_dict, **dataset_args_dict} - - for arg in metadata_args: - if arg not in args_dict.keys(): - logger.warning( - f"Required metadata argument {arg} was not found " - f"in the training arguments. Setting {arg} to None." - ) - metadata[arg] = None - else: - metadata[arg] = args_dict[arg] - - return metadata - - def _check_super_defined(self, func: str): - if not hasattr(super(), func): - raise NotImplementedError( - f"The super class for SessionManagerMixIn must define a {func} function" - ) - - def _calculate_checkpoint_info(self, kwargs) -> Tuple[Optional[str], float]: - """ - If resuming from checkpoint is set, get checkpoint and epoch to resume from - """ - checkpoint = None - epoch = 0.0 - - if not kwargs or "resume_from_checkpoint" not in kwargs: - logger.warning( - "resume_from_checkpoint not passed into LLM Compressor Trainer.train. " - "This will cause issues with restoring recipes when " - "running from a checkpoint." - ) - elif kwargs["resume_from_checkpoint"]: - if ( - isinstance(kwargs["resume_from_checkpoint"], bool) - and kwargs["resume_from_checkpoint"] - ): - checkpoint = get_last_checkpoint(self.args.output_dir) - else: - checkpoint = kwargs["resume_from_checkpoint"] - epoch = TrainerState.load_from_json( - os.path.join(checkpoint, TRAINER_STATE_NAME) - ).epoch - - return checkpoint, epoch diff --git a/src/llmcompressor/transformers/finetune/trainer.py b/src/llmcompressor/transformers/finetune/trainer.py deleted file mode 100644 index 6bb3a1739b..0000000000 --- a/src/llmcompressor/transformers/finetune/trainer.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Enhanced trainer class for fine-tuning with compression support. - -This module provides a Trainer class that extends HuggingFace's Trainer with -LLM compression session management capabilities. Integrates compression -workflows into the standard training loop for seamless model optimization -during fine-tuning. -""" - -from transformers import Trainer as HFTransformersTrainer - -from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn - -__all__ = ["Trainer"] - - -class Trainer(SessionManagerMixIn, HFTransformersTrainer): - pass diff --git a/tests/llmcompressor/transformers/finetune/__init__.py b/tests/llmcompressor/transformers/data/__init__.py similarity index 100% rename from tests/llmcompressor/transformers/finetune/__init__.py rename to tests/llmcompressor/transformers/data/__init__.py diff --git a/tests/llmcompressor/transformers/finetune/data/conftest.py b/tests/llmcompressor/transformers/data/conftest.py similarity index 100% rename from tests/llmcompressor/transformers/finetune/data/conftest.py rename to tests/llmcompressor/transformers/data/conftest.py diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/data/test_dataset_helpers.py similarity index 100% rename from tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py rename to tests/llmcompressor/transformers/data/test_dataset_helpers.py diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/data/test_dataset_loading.py similarity index 100% rename from tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py rename to tests/llmcompressor/transformers/data/test_dataset_loading.py diff --git a/tests/llmcompressor/transformers/finetune/data/test_registry.py b/tests/llmcompressor/transformers/data/test_registry.py similarity index 100% rename from tests/llmcompressor/transformers/finetune/data/test_registry.py rename to tests/llmcompressor/transformers/data/test_registry.py diff --git a/tests/llmcompressor/transformers/finetune/data/__init__.py b/tests/llmcompressor/transformers/finetune/data/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml deleted file mode 100644 index fd4d5f07ce..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml +++ /dev/null @@ -1,5 +0,0 @@ -cadence: "commit" -test_type: "sanity" -model: "nm-testing/tinysmokellama-3.2" -file_extension: json -num_train_epochs: 1 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml deleted file mode 100644 index 9a83729922..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml +++ /dev/null @@ -1,5 +0,0 @@ -cadence: "commit" -test_type: "sanity" -model: "nm-testing/tinysmokellama-3.2" -file_extension: csv -num_train_epochs: 1 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml deleted file mode 100644 index 1828cc6ba3..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml +++ /dev/null @@ -1,5 +0,0 @@ -cadence: "nightly" -test_type: "regression" -model: "neuralmagic/Llama-2-7b-ultrachat200k" -file_extension: json -num_train_epochs: 0.5 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml deleted file mode 100644 index b7a7c87d87..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml +++ /dev/null @@ -1,4 +0,0 @@ -cadence: "nightly" -test_type: "regression" -model: "nm-testing/tinysmokellama-3.2" -dataset: open_platypus \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml b/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml deleted file mode 100644 index 48ae2741b9..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -cadence: "commit" -test_type: "sanity" -model: "nm-testing/tinysmokellama-3.2" -dataset: wikitext -dataset_config_name: "wikitext-2-raw-v1" -recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" -num_train_epochs: 0.25 -concat_txt: False \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml b/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml deleted file mode 100644 index f81362ea1c..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml +++ /dev/null @@ -1,7 +0,0 @@ -cadence: "nightly" -test_type: "regression" -model: "neuralmagic/Llama-2-7b-ultrachat200k" -dataset: "ultrachat-200k" -recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" -num_train_epochs: 0.05 -concat_txt: False diff --git a/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml deleted file mode 100644 index 2b5999c3dc..0000000000 --- a/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml +++ /dev/null @@ -1,5 +0,0 @@ -cadence: "nightly" -test_type: "regression" -model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" -dataset_config_name: wikitext-2-raw-v1 -dataset: wikitext \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml b/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml deleted file mode 100644 index 96283cbfaf..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml +++ /dev/null @@ -1,22 +0,0 @@ -test_oneshot_stage: - obcq_modifiers: - SparseGPTModifier: - sparsity: 0.7 - block_size: 128 - dampening_frac: 0.01 - mask_structure: "0:0" - targets: ["Linear"] - ignore: ["re:.*lm_head"] -test_train_stage: - pruning_modifiers: - ConstantPruningModifier: - targets: [ - "re:.*self_attn.q_proj", - "re:.*self_attn.k_proj", - "re:.*self_attn.v_proj", - "re:.*self_attn.o_proj", - "re:.*mlp.down_proj", - "re:.*mlp.gate_proj", - "re:.*mlp.up_proj" - ] - start: 0 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py deleted file mode 100644 index 85a0935ff9..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py +++ /dev/null @@ -1,137 +0,0 @@ -import csv -import json -import os -import tempfile -from io import StringIO -from pathlib import Path - -import pytest -import torch -from transformers import AutoModelForCausalLM - -from llmcompressor import train -from tests.testing_utils import parse_params, requires_gpu - -CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_custom" -GPU_CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_custom/gpu" - - -def create_mock_file(extension, content, path, filename): - os.makedirs(path, exist_ok=True) - - if extension == "json": - mock_data = {"text": content} - mock_content = json.dumps(mock_data, indent=2) - - else: - fieldnames = ["text"] - mock_data = [{"text": content}] - csv_output = StringIO() - csv_writer = csv.DictWriter(csv_output, fieldnames=fieldnames) - csv_writer.writeheader() - csv_writer.writerows(mock_data) - mock_content = csv_output.getvalue() - - mock_filename = f"{filename}.{extension}" - mock_filepath = os.path.join(path, mock_filename) - - with open(mock_filepath, "w") as mock_file: - mock_file.write(mock_content) - - return mock_filepath # Return the file path - - -def create_mock_custom_dataset_folder_structure(tmp_dir_data, file_extension): - train_path = os.path.join(tmp_dir_data, "train") - test_path = os.path.join(tmp_dir_data, "test") - validate_path = os.path.join(tmp_dir_data, "validate") - - # create tmp mock data files - create_mock_file( - extension=file_extension, - content="text for train data 1", - path=train_path, - filename="data1", - ) - create_mock_file( - extension=file_extension, - content="text for train data 2", - path=train_path, - filename="data2", - ) - create_mock_file( - extension=file_extension, - content="text for test data 1", - path=test_path, - filename="data3", - ) - create_mock_file( - extension=file_extension, - content="text for validate data 1", - path=validate_path, - filename="data4", - ) - return True - - -def _test_finetune_wout_recipe_custom_dataset( - model, file_extension, num_train_epochs, output -): - dataset_path = Path(tempfile.mkdtemp()) - - created_success = create_mock_custom_dataset_folder_structure( - dataset_path, file_extension - ) - assert created_success - - def preprocessing_func(example): - example["text"] = "Review: " + example["text"] - return example - - concatenate_data = False - - train( - model=model, - dataset=file_extension, - output_dir=output, - recipe=None, - num_train_epochs=num_train_epochs, - concatenate_data=concatenate_data, - text_column="text", - dataset_path=dataset_path, - preprocessing_func=preprocessing_func, - precision="bfloat16", - bf16=True, - ) - - -@pytest.mark.integration -@pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) -def test_oneshot_then_finetune_small(config, tmp_path): - model = config["model"] - file_extension = config["file_extension"] - num_train_epochs = config["num_train_epochs"] - - output = tmp_path / "oneshot_output" - - _test_finetune_wout_recipe_custom_dataset( - model, file_extension, num_train_epochs, output - ) - - -@requires_gpu -@pytest.mark.integration -@pytest.mark.parametrize("config", parse_params(GPU_CONFIGS_DIRECTORY)) -def test_oneshot_then_finetune_gpu(config, tmp_path): - model = config["model"] - file_extension = config["file_extension"] - num_train_epochs = config["num_train_epochs"] - output = tmp_path / "oneshot_output" - - device = "cuda:0" - model = AutoModelForCausalLM.from_pretrained( - model, device_map=device, torch_dtype=torch.bfloat16 - ) - _test_finetune_wout_recipe_custom_dataset( - model, file_extension, num_train_epochs, output - ) diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml b/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml deleted file mode 100644 index a0eb314988..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml +++ /dev/null @@ -1,19 +0,0 @@ -test_stage: - pruning_modifiers: - ConstantPruningModifier: - targets: [ - "re:.*self_attn.q_proj", - "re:.*self_attn.k_proj", - "re:.*self_attn.v_proj", - "re:.*self_attn.o_proj", - "re:.*mlp.gate_proj", - "re:.*mlp.up_proj" - ] - start: 0 - distillation_modifiers: - OutputDistillationModifier: - targets: ["re:model.layers.\\d+$"] - comparison: "square_head" - start: 0 - orig_scale: 1.0 - distill_scale: 1.0 \ No newline at end of file diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py deleted file mode 100644 index e9901eb9fc..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py +++ /dev/null @@ -1,31 +0,0 @@ -import pytest - -from llmcompressor import train -from tests.testing_utils import parse_params, requires_gpu - -CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" - - -@pytest.mark.integration -@requires_gpu -@pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) -def test_finetune_without_recipe(config, tmp_path): - model = config["model"] - dataset = config["dataset"] - output = tmp_path / "finetune_output" - - recipe_str = None - - concatenate_data = False - max_steps = 50 - splits = "train" - - train( - model=model, - dataset=dataset, - output_dir=output, - recipe=recipe_str, - max_steps=max_steps, - concatenate_data=concatenate_data, - splits=splits, - ) diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py deleted file mode 100644 index 32c8310332..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py +++ /dev/null @@ -1,122 +0,0 @@ -import os - -import pytest -import torch -from compressed_tensors.compressors import ModelCompressor -from transformers import AutoConfig, AutoModelForCausalLM - -from llmcompressor import oneshot, train -from llmcompressor.transformers.compression.compressed_tensors_utils import ( - get_model_compressor, -) -from tests.testing_utils import parse_params, requires_gpu - -CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_oneshot_configs" -GPU_CONFIGS_DIRECTORY = ( - "tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu" -) - - -def _test_oneshot_and_finetune( - model, dataset, recipe, dataset_config_name, concat_txt, output, num_train_epochs -): - splits = {"train": "train[:5%]", "calibration": "train[5%:10%]"} - if dataset == "ultrachat-200k": - splits = {"train": "train_gen[:5%]", "calibration": "train_gen[5%:10%]"} - - oneshot_args = dict( - dataset=dataset, - splits=splits, - recipe=recipe, - num_calibration_samples=64, - dataset_config_name=dataset_config_name, - concatenate_data=concat_txt, - output_dir=output, - ) - - oneshot_model = oneshot( - model=model, - **oneshot_args, - stage="test_oneshot_stage", - ) - - compressor = get_model_compressor(model=oneshot_model, save_compressed=True) - if compressor is not None: - compressor.decompress_model(oneshot_model) - - train_args = dict( - num_train_epochs=num_train_epochs, - precision="bfloat16", - bf16=True, - ) - train( - model=oneshot_model, - **oneshot_args, - **train_args, - stage="test_train_stage", - ) - - config_sparse_applied = ModelCompressor.parse_sparsity_config( - AutoConfig.from_pretrained( - os.path.join(output, "test_oneshot_stage") - ).quantization_config - ) - config_finetune_applied = ModelCompressor.parse_sparsity_config( - AutoConfig.from_pretrained( - os.path.join(output, "test_train_stage") - ).quantization_config - ) - # model is first sparsified, then finetuned, both should have the same sparsity - assert config_sparse_applied["global_sparsity"] == pytest.approx( - config_finetune_applied["global_sparsity"], abs=1e-5 - ) - - -@pytest.mark.integration -@pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) -def test_oneshot_and_finetune_small(config, tmp_path): - model = config["model"] - dataset = config["dataset"] - recipe = config["recipe"] - dataset_config_name = config.get("dataset_config_name") - num_train_epochs = config["num_train_epochs"] - concat_txt = config["concat_txt"] - output = tmp_path / "finetune_output" - - _test_oneshot_and_finetune( - model, - dataset, - recipe, - dataset_config_name, - concat_txt, - output, - num_train_epochs, - ) - - -@requires_gpu -@pytest.mark.integration -@pytest.mark.parametrize("config", parse_params(GPU_CONFIGS_DIRECTORY)) -def test_oneshot_and_finetune_gpu(config, tmp_path): - model = config["model"] - dataset = config["dataset"] - recipe = config["recipe"] - dataset_config_name = config.get("dataset_config_name") - num_train_epochs = config["num_train_epochs"] - concat_txt = config["concat_txt"] - output = tmp_path / "finetune_output" - - device = "cuda:0" - model = AutoModelForCausalLM.from_pretrained( - model, device_map=device, torch_dtype=torch.bfloat16 - ) - - _test_oneshot_and_finetune( - model, - dataset, - recipe, - dataset_config_name, - concat_txt, - output, - num_train_epochs, - ) diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py deleted file mode 100644 index 5aa8ca2743..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py +++ /dev/null @@ -1,62 +0,0 @@ -import pytest -from datasets import load_dataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot, train -from tests.testing_utils import parse_params, requires_gpu - -CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_tokenizer" - - -@pytest.mark.integration -@requires_gpu -@pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) -def test_oneshot_and_finetune_with_tokenizer(config, tmp_path): - model = config["model"] - dataset = config["dataset"] - dataset_config_name = config["dataset_config_name"] - - output = tmp_path / "sparsity_finetune_output" - # finetune workflows in general seem to have trouble with multi-gpus - # use just one atm - - recipe_str = "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" - tokenizer = AutoTokenizer.from_pretrained( - model, - ) - model_loaded = AutoModelForCausalLM.from_pretrained(model, torch_dtype="auto") - - dataset_loaded = load_dataset(dataset, dataset_config_name, split="train[:50%]") - - concatenate_data = True - run_stages = True - max_steps = 50 - splits = {"train": "train[:50%]", "calibration": "train[50%:60%]"} - - model_and_data_kwargs = dict( - dataset=dataset_loaded, - dataset_config_name=dataset_config_name, - recipe=recipe_str, - concatenate_data=concatenate_data, - splits=splits, - tokenizer=tokenizer, - ) - - oneshot_model = oneshot( - model=model_loaded, - **model_and_data_kwargs, - stage="test_oneshot_stage", - ) - - finetune_model = train( - run_stages=run_stages, - model=oneshot_model, - max_steps=max_steps, - stage="test_train_stage", - **model_and_data_kwargs, - output_dir=output, - ) - - input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") - output = finetune_model.generate(input_ids, max_new_tokens=20) - print(tokenizer.decode(output[0])) diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py deleted file mode 100644 index b309c07672..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py +++ /dev/null @@ -1,160 +0,0 @@ -import pytest -from transformers import AutoModelForCausalLM -from transformers.utils.quantization_config import CompressedTensorsConfig - -from llmcompressor import oneshot, train -from llmcompressor.core import create_session -from llmcompressor.modifiers.quantization import QuantizationModifier - - -@pytest.mark.unit -def test_oneshot_sparsification_then_finetune(tmp_path): - output = tmp_path / "finetune_output" - quantization_config = CompressedTensorsConfig(run_compressed=False) - - recipe_str = "tests/llmcompressor/transformers/sparsegpt/recipes/test_tiny2.yaml" - model = AutoModelForCausalLM.from_pretrained( - "nm-testing/tinysmokellama-3.2", torch_dtype="auto" - ) - dataset = "open_platypus" - concatenate_data = False - num_calibration_samples = 64 - output_dir = output / "oneshot_out" - splits = {"calibration": "train[:5%]"} - - with create_session(): - oneshot( - model=model, - dataset=dataset, - output_dir=output_dir, - num_calibration_samples=num_calibration_samples, - recipe=recipe_str, - concatenate_data=concatenate_data, - splits=splits, - ) - - recipe_str = "tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml" - - # Explictly decompress the model for training using quantization_config - model = AutoModelForCausalLM.from_pretrained( - output / "oneshot_out", - torch_dtype="auto", - quantization_config=quantization_config, - ) - distill_teacher = AutoModelForCausalLM.from_pretrained( - "nm-testing/tinysmokellama-3.2", torch_dtype="auto" - ) - dataset = "open_platypus" - concatenate_data = False - output_dir = output / "finetune_out" - splits = "train[5%:7%]" - - recipe = """ - test_stage: - pruning_modifiers: - ConstantPruningModifier: - targets: ['re:.*q_proj.weight', 're:.*k_proj.weight', - 're:.*v_proj.weight', 're:.*o_proj.weight', - 're:.*gate_proj.weight', 're:.*up_proj.weight', - 're:.*down_proj.weight'] - start: 0 - """ - - with create_session(): - train( - model=model, - distill_teacher=distill_teacher, - dataset=dataset, - output_dir=output_dir, - num_train_epochs=0.05, - concatenate_data=concatenate_data, - splits=splits, - recipe=recipe, - ) - - # test reloading checkpoint and final model - # verify checkpoint reloading and can carry out finetune - # with the saved model - # Explictly decompress the model for training using quantization_config - model = AutoModelForCausalLM.from_pretrained( - output_dir, - torch_dtype="auto", - quantization_config=quantization_config, - ) - - with create_session(): - train( - model=model, - distill_teacher=distill_teacher, - dataset=dataset, - output_dir=output_dir, - num_train_epochs=0.05, - concatenate_data=concatenate_data, - splits=splits, - recipe=recipe, - ) - - -def test_oneshot_quantization_then_finetune(tmp_path): - recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] - ) - - model = AutoModelForCausalLM.from_pretrained( - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype="auto" - ) - dataset = "open_platypus" - concatenate_data = False - num_calibration_samples = 64 - output_dir = tmp_path / "oneshot_out" - splits = {"calibration": "train[:5%]"} - - with create_session(): - oneshot( - model=model, - dataset=dataset, - output_dir=output_dir, - num_calibration_samples=num_calibration_samples, - recipe=recipe, - concatenate_data=concatenate_data, - splits=splits, - ) - - quantization_config = CompressedTensorsConfig(run_compressed=False) - model = AutoModelForCausalLM.from_pretrained( - output_dir, - torch_dtype="auto", - quantization_config=quantization_config, - ) - - dataset = "open_platypus" - concatenate_data = False - output_dir = tmp_path / "finetune_out" - splits = {"calibration": "train[:5%]", "train": "train[5%:7%]"} - - with create_session(): - train( - model=model, - dataset=dataset, - output_dir=output_dir, - concatenate_data=concatenate_data, - splits=splits, - num_train_epochs=0.05, - ) - - # test reloading checkpoint and final model - model = AutoModelForCausalLM.from_pretrained( - output_dir, - torch_dtype="auto", - quantization_config=quantization_config, - ) - - with create_session(): - train( - model=model, - dataset=dataset, - output_dir=output_dir, - concatenate_data=concatenate_data, - splits=splits, - num_train_epochs=0.05, - ) diff --git a/tests/llmcompressor/transformers/finetune/test_quantization.yaml b/tests/llmcompressor/transformers/finetune/test_quantization.yaml deleted file mode 100644 index 5651232707..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_quantization.yaml +++ /dev/null @@ -1,31 +0,0 @@ -test_stage: - quant_modifiers: - QuantizationModifier: - ignore: - - model.layers.0.mlp.down_proj - - model.layers.1.mlp.down_proj - - model.layers.2.mlp.down_proj - - model.layers.3.mlp.down_proj - - model.layers.4.mlp.down_proj - - model.layers.5.mlp.down_proj - config_groups: - group_0: - weights: - num_bits: 8 - type: "int" - symmetric: False - strategy: "tensor" - input_activations: null - output_activations: null - targets: ["Linear"] - pruning_modifiers: - ConstantPruningModifier: - targets: [ - "re:.*self_attn.q_proj", - "re:.*self_attn.k_proj", - "re:.*self_attn.v_proj", - "re:.*self_attn.o_proj", - "re:.*mlp.gate_proj", - "re:.*mlp.up_proj" - ] - start: 0 diff --git a/tests/llmcompressor/transformers/finetune/test_safetensors.py b/tests/llmcompressor/transformers/finetune/test_safetensors.py deleted file mode 100644 index 7036516ab8..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_safetensors.py +++ /dev/null @@ -1,42 +0,0 @@ -import os - -import pytest - -from llmcompressor import train -from tests.testing_utils import parse_params, requires_gpu - -CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" - - -@pytest.mark.integration -@requires_gpu -@pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) -def test_safetensors(config, tmp_path): - model = config["model"] - dataset = config["dataset"] - output = tmp_path / "finetune_output" - - output_dir = output / "output1" - max_steps = 10 - splits = {"train": "train[:10%]"} - - train( - model=model, - dataset=dataset, - output_dir=output_dir, - max_steps=max_steps, - splits=splits, - ) - - assert os.path.exists(output_dir / "model.safetensors") - assert not os.path.exists(output_dir / "pytorch_model.bin") - - # test we can also load - new_output_dir = output / "output2" - train( - model=output_dir, - dataset=dataset, - output_dir=new_output_dir, - max_steps=max_steps, - splits=splits, - ) diff --git a/tests/llmcompressor/transformers/finetune/test_session_mixin.py b/tests/llmcompressor/transformers/finetune/test_session_mixin.py deleted file mode 100644 index 81a83ec565..0000000000 --- a/tests/llmcompressor/transformers/finetune/test_session_mixin.py +++ /dev/null @@ -1,65 +0,0 @@ -from typing import Any, Dict, Optional, Union - -import pytest -from torch.nn import Module -from transformers import AutoModelForCausalLM, Trainer - -from llmcompressor.core import active_session -from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn - - -class MixInTest(SessionManagerMixIn, Trainer): - def __init__( - self, - model: Module, - recipe: Optional[str], - recipe_args: Optional[Union[Dict[str, Any], str]] = None, - model_args: Optional[Union[Dict[str, Any], str]] = None, - dataset_args: Optional[Union[Dict[str, Any], str]] = None, - teacher: Optional[Union[Module, str]] = None, - **kwargs, - ): - super().__init__( - model=model, - recipe=recipe, - recipe_args=recipe_args, - model_args=model_args, - dataset_args=dataset_args, - teacher=teacher, - **kwargs, - ) - - -@pytest.mark.unit -def test_mixin_init(): - model_state_path = "nm-testing/tinysmokellama-3.2" - model = AutoModelForCausalLM.from_pretrained(model_state_path) - recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml" - - session_mixin = MixInTest(model=model, recipe=recipe) - assert isinstance(session_mixin, SessionManagerMixIn) - assert isinstance(session_mixin, Trainer) - assert session_mixin.recipe == recipe - assert session_mixin.model == model - - -@pytest.fixture -def mixin_trainer(): - model_state_path = "nm-testing/tinysmokellama-3.2" - model = AutoModelForCausalLM.from_pretrained(model_state_path) - recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml" - train_dataset = "open-platypus" - - return MixInTest( - model=model, - recipe=recipe, - train_dataset=train_dataset, - ) - - -@pytest.mark.unit -def test_mixin_session_init(mixin_trainer): - mixin_trainer.initialize_session(epoch=0.0, checkpoint=None) - session = active_session() - - assert session.lifecycle.initialized_ From 912060906d24b1fa826f000bba6392445078a4b0 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 12:32:51 -0500 Subject: [PATCH 02/23] fix import --- src/llmcompressor/args/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llmcompressor/args/__init__.py b/src/llmcompressor/args/__init__.py index 22079b5139..16b605b992 100644 --- a/src/llmcompressor/args/__init__.py +++ b/src/llmcompressor/args/__init__.py @@ -10,5 +10,4 @@ from .dataset_arguments import DatasetArguments from .model_arguments import ModelArguments from .recipe_arguments import RecipeArguments -from .training_arguments import TrainingArguments from .utils import parse_args From b07287f585c6e42b4ae6ac8792b9e9cd0457c96d Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 12:47:02 -0500 Subject: [PATCH 03/23] fix arg parsing --- src/llmcompressor/args/README.md | 8 ++----- src/llmcompressor/args/utils.py | 27 ++++++------------------ src/llmcompressor/entrypoints/oneshot.py | 2 +- 3 files changed, 9 insertions(+), 28 deletions(-) diff --git a/src/llmcompressor/args/README.md b/src/llmcompressor/args/README.md index 4691a615cc..d5ced3d5dd 100644 --- a/src/llmcompressor/args/README.md +++ b/src/llmcompressor/args/README.md @@ -1,4 +1,4 @@ -# Input arguments for `oneshot`, `train`, `eval` entrypoints +# Input arguments for `oneshot` and `eval` entrypoints Parsers in `llm-compressor` define the input arguments required for various entry points, including `oneshot`, `train`, and `eval`. @@ -38,8 +38,4 @@ Handles model loading and saving. For example, `ModelArguments.model` can be a H Manages data loading and preprocessing. The dataset argument can specify a Hugging Face dataset stub or a local dataset compatible with [`load_dataset`](https://github.com/huggingface/datasets/blob/3a4e74a9ace62ecd5c9cde7dcb6bcabd65cc7857/src/datasets/load.py#L1905). The preprocessing_func is a callable function that applies custom logic, such as formatting the data using a chat template. ## RecipeArguments -Defines the model recipe. A `recipe` consists of user-defined instructions for optimizing the model. Examples of recipes can be found in the `/examples` directory. - -## TrainingArguments -Specifies training parameters based on Hugging Face's [TrainingArguments class](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py). These parameters include settings like learning rate (`learning_rate`), and the optimizer to use (`optim`). - +Defines the model recipe. A `recipe` consists of user-defined instructions for optimizing the model. Examples of recipes can be found in the `/examples` directory. \ No newline at end of file diff --git a/src/llmcompressor/args/utils.py b/src/llmcompressor/args/utils.py index fd420109f9..29d42fed05 100644 --- a/src/llmcompressor/args/utils.py +++ b/src/llmcompressor/args/utils.py @@ -14,18 +14,16 @@ DatasetArguments, ModelArguments, RecipeArguments, - TrainingArguments, ) from llmcompressor.transformers.utils.helpers import resolve_processor_from_model_args def parse_args( - include_training_args: bool = False, **kwargs + **kwargs, ) -> tuple[ ModelArguments, DatasetArguments, - RecipeArguments, - TrainingArguments | None, + RecipeArguments | None, str | None, ]: """ @@ -38,31 +36,18 @@ def parse_args( src/llmcompressor/args/dataset_args.py * RecipeArguments in src/llmcompressor/args/recipe_args.py - * TrainingArguments in - src/llmcompressor/args/training_args.py - ModelArguments, DatasetArguments, and RecipeArguments are used for both - `oneshot` and `train`. TrainingArguments is only used for `train`. + ModelArguments, DatasetArguments, and RecipeArguments used for + oneshot. """ - - # pop output_dir, used as an attr in TrainingArguments, where oneshot is not used output_dir = kwargs.pop("output_dir", None) parser_args = (ModelArguments, DatasetArguments, RecipeArguments) - if include_training_args: - parser_args += (TrainingArguments,) - parser = HfArgumentParser(parser_args) parsed_args = parser.parse_dict(kwargs) - training_args = None - if include_training_args: - model_args, dataset_args, recipe_args, training_args = parsed_args - if output_dir is not None: - training_args.output_dir = output_dir - else: - model_args, dataset_args, recipe_args = parsed_args + model_args, dataset_args, recipe_args = parsed_args if recipe_args.recipe_args is not None: if not isinstance(recipe_args.recipe_args, dict): @@ -83,4 +68,4 @@ def parse_args( # silently assign tokenizer to processor resolve_processor_from_model_args(model_args) - return model_args, dataset_args, recipe_args, training_args, output_dir + return model_args, dataset_args, recipe_args, output_dir diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 66c320d1b3..36d5706a4d 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -140,7 +140,7 @@ def __init__( level="DEBUG", ) - model_args, dataset_args, recipe_args, _, output_dir = parse_args(**kwargs) + model_args, dataset_args, recipe_args, output_dir = parse_args(**kwargs) self.model_args = model_args self.dataset_args = dataset_args From dd5c58e62d425a6f66163e8c4e68d3a8d2f6e29a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 12:49:38 -0500 Subject: [PATCH 04/23] fix import --- src/llmcompressor/transformers/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llmcompressor/transformers/__init__.py b/src/llmcompressor/transformers/__init__.py index 2e018413ac..401ebc8544 100644 --- a/src/llmcompressor/transformers/__init__.py +++ b/src/llmcompressor/transformers/__init__.py @@ -6,5 +6,4 @@ # (import order matters for circular import avoidance) from .utils import * -from .finetune import * from .data import TextGenerationDataset From 841723c91daaed8eb1a598d6960f16290670bd80 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:06:24 -0500 Subject: [PATCH 05/23] update --- src/llmcompressor/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py index 8aef67fcd4..2b80b1ed9a 100644 --- a/src/llmcompressor/datasets/utils.py +++ b/src/llmcompressor/datasets/utils.py @@ -18,7 +18,7 @@ from transformers.data import default_data_collator from llmcompressor.args import DatasetArguments -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor From a68de4b7cb031a61fa1ec66eb6fdf59a5c6844b3 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:08:38 -0500 Subject: [PATCH 06/23] more updates --- src/llmcompressor/transformers/data/base.py | 2 +- src/llmcompressor/transformers/utils/preprocessing_functions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/transformers/data/base.py b/src/llmcompressor/transformers/data/base.py index a6300fe404..968c2555a8 100644 --- a/src/llmcompressor/transformers/data/base.py +++ b/src/llmcompressor/transformers/data/base.py @@ -18,7 +18,7 @@ from loguru import logger from llmcompressor.args import DatasetArguments -from llmcompressor.transformers.finetune.data.data_helpers import ( +from llmcompressor.transformers.data.data_helpers import ( LABELS_MASK_VALUE, get_custom_datasets_from_path, get_raw_dataset, diff --git a/src/llmcompressor/transformers/utils/preprocessing_functions.py b/src/llmcompressor/transformers/utils/preprocessing_functions.py index e6749d6a51..16466b6419 100644 --- a/src/llmcompressor/transformers/utils/preprocessing_functions.py +++ b/src/llmcompressor/transformers/utils/preprocessing_functions.py @@ -12,7 +12,7 @@ from compressed_tensors.registry import RegistryMixin if TYPE_CHECKING: - from llmcompressor.transformers.finetune.data.base import TextGenerationDataset + from llmcompressor.transformers.data.base import TextGenerationDataset class PreprocessingFunctionRegistry(RegistryMixin): From caeacc1526d37e75a7850f38b42ad95a97ebcb41 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:11:55 -0500 Subject: [PATCH 07/23] update --- src/llmcompressor/transformers/data/c4.py | 2 +- src/llmcompressor/transformers/data/cnn_dailymail.py | 2 +- src/llmcompressor/transformers/data/custom.py | 2 +- src/llmcompressor/transformers/data/evolcodealpaca.py | 2 +- src/llmcompressor/transformers/data/flickr_30k.py | 2 +- src/llmcompressor/transformers/data/gsm8k.py | 2 +- src/llmcompressor/transformers/data/open_platypus.py | 2 +- src/llmcompressor/transformers/data/peoples_speech.py | 4 ++-- src/llmcompressor/transformers/data/ultrachat_200k.py | 2 +- src/llmcompressor/transformers/data/wikitext.py | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/llmcompressor/transformers/data/c4.py b/src/llmcompressor/transformers/data/c4.py index e4fe6431cd..52627e5985 100644 --- a/src/llmcompressor/transformers/data/c4.py +++ b/src/llmcompressor/transformers/data/c4.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/cnn_dailymail.py b/src/llmcompressor/transformers/data/cnn_dailymail.py index fcc67482f9..d205d44507 100644 --- a/src/llmcompressor/transformers/data/cnn_dailymail.py +++ b/src/llmcompressor/transformers/data/cnn_dailymail.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/custom.py b/src/llmcompressor/transformers/data/custom.py index 72b6ac6bb4..80a0478964 100644 --- a/src/llmcompressor/transformers/data/custom.py +++ b/src/llmcompressor/transformers/data/custom.py @@ -7,7 +7,7 @@ user-provided datasets. """ -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset @TextGenerationDataset.register(name="custom", alias=["json", "csv"]) diff --git a/src/llmcompressor/transformers/data/evolcodealpaca.py b/src/llmcompressor/transformers/data/evolcodealpaca.py index 8a7892c131..014545614c 100644 --- a/src/llmcompressor/transformers/data/evolcodealpaca.py +++ b/src/llmcompressor/transformers/data/evolcodealpaca.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/flickr_30k.py b/src/llmcompressor/transformers/data/flickr_30k.py index 8ada07a0e2..e257f17e79 100644 --- a/src/llmcompressor/transformers/data/flickr_30k.py +++ b/src/llmcompressor/transformers/data/flickr_30k.py @@ -3,7 +3,7 @@ from loguru import logger -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/gsm8k.py b/src/llmcompressor/transformers/data/gsm8k.py index ae1318571e..55396d1df5 100644 --- a/src/llmcompressor/transformers/data/gsm8k.py +++ b/src/llmcompressor/transformers/data/gsm8k.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/open_platypus.py b/src/llmcompressor/transformers/data/open_platypus.py index 81413e7852..fcf08bbaac 100644 --- a/src/llmcompressor/transformers/data/open_platypus.py +++ b/src/llmcompressor/transformers/data/open_platypus.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/peoples_speech.py b/src/llmcompressor/transformers/data/peoples_speech.py index 31d0668316..9e0b9e544e 100644 --- a/src/llmcompressor/transformers/data/peoples_speech.py +++ b/src/llmcompressor/transformers/data/peoples_speech.py @@ -4,8 +4,8 @@ from datasets.formatting.formatting import LazyRow from loguru import logger -from llmcompressor.transformers.finetune.data import TextGenerationDataset -from llmcompressor.transformers.finetune.data.base import get_columns +from llmcompressor.transformers.data import TextGenerationDataset +from llmcompressor.transformers.data.base import get_columns from llmcompressor.typing import DatasetType, Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/ultrachat_200k.py b/src/llmcompressor/transformers/data/ultrachat_200k.py index 296eb3db56..308722fbfb 100644 --- a/src/llmcompressor/transformers/data/ultrachat_200k.py +++ b/src/llmcompressor/transformers/data/ultrachat_200k.py @@ -3,7 +3,7 @@ from loguru import logger -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: diff --git a/src/llmcompressor/transformers/data/wikitext.py b/src/llmcompressor/transformers/data/wikitext.py index 73142d671c..1bce90cc20 100644 --- a/src/llmcompressor/transformers/data/wikitext.py +++ b/src/llmcompressor/transformers/data/wikitext.py @@ -1,7 +1,7 @@ from copy import deepcopy from typing import TYPE_CHECKING -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.typing import Processor if TYPE_CHECKING: From 9246d0507f9a2c753f5f5b01f0404ad187429b43 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:20:04 -0500 Subject: [PATCH 08/23] remove --- src/llmcompressor/args/model_arguments.py | 6 --- src/llmcompressor/entrypoints/utils.py | 51 +------------------ .../transformers/utils/helpers.py | 40 +-------------- 3 files changed, 3 insertions(+), 94 deletions(-) diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py index d927bd62e7..279287524f 100644 --- a/src/llmcompressor/args/model_arguments.py +++ b/src/llmcompressor/args/model_arguments.py @@ -26,12 +26,6 @@ class ModelArguments: ) }, ) - distill_teacher: str | None = field( - default=None, - metadata={ - "help": "Teacher model (a trained text generation model)", - }, - ) config_name: str | None = field( default=None, metadata={ diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 0b482727ee..0e6564833a 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -1,8 +1,8 @@ """ Utility functions for entrypoint pre and post-processing operations. -Provides common utility functions used by training and one-shot -compression entrypoints. Includes model loading, configuration setup, +Provides common utility functions used by the one-shot +entrypoint. Includes model loading, configuration setup, preprocessing steps, and post-processing operations for compression workflows. """ @@ -19,7 +19,6 @@ AutoModelForCausalLM, AutoProcessor, PreTrainedModel, - set_seed, ) from transformers.utils.quantization_config import CompressedTensorsConfig @@ -27,7 +26,6 @@ DatasetArguments, ModelArguments, RecipeArguments, - TrainingArguments, ) from llmcompressor.core import reset_session from llmcompressor.pytorch.model_load.helpers import parse_dtype @@ -36,7 +34,6 @@ untie_word_embeddings, ) from llmcompressor.transformers.utils.helpers import ( - detect_last_checkpoint, is_model_ct_quantized_from_path, ) from llmcompressor.typing import Processor @@ -109,8 +106,6 @@ def post_process( Saves the model and tokenizer/processor to the output directory if model_args, output_dir is provided. - Save is skipped for stage runs for `train` - saves using the trainer.save_model() - If the `output_dir` is not the default directory, the method resets lifecycle actions. The model is saved in a compressed format if specified in `model_args`. Additionally, the tokenizer or processor, if available, is also saved. @@ -150,7 +145,6 @@ def post_process( def initialize_model_from_path( model_args: ModelArguments, - training_args: TrainingArguments | None = None, ) -> tuple[PreTrainedModel, PreTrainedModel | None]: # Load pretrained model # The .from_pretrained methods guarantee that only one local process can @@ -167,47 +161,6 @@ def initialize_model_from_path( last_checkpoint = None teacher = None - if training_args is not None: - # Load teacher configuration if applicable - teacher_config = ( - AutoConfig.from_pretrained( - model_args.distill_teacher, - use_auth_token=True if model_args.use_auth_token else None, - trust_remote_code=model_args.trust_remote_code_model, - ) - if model_args.distill_teacher - else None - ) - - # Detect last checkpoint - last_checkpoint = detect_last_checkpoint(training_args, model_args=model_args) - - # Set seed before initializing model - set_seed(training_args.seed) - - # Initialize teacher model if teacher path is provided - if model_args.distill_teacher is not None: - teacher_device_map = ( - None - if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true" - else "auto" - ) - teacher_kwargs = { - "config": teacher_config, - "cache_dir": None, - "use_auth_token": True if model_args.use_auth_token else None, - "torch_dtype": parse_dtype(model_args.precision), - "device_map": teacher_device_map, - "trust_remote_code": model_args.trust_remote_code_model, - } - - teacher = AutoModelForCausalLM.from_pretrained( - model_args.distill_teacher, - **teacher_kwargs, - ) - if "sequence_length" in teacher_kwargs: - teacher.seqlen = teacher_kwargs["sequence_length"] - model_path = ( last_checkpoint or model_args.model if hasattr(model_args, "model") diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index 5df8354870..1834c19b00 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -16,56 +16,18 @@ ) from loguru import logger from transformers import AutoConfig -from transformers.trainer_utils import get_last_checkpoint if TYPE_CHECKING: - from llmcompressor.args import ModelArguments, TrainingArguments + from llmcompressor.args import ModelArguments __all__ = [ "RECIPE_FILE_NAME", - "detect_last_checkpoint", "is_model_ct_quantized_from_path", ] RECIPE_FILE_NAME = "recipe.yaml" -def detect_last_checkpoint( - training_args: "TrainingArguments", - model_args: Optional["ModelArguments"] = None, -): - last_checkpoint = None - if ( - os.path.isdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if training_args.run_stages and model_args is not None: - model = ( - model_args.model - if hasattr(model_args, "model") - else model_args.model_name_or_path - ) - if os.path.isdir(model): - last_checkpoint = get_last_checkpoint(model_args.model_name_or_path) - if last_checkpoint is None and (len(os.listdir(training_args.output_dir)) > 0): - raise ValueError( - f"Output directory ({training_args.output_dir}) already " - "exists and is not empty. Use --overwrite_output_dir to overcome." - ) - elif ( - last_checkpoint is not None and training_args.resume_from_checkpoint is None - ): - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To " - "avoid this behavior, change the `--output_dir` or add " - "`--overwrite_output_dir` to train from scratch." - ) - - return last_checkpoint - - def is_model_ct_quantized_from_path(path: str) -> bool: """ Determine if model from path is quantized based From d82ca8e142c71fe6816ae6785a13b79225f784d7 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:22:56 -0500 Subject: [PATCH 09/23] fix import --- src/llmcompressor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/__init__.py b/src/llmcompressor/__init__.py index 2e9547a0fc..227052d94e 100644 --- a/src/llmcompressor/__init__.py +++ b/src/llmcompressor/__init__.py @@ -26,4 +26,4 @@ create_session, reset_session, ) -from llmcompressor.entrypoints import Oneshot, oneshot, train, model_free_ptq +from llmcompressor.entrypoints import Oneshot, oneshot, model_free_ptq From 39f3fcaa98f8446703c4118dc4b2337dc745cdb4 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:45:28 -0500 Subject: [PATCH 10/23] update --- src/llmcompressor/entrypoints/oneshot.py | 1 - src/llmcompressor/entrypoints/utils.py | 46 +++--------------------- 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 36d5706a4d..c2b29aa97c 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -229,7 +229,6 @@ def apply_recipe_modifiers( def oneshot( # Model arguments model: str | PreTrainedModel, - distill_teacher: str | None = None, config_name: str | None = None, tokenizer: str | PreTrainedTokenizerBase | None = None, processor: str | ProcessorMixin | None = None, diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py index 0e6564833a..3c1b354ce7 100644 --- a/src/llmcompressor/entrypoints/utils.py +++ b/src/llmcompressor/entrypoints/utils.py @@ -7,13 +7,11 @@ workflows. """ -import inspect import os from pathlib import PosixPath from compressed_tensors.utils import remove_dispatch from loguru import logger -from torch.nn import Module from transformers import ( AutoConfig, AutoModelForCausalLM, @@ -58,14 +56,13 @@ def pre_process( # Initialize model if isinstance(model_args.model, (str, PosixPath)): - model, distill_teacher = initialize_model_from_path(model_args) + model = initialize_model_from_path(model_args) if is_fsdp_model(model): raise NotImplementedError( "FSDP models are not supported in the current release but will be " "suported in future releases of LLM Compressor." ) model_args.model = model - model_args.distill_teacher = distill_teacher # Initialize processor if dataset provided if isinstance(model_args.processor, (str, type(None))): @@ -145,7 +142,7 @@ def post_process( def initialize_model_from_path( model_args: ModelArguments, -) -> tuple[PreTrainedModel, PreTrainedModel | None]: +) -> PreTrainedModel: # Load pretrained model # The .from_pretrained methods guarantee that only one local process can # concurrently download model & vocab. @@ -159,7 +156,6 @@ def initialize_model_from_path( ) last_checkpoint = None - teacher = None model_path = ( last_checkpoint or model_args.model @@ -186,17 +182,13 @@ def initialize_model_from_path( if "sequence_length" in model_kwargs: model.seqlen = model_kwargs["sequence_length"] - return model, teacher + return model def initialize_processor_from_path( - model_args: ModelArguments, - model: PreTrainedModel, - teacher: PreTrainedModel | None = None, + model_args: ModelArguments, model: PreTrainedModel ) -> Processor: - processor_src = model_args.processor or get_processor_name_from_model( - model, teacher - ) + processor_src = model_args.processor or model.config._name_or_path # The use_fast=True option is not currently supported safely in Transformers # See: https://github.com/huggingface/transformers/pull/34836#issuecomment-2491809727 # noqa: E501 try: @@ -229,31 +221,3 @@ def initialize_processor_from_path( ) return processor - - -def get_processor_name_from_model(student: Module, teacher: Module | None) -> str: - """ - Get a processor/tokenizer source used for both student and teacher, assuming - that they could be shared - - :param student: the student model - :param teacher: the teacher model - :return: the source for the processor/tokenizer shared between teacher and model - """ - if teacher is not None and teacher not in ("disable", "self"): - student_forward_params = list( - inspect.signature(student.forward).parameters.keys() - ) - teacher_forward_params = list( - inspect.signature(teacher.forward).parameters.keys() - ) - diff = [p for p in student_forward_params if p not in teacher_forward_params] - if diff: - raise RuntimeError( - "Teacher tokenizer cannot be used for student " - f"due to missing args: {diff}" - ) - src_model = teacher - else: - src_model = student - return src_model.config._name_or_path From 2761ebaf926f4d2bfe164e5ae38d7bbaa13d3236 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:49:11 -0500 Subject: [PATCH 11/23] updoate --- .../llmcompressor/transformers/compression/test_quantization.py | 2 +- tests/llmcompressor/transformers/data/test_dataset_helpers.py | 2 +- tests/llmcompressor/transformers/data/test_registry.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py index 60ed7ed94e..29d5c21c0f 100644 --- a/tests/llmcompressor/transformers/compression/test_quantization.py +++ b/tests/llmcompressor/transformers/compression/test_quantization.py @@ -7,7 +7,7 @@ from llmcompressor import oneshot from llmcompressor.args import DatasetArguments from llmcompressor.pytorch.utils import tensors_to_device -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from llmcompressor.utils.dev import dispatch_for_generation from tests.testing_utils import parse_params, requires_gpu diff --git a/tests/llmcompressor/transformers/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/data/test_dataset_helpers.py index a7138b186d..ed0335bfd1 100644 --- a/tests/llmcompressor/transformers/data/test_dataset_helpers.py +++ b/tests/llmcompressor/transformers/data/test_dataset_helpers.py @@ -2,7 +2,7 @@ from llmcompressor.args import DatasetArguments from llmcompressor.datasets import make_dataset_splits -from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset +from llmcompressor.transformers.data.data_helpers import get_raw_dataset @pytest.mark.unit diff --git a/tests/llmcompressor/transformers/data/test_registry.py b/tests/llmcompressor/transformers/data/test_registry.py index 29895b4a4c..d775f39441 100644 --- a/tests/llmcompressor/transformers/data/test_registry.py +++ b/tests/llmcompressor/transformers/data/test_registry.py @@ -1,7 +1,7 @@ import pytest from llmcompressor.args import DatasetArguments -from llmcompressor.transformers.finetune.data import ( +from llmcompressor.transformers.data import ( C4Dataset, OpenPlatypusDataset, TextGenerationDataset, From 6998fb9937a1e1a3f606c030074a5294bad5a8cf Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 13:51:30 -0500 Subject: [PATCH 12/23] update --- src/llmcompressor/args/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/args/README.md b/src/llmcompressor/args/README.md index d5ced3d5dd..57c2a4a938 100644 --- a/src/llmcompressor/args/README.md +++ b/src/llmcompressor/args/README.md @@ -1,4 +1,4 @@ -# Input arguments for `oneshot` and `eval` entrypoints +# Input arguments for the `oneshot` entrypoint Parsers in `llm-compressor` define the input arguments required for various entry points, including `oneshot`, `train`, and `eval`. From 7958288d98486448ab06af53820368e77c813227 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:11:29 -0500 Subject: [PATCH 13/23] update --- .../transformers/sparsegpt/test_sparsegpt_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_completion.py b/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_completion.py index 724a7b12a2..b1e1fcd165 100644 --- a/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_completion.py +++ b/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_completion.py @@ -10,7 +10,7 @@ from llmcompressor.transformers.compression.compressed_tensors_utils import ( get_model_compressor, ) -from llmcompressor.transformers.finetune.data import TextGenerationDataset +from llmcompressor.transformers.data import TextGenerationDataset from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = ( From bf3acb42f3ef766c17502095037670a6181c38c5 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:14:55 -0500 Subject: [PATCH 14/23] remove old links --- src/llmcompressor/entrypoints/README.md | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md index f023d3c027..25a85bae30 100644 --- a/src/llmcompressor/entrypoints/README.md +++ b/src/llmcompressor/entrypoints/README.md @@ -261,11 +261,4 @@ with create_session(): distill_teacher=distill_teacher, # The teacher model recipe=recipe, # The recipe to use ) -``` - -### SFT Trainer - -TRL's SFT Trainer can be used for sparse fine-tuning or applying sparse knowledge distillation. Examples are available in the `examples/` folder. - -- [Sparse-fine-tune a 50% sparse Llama-7b model](../../../examples/trl_mixin/README.md) -- [Sparse-fine-tune a 50% sparse Llama-7b model using knowledge distillation](../../../examples/trl_mixin/README.md) \ No newline at end of file +``` \ No newline at end of file From 2c01fcb874a450af8a70d51ef3f2648e3389e4f8 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:30:31 -0500 Subject: [PATCH 15/23] update --- .../workflows/test-check-transformers.yaml | 32 ++----------------- .github/workflows/test-check.yaml | 20 ++++++++---- 2 files changed, 16 insertions(+), 36 deletions(-) diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 12dc6baeb9..53ab8e0435 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -73,7 +73,7 @@ jobs: run: uv pip install .[dev] - uses: actions/checkout@v4 with: - repository: "neuralmagic/compressed-tensors" + repository: "vllm/compressed-tensors" path: "compressed-tensors" fetch-depth: 0 fetch-tags: true @@ -92,35 +92,7 @@ jobs: - name: "🔬 Running transformers tests" if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest -v tests/llmcompressor/transformers/compression - - name: Run Data Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/data - - name: Running GPTQ Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/gptq - - name: Running AutoRound Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/autoround - - name: Running ONESHOT Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/oneshot - - name: Running SparseGPT Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/sparsegpt - - name: Running Tracing Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/tracing - - name: Running KV Cache Tests - if: (success() || failure()) && steps.install.outcome == 'success' - run: | - pytest -v tests/llmcompressor/transformers/kv_cache + pytest -v tests/llmcompressor/transformers/ - name: "Upload coverage report" if: (success() || failure()) && inputs.code_coverage uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 7e12aba897..63afa9b31d 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -22,10 +22,14 @@ jobs: runs-on: ubuntu-22.04 env: COVERAGE_FILE: ".coverage.base" + strategy: + matrix: + python: ["3.10", "3.13"] steps: - - uses: actions/setup-python@v5 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: ${{ matrix.python }} - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -36,7 +40,7 @@ jobs: run: uv pip install .[dev] - uses: actions/checkout@v4 with: - repository: "neuralmagic/compressed-tensors" + repository: "vllm/compressed-tensors" path: "compressed-tensors" fetch-depth: 0 fetch-tags: true @@ -73,10 +77,14 @@ jobs: runs-on: ubuntu-22.04 env: COVERAGE_FILE: ".coverage.pytorch" + strategy: + matrix: + python: ["3.10", "3.13"] steps: - - uses: actions/setup-python@v5 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: ${{ matrix.python }} - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -87,7 +95,7 @@ jobs: run: uv pip install .[dev] - uses: actions/checkout@v4 with: - repository: "neuralmagic/compressed-tensors" + repository: "vllm/compressed-tensors" path: "compressed-tensors" fetch-depth: 0 fetch-tags: true From f4cb04021d6e20dd90be6295cfe61e92362e8894 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:31:35 -0500 Subject: [PATCH 16/23] updatge --- .github/workflows/test-check-transformers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 53ab8e0435..c61f598647 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -62,7 +62,7 @@ jobs: steps: - uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.12' - uses: actions/checkout@v4 with: fetch-depth: 0 From ac31948ff9f08f6954fdb06ebb2a27a152064309 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:32:58 -0500 Subject: [PATCH 17/23] update --- .github/workflows/test-check-transformers.yaml | 2 +- .github/workflows/test-check.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index c61f598647..96528f51a5 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -73,7 +73,7 @@ jobs: run: uv pip install .[dev] - uses: actions/checkout@v4 with: - repository: "vllm/compressed-tensors" + repository: "vllm-project/compressed-tensors" path: "compressed-tensors" fetch-depth: 0 fetch-tags: true diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml index 63afa9b31d..ea6a7d4884 100644 --- a/.github/workflows/test-check.yaml +++ b/.github/workflows/test-check.yaml @@ -40,7 +40,7 @@ jobs: run: uv pip install .[dev] - uses: actions/checkout@v4 with: - repository: "vllm/compressed-tensors" + repository: "vllm-project/compressed-tensors" path: "compressed-tensors" fetch-depth: 0 fetch-tags: true @@ -95,7 +95,7 @@ jobs: run: uv pip install .[dev] - uses: actions/checkout@v4 with: - repository: "vllm/compressed-tensors" + repository: "vllm-project/compressed-tensors" path: "compressed-tensors" fetch-depth: 0 fetch-tags: true From 048fabb976721cf69bec9632b0ab1d6c0600b58f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:40:11 -0500 Subject: [PATCH 18/23] revert --- .../workflows/test-check-transformers.yaml | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 96528f51a5..4835f5537d 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -92,7 +92,35 @@ jobs: - name: "🔬 Running transformers tests" if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest -v tests/llmcompressor/transformers/ + pytest -v tests/llmcompressor/transformers/compression + - name: Run Finetune Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/finetune + - name: Running GPTQ Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/gptq + - name: Running AutoRound Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/autoround + - name: Running ONESHOT Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/oneshot + - name: Running SparseGPT Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/sparsegpt + - name: Running Tracing Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/tracing + - name: Running KV Cache Tests + if: (success() || failure()) && steps.install.outcome == 'success' + run: | + pytest -v tests/llmcompressor/transformers/kv_cache - name: "Upload coverage report" if: (success() || failure()) && inputs.code_coverage uses: actions/upload-artifact@v4 From 16098492e28e162cbba43aec5fde515658a96bec Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 14:56:32 -0500 Subject: [PATCH 19/23] update --- .github/workflows/test-check-transformers.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 4835f5537d..4753e9145b 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -93,10 +93,10 @@ jobs: if: (success() || failure()) && steps.install.outcome == 'success' run: | pytest -v tests/llmcompressor/transformers/compression - - name: Run Finetune Tests + - name: Run Data Tests if: (success() || failure()) && steps.install.outcome == 'success' run: | - pytest -v tests/llmcompressor/transformers/finetune + pytest -v tests/llmcompressor/transformers/data - name: Running GPTQ Tests if: (success() || failure()) && steps.install.outcome == 'success' run: | From 5e2fce04a76ce4086b150e34df53a3a4063a997b Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 18:36:34 -0500 Subject: [PATCH 20/23] remove distillation modifier --- .../modifiers/distillation/__init__.py | 9 - .../modifiers/distillation/output/__init__.py | 3 - .../modifiers/distillation/output/base.py | 196 --------- .../modifiers/distillation/utils/__init__.py | 0 .../distillation/utils/pytorch/__init__.py | 5 - .../distillation/utils/pytorch/kd_factory.py | 408 ------------------ .../distillation/utils/pytorch/kd_wrapper.py | 116 ----- .../utils/pytorch/model_wrapper.py | 135 ------ 8 files changed, 872 deletions(-) delete mode 100644 src/llmcompressor/modifiers/distillation/__init__.py delete mode 100644 src/llmcompressor/modifiers/distillation/output/__init__.py delete mode 100644 src/llmcompressor/modifiers/distillation/output/base.py delete mode 100644 src/llmcompressor/modifiers/distillation/utils/__init__.py delete mode 100644 src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py delete mode 100644 src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py delete mode 100644 src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py delete mode 100644 src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py diff --git a/src/llmcompressor/modifiers/distillation/__init__.py b/src/llmcompressor/modifiers/distillation/__init__.py deleted file mode 100644 index 735b9d3755..0000000000 --- a/src/llmcompressor/modifiers/distillation/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# ruff: noqa - -""" -Provides model distillation functionality, specifically importing output-based - distillation modifiers for transferring knowledge from teacher to student - models during compression. -""" - -from .output import * diff --git a/src/llmcompressor/modifiers/distillation/output/__init__.py b/src/llmcompressor/modifiers/distillation/output/__init__.py deleted file mode 100644 index a4291054b4..0000000000 --- a/src/llmcompressor/modifiers/distillation/output/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# ruff: noqa - -from .base import * diff --git a/src/llmcompressor/modifiers/distillation/output/base.py b/src/llmcompressor/modifiers/distillation/output/base.py deleted file mode 100644 index 130e2470ca..0000000000 --- a/src/llmcompressor/modifiers/distillation/output/base.py +++ /dev/null @@ -1,196 +0,0 @@ -from typing import Any, Dict, List, Tuple, Union - -from torch.nn import Module - -from llmcompressor.core import Event, EventType, State -from llmcompressor.modifiers import Modifier -from llmcompressor.modifiers.distillation.utils.pytorch import ( - KDFactory, - KDModelWrapper, - KDModuleWrapper, -) -from llmcompressor.utils.fsdp.context import summon_full_params_context -from llmcompressor.utils.fsdp.helpers import maybe_get_wrapped, set_wrapped_model -from llmcompressor.utils.pytorch.module import get_layers, set_layer - -__all__ = ["OutputDistillationModifier"] - - -class OutputDistillationModifier(Modifier): - targets: Union[str, List[Union[str, Tuple[str, str]]]] - projection: str = None - projection_args: Dict[str, Any] = None - transforms: Union[str, List[str]] = "identity" - transforms_args: Union[Dict[str, Any], List[Dict[str, Any]]] = None - comparison: str = "kl_divergence" - comparison_args: Dict[str, Any] = None - orig_scale: float = 1.0 - distill_scale: float = 1.0 - offload_layer_output: bool = False - - wrappers_: Dict[str, Any] = None - wrapped_kd_model_: Any = None - fsdp_active_: bool = False - - def on_initialize(self, state: State, **kwargs) -> bool: - if state.model is None or state.teacher_model is None: - return False - - self.wrappers_ = {} - if kwargs.get("fsdp_active"): - self.fsdp_active_ = True - - if not hasattr(state.model.config, "hidden_size"): - raise ValueError( - "Model config must specify hidden_size in order to use " - "OutputDistillationModifier" - ) - - # needed to initialize intermediate output buffers for student and teacher - hidden_size = ( - kwargs.get("metadata").get("per_device_train_batch_size", 1), - kwargs.get("metadata").get("max_seq_length", 512), - state.model.config.hidden_size, - ) - - for target in ( - self.targets if isinstance(self.targets, list) else [self.targets] - ): - if isinstance(target, tuple): - model_target, teacher_target = target - else: - model_target, teacher_target = target, target - - model_layers = get_layers(model_target, state.model) - teacher_layers = get_layers(teacher_target, state.teacher_model) - - if len(model_layers) < 1: - raise ValueError(f"no model layers found for target {target}") - - if len(model_layers) != len(teacher_layers): - raise ValueError( - f"model and teacher model layers for target {target} do not match" - ) - - for (key, student_layer), teacher_layer in zip( - model_layers.items(), teacher_layers.values() - ): - student_wrapper = self._create_layer_wrapper( - student_layer, hidden_size, state - ) - teacher_wrapper = self._create_layer_wrapper( - teacher_layer, hidden_size, state - ) - self.wrappers_[key] = (student_wrapper, teacher_wrapper) - - with summon_full_params_context(state.teacher_model, offload_to_cpu=True): - for key, (student_wrapper, teacher_wrapper) in self.wrappers_.items(): - set_layer(key, student_wrapper, state.model) - set_layer(key, teacher_wrapper, state.teacher_model) - - self.wrapped_kd_model_ = self._create_model_wrapper( - student_model=maybe_get_wrapped(state.model), - teacher_model=state.teacher_model, - state=state, - ) - - set_wrapped_model(state, self.wrapped_kd_model_) - - # for square-head distillation we want to scale the loss by the number of - # layers if the user doesn't alter the default scale. This is done so the - # distillation loss is roughly equally weighted to the cross entropy loss - num_layers = len(self.wrappers_) - if self.comparison == "square_head" and self.distill_scale == 1.0: - self.distill_scale = float(num_layers) - return True - - def on_finalize(self, state: State, **kwargs) -> bool: - set_wrapped_model(state, self.wrapped_kd_model_.student_model) - - with summon_full_params_context(state.teacher_model, offload_to_cpu=True): - for key, (student_wrapper, teacher_wrapper) in self.wrappers_.items(): - set_layer(key, student_wrapper.layer, state.model) - set_layer(key, teacher_wrapper.layer, state.teacher_model) - del student_wrapper - del teacher_wrapper - - del self.wrapped_kd_model_ - return True - - def on_start(self, state: State, event: Event, **kwargs): - for student_wrapper, teacher_wrapper in self.wrappers_.values(): - student_wrapper.kd_enabled = True - teacher_wrapper.kd_enabled = True - self.wrapped_kd_model_.kd_enabled = True - - def on_update(self, state: State, event: Event, **kwargs): - if event.type_ == EventType.LOSS_CALCULATED and event.should_update( - self.start, self.end, self.update - ): - distill_loss = self.wrapped_kd_model_.kd_last_comparison - model_loss = self.orig_scale * kwargs["loss"] - distill_loss = self.distill_scale * distill_loss.to(model_loss.device) - state.loss = model_loss + distill_loss - - def on_end(self, state: State, event: Event, **kwargs): - for student_wrapper, teacher_wrapper in self.wrappers_.values(): - student_wrapper.kd_enabled = False - teacher_wrapper.kd_enabled = False - self.wrapped_kd_model_.kd_enabled = False - - def _create_model_wrapper( - self, student_model: Module, teacher_model: Module, state: State - ) -> KDModelWrapper: - comparison = KDFactory.create_comparison( - self.comparison, - student_model, - teacher_model, - state, - **(self.comparison_args or {}), - ) - - return KDModelWrapper( - student_model=student_model, - teacher_model=teacher_model, - wrappers=self.wrappers_, - comparison=comparison, - fsdp_active=self.fsdp_active_, - ) - - def _create_layer_wrapper( - self, layer: Module, hidden_size: int, state: State - ) -> KDModuleWrapper: - transforms = [] - if self.transforms: - tmp_transforms = ( - self.transforms - if isinstance(self.transforms, list) - else [self.transforms] - ) - tmp_transform_args = [ - args - for args in ( - self.transforms_args - if isinstance(self.transforms_args, list) - else [self.transforms_args if self.transforms_args else {}] - ) - for _ in range(len(tmp_transforms)) - ] - - for transform, transform_args in zip(tmp_transforms, tmp_transform_args): - transforms.append( - KDFactory.create_transform( - transform, - layer, - state, - **transform_args, - ) - ) - - return KDModuleWrapper( - layer=layer, - hidden_size=hidden_size, - transforms=transforms, - fsdp_active=self.fsdp_active_, - offload_output=self.offload_layer_output, - ) diff --git a/src/llmcompressor/modifiers/distillation/utils/__init__.py b/src/llmcompressor/modifiers/distillation/utils/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py b/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py deleted file mode 100644 index 1b5a1c4465..0000000000 --- a/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# ruff: noqa - -from .kd_factory import * -from .kd_wrapper import * -from .model_wrapper import * diff --git a/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py b/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py deleted file mode 100644 index 150a0e2220..0000000000 --- a/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py +++ /dev/null @@ -1,408 +0,0 @@ -import re -from typing import Callable, Dict, Sequence, Tuple, Union - -import torch -import torch.nn.functional as TF -from torch import Tensor -from torch.nn import Module - -from llmcompressor.core import State - -__all__ = [ - "TensorOrCollectionType", - "ProjectionFuncType", - "CreateProjectionFuncType", - "TransformFuncType", - "CreateTransformFuncType", - "ComparisonFuncType", - "CreateComparisonFuncType", - "KDFactory", - "recursive_apply", - "recursive_combine", - "identity_transform", - "softmax_transform", - "log_softmax_transform", - "normalize_transform", - "l1_comparison", - "l2_comparison", - "inner_product_comparison", - "cosine_similarity_comparison", - "kl_divergence_comparison", - "cross_entropy_comparison", -] - - -TensorOrCollectionType = Union[Tensor, Sequence[Tensor], Dict[str, Tensor]] -ProjectionFuncType = Callable[ - [TensorOrCollectionType, TensorOrCollectionType], TensorOrCollectionType -] -CreateProjectionFuncType = Callable[ - [str, Module, Module, State], Tuple[ProjectionFuncType, ProjectionFuncType] -] -TransformFuncType = Callable[[TensorOrCollectionType], TensorOrCollectionType] -CreateTransformFuncType = Callable[[str, Module, Module, State], TransformFuncType] -ComparisonFuncType = Callable[ - [TensorOrCollectionType, TensorOrCollectionType], TensorOrCollectionType -] -CreateComparisonFuncType = Callable[[str, Module, Module, State], ComparisonFuncType] - - -class KDFactory: - registry_projections: Dict[str, CreateProjectionFuncType] = {} - registry_transforms: Dict[str, CreateTransformFuncType] = {} - registry_comparisons: Dict[str, CreateComparisonFuncType] = {} - - @staticmethod - def register_projection(name: str, func: CreateProjectionFuncType): - KDFactory.registry_projections[name] = func - - @staticmethod - def register_projection_decorator(name: str): - def inner(func: CreateProjectionFuncType): - KDFactory.registry_projections[name] = func - return func - - return inner - - @staticmethod - def create_projection( - name: str, student_layer: Module, teacher_layer: Module, state: State, **kwargs - ) -> Tuple[ProjectionFuncType, ProjectionFuncType]: - for pattern, creator in KDFactory.registry_projections: - match = pattern == name - - if not match: - try: - match = re.match(pattern, name) - except Exception: - pass - - if match: - return creator( - name=name, - student_layer=student_layer, - teacher_layer=teacher_layer, - state=state, - **kwargs, - ) - - raise ValueError(f"Invalid projection name: {name}") - - @staticmethod - def register_transform(name: str, func: CreateTransformFuncType): - KDFactory.registry_transforms[name] = func - - @staticmethod - def register_transform_decorator(name: str): - def inner(func: CreateTransformFuncType): - KDFactory.registry_transforms[name] = func - return func - - return inner - - @staticmethod - def create_transform( - name: str, - layer: Module, - state: State, - **kwargs, - ) -> TransformFuncType: - for pattern, creator in KDFactory.registry_transforms.items(): - match = pattern == name - - if not match: - try: - match = re.match(pattern, name) - except Exception: - pass - - if match: - return creator( - name=name, - layer=layer, - state=state, - **kwargs, - ) - - raise ValueError(f"Invalid transform name: {name}") - - @staticmethod - def register_comparison(name: str, func): - KDFactory.registry_comparisons[name] = func - - @staticmethod - def register_comparison_decorator(name: str): - def inner(func): - KDFactory.registry_comparisons[name] = func - return func - - return inner - - @staticmethod - def create_comparison( - name: str, student_layer: Module, teacher_layer: Module, state: State, **kwargs - ) -> ComparisonFuncType: - for pattern, creator in KDFactory.registry_comparisons.items(): - match = pattern == name - - if not match: - try: - match = re.match(pattern, name) - except Exception: - pass - - if match: - return creator( - name=name, - student_layer=student_layer, - teacher_layer=teacher_layer, - state=state, - **kwargs, - ) - - raise ValueError(f"Invalid comparison name: {name}") - - -def recursive_apply( - val: TensorOrCollectionType, - func: Callable[[Tensor], Tensor], -) -> TensorOrCollectionType: - if isinstance(val, Tensor): - return func(val) - - if isinstance(val, Sequence): - return [recursive_apply(item, func) for item in val] - - if isinstance(val, dict): - return {key: recursive_apply(item, func) for key, item in val.items()} - - raise ValueError(f"Unsupported type for recursive_apply: {type(val)}") - - -def recursive_combine( - val_one: TensorOrCollectionType, - val_two: TensorOrCollectionType, - func: Callable[[Tensor, Tensor], Tensor], -): - if not isinstance(val_one, type(val_two)): - raise ValueError( - f"val_one type of {type(val_one)} must match " - f"val_two type of {type(val_two)}" - ) - - if isinstance(val_one, Tensor): - return func(val_one, val_two) - - if isinstance(val_one, Sequence): - return [ - recursive_combine(item_one, item_two, func) - for item_one, item_two in zip(val_one, val_two) - ] - - if isinstance(val_one, dict): - return { - key: recursive_combine(val_one[key], val_two[key], func) - for key in val_one.keys() - } - - raise ValueError(f"Unsupported type for recursive_combine: {type(val_one)}") - - -@KDFactory.register_transform_decorator("identity") -def identity_transform(name: str, **kwargs): - if name != "identity": - raise ValueError(f"Invalid transform name: {name}") - - def _create_transform(val: TensorOrCollectionType) -> TensorOrCollectionType: - return val - - return _create_transform - - -@KDFactory.register_transform_decorator("softmax") -def softmax_transform(name: str, temperature: float = 1.0, dim: int = -1, **kwargs): - if name != "softmax": - raise ValueError(f"Invalid transform name: {name}") - - def _softmax(val: Tensor) -> Tensor: - val = val / temperature - - return torch.softmax(val, dim=dim) - - def _create_transform(val: TensorOrCollectionType) -> TensorOrCollectionType: - return recursive_apply(val, _softmax) - - return _create_transform - - -@KDFactory.register_transform_decorator("log_softmax") -def log_softmax_transform(name: str, temperature: float = 1.0, dim: int = -1, **kwargs): - if name != "log_softmax": - raise ValueError(f"Invalid transform name: {name}") - - def _log_softmax(val: Tensor) -> Tensor: - val = val / temperature - - return torch.log_softmax(val, dim=dim) - - def _create_transform(val: TensorOrCollectionType) -> TensorOrCollectionType: - return recursive_apply(val, _log_softmax) - - return _create_transform - - -@KDFactory.register_transform_decorator("normalize") -def normalize_transform( - name: str, - p: float = 1, - dim: int = -1, - eps: float = 1e-12, - mean: bool = False, - std: bool = False, - **kwargs, -): - if name != "normalize": - raise ValueError(f"Invalid transform name: {name}") - - def _normalize(val: Tensor) -> Tensor: - out = TF.normalize(val, p=p, dim=dim, eps=eps) - - if mean: - out = out - out.mean(dim=dim, keepdim=True) - - if std: - out = out / out.std(dim=dim, keepdim=True) - - return out - - def _create_transform(val: TensorOrCollectionType) -> TensorOrCollectionType: - return recursive_apply(val, _normalize) - - return _create_transform - - -@KDFactory.register_comparison_decorator("l1_distance") -def l1_comparison(name: str, dim: int = -1, **kwargs): - if name != "l1_distance": - raise ValueError(f"Invalid comparison name: {name}") - - def _l1(val_one: Tensor, val_two: Tensor) -> Tensor: - return torch.sum(torch.abs(val_one - val_two), dim=dim) - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _l1) - - return _create_comparison - - -@KDFactory.register_comparison_decorator("l2_distance") -def l2_comparison(name: str, dim: int = -1, **kwargs): - if name != "l2_distance": - raise ValueError(f"Invalid comparison name: {name}") - - def _l2(val_one: Tensor, val_two: Tensor) -> Tensor: - return torch.sum((val_one - val_two) ** 2, dim=dim) - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _l2) - - return _create_comparison - - -@KDFactory.register_comparison_decorator("inner_product") -def inner_product_comparison(name: str, dim: int = -1, **kwargs): - if name != "inner_product": - raise ValueError(f"Invalid comparison name: {name}") - - def _inner_product(val_one: Tensor, val_two: Tensor) -> Tensor: - return torch.sum(val_one * val_two, dim=dim) - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _inner_product) - - return _create_comparison - - -@KDFactory.register_comparison_decorator("cosine_similarity") -def cosine_similarity_comparison(name: str, dim: int = -1, **kwargs): - if name != "cosine_similarity": - raise ValueError(f"Invalid comparison name: {name}") - - def _cosine_similarity(val_one: Tensor, val_two: Tensor) -> Tensor: - return torch.sum(val_one * val_two, dim=dim) / ( - torch.norm(val_one, dim=dim) * torch.norm(val_two, dim=dim) - ) - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _cosine_similarity) - - return _create_comparison - - -@KDFactory.register_comparison_decorator("kl_divergence") -def kl_divergence_comparison( - name: str, dim: int = -1, temperature: float = 1.0, **kwargs -): - if name != "kl_divergence": - raise ValueError(f"Invalid comparison name: {name}") - - def _kl_divergence(val_one: Tensor, val_two: Tensor) -> Tensor: - val_one = val_one / temperature - val_two = val_two / temperature - - return torch.sum(val_one * torch.log(val_one / val_two), dim=dim) - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _kl_divergence) - - return _create_comparison - - -@KDFactory.register_comparison_decorator("cross_entropy") -def cross_entropy_comparison( - name: str, temperature: float = 1.0, reduction: str = "none", **kwargs -): - if name != "cross_entropy": - raise ValueError(f"Invalid projection name: {name}") - - def _cross_entropy(val_one: Tensor, val_two: Tensor) -> Tensor: - val_one = val_one / temperature - val_two = val_two / temperature - - return TF.cross_entropy(val_one, val_two, reduction=reduction) - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _cross_entropy) - - return _create_comparison - - -@KDFactory.register_comparison_decorator("square_head") -def square_head_comparison(name: str, **kwargs): - if name != "square_head": - raise ValueError(f"Invalid projection name: {name}") - - def _square_head(val_one: Tensor, val_two: Tensor) -> Tensor: - numerator = torch.sum(torch.square(val_two - val_one)) - denominator = torch.sum(torch.square(val_two)) - - return numerator / denominator - - def _create_comparison( - val_one: TensorOrCollectionType, val_two: TensorOrCollectionType - ) -> TensorOrCollectionType: - return recursive_combine(val_one, val_two, _square_head) - - return _create_comparison diff --git a/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py b/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py deleted file mode 100644 index ee96e4763d..0000000000 --- a/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_wrapper.py +++ /dev/null @@ -1,116 +0,0 @@ -from typing import List, Optional, Set, Tuple - -import torch -from torch.nn import Module - -from llmcompressor.modifiers.distillation.utils.pytorch.kd_factory import ( - TransformFuncType, -) - -__all__ = ["KDModuleWrapper"] - - -class KDModuleWrapper(Module): - KD_TRANSFORMED_BUFFER = "kd_last_transformed" - - def __init__( - self, - layer: Module, - hidden_size: Tuple, - transforms: Optional[List[TransformFuncType]], - fsdp_active: bool, - offload_output: bool, - ): - super(KDModuleWrapper, self).__init__() - - self.layer = layer - self._save_active = False - self._fsdp_active = fsdp_active - self.offload_output = offload_output - self.kd_transforms = transforms - self.kd_enabled = False - self.register_buffer( - self.KD_TRANSFORMED_BUFFER, torch.zeros(hidden_size, device="cpu") - ) - self._init_called = True # make sure this is last property to be set - - def _clear_missing_keys(module, incompatible_keys): - incompatible_keys.missing_keys.clear() - - self.register_load_state_dict_post_hook(_clear_missing_keys) - - def forward(self, *args, **kwargs): - if not self.kd_enabled: - return self.layer(*args, **kwargs) - - org_output = self.layer(*args, **kwargs) - output = org_output if isinstance(org_output, torch.Tensor) else org_output[0] - - if self.kd_transforms is not None: - for transform in self.kd_transforms: - output = transform(output) - - if self.offload_output: - output = output.to("cpu") - setattr(self, self.KD_TRANSFORMED_BUFFER, output) - return org_output - - def state_dict(self, destination=None, prefix="", keep_vars=False, **kwargs): - return self.layer.state_dict( - destination=destination, prefix=prefix, keep_vars=keep_vars, **kwargs - ) - - def load_state_dict(self, state_dict, strict=True): - return self.layer.load_state_dict(state_dict, strict=strict) - - def _load_from_state_dict( - self, - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, - ): - self.layer._load_from_state_dict( - state_dict=state_dict, - prefix=prefix, - local_metadata=local_metadata, - strict=strict, - missing_keys=missing_keys, - unexpected_keys=unexpected_keys, - error_msgs=error_msgs, - ) - - def named_modules( - self, - memo: Optional[Set["Module"]] = None, - prefix: str = "", - remove_duplicate: bool = True, - ): - # outside of saving, we want the full names of modules in two cases: - # 1. trainer initialization, so teacher is moved to the correct device. This is - # caught by the kd_enabled flag, which is set when the modifier is started - # 2. running in DataParallel (non-FSDP) mode so the replicate function can pick - # up the teacher. - if self._save_active or (self.kd_enabled and self._fsdp_active): - return self.layer.named_modules( - memo=memo, prefix=prefix, remove_duplicate=remove_duplicate - ) - - return super().named_modules( - memo=memo, prefix=prefix, remove_duplicate=remove_duplicate - ) - - def prepare_for_save(self): - """ - Prepare model structure to be saved, specifically `self.named_modules` - """ - self._save_active = True - - def finish_save(self): - """ - Finish saving model - """ - self._save_active = False diff --git a/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py b/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py deleted file mode 100644 index 33ba6f6986..0000000000 --- a/src/llmcompressor/modifiers/distillation/utils/pytorch/model_wrapper.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import Any, Dict, Optional, Set - -import torch -from torch.nn import Module - -__all__ = ["KDModelWrapper"] - - -class KDModelWrapper(Module): - KD_LAST_COMPARISON = "kd_last_comparison" - - def __init__( - self, - student_model: Module, - teacher_model: Module, - wrappers: Dict[str, Any], - comparison, - fsdp_active: bool, - ): - super(KDModelWrapper, self).__init__() - - self.student_model = student_model - self.teacher_model = teacher_model - self.wrappers = wrappers - self.kd_comparison = comparison - self._save_active = False - self._fsdp_active = fsdp_active - self.kd_enabled = False - self.register_buffer(self.KD_LAST_COMPARISON, torch.zeros(1, device="cpu")) - self._init_called = True # make sure this is last property to be set - - def _clear_missing_keys(module, incompatible_keys): - incompatible_keys.missing_keys.clear() - - self.register_load_state_dict_post_hook(_clear_missing_keys) - - def forward(self, *args, **kwargs): - if not self.kd_enabled: - return self.student_model(*args, **kwargs) - - org_output = self.student_model(*args, **kwargs) - with torch.no_grad(): - self.teacher_model(*args, **kwargs) - - layerwise_comps = [] - nonpad_tokens = kwargs["attention_mask"] == 1 - device = nonpad_tokens.device - for key, (student_wrapper, teacher_wrapper) in self.wrappers.items(): - student_out = student_wrapper.kd_last_transformed.to(device)[nonpad_tokens] - teacher_out = teacher_wrapper.kd_last_transformed.to(device)[nonpad_tokens] - comp = self.kd_comparison(student_out, teacher_out) - layerwise_comps.append(comp) - - setattr(self, self.KD_LAST_COMPARISON, torch.stack(layerwise_comps).mean()) - - return org_output - - def state_dict(self, destination=None, prefix="", keep_vars=False, **kwargs): - return self.student_model.state_dict( - destination=destination, prefix=prefix, keep_vars=keep_vars, **kwargs - ) - - def load_state_dict(self, state_dict, strict=True): - return self.student_model.load_state_dict(state_dict, strict=strict) - - def _load_from_state_dict( - self, - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, - ): - self.student_model._load_from_state_dict( - state_dict=state_dict, - prefix=prefix, - local_metadata=local_metadata, - strict=strict, - missing_keys=missing_keys, - unexpected_keys=unexpected_keys, - error_msgs=error_msgs, - ) - - def named_modules( - self, - memo: Optional[Set["Module"]] = None, - prefix: str = "", - remove_duplicate: bool = True, - ): - # outside of saving, we want the full names of modules in two cases: - # 1. trainer initialization, so teacher is moved to the correct device. This is - # caught by the kd_enabled flag, which is set when the modifier is started - # 2. running in DataParallel (non-FSDP) mode so the replicate function can pick - # up the teacher. - if self._save_active or (self.kd_enabled and self._fsdp_active): - return self.student_model.named_modules( - memo=memo, prefix=prefix, remove_duplicate=remove_duplicate - ) - - return super().named_modules( - memo=memo, prefix=prefix, remove_duplicate=remove_duplicate - ) - - def named_children(self): - return self.student_model.named_children() - - def train(self, mode: bool = True): - self.student_model.train(mode) - return self - - def prepare_for_save(self): - """ - Prepare model structure to be saved, specifically `self.named_modules` - """ - self._save_active = True - for student_wrapper, teacher_wrapper in self.wrappers.values(): - student_wrapper.prepare_for_save() - teacher_wrapper.prepare_for_save() - - def finish_save(self): - """ - Finish saving model - """ - self._save_active = False - for student_wrapper, teacher_wrapper in self.wrappers.values(): - student_wrapper.finish_save() - teacher_wrapper.finish_save() - - def __getattr__(self, name: str) -> Any: - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.student_model, name) From 13f4a7eeb11361af3037a478416864c16067b853 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 18 Nov 2025 18:41:09 -0500 Subject: [PATCH 21/23] remove link --- src/llmcompressor/modifiers/README.md | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/llmcompressor/modifiers/README.md b/src/llmcompressor/modifiers/README.md index 72ff0b0586..fad84b5196 100644 --- a/src/llmcompressor/modifiers/README.md +++ b/src/llmcompressor/modifiers/README.md @@ -65,11 +65,4 @@ rather than the linear smoothing done by SmoothQuant. The implementation is base One-shot pruning algorithms often introduce accuracy degradation that can be recovered with finetuning. This modifier ensures that the sparsity mask of the model is maintained during finetuning, allowing a sparse model to recover accuracy while maintaining its sparsity structure. It is intended to be used after a pruning modifier -such as `SparseGPT` or `WANDA` has already been applied. - -### [Distillation](./distillation/output/base.py) -To better recover accuracy of sparse models during finetuning, we can also use a teacher model of the same architecture -to influence the loss. This modifier is intended to be used in conjunction with `ConstantPruning` modifier on a -pruned model, with the dense version of the model being used as the teacher. Both output distillation loss and -layer-by-layer distillation loss are supported. The layer-by-layer implementation follows the Square Head distillation -algorithm presented in [Sparse Fine-tuning for Inference Acceleration of Large Language Models](https://arxiv.org/pdf/2310.06927). \ No newline at end of file +such as `SparseGPT` or `WANDA` has already been applied. \ No newline at end of file From 5b3c280eed4bb53bcdfcfdcb932786fc4682bcc0 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 19 Nov 2025 12:59:15 -0500 Subject: [PATCH 22/23] update example --- .../2of4_w4a16_group-128_recipe.yaml | 13 ---- .../quantization_2of4_sparse_w4a16/README.md | 60 ++++--------------- .../llama7b_sparse_w4a16.py | 53 +++------------- 3 files changed, 20 insertions(+), 106 deletions(-) diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml index 7a002633a1..bb76f11015 100644 --- a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml +++ b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml @@ -5,19 +5,6 @@ sparsity_stage: mask_structure: "2:4" targets: ["Linear"] ignore: ["re:.*lm_head"] -finetuning_stage: - finetuning_modifiers: - ConstantPruningModifier: - targets: [ - 're:.*q_proj.weight', - 're:.*k_proj.weight', - 're:.*v_proj.weight', - 're:.*o_proj.weight', - 're:.*gate_proj.weight', - 're:.*up_proj.weight', - 're:.*down_proj.weight', - ] - start: 0 quantization_stage: quantization_modifiers: GPTQModifier: diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md index 932284fe47..8fcd880cc6 100644 --- a/examples/quantization_2of4_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -4,9 +4,10 @@ > `2:4 sparisty + int4/int8` mixed precision computation is supported in vLLM on Nvidia capability > 8.0 (Ampere, Ada Lovelace, Hopper). -## NOTE: -Fine tuning can require more steps than is shown in the example. -See the Axolotl integration blog post for best fine tuning practices +## NOTE: The following example no longer includes finetuning as training +training support has been deprecated as of v0.9.0. To apply finetuning +to your sparse model, see the Axolotl integration blog post for best +fine tuning practices https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open @@ -78,22 +79,11 @@ output_path = Path(output_dir) splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} max_seq_length = 512 num_calibration_samples = 512 - -# set training parameters for finetuning -# increase num_train_epochs for longer training -num_train_epochs = 0.01 -logging_steps = 500 -save_steps = 5000 -gradient_checkpointing = True # saves memory during training -learning_rate = 0.0001 -bf16 = False # using full precision for training -lr_scheduler_type = "cosine" -warmup_ratio = 0.1 preprocessing_num_workers = 8 ``` -## Step 2: Run `sparsification`, `fine-tuning`, and `quantization` -The compression process now runs in three stages: sparsification, fine-tuning, and quantization. +## Step 2: Run `sparsification` and `quantization` +The compression process now runs in two stages: sparsification and quantization. Each stage saves the intermediate model outputs to the `output_llama7b_2of4_w4a16_channel` directory. ```python @@ -106,47 +96,19 @@ output_path = Path(output_dir) # 1. Oneshot sparsification: apply pruning oneshot( model=model, - dataset=dataset, - recipe=recipe, - splits=splits, - num_calibration_samples=num_calibration_samples, - preprocessing_num_workers=preprocessing_num_workers, + **oneshot_kwargs, output_dir=output_dir, stage="sparsity_stage", ) -# 2. Sparse fine-tuning: improve accuracy on pruned model -train( - model=output_path / "sparsity_stage", - dataset=dataset, - recipe=recipe, - splits=splits, - num_calibration_samples=num_calibration_samples, - preprocessing_num_workers=preprocessing_num_workers, - bf16=bf16, - max_seq_length=max_seq_length, - num_train_epochs=num_train_epochs, - logging_steps=logging_steps, - save_steps=save_steps, - gradient_checkpointing=gradient_checkpointing, - learning_rate=learning_rate, - lr_scheduler_type=lr_scheduler_type, - warmup_ratio=warmup_ratio, - output_dir=output_dir, - stage="finetuning_stage", -) -# 3. Oneshot quantization: compress model weights to lower precision +# 2. Oneshot quantization: compress model weights to lower precision quantized_model = oneshot( - model=output_path / "finetuning_stage", - dataset=dataset, - recipe=recipe, - splits=splits, - num_calibration_samples=num_calibration_samples, - preprocessing_num_workers=preprocessing_num_workers, - output_dir=output_dir, + model=(output_path / "sparsity_stage"), + **oneshot_kwargs, stage="quantization_stage", ) + # skip_sparsity_compression_stats is set to False # to account for sparsity in the model when compressing quantized_model.save_pretrained( diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index 51e24f0063..1c54e906c6 100644 --- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -1,5 +1,7 @@ -# NOTE: Fine tuning can require more steps than is shown in the example -# See the Axolotl integration blog post for best fine tuning practices +# NOTE: The following example no longer includes finetuning as training +# training support has been deprecated as of v0.9.0. To apply finetuning +# to your sparse model, see the Axolotl integration blog post for best +# fine tuning practices # https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open from pathlib import Path @@ -8,7 +10,7 @@ from loguru import logger from transformers import AutoModelForCausalLM, AutoTokenizer -from llmcompressor import oneshot, train +from llmcompressor import oneshot # load the model in as bfloat16 to save on memory and compute model_stub = "neuralmagic/Llama-2-7b-ultrachat200k" @@ -26,22 +28,11 @@ output_path = Path(output_dir) # set dataset config parameters -splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} +splits = {"calibration": "train_gen[:5%]"} max_seq_length = 512 -num_calibration_samples = 512 - -# set training parameters for finetuning -num_train_epochs = 0.01 -logging_steps = 500 -save_steps = 5000 -gradient_checkpointing = True # saves memory during training -learning_rate = 0.0001 -bf16 = False # using full precision for training -lr_scheduler_type = "cosine" -warmup_ratio = 0.1 +num_calibration_samples = 10 preprocessing_num_workers = 64 - oneshot_kwargs = dict( dataset=dataset, recipe=recipe, @@ -50,26 +41,10 @@ splits=splits, ) -training_kwargs = dict( - bf16=bf16, - max_seq_length=max_seq_length, - num_train_epochs=num_train_epochs, - logging_steps=logging_steps, - save_steps=save_steps, - gradient_checkpointing=gradient_checkpointing, - learning_rate=learning_rate, - lr_scheduler_type=lr_scheduler_type, - warmup_ratio=warmup_ratio, -) - -# This will run the targeted stage of the recipe -# oneshot sparsification -> finetuning -> oneshot quantization - # Models are automatically saved in -# ./output_llama7b_2of4_w4a16_channel/ + (finetuning/sparsity/quantization)_stage +# ./output_llama7b_2of4_w4a16_channel/ + (sparsity/quantization)_stage # Oneshot sparsification - oneshot( model=model, **oneshot_kwargs, @@ -77,19 +52,9 @@ stage="sparsity_stage", ) -# Sparse finetune -# This step can be supplanted by fine tuning via integrated FT libraries such as Axolotl -train( - model=(output_path / "sparsity_stage"), - **oneshot_kwargs, - **training_kwargs, - output_dir=output_dir, - stage="finetuning_stage", -) - # Oneshot quantization quantized_model = oneshot( - model=(output_path / "finetuning_stage"), + model=(output_path / "sparsity_stage"), **oneshot_kwargs, stage="quantization_stage", ) From 1cea473281574ff4ad8e5c17ea000774c953f417 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 19 Nov 2025 13:21:56 -0500 Subject: [PATCH 23/23] update readme --- .../quantization_2of4_sparse_w4a16/README.md | 2 +- .../llama7b_sparse_w4a16.py | 5 +- src/llmcompressor/entrypoints/README.md | 188 +----------------- 3 files changed, 13 insertions(+), 182 deletions(-) diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md index 8fcd880cc6..178f4373b8 100644 --- a/examples/quantization_2of4_sparse_w4a16/README.md +++ b/examples/quantization_2of4_sparse_w4a16/README.md @@ -5,7 +5,7 @@ > `2:4 sparisty + int4/int8` mixed precision computation is supported in vLLM on Nvidia capability > 8.0 (Ampere, Ada Lovelace, Hopper). ## NOTE: The following example no longer includes finetuning as training -training support has been deprecated as of v0.9.0. To apply finetuning +Training support has been deprecated as of v0.9.0. To apply finetuning to your sparse model, see the Axolotl integration blog post for best fine tuning practices https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py index 1c54e906c6..b2f4e57b64 100644 --- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py +++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py @@ -1,5 +1,6 @@ -# NOTE: The following example no longer includes finetuning as training -# training support has been deprecated as of v0.9.0. To apply finetuning +# NOTE: The following example no longer includes finetuning as training. + +# Training support has been deprecated as of v0.9.0. To apply finetuning # to your sparse model, see the Axolotl integration blog post for best # fine tuning practices # https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md index 25a85bae30..97da4fcaab 100644 --- a/src/llmcompressor/entrypoints/README.md +++ b/src/llmcompressor/entrypoints/README.md @@ -1,21 +1,18 @@ -# Compression and Fine-tuning Entrypoint +# Compression Entrypoints ## Oneshot - An ideal compression technique reduces memory footprint while maintaining accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification. ### PTQ -PTQ is performed to reduce the precision of quantizable weights (e.g., linear layers) to a lower bit-width. Supported formats are: -- [W4A16](../../../examples/quantization_w4a16/README.md) -- [W8A8-INT8](../../../examples/quantization_w8a8_int8/README.md) -- [W8A8-FP8](../../../examples/quantization_w8a8_fp8/README.md) +PTQ is performed to reduce the precision of quantizable weights (e.g., linear layers) to a lower bit-width. +A complete list of formats can be found here: https://docs.vllm.ai/projects/llm-compressor/en/latest/guides/compression_schemes/ ### Sparsification Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include: - [2:4-Sparsity with FP4 Weight](../../../examples/quantization_2of4_sparse_w4a16/README.md) - [2:4-Sparsity with FP8 Weight, FP8 Input Activation](../../../examples/sparse_2of4_quantization_fp8/README.md) -## Code +### Example Example scripts for all the above formats are located in the [examples](../../../examples/) folder. The [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: @@ -68,7 +65,6 @@ oneshot( ) ``` - ### Lifecycle The oneshot calibration lifecycle consists of three steps: @@ -88,177 +84,11 @@ The oneshot calibration lifecycle consists of three steps: This will automatically save the model weights to a compressed SafeTensors format. The tokenizer/processor, recipe, and the configuration file will also be saved. -## Train / Finetune -Compressed models can be trained to improve accuracy. Training is carried out using HuggingFace's Trainer. - -### Finetuning a Compressed Model -LLM-Compressor supports fine-tuning of quantized, sparsified, and sparse-quantized models. It offers both standard fine-tuning, knowledge distillation and SFT Trainer. - -## Code +## Model-Free PTQ +For certain cases, it may be beneficial to consider the `model_free_ptq` entrypoint such as when a model definition is lacking or if the `oneshot` entrypoint fails. +`model_free_ptq` can be applied for schemes that do not require data, such as Round-To-Nearest with FP8 or NVFP4A16. Examples applying the entrypoint can be found +here: https://github.com/vllm-project/llm-compressor/tree/main/examples/model_free_ptq. ### Finetuning -A compressed model generated using `oneshot` is saved to disk in a compressed format. To load it, the model must be decompressed using `CompressedTensorsConfig` with `AutoModelForCausalLM`. If the above `oneshot` example script was executed and the compressed model was saved to `./oneshot_model`, the following code is used to perform fine-tuning: - - -```python -from transformers.utils.quantization_config import CompressedTensorsConfig - -from llmcompressor import create_session, train - -# The saving directory -output_dir = "./oneshot_model" - -# The model to train -model = AutoModelForCausalLM.from_pretrained( - output_dir, - quantization_config=CompressedTensorsConfig(run_compressed=False), -) - -dataset = "open_platypus" # Define dataset to use for kd -output_dir = "./finetuned_model" -splits = "train[:50%]" # Use 50% of the training data -max_steps = ( - 25 # Number of training steps (updates) before stopping the training process -) -num_calibration_samples = 8 # Number of workers processing datasets in parallel - -# Create an isolated session independent from the previous runs -with create_session(): - train( - model=model, # The model to finetune - dataset=dataset, # The data to carry out finetuning - output_dir=output_dir, # The output directory to save - num_calibration_samples=num_calibration_samples, # The number of workers to carry out dataset processing - splits=splits, # The dataset key and percentage of samples to use - max_steps=max_steps, # The total number of iterations to carry out training - ) -``` - - -### Knowledge Distillation - -To perform knowledge distillation, a teacher model and a student model (the compressed model) must be defined. The loss between the student and the teacher can be specified in the recipe by defining the `comparison` key. In this case, KL divergence is used to compare the output distributions of the student and the teacher. -Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py`. - -```python -# Define the teacher model -distill_teacher = AutoModelForCausalLM.from_pretrained( - "meta-llama/Meta-Llama-3-8B-Instruct", -) - -# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with -recipe = r""" -kd_stage: - distillation_modifiers: - OutputDistillationModifier: - targets: ["re:model.layers.\\d+$"] - comparison: "kl_divergence" - start: 0 - orig_scale: 1.0 - distill_scale: 1.0 -""" - -# Create an isolated session from the previous runs -with create_session(): - train( - ... - distill_teacher=distill_teacher, # The teacher model - recipe=recipe, # The recipe to use - ) - -``` - -The output terminal will provide the sparsification, quantization and training metrics: - -```bash -2025-02-25T18:39:08.984855-0500 | log_model_sparsification | INFO - There are 8033013760 prunable params which have 0.02% avg sparsity. -2025-02-25T18:39:08.987302-0500 | log_model_sparsification | INFO - There are 8033013760 quantizable params, with a quantization percentage of 86.88%. -***** train metrics ***** - epoch = 0.016 - perplexity = 1.5422 - total_flos = 3221945GF - train_loss = 0.4332 - train_runtime = 0:03:53.39 - train_samples = 12463 - train_samples_per_second = 0.857 - train_steps_per_second = 0.107 -``` - -### End-to-end Script -The end-to-end script for carrying out `oneshot` for `W8A8-FP8` and then knowledge distillation is shown below: - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier - -MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - -# The directory for saving -oneshot_output_dir = "./oneshot_model" - -# Load the model -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") -# Load the tokenizer -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Define the recipe. `scheme="FP8_DYNAMIC"` compresses to W8A8-FP8, which is -# FP8 channel-wise for weight, and FP8 dynamic per token activation -recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] -) - -# compress the model -oneshot(model=model, recipe=recipe, output_dir=oneshot_output_dir) - -from transformers.utils.quantization_config import CompressedTensorsConfig - -from llmcompressor import create_session, train - -# Student model -model = AutoModelForCausalLM.from_pretrained( - oneshot_output_dir, - quantization_config=CompressedTensorsConfig(run_compressed=False), -) - -dataset = "open_platypus" # Define dataset to use for knowledge distillation -finetune_output_dir = "./finetuned_model" # The output saving directory -splits = "train[:50%]" # Use 50% of the training data -max_steps = ( - 25 # The number of training steps (updates) before stopping the training process -) -num_calibration_samples = 8 # The number of workers processing datasets in parallel - -# Define teacher model -distill_teacher = AutoModelForCausalLM.from_pretrained( - "meta-llama/Meta-Llama-3-8B-Instruct", -) - -# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with -# KL divergence comparison -recipe = r""" -kd_stage: - distillation_modifiers: - OutputDistillationModifier: - targets: ["re:model.layers.\\d+$"] - comparison: "kl_divergence" - start: 0 - orig_scale: 1.0 - distill_scale: 1.0 -""" - -# Create an isolated session from the previous runs -with create_session(): - train( - model=model, # The student model - dataset=dataset, # The data to carry out finetuning - output_dir=finetune_output_dir, # Output directory to save - num_calibration_samples=num_calibration_samples, # The number of workers to carry out dataset processing - splits=splits, # The percentage of the subsets of a dataset to use - max_steps=max_steps, # The number of training steps - distill_teacher=distill_teacher, # The teacher model - recipe=recipe, # The recipe to use - ) -``` \ No newline at end of file +As of LLM Compressor v0.9.0, training support has been deprecated. To apply finetuning to your model, such as in the case of sparse-finetuning, Axolotl training can be applied. A step-by-step guide explaining how to apply the Axolotl integration can be found here: https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open# as well as in the Axolotl documentation: https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor. \ No newline at end of file