vllm-project · dsikka · Nov 20, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml
@@ -62,7 +62,7 @@ jobs:
     steps:
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.10'
+          python-version: '3.12'
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
@@ -73,7 +73,7 @@ jobs:
         run: uv pip install .[dev]
       - uses: actions/checkout@v4
         with:
-          repository: "neuralmagic/compressed-tensors"
+          repository: "vllm-project/compressed-tensors"
           path: "compressed-tensors"
           fetch-depth: 0
           fetch-tags: true
@@ -93,10 +93,10 @@ jobs:
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest -v tests/llmcompressor/transformers/compression
-      - name: Run Finetune Tests
+      - name: Run Data Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/finetune
+          pytest -v tests/llmcompressor/transformers/data
       - name: Running GPTQ Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |

diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -22,10 +22,14 @@ jobs:
     runs-on: ubuntu-22.04
     env:
       COVERAGE_FILE: ".coverage.base"
+    strategy:
+      matrix:
+        python: ["3.10", "3.13"]
     steps:
-      - uses: actions/setup-python@v5
+      - name: Set up Python
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.12'
+          python-version: ${{ matrix.python }}
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
@@ -36,7 +40,7 @@ jobs:
         run: uv pip install .[dev]
       - uses: actions/checkout@v4
         with:
-          repository: "neuralmagic/compressed-tensors"
+          repository: "vllm-project/compressed-tensors"
           path: "compressed-tensors"
           fetch-depth: 0
           fetch-tags: true
@@ -73,10 +77,14 @@ jobs:
     runs-on: ubuntu-22.04
     env:
       COVERAGE_FILE: ".coverage.pytorch"
+    strategy:
+      matrix:
+        python: ["3.10", "3.13"]
     steps:
-      - uses: actions/setup-python@v5
+      - name: Set up Python
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
+          python-version: ${{ matrix.python }}
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
@@ -87,7 +95,7 @@ jobs:
         run: uv pip install .[dev]
       - uses: actions/checkout@v4
         with:
-          repository: "neuralmagic/compressed-tensors"
+          repository: "vllm-project/compressed-tensors"
           path: "compressed-tensors"
           fetch-depth: 0
           fetch-tags: true

diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml
@@ -5,19 +5,6 @@ sparsity_stage:
       mask_structure: "2:4"
       targets: ["Linear"]
       ignore: ["re:.*lm_head"]
-finetuning_stage:
-  finetuning_modifiers:
-    ConstantPruningModifier:
-      targets: [
-        're:.*q_proj.weight',
-        're:.*k_proj.weight', 
-        're:.*v_proj.weight',
-        're:.*o_proj.weight',
-        're:.*gate_proj.weight',
-        're:.*up_proj.weight',
-        're:.*down_proj.weight',
-      ]
-      start: 0
 quantization_stage:
   quantization_modifiers:
     GPTQModifier:

diff --git a/examples/quantization_2of4_sparse_w4a16/README.md b/examples/quantization_2of4_sparse_w4a16/README.md
@@ -4,9 +4,10 @@
 
 > `2:4 sparisty + int4/int8` mixed precision computation is supported in vLLM on Nvidia capability > 8.0 (Ampere, Ada Lovelace, Hopper).
 
-## NOTE: 
-Fine tuning can require more steps than is shown in the example.
-See the Axolotl integration blog post for best fine tuning practices
+## NOTE: The following example no longer includes finetuning as training
+Training support has been deprecated as of v0.9.0. To apply finetuning
+to your sparse model, see the Axolotl integration blog post for best  
+fine tuning practices
 https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open
 
 
@@ -78,22 +79,11 @@ output_path = Path(output_dir)
 splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
 max_seq_length = 512
 num_calibration_samples = 512
-
-# set training parameters for finetuning
-# increase num_train_epochs for longer training
-num_train_epochs = 0.01
-logging_steps = 500
-save_steps = 5000
-gradient_checkpointing = True  # saves memory during training
-learning_rate = 0.0001
-bf16 = False  # using full precision for training
-lr_scheduler_type = "cosine"
-warmup_ratio = 0.1
 preprocessing_num_workers = 8
 ```
 
-## Step 2: Run `sparsification`, `fine-tuning`, and `quantization`
-The compression process now runs in three stages: sparsification, fine-tuning, and quantization.
+## Step 2: Run `sparsification` and `quantization`
+The compression process now runs in two stages: sparsification and quantization.
 Each stage saves the intermediate model outputs to the `output_llama7b_2of4_w4a16_channel` directory.
 
 ```python
@@ -106,47 +96,19 @@ output_path = Path(output_dir)
 # 1. Oneshot sparsification: apply pruning
 oneshot(
     model=model,
-    dataset=dataset,
-    recipe=recipe,
-    splits=splits,
-    num_calibration_samples=num_calibration_samples,
-    preprocessing_num_workers=preprocessing_num_workers,
+    **oneshot_kwargs,
     output_dir=output_dir,
     stage="sparsity_stage",
 )
 
-# 2. Sparse fine-tuning: improve accuracy on pruned model
-train(
-    model=output_path / "sparsity_stage",
-    dataset=dataset,
-    recipe=recipe,
-    splits=splits,
-    num_calibration_samples=num_calibration_samples,
-    preprocessing_num_workers=preprocessing_num_workers,
-    bf16=bf16,
-    max_seq_length=max_seq_length,
-    num_train_epochs=num_train_epochs,
-    logging_steps=logging_steps,
-    save_steps=save_steps,
-    gradient_checkpointing=gradient_checkpointing,
-    learning_rate=learning_rate,
-    lr_scheduler_type=lr_scheduler_type,
-    warmup_ratio=warmup_ratio,
-    output_dir=output_dir,
-    stage="finetuning_stage",
-)
 
-# 3. Oneshot quantization: compress model weights to lower precision
+# 2. Oneshot quantization: compress model weights to lower precision
 quantized_model = oneshot(
-    model=output_path / "finetuning_stage",
-    dataset=dataset,
-    recipe=recipe,
-    splits=splits,
-    num_calibration_samples=num_calibration_samples,
-    preprocessing_num_workers=preprocessing_num_workers,
-    output_dir=output_dir,
+    model=(output_path / "sparsity_stage"),
+    **oneshot_kwargs,
     stage="quantization_stage",
 )
+
 # skip_sparsity_compression_stats is set to False
 # to account for sparsity in the model when compressing
 quantized_model.save_pretrained(

diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -1,5 +1,8 @@
-# NOTE: Fine tuning can require more steps than is shown in the example
-# See the Axolotl integration blog post for best fine tuning practices
+# NOTE: The following example no longer includes finetuning as training.
+
+# Training support has been deprecated as of v0.9.0. To apply finetuning
+# to your sparse model, see the Axolotl integration blog post for best
+# fine tuning practices
 # https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open
 
 from pathlib import Path
@@ -8,7 +11,7 @@
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor import oneshot, train
+from llmcompressor import oneshot
 
 # load the model in as bfloat16 to save on memory and compute
 model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
@@ -26,22 +29,11 @@
 output_path = Path(output_dir)
 
 # set dataset config parameters
-splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
+splits = {"calibration": "train_gen[:5%]"}
 max_seq_length = 512
-num_calibration_samples = 512
-
-# set training parameters for finetuning
-num_train_epochs = 0.01
-logging_steps = 500
-save_steps = 5000
-gradient_checkpointing = True  # saves memory during training
-learning_rate = 0.0001
-bf16 = False  # using full precision for training
-lr_scheduler_type = "cosine"
-warmup_ratio = 0.1
+num_calibration_samples = 10
 preprocessing_num_workers = 64
 
-
 oneshot_kwargs = dict(
     dataset=dataset,
     recipe=recipe,
@@ -50,46 +42,20 @@
     splits=splits,
 )
 
-training_kwargs = dict(
-    bf16=bf16,
-    max_seq_length=max_seq_length,
-    num_train_epochs=num_train_epochs,
-    logging_steps=logging_steps,
-    save_steps=save_steps,
-    gradient_checkpointing=gradient_checkpointing,
-    learning_rate=learning_rate,
-    lr_scheduler_type=lr_scheduler_type,
-    warmup_ratio=warmup_ratio,
-)
-
-# This will run the targeted stage of the recipe
-# oneshot sparsification -> finetuning -> oneshot quantization
-
 # Models are automatically saved in
-# ./output_llama7b_2of4_w4a16_channel/ + (finetuning/sparsity/quantization)_stage
+# ./output_llama7b_2of4_w4a16_channel/ + (sparsity/quantization)_stage
 
 # Oneshot sparsification
-
 oneshot(
     model=model,
     **oneshot_kwargs,
     output_dir=output_dir,
     stage="sparsity_stage",
 )
 
-# Sparse finetune
-# This step can be supplanted by fine tuning via integrated FT libraries such as Axolotl
-train(
-    model=(output_path / "sparsity_stage"),
-    **oneshot_kwargs,
-    **training_kwargs,
-    output_dir=output_dir,
-    stage="finetuning_stage",
-)
-
 # Oneshot quantization
 quantized_model = oneshot(
-    model=(output_path / "finetuning_stage"),
+    model=(output_path / "sparsity_stage"),
     **oneshot_kwargs,
     stage="quantization_stage",
 )

diff --git a/examples/trl_mixin/README.md b/examples/trl_mixin/README.md
diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py