update qwen3_next to use moe_calibration_context

dsikka · HDCharles · commit 7772e25ce184 · 2025-11-03T19:34:33.000Z
diff --git a/examples/quantization_w4a4_fp4/qwen3_next_example.py b/examples/quantization_w4a4_fp4/qwen3_next_example.py
@@ -68,18 +68,22 @@ def tokenize(sample):
 )
 
 # Apply quantization.
-# We see `calibrate_moe_context` to True to update all `Qwen3MoeSparseMoeBlock`
-# during calibration.
+# MoE calibration is now handled automatically by the pipeline.
+# We set `moe_calibrate_all_experts` to True to ensure all experts receive
+# calibration data. This temporarily updates the model definition to use
+# `CalibrationQwen3NextSparseMoeBlock` (from `llmcompressor.modeling.qwen3_next_moe`)
+# which replaces the original `Qwen3NextSparseMoeBlock` class.
+# This updates how the forward pass is handled in the MoE block during calibration.
 # Feel free to update the definition under
-# llm-compressor/src/llmcompressor/modeling/qwen3_moe.py` to play around with
-# this behaviour and evaluate its impact on quantization performance
+# llm-compressor/src/llmcompressor/modeling/qwen3_next_moe.py to play around with
+# this behavior and evaluate its impact on quantization performance.
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    calibrate_moe_context=True,
+    moe_calibrate_all_experts=True,
 )
 
 
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -29,6 +29,9 @@
 from llmcompressor.modeling.qwen3_moe import (  # noqa: F401
     CalibrationQwen3MoeSparseMoeBlock,
 )
+from llmcompressor.modeling.qwen3_next_moe import (  # noqa: F401
+    CalibrationQwen3NextSparseMoeBlock,
+)
 from llmcompressor.modeling.qwen3_vl_moe import (
     replace as replace_Qwen3VLMoE,
 )
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py
@@ -16,13 +16,25 @@
 
 import torch
 
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
+
+
+@register_moe_calibration("Qwen3NextSparseMoeBlock")
+class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
+    """
+    Calibration version of Qwen3NextSparseMoeBlock that sends all tokens to all experts.
+    """
+
+    is_permanent = False
 
-class Qwen3NextSparseMoeBlock(torch.nn.Module):
     def __init__(
         self,
-        config,
         original,
-        calibrate_all_experts: bool,
+        config,
+        calibrate_all_experts: bool = True,
     ):
         super().__init__()
         self.num_experts = config.num_experts
@@ -109,6 +121,6 @@ def replace(
     module,
     calibrate_all_experts,
 ):
-    return Qwen3NextSparseMoeBlock(
+    return CalibrationQwen3NextSparseMoeBlock(
         config=config, original=module, calibrate_all_experts=calibrate_all_experts
     )

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,9 @@`
`29`	`29`	`from llmcompressor.modeling.qwen3_moe import ( # noqa: F401`
`30`	`30`	`CalibrationQwen3MoeSparseMoeBlock,`
`31`	`31`	`)`
	`32`	`+from llmcompressor.modeling.qwen3_next_moe import ( # noqa: F401`
	`33`	`+ CalibrationQwen3NextSparseMoeBlock,`
	`34`	`+)`
`32`	`35`	`from llmcompressor.modeling.qwen3_vl_moe import (`
`33`	`36`	`replace as replace_Qwen3VLMoE,`
`34`	`37`	`)`