add additional test; add hints

dsikka · HDCharles · commit e6fdfedf129c · 2025-11-03T19:34:33.000Z
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py
@@ -24,6 +24,11 @@
 
 @register_moe_calibration("Qwen3NextSparseMoeBlock")
 class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
+    from transformers import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import (
+        Qwen3NextSparseMoeBlock,
+    )
+
     """
     Calibration version of Qwen3NextSparseMoeBlock that sends all tokens to all experts.
     """
@@ -32,8 +37,8 @@ class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
 
     def __init__(
         self,
-        original,
-        config,
+        original: Qwen3NextSparseMoeBlock,
+        config: Qwen3NextConfig,
         calibrate_all_experts: bool = True,
     ):
         super().__init__()
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_next.py b/tests/llmcompressor/modeling/test_calib_qwen3_next.py
@@ -1,8 +1,62 @@
+import contextlib
+from functools import partial
+
+import pytest
 import torch
+from transformers import AutoModelForCausalLM
 
+from llmcompressor.modeling.moe_context import moe_calibration_context
 from llmcompressor.modeling.qwen3_next_moe import CalibrationQwen3NextSparseMoeBlock
-from llmcompressor.utils.helpers import calibration_forward_context
-from tests.testing_utils import requires_gpu
+from llmcompressor.utils.dev import skip_weights_download
+from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
+from tests.testing_utils import requires_cadence, requires_gpu
+
+
+@requires_cadence("weekly")
+@pytest.mark.parametrize("model_stub", ["Qwen/Qwen3-Next-80B-A3B-Instruct"])
+def test_calib_replace_qwen3moe_all_experts(model_stub):
+    with skip_weights_download():
+        model = AutoModelForCausalLM.from_pretrained(model_stub)
+
+    # Qwen3MoE layer replacement is temporary within the context
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(calibration_forward_context(model))
+        stack.enter_context(DisableQuantization(model))
+        stack.enter_context(moe_calibration_context(model, calibrate_all_experts=True))
+
+        # Find one MoE layer
+        moe_layer = None
+        for name, module in model.named_modules():
+            if isinstance(module, CalibrationQwen3NextSparseMoeBlock):
+                moe_layer = module
+                break
+
+        assert moe_layer is not None
+
+        num_experts = len(moe_layer.experts)
+        expert_triggered = [False for _ in range(num_experts)]
+
+        # Define the hook function
+        def hook_fn(i, module, input, output):
+            expert_triggered[i] = True
+
+        # Attach hooks using functools.partial to bind each index
+        for i, expert in enumerate(moe_layer.experts):
+            expert.register_forward_hook(partial(hook_fn, i))
+
+        # Create dummy input tensor that simulates hidden_states
+        hidden_dim = model.config.hidden_size
+        batch, seq_len = 4, 32
+        sample = torch.randn(batch, seq_len, hidden_dim, dtype=torch.float32)
+
+        # Forward through the MoE layer directly
+        with torch.no_grad():
+            _ = moe_layer(sample)
+
+        # Assert all experts are used
+        assert all(
+            expert_triggered
+        ), f"Not all experts were triggered: {expert_triggered}"
 
 
 @requires_gpu