update test

dsikka · HDCharles · commit 9bce0d8e69a6 · 2025-11-03T19:34:33.000Z
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py
@@ -38,7 +38,7 @@ def __init__(
     ):
         super().__init__()
         self.num_experts = config.num_experts
-        self.top_k = config.top_k
+        self.top_k = original.top_k
         self.norm_topk_prob = config.norm_topk_prob
 
         # gating
@@ -56,7 +56,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_logits = self.gate(hidden_states)
 
         routing_weights = torch.nn.functional.softmax(
-            router_logits, dim=1, dtype=torch.float
+            router_logits, dim=-1, dtype=torch.float
         )
         routing_weights, selected_experts = torch.topk(
             routing_weights, self.top_k, dim=-1
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_next.py b/tests/llmcompressor/modeling/test_calib_qwen3_next.py
@@ -8,7 +8,10 @@
 @requires_gpu
 def test_calib_qwen3_moe_module():
     from transformers import Qwen3NextConfig
-    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextSparseMoeBlock
+    from transformers.models.qwen3_next.modeling_qwen3_next import (
+        Qwen3NextSparseMoeBlock,
+    )
+
     config = Qwen3NextConfig()
     with torch.device("cuda"):
         original = Qwen3NextSparseMoeBlock(config).eval()
@@ -27,13 +30,13 @@ def test_calib_qwen3_moe_module():
 
     with calibration_forward_context(module):
         output = module(sample)
-        #assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
+        assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
         assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10
 
     module = CalibrationQwen3NextSparseMoeBlock(
         original, config, calibrate_all_experts=False
     )
     with calibration_forward_context(module):
         output = module(sample)
-        #assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
+        assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
         assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10