[GPT-OSS] Separate expert sum to not quantize with Qwix

amishacorns · amishacorns · commit b557388a1276 · 2025-11-12T04:30:43.000Z
Signed-off-by: Jordan Dotzel &lt;amishacorns@users.noreply.github.com&gt;
diff --git a/tpu_inference/layers/jax/moe/gpt_oss_moe.py b/tpu_inference/layers/jax/moe/gpt_oss_moe.py
@@ -67,6 +67,34 @@ def _swiglu(x: Float, alpha: Float, limit: Float) -> Float:
     return gated_activation * (x_linear + 1)
 
 
+@dataclass(kw_only=True)
+class CombineExperts(nnx.Module):
+    """Module for combining expert outputs with weighted sum."""
+    dtype: jnp.dtype
+
+    def __call__(self, down_proj_TED: Float, weights_TX: Float,
+                 indices_TX: jax.Array) -> Float:
+        """Combines expert outputs using weighted sum.
+
+        Args:
+            down_proj_TED: Expert outputs, shape (tokens, experts, hidden_dim)
+            weights_TX: Router weights, shape (tokens, experts_per_token)
+            indices_TX: Selected expert indices, shape (tokens, experts_per_token)
+
+        Returns:
+            Combined output, shape (tokens, hidden_dim)
+        """
+        with jax.named_scope("combine_experts"):
+            indices_for_gather = indices_TX[..., None]
+            gathered_down_proj_TED = jnp.take_along_axis(down_proj_TED,
+                                                         indices_for_gather,
+                                                         axis=1)
+            output_TD = jnp.einsum('TXD,TX -> TD', gathered_down_proj_TED,
+                                   weights_TX)
+
+        return output_TD.astype(self.dtype)
+
+
 @dataclass(kw_only=True)
 class GptOssMoE(nnx.Module):
     """
@@ -114,20 +142,16 @@ def __call__(self, x_TD: Float) -> Float:
             down_proj_TED += self.mlp2_bias_ED.value
 
         # Weighted sum of expert outputs
-        with jax.named_scope("sum"):
-            indices_for_gather = indices_TX[..., None]
-            gathered_down_proj_TED = jnp.take_along_axis(down_proj_TED,
-                                                         indices_for_gather,
-                                                         axis=1)
-            output_TD = jnp.einsum('TXD,TX -> TD', gathered_down_proj_TED,
-                                   weights_TX)
+        output_TD = self.combine_experts(down_proj_TED, weights_TX, indices_TX)
 
-        return output_TD.astype(self.dtype)
+        return output_TD
 
     def __post_init__(self, rngs: nnx.Rngs):
         """Initializes all weights and biases for the MoE block."""
         D, F, E = self.hidden_size, self.intermediate_size_moe, self.num_local_experts
 
+        self.combine_experts = CombineExperts(dtype=self.dtype)
+
         # MLP #1 Weights (Combined Gate and Up-projection) and Bias
         self.mlp1_weight_EDF2 = create_param(
             rngs,