[Torchax] Add ability to load MoE bias

kyuyeunk · kyuyeunk · commit 649912e7e188 · 2025-11-07T06:24:32.000Z
Signed-off-by: Kyuyeun Kim &lt;kyuyeunk@google.com&gt;
diff --git a/tpu_inference/layers/vllm/fused_moe.py b/tpu_inference/layers/vllm/fused_moe.py
@@ -272,7 +272,7 @@ def _ragged_all_to_all(operand, input_offsets, send_sizes, output_offsets,
     )(gmm_res, input_offsets, send_sizes, output_offsets, recv_sizes)
 
 
-def jax_fused_moe_func(
+def fused_moe_func(
     hidden_states: jax.Array,
     w1: jax.Array,
     w2: jax.Array,
@@ -368,11 +368,10 @@ def jax_fused_moe_func(
     return x
 
 
-def jax_fused_moe_func_padded(hidden_states: jax.Array, w1: jax.Array,
-                              w2: jax.Array, gating_output: jax.Array,
-                              topk: int, global_num_experts: int,
-                              renormalize: bool, reduce_results: bool,
-                              mesh: Mesh, use_ep: bool):
+def fused_moe_func_padded(hidden_states: jax.Array, w1: jax.Array,
+                          w2: jax.Array, gating_output: jax.Array, topk: int,
+                          global_num_experts: int, renormalize: bool,
+                          reduce_results: bool, mesh: Mesh, use_ep: bool):
     # TODO(fanhongmin@google.com): Once the jax runner pads the input, we no longer need this.
     hidden_size = hidden_states.shape[-1]
     num_tokens = hidden_states.size // hidden_size
@@ -387,13 +386,13 @@ def jax_fused_moe_func_padded(hidden_states: jax.Array, w1: jax.Array,
         reps = (n_repeats, ) + (1, ) * (gating_output.ndim - 1)
         expanded_gating_output = jnp.tile(gating_output, reps)
 
-        expanded_x = jax_fused_moe_func(expanded_hidden_states, w1, w2,
-                                        expanded_gating_output, topk,
-                                        global_num_experts, renormalize,
-                                        reduce_results, mesh, use_ep)
+        expanded_x = fused_moe_func(expanded_hidden_states, w1, w2,
+                                    expanded_gating_output, topk,
+                                    global_num_experts, renormalize,
+                                    reduce_results, mesh, use_ep)
         x = expanded_x[:hidden_states.shape[0]]
         return x
     else:
-        return jax_fused_moe_func(hidden_states, w1, w2, gating_output, topk,
-                                  global_num_experts, renormalize,
-                                  reduce_results, mesh, use_ep)
+        return fused_moe_func(hidden_states, w1, w2, gating_output, topk,
+                              global_num_experts, renormalize, reduce_results,
+                              mesh, use_ep)
diff --git a/tpu_inference/layers/vllm/quantization/unquantized.py b/tpu_inference/layers/vllm/quantization/unquantized.py
@@ -24,7 +24,7 @@
     QuantizationConfig, QuantizeMethodBase)
 
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
-from tpu_inference.layers.vllm.fused_moe import jax_fused_moe_func_padded
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
@@ -191,8 +191,12 @@ def select_gemm_impl(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
 
-        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
+        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
+
+        if self.moe.has_bias:
+            w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+            w2_bias = t2j(layer.w2_bias, use_dlpack=False)
 
         if self.use_kernel and layer.use_ep:
             # Kernel expects:
@@ -208,25 +212,34 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
             w13_reshaped = w13_weight.reshape(num_experts, 2,
                                               intermediate_size, hidden_size)
-            w13_weight = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
 
             # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
             w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
 
             # Apply EP sharding
             w13_weight = jax.device_put(
-                w13_weight,
+                w13_weight_transposed,
                 Format(Layout((0, 1, 2, 3)),
                        NamedSharding(self.mesh, P("model", None, None, None))))
-            w2_weight_transposed = jax.device_put(
+            w2_weight = jax.device_put(
                 w2_weight_transposed,
                 Format(Layout((0, 1, 2)),
                        NamedSharding(self.mesh, P("model", None, None))))
 
-            layer.w13_weight = Parameter(torch_view(w13_weight),
-                                         requires_grad=False)
-            layer.w2_weight = Parameter(torch_view(w2_weight_transposed),
-                                        requires_grad=False)
+            if self.moe.has_bias:
+                w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+
+                # Apply EP sharding
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+
         else:
             # Original logic for non-kernel path
             if layer.use_ep:
@@ -238,6 +251,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     w2_weight,
                     Format(Layout((0, 1, 2)),
                            NamedSharding(self.mesh, P("model", None, None))))
+
+                if self.moe.has_bias:
+                    w13_bias = jax.device_put(
+                        w13_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
+                    w2_bias = jax.device_put(
+                        w2_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
+
             else:
                 intermediate_size = w13_weight.shape[1] // 2
                 assert intermediate_size == w2_weight.shape[-1]
@@ -255,11 +279,27 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     Format(Layout((0, 1, 2)),
                            NamedSharding(self.mesh, P(None, None, "model"))))
 
+                if self.moe.has_bias:
+                    w13_bias = jax.device_put(
+                        w13_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P(None, "model"))))
+                    w2_bias = jax.device_put(
+                        w2_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P(None, None))))
+
             layer.w13_weight = Parameter(torch_view(w13_weight),
                                          requires_grad=False)
             layer.w2_weight = Parameter(torch_view(w2_weight),
                                         requires_grad=False)
 
+            if self.moe.has_bias:
+                layer.w13_bias = Parameter(torch_view(w13_bias),
+                                           requires_grad=False)
+                layer.w2_bias = Parameter(torch_view(w2_bias),
+                                          requires_grad=False)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -290,6 +330,9 @@ def apply(
         if scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
+        # TODO(kyuyeunk): Remove this check once MoE bias support has landed.
+        if self.moe.has_bias:
+            raise NotImplementedError("Bias is not currently supported.")
 
         if self.use_kernel and layer.use_ep:
             output = fused_ep_moe(
@@ -305,7 +348,7 @@ def apply(
         else:
             # Use the original implementation
             _fused_moe_func = functools.partial(
-                jax.jit(jax_fused_moe_func_padded,
+                jax.jit(fused_moe_func_padded,
                         static_argnames=[
                             "topk", "global_num_experts", "renormalize",
                             "reduce_results", "mesh", "use_ep"
diff --git a/tpu_inference/models/vllm/vllm_model_wrapper.py b/tpu_inference/models/vllm/vllm_model_wrapper.py
@@ -86,6 +86,11 @@ def load_weights(self):
         assert self.vllm_config.model_config.dtype in TORCH_DTYPE_TO_JAX, "The model_config.dtype must be a PyTorch dtype."
         vllm_config_for_load.device_config.device = "cpu"
 
+        # When expert parallelism is enabled, vLLM loads weight in sharding
+        # aware manner. Since tpu-inference has its own sharding logic, this
+        # may casue errors. Therefore, we disable it during weight loading.
+        vllm_config_for_load.parallel_config.enable_expert_parallel = False
+
         if os.getenv("JAX_RANDOM_WEIGHTS", False):
             vllm_config_for_load.load_config.load_format = "dummy"
             use_random_weights = True