[GPT OSS] Add support for both BF16 and MXFP4

amishacorns · amishacorns · commit 81954b410672 · 2025-11-11T22:51:59.000Z
Signed-off-by: Jordan Dotzel &lt;amishacorns@users.noreply.github.com&gt;
diff --git a/tpu_inference/models/jax/gpt_oss.py b/tpu_inference/models/jax/gpt_oss.py
@@ -11,7 +11,8 @@
 from jax.sharding import PartitionSpec as P
 from vllm.config import VllmConfig
 from tpu_inference.models.jax.utils.quantization.mxfp4_utils import (
-    dequant_mxfp4_to_bf16,
+    MXFP4_QUANT_METHOD, dequant_mxfp4_to_bf16,
+    unpack_mxfp4_to_fp32,
 )
 
 from tpu_inference.layers.jax.attention.gpt_oss_attention import (
@@ -188,6 +189,13 @@ def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
         """Loads and transforms all weights from a checkpoint"""
         self.rng = nnx.Rngs(rng)
 
+        # Determine quantization method from HF config (config.json)
+        quant_method = (
+            self.hf_config.quantization_config["quant_method"]
+            if hasattr(self.hf_config, "quantization_config")
+            else None
+        )
+
         # Format: 'hf_key': ('jax_model_path', transform_function, target_shape)
         transforms = {
             "transpose_reshape": lambda w, shape: w.T.reshape(shape),
@@ -196,6 +204,9 @@ def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
             "swap_last2": lambda w, _: w.swapaxes(-1, -2),
         }
 
+        # MXFP4 checkpoints swap last two dims for MoE to place packed dim at most minor
+        swap_mlp_transform = transforms["swap_last2"] if quant_method == MXFP4_QUANT_METHOD else None
+
         mappings = {
             # Embeddings, Norms, and LM Head
             "model.embed_tokens.weight": ("embedder.input_embedding_table_VD",
@@ -251,11 +262,11 @@ def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
             "model.layers.*.mlp.router.bias":
             ("layers.*.custom_module.router.bias_E", None, None),
             "model.layers.*.mlp.experts.gate_up_proj":
-            ("layers.*.custom_module.mlp1_weight_EDF2", transforms["swap_last2"], None),
+            ("layers.*.custom_module.mlp1_weight_EDF2", swap_mlp_transform, None),
             "model.layers.*.mlp.experts.gate_up_proj_bias":
             ("layers.*.custom_module.mlp1_bias_EF2", None, None),
             "model.layers.*.mlp.experts.down_proj":
-            ("layers.*.custom_module.mlp2_weight_EFD", transforms["swap_last2"], None),
+            ("layers.*.custom_module.mlp2_weight_EFD", swap_mlp_transform, None),
             "model.layers.*.mlp.experts.down_proj_bias":
             ("layers.*.custom_module.mlp2_bias_ED", None, None),
         }
@@ -269,9 +280,76 @@ def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
             framework="pt",
             download_dir=self.vllm_config.load_config.download_dir)
 
-        # Single pass: build a unified pool. Combine MXFP4 expert blocks/scales
-        # into a dequantized bf16 tensor as soon as both are seen.
-        pool: dict[str, torch.Tensor] = {}
+        # Build a pool of weights with MXFP4 experts combined if neededs
+        pool: dict[str, torch.Tensor | tuple] = (
+            self._build_mxfp4_pool(names_and_weights_generator, mappings)
+            if quant_method == MXFP4_QUANT_METHOD
+            else {loaded_name: loaded_weight
+                  for loaded_name, loaded_weight in names_and_weights_generator}
+        )
+
+        with jax.default_device(jax.devices("cpu")[0]):
+            for loaded_name, loaded_weight in pool.items():
+                hf_pattern = re.sub(r"layers\.(\d+)", "layers.*", loaded_name)
+                if hf_pattern not in mappings:
+                    logger.warning(
+                        f"No mapping found for checkpoint tensor: {loaded_name}. Skipping."
+                    )
+                    continue
+
+                jax_path_template, transform_fn, target_shape = mappings[
+                    hf_pattern]
+
+                layer_num_match = re.search(r"layers\.(\d+)", loaded_name)
+                jax_path = jax_path_template
+                if layer_num_match:
+                    jax_path = jax_path_template.replace(
+                        "*", layer_num_match.group(1))
+
+                model_weight = get_param(model_params, jax_path)
+
+                prepared_weight = loaded_weight
+                if isinstance(loaded_weight, tuple):
+                    # Loaded weight is an MXFP4 tuple
+                    blocks_u8, scales_u8 = loaded_weight
+                    # Quantized param (QArray): set qvalue/scale directly and skip regular path
+                    if hasattr(model_weight, "array"):  # QArray check
+                        codes_fp32_t, scales_fp32_t = unpack_mxfp4_to_fp32(blocks_u8, scales_u8)
+                        self._load_mxfp4(
+                            model_weight=model_weight,
+                            codes_fp32_t=codes_fp32_t,
+                            scales_fp32_t=scales_fp32_t,
+                            transform_fn=transform_fn,
+                        )
+                        if is_verbose:
+                            print_param_info(model_weight, loaded_name)
+                        continue
+                    # Not a QArray: dequantize MXFP4 to BF16 full weights
+                    prepared_weight = dequant_mxfp4_to_bf16(blocks_u8, scales_u8)
+
+                # Single regular-tensor load call (BF16 or dequantized MXFP4)
+                cast_type = model_weight.value.dtype
+                self._load_regular_param(
+                    model_weight=model_weight,
+                    loaded_weight=prepared_weight,
+                    cast_type=cast_type,
+                    transform_fn=transform_fn,
+                    target_shape=target_shape,
+                    jax_path_template=jax_path_template,
+                )
+
+                if is_verbose:
+                    print_param_info(model_weight, loaded_name)
+
+        nnx.update(self, model_params)
+
+    def _build_mxfp4_pool(self, names_and_weights_generator, mappings):
+        """Collect MXFP4 weights into a pool keeping tuples (blocks_u8, scales_u8).
+
+        Combines *_blocks and *_scales pairs and stores uint8 tensors together.
+        Non-expert tensors are kept as-is. Raises if any expert bundle is incomplete.
+        """
+        pool: dict[str, torch.Tensor | tuple] = {}
         pending_experts: dict[str, dict[str, torch.Tensor]] = {}
         for loaded_name, loaded_weight in names_and_weights_generator:
             if loaded_name.endswith("_blocks") or loaded_name.endswith("_scales"):
@@ -282,14 +360,12 @@ def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
                 else:
                     entry["scales"] = loaded_weight
 
-                # If we have both parts, dequantize now and place into the main pool
+                # If we have both parts, place raw pair into the main pool
                 if "blocks" in entry and "scales" in entry:
                     hf_pattern = re.sub(r"layers\.(\d+)", "layers.*", base)
                     if hf_pattern not in mappings:
-                        logger.warning(f"No mapping found for expert tensor: {base}. Skipping.")
-                    else:
-                        deq = dequant_mxfp4_to_bf16(entry["blocks"], entry["scales"])  # torch.bfloat16
-                        pool[base] = deq
+                        raise ValueError(f"No mapping found for expert tensor: {base}")
+                    pool[base] = (entry["blocks"], entry["scales"])
                     # Remove from pending to free memory
                     pending_experts.pop(base, None)
             else:
@@ -304,68 +380,82 @@ def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
             raise RuntimeError(
                 "Incomplete MXFP4 expert bundle(s) encountered: " + ", ".join(details)
             )
+        return pool
+
+    def _load_mxfp4(self,
+                    model_weight,
+                    codes_fp32_t,
+                    scales_fp32_t,
+                    transform_fn=None):
+        """Assign decoded MXFP4 codes/scales into a QArray (qvalue/scale)."""
+
+        qv = model_weight.array.qvalue
+        sv = model_weight.array.scale
+        q_dtype = qv.value.dtype
+        s_dtype = sv.value.dtype
+
+        exp_q_shape = tuple(qv.value.shape)
+        exp_s_shape = tuple(sv.value.shape)
+
+        # Apply optional transform (e.g., swap last two dims) before conversion
+        if transform_fn is not None:
+            codes_fp32_t = transform_fn(codes_fp32_t, None)
+            scales_fp32_t = transform_fn(scales_fp32_t, None)
+
+        # Convert from torch.Tensor to numpy before creating JAX arrays
+        codes_fp32_t = codes_fp32_t.detach().cpu().numpy()
+        scales_fp32_t = scales_fp32_t.detach().cpu().numpy()
+
+        codes_jnp = jnp.asarray(codes_fp32_t).astype(q_dtype)
+        scales_jnp = jnp.asarray(scales_fp32_t).astype(s_dtype)
+
+        def get_q_slice(index):
+            return codes_jnp[index]
+
+        def get_s_slice(index):
+            return scales_jnp[index]
+
+        q_sharded = jax.make_array_from_callback(
+            exp_q_shape, NamedSharding(self.mesh, P(*qv.sharding)), get_q_slice)
+        s_sharded = jax.make_array_from_callback(
+            exp_s_shape, NamedSharding(self.mesh, P(*sv.sharding)), get_s_slice)
+
+        model_weight.array.qvalue.value = q_sharded
+        model_weight.array.scale.value = s_sharded
+
+    def _load_regular_param(self,
+                            model_weight,
+                            loaded_weight: torch.Tensor,
+                            cast_type,
+                            transform_fn,
+                            target_shape,
+                            jax_path_template: str):
+        """Assign a regular tensor (non-MXFP4) into the model param with transform applied."""
+        if jax_path_template == "layers.*.attn.sinks_N":
+            # Checkpoint is bf16, but we have to upcast sinks to f32, as required by RPA_v3 kernel
+            weight_np = jnp.array(loaded_weight.to(torch.float32).numpy())
+        else:
+            torch_view_type = DTYPE_VIEW_MAP.get(jnp.dtype(cast_type))
+            if torch_view_type:
+                weight_np = jnp.array(loaded_weight.view(torch_view_type).numpy()).view(cast_type)
+            else:
+                raise ValueError(
+                    f"Unsupported dtype for tensor conversion: {cast_type}")
 
-        with jax.default_device(jax.devices("cpu")[0]):
-            for loaded_name, loaded_weight in pool.items():
-                hf_pattern = re.sub(r"layers\.(\d+)", "layers.*", loaded_name)
-                if hf_pattern not in mappings:
-                    logger.warning(
-                        f"No mapping found for checkpoint tensor: {loaded_name}. Skipping."
-                    )
-                    continue
-
-                jax_path_template, transform_fn, target_shape = mappings[
-                    hf_pattern]
-
-                layer_num_match = re.search(r"layers\.(\d+)", loaded_name)
-                jax_path = jax_path_template
-                if layer_num_match:
-                    jax_path = jax_path_template.replace(
-                        "*", layer_num_match.group(1))
-
-                model_weight = get_param(model_params, jax_path)
-                cast_type = model_weight.value.dtype
-
-                if jax_path_template == "layers.*.attn.sinks_N":
-                    # Checkpoint is bf16, but we have to upcast sinks to f32, as required by RPA_v3 kernel
-                    weight_np = jnp.array(
-                        loaded_weight.to(torch.float32).numpy())
-                else:
-                    torch_view_type = DTYPE_VIEW_MAP.get(jnp.dtype(cast_type))
-                    if torch_view_type:
-                        # Avoid unnecessary upcasting and mem copy by viewing the tensor's
-                        # raw data as integers before converting to a JAX array.
-                        weight_np = jnp.array(
-                            loaded_weight.view(torch_view_type).numpy()).view(
-                                cast_type)
-                    else:
-                        raise ValueError(
-                            f"Unsupported dtype for tensor conversion: {cast_type}"
-                        )
-
-                if transform_fn:
-                    transformed_weight = transform_fn(weight_np, target_shape)
-                else:
-                    transformed_weight = weight_np
-
-                if model_weight.value.shape != transformed_weight.shape:
-                    raise ValueError(
-                        f"Shape mismatch for '{jax_path}': Model expects {model_weight.value.shape}, but got {transformed_weight.shape} after transformation."
-                    )
-
-                def get_slice(index):
-                    return transformed_weight[index]
+        transformed_weight = transform_fn(weight_np, target_shape) if transform_fn else weight_np
 
-                sharded_array = jax.make_array_from_callback(
-                    transformed_weight.shape,
-                    NamedSharding(self.mesh, P(*model_weight.sharding)),
-                    get_slice)
-                model_weight.value = sharded_array
+        if model_weight.value.shape != transformed_weight.shape:
+            raise ValueError(
+                f"Shape mismatch: model expects {model_weight.value.shape}, but got {transformed_weight.shape} after transform.")
 
-                if is_verbose:
-                    print_param_info(model_weight, loaded_name)
+        def get_slice(index):
+            return transformed_weight[index]
 
-        nnx.update(self, model_params)
+        sharded_array = jax.make_array_from_callback(
+            transformed_weight.shape,
+            NamedSharding(self.mesh, P(*model_weight.sharding)),
+            get_slice)
+        model_weight.value = sharded_array
 
     def __call__(
         self,
diff --git a/tpu_inference/models/jax/utils/quantization/mxfp4_utils.py b/tpu_inference/models/jax/utils/quantization/mxfp4_utils.py
@@ -7,6 +7,8 @@
 MXFP4_BLOCK_SIZE: int = 32
 # Exponent-only e8m0 scale bias used by MXFP4 scales
 MXFP4_SCALE_BIAS: int = 127
+# Name used in config.json quantization_config["quant_method"]
+MXFP4_QUANT_METHOD: str = "mxfp4"
 
 
 # Precompute a small LUT once; move to device on demand (cheap 16-element copy)
@@ -16,7 +18,7 @@
 ], dtype=torch.float32)
 
 
-def _unpack_uint8_to_fp4_values(packed: torch.Tensor) -> torch.Tensor:
+def unpack_mxfp4(packed: torch.Tensor) -> torch.Tensor:
     """Unpack uint8 (..., 16) -> fp4 values (..., 32) using low->high nibble order.
 
     Returns float32 values corresponding to FP4 codebook entries.
@@ -29,7 +31,7 @@ def _unpack_uint8_to_fp4_values(packed: torch.Tensor) -> torch.Tensor:
     return lut[idx.long()]
 
 
-def _e8m0_to_scale(u8: torch.Tensor) -> torch.Tensor:
+def e8m0_to_fp32(u8: torch.Tensor) -> torch.Tensor:
     """Convert e8m0 uint8 exponents to power-of-two scales using MXFP4_SCALE_BIAS.
 
     Uses ldexp for exact power-of-two scaling: 1.0 * 2**(u8 - bias).
@@ -43,17 +45,38 @@ def dequant_mxfp4_to_bf16(blocks_u8: torch.Tensor, scales_u8: torch.Tensor) -> t
     """Dequantize MXFP4 blocks/scales into bfloat16 values.
 
     Args:
-      blocks_u8: uint8 tensor shaped [..., Kb, 16], each byte holds 2 FP4 codes.
-      scales_u8: uint8 tensor shaped [..., Kb], exponent-only e8m0 per 32-value block.
+        blocks_u8: uint8 tensor shaped [..., Kb, 16], each byte holds 2 FP4 codes.
+        scales_u8: uint8 tensor shaped [..., Kb], exponent-only e8m0 per 32-value block.
 
     Returns:
-      torch.bfloat16 tensor with last logical dimension K = Kb * 32.
+        torch.bfloat16 tensor with last logical dimension K = Kb * 32.
     """
     if blocks_u8.dtype != torch.uint8 or scales_u8.dtype != torch.uint8:
-        raise ValueError(f"Expected uint8 inputs, got blocks={blocks_u8.dtype}, scales={scales_u8.dtype}")
+            raise ValueError(f"Expected uint8 inputs, got blocks={blocks_u8.dtype}, scales={scales_u8.dtype}")
     # Unpack FP4 codes to float32 values [..., Kb, 32]
-    fp4_vals = _unpack_uint8_to_fp4_values(blocks_u8)  # (..., Kb, 32)
+    fp4_vals = unpack_mxfp4(blocks_u8)  # (..., Kb, 32)
     # Compute power-of-two scales and apply per block
-    scales = _e8m0_to_scale(scales_u8).unsqueeze(-1)    # (..., Kb, 1)
+    scales = e8m0_to_fp32(scales_u8).unsqueeze(-1)    # (..., Kb, 1)
     full = (fp4_vals * scales).reshape(*fp4_vals.shape[:-2], fp4_vals.shape[-2] * MXFP4_BLOCK_SIZE)
     return full.to(torch.bfloat16)
+
+
+def unpack_mxfp4_to_fp32(blocks_u8: torch.Tensor, scales_u8: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Decode MXFP4 packed blocks and e8m0 scales to float32 codes and scales.
+
+    Args:
+        blocks_u8: uint8 tensor shaped [..., Kb, 16], each byte packs two FP4 codes.
+        scales_u8: uint8 tensor shaped [..., Kb], exponent-only e8m0 per block.
+
+    Returns:
+        (codes_fp32, scales_fp32), where
+        - codes_fp32 has shape [..., Kb*32] and dtype float32
+        - scales_fp32 has shape [..., Kb] and dtype float32
+    """
+    if blocks_u8.dtype != torch.uint8 or scales_u8.dtype != torch.uint8:
+        raise ValueError(
+            f"Expected uint8 inputs, got blocks={blocks_u8.dtype}, scales={scales_u8.dtype}")
+    fp4_vals = unpack_mxfp4(blocks_u8)  # (..., Kb, 32) float32
+    codes_fp32 = fp4_vals.reshape(*fp4_vals.shape[:-2], fp4_vals.shape[-2] * MXFP4_BLOCK_SIZE)
+    scales_fp32 = e8m0_to_fp32(scales_u8)  # (..., Kb) float32
+    return codes_fp32, scales_fp32
diff --git a/tpu_inference/models/jax/utils/quantization/quantization_utils.py b/tpu_inference/models/jax/utils/quantization/quantization_utils.py
@@ -71,6 +71,27 @@
     }
 }
 
+# Default Qwix config for GPT-OSS MXFP4 checkpoints.
+# Notes:
+# - We quantize only the MoE expert weights by default (router stays in BF16).
+# - We use Qwix's abstract-model path so weights can be set directly into QArray
+#   fields during weight loading (similar to DeepSeek's flow).
+# - Activation quantization is not set but Qwix would pickup MoE sum if activated
+DEFAULT_GPT_OSS_FP4_CONFIG = {
+    "qwix": {
+        "use_abstract_model": True,
+        "scale_dtype": "bfloat16",
+        "rules": [
+            {
+                "module_path": ".*custom_module",
+                "weight_qtype": "float4_e2m1fn",
+                "act_qtype": None,
+                "tile_size": 32,
+            },
+        ],
+    }
+}
+
 
 def parse_qwix_config_to_rules(
         qwix_config: List[dict]) -> List[qwix.QuantizationRule]:
@@ -400,6 +421,9 @@ def get_default_qwix_quantization_config(
         return DEFAULT_DEEPSEEK_FP8_CONFIG
     elif model_type == "llama4" and quant_method == "compressed-tensors":
         return DEFAULT_LLAMA4_FP8_CONFIG
+    # MXFP4 (GPT-OSS): provide a default configuration to quantize MoE experts via Qwix
+    elif model_type == "gpt_oss" and quant_method == "mxfp4":
+        return DEFAULT_GPT_OSS_FP4_CONFIG
 
 
 def update_vllm_config_for_qwix_quantization(vllm_config: "VllmConfig"):