vllm-project
diff --git a/‎tpu_inference/layers/vllm/quantization/__init__.py‎
Lines changed: 16 additions & 11 deletions b/‎tpu_inference/layers/vllm/quantization/__init__.py‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py‎
Lines changed: 47 additions & 36 deletions b/‎tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py‎
Lines changed: 47 additions & 36 deletions
@@ -2,33 +2,38 @@
 
 from jax.sharding import Mesh
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.quantization.base_config import \
-    QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
 from tpu_inference.layers.vllm.quantization.awq import VllmAWQConfig
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
-from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
-    VllmCompressedTensorsConfig  # noqa: E501
-from tpu_inference.layers.vllm.quantization.unquantized import \
-    VllmUnquantizedConfig
+from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import (
+    VllmCompressedTensorsConfig,
+)  # noqa: E501
+from tpu_inference.layers.vllm.quantization.unquantized import VllmUnquantizedConfig
 
 
-def get_tpu_quantization_config(vllm_config: VllmConfig,
-                                mesh: Mesh) -> QuantizationConfig:
+def get_tpu_quantization_config(
+    vllm_config: VllmConfig, mesh: Mesh
+) -> QuantizationConfig:
     model_config = copy.deepcopy(vllm_config.model_config)
     # TODO(kyuyeunk): Add support for "tpu_int8".
     method_to_config: dict[str, str] = {
         None: VllmUnquantizedConfig,
         "compressed-tensors": VllmCompressedTensorsConfig,
         "awq": VllmAWQConfig,
+        "fp8": VllmCompressedTensorsConfig,
     }
+    # import sys
 
+    # sys.stdin = open(0)
+    # breakpoint()
     if model_config.quantization not in method_to_config:
-        raise NotImplementedError
+        raise NotImplementedError(
+            f"{model_config.quantization} quantization method not supported."
+        )
     quant_config = method_to_config[model_config.quantization]
     assert issubclass(quant_config, JaxCommonConfig)
     quant_config.set_configs(vllm_config, mesh)
 
     model_config.quantization = quant_config.get_name()
-    return VllmConfig.get_quantization_config(model_config,
-                                              vllm_config.load_config)
+    return VllmConfig.get_quantization_config(model_config, vllm_config.load_config)
@@ -6,40 +6,45 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.layers.quantization import \
-    register_quantization_config
-from vllm.model_executor.layers.quantization.base_config import \
-    QuantizeMethodBase  # noqa: E501
+from vllm.model_executor.layers.quantization import register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase  # noqa: E501
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
-    CompressedTensorsConfig, CompressedTensorsKVCacheMethod,
-    CompressedTensorsLinearMethod, CompressedTensorsScheme)
+    CompressedTensorsConfig,
+    CompressedTensorsKVCacheMethod,
+    CompressedTensorsLinearMethod,
+    CompressedTensorsScheme,
+)
+from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import (
+    CompressedTensorsW8A8Fp8MoEMethod,
+)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target, is_activation_quantization_format,
-    should_ignore_layer)
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer,
+)
 
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
-from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import \
-    VllmCompressedTensorsW8A8Fp8
-from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import \
-    VllmCompressedTensorsW8A8Int8
-from tpu_inference.layers.vllm.quantization.unquantized import \
-    VllmUnquantizedConfig
+from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import (
+    VllmCompressedTensorsW8A8Fp8,
+)
+from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import (
+    VllmCompressedTensorsW8A8Int8,
+)
+from tpu_inference.layers.vllm.quantization.unquantized import VllmUnquantizedConfig
 
 P = PartitionSpec
 logger = init_logger(__name__)
 
 
 @register_quantization_config("jax-compressed-tensors")
 class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
-
     @classmethod
     def get_name(cls) -> str:
         return "jax-compressed-tensors"
 
-    def get_scheme(self,
-                   layer: torch.nn.Module,
-                   layer_name: Optional[str] = None
-                   ) -> Optional["CompressedTensorsScheme"]:
+    def get_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
 
@@ -60,24 +65,30 @@ def get_scheme(self,
                 layer_name=layer_name,
                 module=layer,
                 targets=self.target_scheme_map.keys(),
-                fused_mapping=self.packed_modules_mapping)
+                fused_mapping=self.packed_modules_mapping,
+            )
 
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
             format = scheme_dict.get("format")
 
         if weight_quant is None:
-            logger.warning_once("Acceleration for non-quantized schemes is "
-                                "not supported by Compressed Tensors. "
-                                "Falling back to UnquantizedLinearMethod")
+            logger.warning_once(
+                "Acceleration for non-quantized schemes is "
+                "not supported by Compressed Tensors. "
+                "Falling back to UnquantizedLinearMethod"
+            )
             return None
 
         # TODO(kyuyeunk): Add support for different act_quant_format
-        act_quant_format = is_activation_quantization_format(  # noqa: F841
-            format
-        ) if format is not None else is_activation_quantization_format(
-            self.quant_format)
+        act_quant_format = (
+            is_activation_quantization_format(  # noqa: F841
+                format
+            )
+            if format is not None
+            else is_activation_quantization_format(self.quant_format)
+        )
 
         linear_config = self.get_linear_config(layer)
         if self._is_fp8_w8a8(weight_quant, input_quant):
@@ -94,28 +105,28 @@ def get_scheme(self,
                 input_symmetric=input_quant.symmetric,
                 jax_config=linear_config,
             )
-        raise NotImplementedError(
-            "No compressed-tensors compatible scheme was found.")
+        raise NotImplementedError("No compressed-tensors compatible scheme was found.")
 
     def get_quant_method(
         self,
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional[QuantizeMethodBase]:
-        if should_ignore_layer(prefix,
-                               ignore=self.ignore,
-                               fused_mapping=self.packed_modules_mapping):
+        if should_ignore_layer(
+            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
             return VllmUnquantizedConfig.get_quant_method(self, layer, prefix)
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
             if scheme is None:
-                return VllmUnquantizedConfig.get_quant_method(
-                    self, layer, prefix)
+                return VllmUnquantizedConfig.get_quant_method(self, layer, prefix)
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, FusedMoE):
-            raise NotImplementedError(
-                "FusedMoE quantization is currently not supported.")
+            print("HERE", layer)
+            return CompressedTensorsW8A8Fp8MoEMethod(
+                self, layer.quant_config, self.mesh
+            )
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         return None