[Quantization][Feature] Add AWQ quantization in vllm-ascend.

menogrey · menogrey · commit d1be88267176 · 2025-11-21T09:53:44.000Z
Signed-off-by: menogrey &lt;1299267905@qq.com&gt;
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -100,7 +100,7 @@ def process_weights_after_loading(self, layer):
                 1, 2).contiguous()
             layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False)
 
-            self.transpose = False
+            #self.transpose = False
         else:
             w13_data = self._maybe_pad_weight(layer.w13_weight.data)
             layer.w13_weight = torch.nn.Parameter(w13_data,
@@ -402,60 +402,6 @@ def forward_impl(self, hidden_states: torch.Tensor,
 
         return final_hidden_states
 
-    def transpose_weight(self, loaded_weight, expert_data, shard_dim):
-        # Ensure training and inference weight shapes match during RL weight updates
-        if (
-            loaded_weight.shape[1] != expert_data.shape[1] and \
-            loaded_weight.shape[0] != expert_data.shape[0]
-        ):
-            shard_dim = int(not shard_dim)
-            loaded_weight = loaded_weight.transpose(0, 1).contiguous()
-        return loaded_weight, shard_dim
-
-    def _load_w13(self,
-                  expert_data: torch.Tensor,
-                  shard_dim: int,
-                  shard_id: str,
-                  loaded_weight: torch.Tensor,
-                  tp_rank: int,
-                  load_full: bool = False):
-        # Index the loaded weight for tp sharding.
-        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
-        loaded_weight, shard_dim = self.transpose_weight(
-            loaded_weight, expert_data, shard_dim)
-        shard_size = expert_data.shape[shard_dim] // 2
-        if not load_full:
-            loaded_weight = loaded_weight.narrow(shard_dim,
-                                                 shard_size * tp_rank,
-                                                 shard_size)
-        # Narrow parameter and load.
-        # w1, gate_proj: Load into first logical weight of w13.
-        if shard_id == "w1":
-            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
-        # w3, up_proj: Load into second logical weight of w13.
-        else:
-            assert shard_id == "w3"
-            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
-        expert_data.copy_(loaded_weight)
-
-    def _load_w2(self,
-                 expert_data: torch.Tensor,
-                 shard_dim: int,
-                 loaded_weight: torch.Tensor,
-                 tp_rank: int,
-                 load_full: bool = False):
-        # Index the loaded weight for tp sharding.
-        # down_proj: "RowParallel" so tp sharding on input_dim
-        # Narrow parameter and load.
-        loaded_weight, shard_dim = self.transpose_weight(
-            loaded_weight, expert_data, shard_dim)
-        shard_size = expert_data.shape[shard_dim]
-        if not load_full:
-            loaded_weight = loaded_weight.narrow(shard_dim,
-                                                 shard_size * tp_rank,
-                                                 shard_size)
-        # w2, down_proj: Load into only logical weight of w2.
-        expert_data.copy_(loaded_weight)
 
 
 class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
diff --git a/vllm_ascend/quantization/awq/awq.py b/vllm_ascend/quantization/awq/awq.py
@@ -5,10 +5,12 @@
 from torch.nn.modules import Module
 import torch_npu
 from vllm.config import get_current_vllm_config
-from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig, FusedMoEQuantConfig,
+                                                         int4_w4a16_moe_quant_config,
+                                                         int8_w8a16_moe_quant_config,)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                RowParallelLinear, UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import \
@@ -76,7 +78,6 @@ def npu_fused_experts(
     )
     expert_tokens = expert_tokens.to(torch.int64)
     # gmm1: gate_up_proj
-    hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
     if not use_wna16:
         hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
         scale_args13 = {
@@ -92,8 +93,6 @@ def npu_fused_experts(
     hidden_states = torch_npu.npu_grouped_matmul(
         x=[hidden_states],
         weight=[w13],
-        scale=[w13_scale.to(scale_dtype)],
-        per_token_scale=[pertoken_scale],
         **scale_args13,
         split_item=2,
         group_list_type=0,
@@ -103,7 +102,6 @@ def npu_fused_experts(
     )[0]
     # act_fn: swiglu
     hidden_states = torch_npu.npu_swiglu(hidden_states)
-    hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
     if not use_wna16:
         hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
 
@@ -117,15 +115,14 @@ def npu_fused_experts(
     hidden_states = torch_npu.npu_grouped_matmul(
         x=[hidden_states],
         weight=[w2],
-        scale=[w2_scale.to(scale_dtype)],
-        per_token_scale=[pertoken_scale],
         **scale_args2,
         split_item=2,
         group_list_type=0,
         group_type=0,
         group_list=expert_tokens,
         output_dtype=original_dtype,
     )[0]
+
     final_hidden_states = torch_npu.npu_moe_finalize_routing(
         hidden_states,
         skip1=None,
@@ -270,91 +267,86 @@ def __init__(self, quant_config: AWQQuantConfig):
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
-        self.moe = layer
-        layer.quant_config = self.quant_config
-        bit8_pack_factor = self.quant_config.pack_factor
-        group_size = self.quant_config.group_size
-        group_size_div_factor = 1
-
-        # make intermediate_size and hidden_size divisible by group_size
-        # we reduce the group size to ensure that
-        # and we would repeat the loaded_weight later
-        while intermediate_size_per_partition % group_size or \
-                hidden_size % group_size:
-            group_size = group_size // 2
-            group_size_div_factor *= 2
-            assert group_size >= 32
-        layer.group_size = group_size
-        layer.group_size_div_factor = group_size_div_factor
-
-        strategy = FusedMoeWeightScaleSupported.GROUP.value
-        extra_weight_attrs.update({
-            "quant_method": strategy,
-            "is_transposed": False
-        })
-
-        assert 'weight_loader' in extra_weight_attrs
-        weight_loader = extra_weight_attrs['weight_loader']
-        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(
-            layer, weight_loader)
-        extra_weight_attrs['weight_loader'] = wrapped_weight_loader
-
-        # Fused gate_up_proj (column parallel)
-        w13_qweight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size // bit8_pack_factor,
-            dtype=torch.uint8),
-                                         requires_grad=False)
+        extra_weight_attrs.update(
+            {
+                "is_transposed": True,
+                "quant_method": FusedMoeWeightScaleSupported.GROUP.value,
+            }
+        )
+
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_qweight", w13_qweight)
         set_weight_attrs(w13_qweight, extra_weight_attrs)
 
-        # down_proj (row parallel)
-        w2_qweight = torch.nn.Parameter(torch.empty(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition // bit8_pack_factor,
-            dtype=torch.uint8),
-                                        requires_grad=False)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_qweight", w2_qweight)
         set_weight_attrs(w2_qweight, extra_weight_attrs)
 
-        w13_scales = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            2 * intermediate_size_per_partition,
-            hidden_size // group_size,
-            dtype=params_dtype),
-                                        requires_grad=False)
+        num_groups_w13 = hidden_size // self.quant_config.group_size
+        num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        w13_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                intermediate_size_per_partition * 2,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
         layer.register_parameter("w13_scales", w13_scales)
         set_weight_attrs(w13_scales, extra_weight_attrs)
 
-        w2_scales = torch.nn.Parameter(torch.zeros(
-            num_experts,
-            hidden_size,
-            intermediate_size_per_partition // group_size,
-            dtype=params_dtype),
-                                       requires_grad=False)
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
 
-        if self.quant_config.zero_point:
-            w13_qzeros = torch.nn.Parameter(torch.zeros(
+        # WEIGHT_ZERO_POINT
+        # Allocate 2 zero points for w1 and w3 respectively.
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition // bit8_pack_factor,
-                hidden_size // group_size,
-                dtype=torch.uint8),
-                                            requires_grad=False)
-            layer.register_parameter("w13_qzeros", w13_qzeros)
-            set_weight_attrs(w13_qzeros, extra_weight_attrs)
-
-            w2_qzeros = torch.nn.Parameter(torch.zeros(
+                num_groups_w13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(
                 num_experts,
-                hidden_size // bit8_pack_factor,
-                intermediate_size_per_partition // group_size,
-                dtype=torch.uint8),
-                                           requires_grad=False)
-            layer.register_parameter("w2_qzeros", w2_qzeros)
-            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+                num_groups_w2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         w13_qweight_tmp = torch.zeros_like(layer.w13_qweight.data)
@@ -406,6 +398,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             "w2_qweight", torch.nn.Parameter(w2_qweight_tmp, requires_grad=False)
         )
 
+    def get_fused_moe_quant_config(
+        self, layer: torch.nn.Module
+    ) -> FusedMoEQuantConfig | None:
+        return None
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -428,6 +425,7 @@ def apply(
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> torch.Tensor:
         assert self.fused_experts is None