vllm-project
diff --git a/‎vllm_ascend/attention/mla_v1.py‎
Lines changed: 9 additions & 8 deletions b/‎vllm_ascend/attention/mla_v1.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎vllm_ascend/ops/layernorm.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm_ascend/ops/layernorm.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm_ascend/platform.py‎
Lines changed: 7 additions & 2 deletions b/‎vllm_ascend/platform.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎vllm_ascend/quantization/awq/__init__.py‎ b/‎vllm_ascend/quantization/awq/__init__.py‎
@@ -1389,9 +1389,10 @@ def _mla_preprocess(self, layer_name, hidden_states, kv_cache,
         num_decode_tokens = attn_metadata.num_decode_tokens
         num_actual_tokens = attn_metadata.num_actual_tokens
         if self.fused_qkv_a_proj is not None:
-            maybe_npu_prefetch(inputs=self.fused_qkv_a_proj.weight,
-                               dependency=hidden_states,
-                               enabled=self.enable_prefetch)
+            if hasattr(self.fused_qkv_a_proj, 'weight'):
+                maybe_npu_prefetch(inputs=self.fused_qkv_a_proj.weight,
+                                dependency=hidden_states,
+                                enabled=self.enable_prefetch)
             qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
             q_c, kv_no_split = qkv_lora.split(
                 [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
@@ -1593,11 +1594,11 @@ def forward(
             o_proj_input[num_decode_tokens:num_actual_tokens] = output_prefill
         # O proj
         MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
-        maybe_npu_prefetch(inputs=self.o_proj.weight,
-                           dependency=o_proj_input,
-                           max_size=MAX_O_PROJ_PREFETCH_SIZE,
-                           enabled=self.enable_prefetch)
-
+        if hasattr(self.o_proj, 'weight'):
+            maybe_npu_prefetch(inputs=self.o_proj.weight,
+                               dependency=o_proj_input,
+                               max_size=MAX_O_PROJ_PREFETCH_SIZE,
+                               enabled=self.enable_prefetch)
         output[...] = self.o_proj(o_proj_input,
                                   is_prefill=prefill_preprocess_res
                                   is not None)[0]
 
@@ -96,7 +96,8 @@ def __init__(
         vllm_config = get_current_vllm_config()
         self.bias = None
         # quantization with anti_method m4 will generate none-zero norm bias
-        if vllm_config.quant_config is not None and \
+        if vllm_config.quant_config is not None and hasattr(vllm_config.quant_config, "quant_description") and \
+                vllm_config.quant_config.quant_description is not None and \
                 any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()):
             self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
                                            requires_grad=False)
 
@@ -30,7 +30,8 @@
                                        init_ascend_config)
 from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
                                         delete_torchair_cache_file)
-from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
+from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
+                               AWQ_QUANTIZATION_METHOD, enable_sp, is_310p,
                                prefill_context_parallel_enable,
                                update_aclgraph_sizes,
                                update_cudagraph_capture_sizes,
@@ -55,7 +56,9 @@ class NPUPlatform(Platform):
     device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
     dispatch_key: str = "PrivateUse1"
 
-    supported_quantization: list[str] = [ASCEND_QUANTIZATION_METHOD]
+    supported_quantization: list[str] = [
+        ASCEND_QUANTIZATION_METHOD, AWQ_QUANTIZATION_METHOD
+    ]
 
     def is_sleep_mode_available(self) -> bool:
         return True
@@ -80,6 +83,8 @@ def pre_register_and_update(cls,
 
         from vllm_ascend.quantization.quant_config import \
             AscendQuantConfig  # noqa: F401
+        from vllm_ascend.quantization.awq.awq import \
+            AWQQuantConfig  # noqa: F401
 
     @classmethod
     def get_device_capability(cls, device_id: int = 0):