Skip to content

Commit 9fe852c

Browse files
committed
[Quantization][Feature] Add AWQ quantization in vllm-ascend.
Signed-off-by: menogrey <1299267905@qq.com>
1 parent 3653f33 commit 9fe852c

File tree

7 files changed

+486
-12
lines changed

7 files changed

+486
-12
lines changed

vllm_ascend/attention/mla_v1.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1389,9 +1389,10 @@ def _mla_preprocess(self, layer_name, hidden_states, kv_cache,
13891389
num_decode_tokens = attn_metadata.num_decode_tokens
13901390
num_actual_tokens = attn_metadata.num_actual_tokens
13911391
if self.fused_qkv_a_proj is not None:
1392-
maybe_npu_prefetch(inputs=self.fused_qkv_a_proj.weight,
1393-
dependency=hidden_states,
1394-
enabled=self.enable_prefetch)
1392+
if hasattr(self.fused_qkv_a_proj, 'weight'):
1393+
maybe_npu_prefetch(inputs=self.fused_qkv_a_proj.weight,
1394+
dependency=hidden_states,
1395+
enabled=self.enable_prefetch)
13951396
qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
13961397
q_c, kv_no_split = qkv_lora.split(
13971398
[self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
@@ -1593,11 +1594,11 @@ def forward(
15931594
o_proj_input[num_decode_tokens:num_actual_tokens] = output_prefill
15941595
# O proj
15951596
MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
1596-
maybe_npu_prefetch(inputs=self.o_proj.weight,
1597-
dependency=o_proj_input,
1598-
max_size=MAX_O_PROJ_PREFETCH_SIZE,
1599-
enabled=self.enable_prefetch)
1600-
1597+
if hasattr(self.o_proj, 'weight'):
1598+
maybe_npu_prefetch(inputs=self.o_proj.weight,
1599+
dependency=o_proj_input,
1600+
max_size=MAX_O_PROJ_PREFETCH_SIZE,
1601+
enabled=self.enable_prefetch)
16011602
output[...] = self.o_proj(o_proj_input,
16021603
is_prefill=prefill_preprocess_res
16031604
is not None)[0]

vllm_ascend/ops/layernorm.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ def __init__(
9696
vllm_config = get_current_vllm_config()
9797
self.bias = None
9898
# quantization with anti_method m4 will generate none-zero norm bias
99-
if vllm_config.quant_config is not None and \
99+
if vllm_config.quant_config is not None and hasattr(vllm_config.quant_config, "quant_description") and \
100+
vllm_config.quant_config.quant_description is not None and \
100101
any("norm.bias" in name for name in vllm_config.quant_config.quant_description.keys()):
101102
self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
102103
requires_grad=False)

vllm_ascend/platform.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030
init_ascend_config)
3131
from vllm_ascend.torchair.utils import (check_torchair_cache_exist,
3232
delete_torchair_cache_file)
33-
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p,
33+
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
34+
AWQ_QUANTIZATION_METHOD, enable_sp, is_310p,
3435
prefill_context_parallel_enable,
3536
update_aclgraph_sizes,
3637
update_cudagraph_capture_sizes,
@@ -55,7 +56,9 @@ class NPUPlatform(Platform):
5556
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
5657
dispatch_key: str = "PrivateUse1"
5758

58-
supported_quantization: list[str] = [ASCEND_QUANTIZATION_METHOD]
59+
supported_quantization: list[str] = [
60+
ASCEND_QUANTIZATION_METHOD, AWQ_QUANTIZATION_METHOD
61+
]
5962

6063
def is_sleep_mode_available(self) -> bool:
6164
return True
@@ -80,6 +83,8 @@ def pre_register_and_update(cls,
8083

8184
from vllm_ascend.quantization.quant_config import \
8285
AscendQuantConfig # noqa: F401
86+
from vllm_ascend.quantization.awq.awq import \
87+
AWQQuantConfig # noqa: F401
8388

8489
@classmethod
8590
def get_device_capability(cls, device_id: int = 0):

vllm_ascend/quantization/awq/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)