|
9 | 9 | from vllm.logger import init_logger |
10 | 10 | from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( |
11 | 11 | CompressedTensorsScheme) |
12 | | -from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501 |
13 | | - dequantize_to_dtype, ref_nvfp4_quant) |
14 | 12 | from vllm.model_executor.parameter import (GroupQuantScaleParameter, |
15 | 13 | ModelWeightParameter, |
16 | 14 | PerTensorScaleParameter) |
|
21 | 19 | __all__ = ["CompressedTensorsW4A4Fp4"] |
22 | 20 |
|
23 | 21 |
|
24 | | -def cutlass_fp4_supported() -> bool: |
25 | | - if not current_platform.is_cuda(): |
26 | | - return False |
27 | | - capability_tuple = current_platform.get_device_capability() |
28 | | - capability = -1 if capability_tuple is None else capability_tuple.to_int() |
29 | | - return cutlass_scaled_mm_supports_fp4(capability) |
30 | | - |
31 | | - |
32 | 22 | class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): |
33 | 23 |
|
34 | 24 | def __init__(self): |
35 | 25 | self.group_size = 16 |
36 | | - self.cutlass_nvfp4_supported = cutlass_fp4_supported() |
37 | | - if not self.cutlass_nvfp4_supported: |
38 | | - logger.warning("Current platform does not support cutlass NVFP4." |
39 | | - " Running emulations.") |
40 | 26 |
|
41 | 27 | @classmethod |
42 | 28 | def get_min_capability(cls) -> int: |
43 | | - # dont restrict as emulations |
44 | | - return 80 |
45 | | - |
46 | | - def run_nvfp4_emulations(self, x: torch.Tensor, layer): |
47 | | - x_m, x_k = x.shape |
48 | | - output_dtype = x.dtype |
49 | | - |
50 | | - # quantize input to (FP4 and interleaved block scale) |
51 | | - x_fp4, x_blockscale = ref_nvfp4_quant(x, layer.input_global_scale, |
52 | | - self.group_size) |
| 29 | + return 100 |
53 | 30 |
|
54 | | - # dequantize input |
55 | | - x_fp4 = x_fp4.reshape(x_m, x_k // self.group_size, self.group_size) |
56 | | - x_blockscale = x_blockscale.unsqueeze(-1) / layer.input_global_scale |
57 | | - x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype) |
58 | | - del x_fp4, x_blockscale |
59 | | - |
60 | | - # dequantize weight |
61 | | - w_fp4 = layer.weight.data.view(torch.uint8) |
62 | | - w_blockscale = layer.weight_scale_swizzled.data |
63 | | - w_global_scale = layer.weight_global_scale |
64 | | - w_dq = dequantize_to_dtype(w_fp4, w_blockscale, w_global_scale, |
65 | | - output_dtype, x.device, self.group_size) |
66 | | - |
67 | | - # matmul |
68 | | - out = torch.matmul(x_dq, w_dq.t()) |
69 | | - del w_dq, x_dq |
70 | | - return out |
| 31 | + @classmethod |
| 32 | + def cutlass_fp4_supported(cls) -> bool: |
| 33 | + if not current_platform.is_cuda(): |
| 34 | + return False |
| 35 | + capability_tuple = current_platform.get_device_capability() |
| 36 | + capability = -1 if capability_tuple is None else capability_tuple.to_int( # noqa: E501 |
| 37 | + ) |
| 38 | + return cutlass_scaled_mm_supports_fp4(capability) |
71 | 39 |
|
72 | 40 | def create_weights(self, layer: torch.nn.Module, |
73 | 41 | output_partition_sizes: list[int], |
@@ -152,27 +120,24 @@ def process_weights_after_loading(self, layer) -> None: |
152 | 120 | # required by cutlass kernel; need Parameter, not ModelWeightParameter |
153 | 121 | layer.weight = Parameter(layer.weight_packed.data, requires_grad=False) |
154 | 122 |
|
155 | | - if self.cutlass_nvfp4_supported: |
156 | | - layer.alpha = Parameter(layer.input_global_scale * |
157 | | - layer.weight_global_scale, |
158 | | - requires_grad=False) |
| 123 | + layer.alpha = Parameter(layer.input_global_scale * |
| 124 | + layer.weight_global_scale, |
| 125 | + requires_grad=False) |
159 | 126 |
|
160 | 127 | def apply_weights(self, |
161 | 128 | layer: torch.nn.Module, |
162 | 129 | x: torch.Tensor, |
163 | 130 | bias: Optional[torch.Tensor] = None) -> torch.Tensor: |
164 | 131 |
|
165 | | - if self.cutlass_nvfp4_supported: |
166 | | - output_dtype = x.dtype |
167 | | - output_shape = [x.shape[0], layer.weight.shape[0]] |
| 132 | + output_dtype = x.dtype |
| 133 | + output_shape = [x.shape[0], layer.weight.shape[0]] |
168 | 134 |
|
169 | | - # quantize BF16 or FP16 to (FP4 and interleaved block scale) |
170 | | - x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale) |
| 135 | + # quantize BF16 or FP16 to (FP4 and interleaved block scale) |
| 136 | + x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale) |
171 | 137 |
|
172 | | - out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale, |
173 | | - layer.weight_scale_swizzled, |
174 | | - 1 / layer.alpha, output_dtype) |
175 | | - if bias is not None: |
176 | | - out = out + bias |
177 | | - return out.view(*output_shape) |
178 | | - return self.run_nvfp4_emulations(x, layer) |
| 138 | + out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale, |
| 139 | + layer.weight_scale_swizzled, |
| 140 | + 1 / layer.alpha, output_dtype) |
| 141 | + if bias is not None: |
| 142 | + out = out + bias |
| 143 | + return out.view(*output_shape) |
0 commit comments