diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index 92a272737f..765d864cc4 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -84,11 +84,13 @@ def data_collator(batch): targets="Linear", scheme=scheme, actorder=None, # added for consistency with past testing configs - ignore=["lm_head"], + ignore=["lm_head", "re:.*mlp.gate[.].*"], ) else: oneshot_kwargs["recipe"] = QuantizationModifier( - targets="Linear", scheme=scheme, ignore=["lm_head"] + targets="Linear", + scheme=scheme, + ignore=["lm_head", "re:.*mlp.gate[.].*"], ) # Apply quantization. diff --git a/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml new file mode 100644 index 0000000000..81fa03ad7d --- /dev/null +++ b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen3-30B-A3B +scheme: NVFP4 +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +num_calibration_samples: 20 diff --git a/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml new file mode 100644 index 0000000000..2a8fd2bd05 --- /dev/null +++ b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml @@ -0,0 +1,4 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen3-30B-A3B +scheme: FP8_DYNAMIC diff --git a/tests/e2e/vLLM/run_vllm.py b/tests/e2e/vLLM/run_vllm.py index 4daa93db10..aed83f2e56 100644 --- a/tests/e2e/vLLM/run_vllm.py +++ b/tests/e2e/vLLM/run_vllm.py @@ -18,7 +18,7 @@ def parse_args(): except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON input: {e}") - if "W4A16_2of4" in scheme: + if scheme is not None and "W4A16_2of4" in scheme: # required by the kernel llm_kwargs["dtype"] = torch.float16