[tests] Update lm_eval VL tests to qwen 3 (#1953)

brian-dellabetta · web-flow · commit 51ff37dde83c · 2025-10-30T10:48:45.000-04:00
SUMMARY: Upgrade the lm_eval vision languge tests from Qwen 2.5 to Qwen 3. After updating to include `apply_chat_template`, the scores closely align with what was achieved with Qwen 2.5 - [x] switch to `neuralmagic/calibration` dataset, based on suggestion [here](#1941 (comment)), to avoid tracing issues related to VL dataset. - [x] switch to `chartqa` task, to increase number of samples to 500 and reduce variance in accuracy. - [x] pruned unused datasets (slimorca and llm_compression_calibration) TEST PLAN: The 3 lm_eval VL tests were run, and the accuracies were updated - vl_fp8_dynamic_per_token.yaml runs in ~29m - vl_int8_w8a8_dynamic_per_token.yaml runs in ~37m - vl_w4a16_actorder_weight.yaml runs in ~34m --------- Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -62,6 +62,21 @@ def data_collator(batch):
 
             oneshot_kwargs["data_collator"] = data_collator
 
+        elif "calibration" in dataset_id:
+
+            def data_collator(batch):
+                assert len(batch) == 1
+                return {
+                    key: (
+                        torch.tensor(value)
+                        if key != "pixel_values"
+                        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+                    )
+                    for key, value in batch[0].items()
+                }
+
+            oneshot_kwargs["data_collator"] = data_collator
+
     oneshot_kwargs["model"] = loaded_model
     if recipe:
         oneshot_kwargs["recipe"] = recipe
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -1,18 +1,22 @@
 cadence: weekly
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: FP8_DYNAMIC
 recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
+  apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  batch_size: 100
+  limit: 500
+  # dense model achieves exact_match accuracy of 0.576
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.806
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
+    exact_match,none: 0.596
+    relaxed_accuracy,none: 0.784
+    anywhere_accuracy,none: 0.810
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -1,20 +1,25 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
+  apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  batch_size: 100
+  limit: 500
+  # dense model achieves exact_match accuracy of 0.576
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.806
   metrics:
-    acc,none: 0.833
-    acc_stderr,none: 0.0557
+    exact_match,none: 0.608
+    relaxed_accuracy,none: 0.806
+    anywhere_accuracy,none: 0.824
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -1,20 +1,25 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
+  apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  batch_size: 100
+  limit: 500
+  # dense model achieves exact_match accuracy of 0.576
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.806
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
+    exact_match,none: 0.588
+    relaxed_accuracy,none: 0.782
+    anywhere_accuracy,none: 0.808
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
@@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel):
     num_fewshot: int = 5
     limit: int = 1000
     batch_size: int = 100
+    apply_chat_template: bool = False
     # Recovery testing (default): compare against base model performance
     # Default threshold is 0.95 (retain ≥95% of base), can be overridden
     recovery_threshold: Union[float, dict] = 0.95
@@ -160,6 +161,7 @@ def _eval_base_model(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 
@@ -190,6 +192,7 @@ def _run_lm_eval(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -218,20 +218,6 @@ def process(sample):
                 add_special_tokens=False,
             )
 
-    elif ds_name == "llm_compression_calibration":
-
-        def process(sample):
-            return processor(
-                processor.apply_chat_template(
-                    sample["text"],
-                    tokenize=False,
-                ),
-                padding=False,
-                max_length=max_seq_length,
-                truncation=True,
-                add_special_tokens=False,
-            )
-
     elif ds_name == "open-platypus":
         # use the output rather than the instruction
         def process(sample):
@@ -246,25 +232,6 @@ def process(sample):
                 add_special_tokens=False,
             )
 
-    elif ds_name == "slimorca-deduped-cleaned-corrected":
-        # find the first element corresponding to a message from a human
-        def process(sample):
-            conversation_idx = 0
-            for idx, conversation in enumerate(sample["conversations"]):
-                if conversation["from"] == "human":
-                    conversation_idx = idx
-                    break
-            return processor(
-                processor.apply_chat_template(
-                    sample["conversations"][conversation_idx]["value"],
-                    tokenize=False,
-                ),
-                padding=False,
-                max_length=max_seq_length,
-                truncation=True,
-                add_special_tokens=False,
-            )
-
     elif ds_name == "flickr30k":
 
         def process(sample):
@@ -285,6 +252,31 @@ def process(sample):
                 "images": sample["image"],
             }
 
+    # "neuralmagic/calibration"
+    elif ds_name == "calibration":
+
+        def process(example):
+            messages = []
+            for message in example["messages"]:
+                messages.append(
+                    {
+                        "role": message["role"],
+                        "content": [{"type": "text", "text": message["content"]}],
+                    }
+                )
+
+            return processor.apply_chat_template(
+                messages,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=max_seq_length,
+                tokenize=True,
+                add_special_tokens=False,
+                return_dict=True,
+                add_generation_prompt=False,
+            )
+
     else:
         raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")