neuralmagic/calibration dataset

brian-dellabetta · HDCharles · commit 6ff11186de33 · 2025-10-30T13:59:50.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -62,6 +62,21 @@ def data_collator(batch):
 
             oneshot_kwargs["data_collator"] = data_collator
 
+        elif "calibration" in dataset_id:
+
+            def data_collator(batch):
+                assert len(batch) == 1
+                return {
+                    key: (
+                        torch.tensor(value)
+                        if key != "pixel_values"
+                        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+                    )
+                    for key, value in batch[0].items()
+                }
+
+            oneshot_kwargs["data_collator"] = data_collator
+
     oneshot_kwargs["model"] = loaded_model
     if recipe:
         oneshot_kwargs["recipe"] = recipe
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -13,7 +13,6 @@ lmeval:
   num_fewshot: 0
   batch_size: 100
   limit: 100
-  # test runs in 26m
   # dense model achieves exact_match accuracy of 0.530
   # dense model achieves relaxed_accuracy of 0.780
   # dense model achieves anywhere_accuracy of 0.800
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -3,8 +3,9 @@ model: Qwen/Qwen3-VL-8B-Instruct
 model_class: Qwen3VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
@@ -15,11 +16,10 @@ lmeval:
   num_fewshot: 0
   batch_size: 100
   limit: 100
-  # test runs in m
-  # dense model achieves exact_match accuracy of 0.
-  # dense model achieves relaxed_accuracy of 0.
-  # dense model achieves anywhere_accuracy of 0.
+  # dense model achieves exact_match accuracy of 0.520
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    exact_match,none: 0.
-    relaxed_accuracy,none: 0.
-    anywhere_accuracy,none: 0.
+    exact_match,none: 0.550
+    relaxed_accuracy,none: 0.770
+    anywhere_accuracy,none: 0.770
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -3,17 +3,23 @@ model: Qwen/Qwen3-VL-8B-Instruct
 model_class: Qwen3VLForConditionalGeneration
 scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
   apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.8333
+  batch_size: 100
+  limit: 100
+  # dense model achieves exact_match accuracy of 0.520
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    acc,none: 0.800
+    exact_match,none: 0.540
+    relaxed_accuracy,none: 0.780
+    anywhere_accuracy,none: 0.800
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -285,6 +285,31 @@ def process(sample):
                 "images": sample["image"],
             }
 
+    # "neuralmagic/calibration"
+    elif ds_name == "calibration":
+
+        def process(example):
+            messages = []
+            for message in example["messages"]:
+                messages.append(
+                    {
+                        "role": message["role"],
+                        "content": [{"type": "text", "text": message["content"]}],
+                    }
+                )
+
+            return processor.apply_chat_template(
+                messages,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=max_seq_length,
+                tokenize=True,
+                add_special_tokens=False,
+                return_dict=True,
+                add_generation_prompt=False,
+            )
+
     else:
         raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")