Skip to content

Commit 51ff37d

Browse files
[tests] Update lm_eval VL tests to qwen 3 (#1953)
SUMMARY: Upgrade the lm_eval vision languge tests from Qwen 2.5 to Qwen 3. After updating to include `apply_chat_template`, the scores closely align with what was achieved with Qwen 2.5 - [x] switch to `neuralmagic/calibration` dataset, based on suggestion [here](#1941 (comment)), to avoid tracing issues related to VL dataset. - [x] switch to `chartqa` task, to increase number of samples to 500 and reduce variance in accuracy. - [x] pruned unused datasets (slimorca and llm_compression_calibration) TEST PLAN: The 3 lm_eval VL tests were run, and the accuracies were updated - vl_fp8_dynamic_per_token.yaml runs in ~29m - vl_int8_w8a8_dynamic_per_token.yaml runs in ~37m - vl_w4a16_actorder_weight.yaml runs in ~34m --------- Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
1 parent 1aa196f commit 51ff37d

File tree

6 files changed

+85
-61
lines changed

6 files changed

+85
-61
lines changed

tests/e2e/e2e_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,21 @@ def data_collator(batch):
6262

6363
oneshot_kwargs["data_collator"] = data_collator
6464

65+
elif "calibration" in dataset_id:
66+
67+
def data_collator(batch):
68+
assert len(batch) == 1
69+
return {
70+
key: (
71+
torch.tensor(value)
72+
if key != "pixel_values"
73+
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
74+
)
75+
for key, value in batch[0].items()
76+
}
77+
78+
oneshot_kwargs["data_collator"] = data_collator
79+
6580
oneshot_kwargs["model"] = loaded_model
6681
if recipe:
6782
oneshot_kwargs["recipe"] = recipe
Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
cadence: weekly
2-
model: Qwen/Qwen2.5-VL-7B-Instruct
3-
model_class: Qwen2_5_VLForConditionalGeneration
2+
model: Qwen/Qwen3-VL-8B-Instruct
3+
model_class: Qwen3VLForConditionalGeneration
44
scheme: FP8_DYNAMIC
55
recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
66
lmeval:
77
model: "hf-multimodal"
88
model_args:
99
dtype: bfloat16
10-
add_bos_token: True
1110
convert_img_format: True
12-
task: mmmu_val_literature
11+
task: chartqa
12+
apply_chat_template: True
1313
num_fewshot: 0
14-
batch_size: 8
15-
# dense model achieves accuracy of 0.9 +/ 0.0557
14+
batch_size: 100
15+
limit: 500
16+
# dense model achieves exact_match accuracy of 0.576
17+
# dense model achieves relaxed_accuracy of 0.780
18+
# dense model achieves anywhere_accuracy of 0.806
1619
metrics:
17-
acc,none: 0.8333
18-
acc_stderr,none: 0.0557
20+
exact_match,none: 0.596
21+
relaxed_accuracy,none: 0.784
22+
anywhere_accuracy,none: 0.810
Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
cadence: "weekly"
2-
model: Qwen/Qwen2.5-VL-7B-Instruct
3-
model_class: Qwen2_5_VLForConditionalGeneration
2+
model: Qwen/Qwen3-VL-8B-Instruct
3+
model_class: Qwen3VLForConditionalGeneration
44
scheme: INT8_dyn_per_token
55
recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
6-
dataset_id: lmms-lab/flickr30k
7-
dataset_split: "test[:512]"
6+
dataset_id: neuralmagic/calibration
7+
dataset_config: LLM
8+
dataset_split: "train[:512]"
89
lmeval:
910
model: "hf-multimodal"
1011
model_args:
1112
dtype: bfloat16
12-
add_bos_token: True
1313
convert_img_format: True
14-
task: mmmu_val_literature
14+
task: chartqa
15+
apply_chat_template: True
1516
num_fewshot: 0
16-
batch_size: 8
17-
# dense model achieves accuracy of 0.9 +/ 0.0557
17+
batch_size: 100
18+
limit: 500
19+
# dense model achieves exact_match accuracy of 0.576
20+
# dense model achieves relaxed_accuracy of 0.780
21+
# dense model achieves anywhere_accuracy of 0.806
1822
metrics:
19-
acc,none: 0.833
20-
acc_stderr,none: 0.0557
23+
exact_match,none: 0.608
24+
relaxed_accuracy,none: 0.806
25+
anywhere_accuracy,none: 0.824
Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
cadence: "weekly"
2-
model: Qwen/Qwen2.5-VL-7B-Instruct
3-
model_class: Qwen2_5_VLForConditionalGeneration
2+
model: Qwen/Qwen3-VL-8B-Instruct
3+
model_class: Qwen3VLForConditionalGeneration
44
scheme: W4A16_actorder_weight
55
recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
6-
dataset_id: lmms-lab/flickr30k
7-
dataset_split: "test[:512]"
6+
dataset_id: neuralmagic/calibration
7+
dataset_config: LLM
8+
dataset_split: "train[:512]"
89
lmeval:
910
model: "hf-multimodal"
1011
model_args:
1112
dtype: bfloat16
12-
add_bos_token: True
1313
convert_img_format: True
14-
task: mmmu_val_literature
14+
task: chartqa
15+
apply_chat_template: True
1516
num_fewshot: 0
16-
batch_size: 8
17-
# dense model achieves accuracy of 0.9 +/ 0.0557
17+
batch_size: 100
18+
limit: 500
19+
# dense model achieves exact_match accuracy of 0.576
20+
# dense model achieves relaxed_accuracy of 0.780
21+
# dense model achieves anywhere_accuracy of 0.806
1822
metrics:
19-
acc,none: 0.8333
20-
acc_stderr,none: 0.0557
23+
exact_match,none: 0.588
24+
relaxed_accuracy,none: 0.782
25+
anywhere_accuracy,none: 0.808

tests/lmeval/test_lmeval.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel):
2525
num_fewshot: int = 5
2626
limit: int = 1000
2727
batch_size: int = 100
28+
apply_chat_template: bool = False
2829
# Recovery testing (default): compare against base model performance
2930
# Default threshold is 0.95 (retain ≥95% of base), can be overridden
3031
recovery_threshold: Union[float, dict] = 0.95
@@ -160,6 +161,7 @@ def _eval_base_model(self):
160161
num_fewshot=self.lmeval.num_fewshot,
161162
limit=self.lmeval.limit,
162163
device="cuda:0",
164+
apply_chat_template=self.lmeval.apply_chat_template,
163165
batch_size=self.lmeval.batch_size,
164166
)
165167

@@ -190,6 +192,7 @@ def _run_lm_eval(self):
190192
num_fewshot=self.lmeval.num_fewshot,
191193
limit=self.lmeval.limit,
192194
device="cuda:0",
195+
apply_chat_template=self.lmeval.apply_chat_template,
193196
batch_size=self.lmeval.batch_size,
194197
)
195198

tests/testing_utils.py

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -218,20 +218,6 @@ def process(sample):
218218
add_special_tokens=False,
219219
)
220220

221-
elif ds_name == "llm_compression_calibration":
222-
223-
def process(sample):
224-
return processor(
225-
processor.apply_chat_template(
226-
sample["text"],
227-
tokenize=False,
228-
),
229-
padding=False,
230-
max_length=max_seq_length,
231-
truncation=True,
232-
add_special_tokens=False,
233-
)
234-
235221
elif ds_name == "open-platypus":
236222
# use the output rather than the instruction
237223
def process(sample):
@@ -246,25 +232,6 @@ def process(sample):
246232
add_special_tokens=False,
247233
)
248234

249-
elif ds_name == "slimorca-deduped-cleaned-corrected":
250-
# find the first element corresponding to a message from a human
251-
def process(sample):
252-
conversation_idx = 0
253-
for idx, conversation in enumerate(sample["conversations"]):
254-
if conversation["from"] == "human":
255-
conversation_idx = idx
256-
break
257-
return processor(
258-
processor.apply_chat_template(
259-
sample["conversations"][conversation_idx]["value"],
260-
tokenize=False,
261-
),
262-
padding=False,
263-
max_length=max_seq_length,
264-
truncation=True,
265-
add_special_tokens=False,
266-
)
267-
268235
elif ds_name == "flickr30k":
269236

270237
def process(sample):
@@ -285,6 +252,31 @@ def process(sample):
285252
"images": sample["image"],
286253
}
287254

255+
# "neuralmagic/calibration"
256+
elif ds_name == "calibration":
257+
258+
def process(example):
259+
messages = []
260+
for message in example["messages"]:
261+
messages.append(
262+
{
263+
"role": message["role"],
264+
"content": [{"type": "text", "text": message["content"]}],
265+
}
266+
)
267+
268+
return processor.apply_chat_template(
269+
messages,
270+
return_tensors="pt",
271+
padding=False,
272+
truncation=True,
273+
max_length=max_seq_length,
274+
tokenize=True,
275+
add_special_tokens=False,
276+
return_dict=True,
277+
add_generation_prompt=False,
278+
)
279+
288280
else:
289281
raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")
290282

0 commit comments

Comments
 (0)