[LM Eval] Update / fix testing (#1992)

dsikka · web-flow · commit 71cd6aea1037 · 2025-11-04T16:11:29.000-05:00
SUMMARY: - Bump LM Eval to support chartqa - #1953 landed without updating lm-eval which is required to support chatqa for VL models - Update recovery metrics for two cases to use different thresholds, not 95%
diff --git a/setup.py b/setup.py
@@ -151,7 +151,7 @@ def localversion_func(version: ScmVersion) -> str:
             "pytest>=6.0.0",
             "pytest-mock>=3.6.0",
             "pytest-rerunfailures>=13.0",
-            "lm_eval==0.4.5",
+            "lm_eval==0.4.9",
             # test dependencies
             "beautifulsoup4~=4.12.3",
             "cmarkgfm>=2024.1.14",
diff --git a/tests/lmeval/configs/w4a16_actorder_weight.yaml b/tests/lmeval/configs/w4a16_actorder_weight.yaml
@@ -5,6 +5,9 @@ recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 lmeval:
+  recovery_threshold:
+    exact_match,strict-match: 0.94
+    exact_match,flexible-extract: 0.94
   metrics:
     exact_match,flexible-extract: 0.72
     exact_match,strict-match: 0.72
diff --git a/tests/lmeval/configs/w4a16_awq_sym.yaml b/tests/lmeval/configs/w4a16_awq_sym.yaml
@@ -5,6 +5,9 @@ recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
 lmeval:
+  recovery_threshold:
+    exact_match,strict-match: 0.92
+    exact_match,flexible-extract: 0.93
   metrics:
     exact_match,flexible-extract: 0.70
     exact_match,strict-match: 0.70