EleutherAI · MengAiDev · Oct 19, 2025
@@ -0,0 +1,17 @@
+from lm_eval.api.task import TaskConfig, get_task_dict
+from lm_eval.tasks.gsm_symbolic.gsm_symbolic import *
+from lm_eval.tasks.gsm_symbolic.gsm_symbolic_cot import *
+
+CONFIG = TaskConfig(
+    name="gsm_symbolic",
+    description="GSM-Symbolic math word problems dataset",
+    keywords=["math", "word problems", "symbolic reasoning"],
+    metrics=["exact_match"],
+)
+
+CONFIG_COT = TaskConfig(
+    name="gsm_symbolic_cot",
+    description="GSM-Symbolic math word problems dataset with Chain-of-Thought prompting",
+    keywords=["math", "word problems", "symbolic reasoning", "chain of thought"],
+    metrics=["exact_match"],
+)
@@ -0,0 +1,45 @@
+tag:
+  - math_word_problems
+task: gsm_symbolic
+dataset_path: apple/GSM-Symbolic
+dataset_name: p1
+output_type: generate_until
+training_split: test
+fewshot_split: test
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|end|>"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0
@@ -0,0 +1,46 @@
+tag:
+  - math_word_problems
+  - chain_of_thought
+task: gsm_symbolic_cot
+dataset_path: apple/GSM-Symbolic
+dataset_name: p1
+output_type: generate_until
+training_split: test
+fewshot_split: test
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer: Let's think step by step."
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|end|>"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0