diff --git a/lm_eval/tasks/gsm_symbolic/__init__.py b/lm_eval/tasks/gsm_symbolic/__init__.py new file mode 100644 index 0000000000..b638b1d0a3 --- /dev/null +++ b/lm_eval/tasks/gsm_symbolic/__init__.py @@ -0,0 +1,17 @@ +from lm_eval.api.task import TaskConfig, get_task_dict +from lm_eval.tasks.gsm_symbolic.gsm_symbolic import * +from lm_eval.tasks.gsm_symbolic.gsm_symbolic_cot import * + +CONFIG = TaskConfig( + name="gsm_symbolic", + description="GSM-Symbolic math word problems dataset", + keywords=["math", "word problems", "symbolic reasoning"], + metrics=["exact_match"], +) + +CONFIG_COT = TaskConfig( + name="gsm_symbolic_cot", + description="GSM-Symbolic math word problems dataset with Chain-of-Thought prompting", + keywords=["math", "word problems", "symbolic reasoning", "chain of thought"], + metrics=["exact_match"], +) \ No newline at end of file diff --git a/lm_eval/tasks/gsm_symbolic/gsm_symbolic.yaml b/lm_eval/tasks/gsm_symbolic/gsm_symbolic.yaml new file mode 100644 index 0000000000..8afc7b549a --- /dev/null +++ b/lm_eval/tasks/gsm_symbolic/gsm_symbolic.yaml @@ -0,0 +1,45 @@ +tag: + - math_word_problems +task: gsm_symbolic +dataset_path: apple/GSM-Symbolic +dataset_name: p1 +output_type: generate_until +training_split: test +fewshot_split: test +test_split: test +doc_to_text: "Question: {{question}}\nAnswer:" +doc_to_target: "{{answer}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" + - "(?s).*#### " + - "\\.$" +generation_kwargs: + until: + - "Question:" + - "" + - "<|end|>" + do_sample: false + temperature: 0.0 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)" + - function: "take_first" +metadata: + version: 1.0 \ No newline at end of file diff --git a/lm_eval/tasks/gsm_symbolic/gsm_symbolic_cot.yaml b/lm_eval/tasks/gsm_symbolic/gsm_symbolic_cot.yaml new file mode 100644 index 0000000000..2b2d3cee10 --- /dev/null +++ b/lm_eval/tasks/gsm_symbolic/gsm_symbolic_cot.yaml @@ -0,0 +1,46 @@ +tag: + - math_word_problems + - chain_of_thought +task: gsm_symbolic_cot +dataset_path: apple/GSM-Symbolic +dataset_name: p1 +output_type: generate_until +training_split: test +fewshot_split: test +test_split: test +doc_to_text: "Question: {{question}}\nAnswer: Let's think step by step." +doc_to_target: "{{answer}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" + - "(?s).*#### " + - "\\.$" +generation_kwargs: + until: + - "Question:" + - "" + - "<|end|>" + do_sample: false + temperature: 0.0 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)" + - function: "take_first" +metadata: + version: 1.0 \ No newline at end of file