forked from EleutherAI/lm-evaluation-harness
-
Notifications
You must be signed in to change notification settings - Fork 10
/
math_sat_cot.py
210 lines (169 loc) · 7.36 KB
/
math_sat_cot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""
SAT Math May 2023 questions that do not have figures.
We use the version of the dataset found in the Huggingface dataset `mcaleste/sat_multiple_choice_math_may_23`.
Our prompt is taken from from appendix G of Lewkowycz et al. (2022).
"""
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from lm_eval.mixins import MajorityVotingMixin
import re
_CITATION = """
@misc{lewkowycz2022solving,
title={Solving Quantitative Reasoning Problems with Language Models},
author={Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
year={2022},
eprint={2206.14858},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
MCQA_PROMPT = r"""Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.
What of the following is the right choice? Explain you answer.
(A) [-5,-2), (B) [2,5), (C) [-2,-5), (D) [5,2)
Solution:
The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \
ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$.
Therefore, the domain of the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is (B). I hope it is correct.
Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$
What of the following is the right choice? Explain you answer.
(A) 14, (B) 4, (C) 2, (D) 24
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \boxed{24}.$
Final Answer: The final answer is (D). I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times \
must Terrell lift them in order to lift the same total weight?
What of the following is the right choice? Explain you answer.
(A) 12, (B) 20, (C) 16, (D) 15
Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. \
If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ \
pounds of weight. Equating this to 480 pounds, we can solve for $n$: \begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is (C). I hope it is correct.
Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{a}{b},$ assuming $b$ is
nonzero.
What of the following is the right choice? Explain you answer.
(A) $-\frac{2}{3}$, (B) $\frac{2}{3}$, (C) $\frac{1}{3}$, (D) $\frac{4}{9}$
Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is (A). I hope it is correct."""
class MathSATCoT(MajorityVotingMixin, Task):
VERSION = 0
DATASET_PATH = "mcaleste/sat_multiple_choice_math_may_23"
DATASET_NAME = None
ANS_RE = re.compile(r"Final Answer: The final answer is \([ABCD]\). I hope it is correct.")
INVALID_ANS = "[not found]"
def __init__(self):
print("WARNING: math_sat_cot ignores --num-fewshot argument and uses a fixed prompt")
super().__init__()
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def validation_docs(self):
return map(self._process_doc, self.dataset["train"])
def test_docs(self):
return map(self._process_doc, self.dataset["train"])
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
return doc["query"]
def _process_doc(self, doc):
def format_example(doc, keys):
"""
Problem: <prompt>
What of the following is the right choice? Explain you answer.
(A) <choice1>, (B) <choice2>, (C) <choice3>, (D) <choice4>
Solution:
"""
prompt = MCQA_PROMPT + "\n\n" + "Problem:\n" + doc["Question"] + "\nWhat of the following is the right choice? Explain you answer.\n"
prompt += ", ".join(
[f"{key} {choice}" for key, choice in zip(keys, doc["Possible Answers"])]
)
prompt += "\nSolution:"
return prompt
keys = ["A", "B", "C", "D"]
return {
"query": format_example(doc, keys),
"choices": doc["Possible Answers"],
"gold": "(" + doc["Answer"] + ")"
}
def doc_to_text(self, doc):
return doc["query"]
@property
def end_seq(self):
return ["\n\n", "Problem:"]
def process_results(self, doc, results, params={}):
candidates = results[0]
assert isinstance(params, dict)
if params == {}:
completion = self._extract_answer(candidates)
acc = self._is_correct(completion, doc['gold'])
pass_rate = acc
elif self.MAJORITY_VOTING in params:
acc, pass_rate, votes = self.majority_vote(
[self._extract_answer(c) for c in candidates if self._extract_answer(c)!=self.INVALID_ANS],
correct_answer=doc['gold'],
# is_equiv=self._is_correct, this line commented out since is_equiv assumed to be symmetric
)
if votes:
completion = votes[0][0]
else:
completion = self.INVALID_ANS
else:
raise AssertionError
return_dict = {
"acc": acc,
"pass_rate": pass_rate,
"metadata": {
"selected_answer": completion,
"candidates": candidates
}
}
if self.MAJORITY_VOTING in params:
return_dict['metadata']['votes'] = votes
return return_dict
def _extract_answer(self, completion):
match = self.ANS_RE.search(completion)
if match is not None:
match_str = match.group(0)
match_str = match_str.lstrip("Final Answer: The final answer is ").rstrip(". I hope it is correct.")
print(match_str)
return match_str
else:
return self.INVALID_ANS
def _is_correct(self, completion, answer):
gold = answer
assert gold != self.INVALID_ANS, "No ground truth answer found in the document."
return completion == gold
def fewshot_examples(self, k, rnd):
# fewshot_examples is not just sampling from train_docs because dev is
# in the same distribution as val/test but auxiliary_train isn't
raise NotImplementedError
def doc_to_text(self, doc):
return doc["query"]
def doc_to_target(self, doc):
raise NotImplementedError("Should not rely on doc_to_target for pure-zeroshot Minerva-MMLU(STEM)")
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
def aggregation(self):
return {"acc": mean, "pass_rate": mean}
def higher_is_better(self):
return {"acc": True, "pass_rate": True}