Skip to content

Commit

Permalink
[evals] added multilingual example and support
Browse files Browse the repository at this point in the history
  • Loading branch information
rlbayes committed Mar 16, 2023
1 parent a7fe8e0 commit c4bcb9d
Show file tree
Hide file tree
Showing 13 changed files with 302 additions and 56 deletions.
52 changes: 29 additions & 23 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Generic eval that uses a prompt + classification.
"""
import itertools
import logging
import string
from collections import Counter
from random import Random
Expand All @@ -25,23 +26,24 @@
"include": lambda x, y: float(x in y),
"exact": lambda x, y: float(x == y),
"endswith": lambda x, y: x.endswith(y),
}
CHOICE_FNS = {
# e.g. "Yes"
"classify": lambda x: x.strip(),
# e.g. "Yes\n The reasons are: ..."
"classify_cot": lambda x: x.strip().split("\n")[0].strip(),
# e.g. "Let's think step by step. ...\nYes"
"cot_classify": lambda x: x.strip().split("\n")[-1].strip(),
"starts_or_endswith": lambda x, y: x.startswith(y) or x.endswith(y),
}

ANSWER_PROMPTS = {
# e.g. "Yes"
"classify": "Answer the question by printing only a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer with no other text.".strip(),
# e.g. "Yes\n The reasons are: ..."
"classify_cot": "First, answer by printing a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer. Then, from the next line, explain your reasonings step by step.".strip(),
# e.g. "Let's think step by step. ...\nYes"
"cot_classify": """
First, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from {choices} (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.
Reasoning:""".strip(),
"cot_classify_jp": """
まず、一歩一歩あなたの推論を書き出してください。単に正しい答えを最初に述べることを避けてください。次に、{choices}(引用符や句読点なし)から正しい答えに対応する1つの選択肢を単独の行に印刷します。最後に、答えだけを新しい行に繰り返してください。
推論:
""".strip(),
}

EVAL_MODELSPEC = ModelSpec(name="gpt-3.5-turbo", model="gpt-3.5-turbo", is_chat=True)
Expand All @@ -52,15 +54,19 @@ def choice_to_str(choice_strings: Iterable[str]) -> str:
return " or ".join(f'"{choice}"' for choice in choice_strings)


def clean_choice(raw_choice: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
"""Clean a choice string to one of choice_strings. Return '__invalid__.' if no match."""
raw_choice = raw_choice.strip()
raw_choice = "".join(c for c in raw_choice if c not in string.punctuation)
if not raw_choice:
return INVALID_STR
for choice in choice_strings:
if match_fn(raw_choice, choice):
return choice
def get_choice(text: str, eval_type: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
"""Clean the answer string to a choice string to one of choice_strings. Return '__invalid__.' if no match."""
lines = text.strip().split("\n")
if eval_type.startswith("cot_classify"):
lines = lines[::-1] # reverse lines
for line in lines:
line = line.strip()
line = "".join(c for c in line if c not in string.punctuation)
if not line:
continue
for choice in choice_strings:
if match_fn(line, choice):
return choice
return INVALID_STR


Expand Down Expand Up @@ -94,7 +100,7 @@ def __init__(
samples_jsonl: str,
modelgraded_spec_file: str,
*args,
match_fn: str = "endswith",
match_fn: str = "starts_or_endswith",
max_tokens: int = 1024,
multicomp_n: int = 1,
multicomp_temperature: float = 0.4,
Expand Down Expand Up @@ -135,7 +141,6 @@ def __init__(
# - "classify_cot": answer then reason (explanation)
# if 'eval_type' is not supplied from modelgraded_specs, then it must be supplied as an argument.
# - Importantly, it also assumes the answer prompt needs to be appended to the prompt.
# 'eval_type' sets 'choice_fn', a function that takes the model's raw response and returns the choice string
self.eval_type = modelgraded_specs.pop("eval_type", None)
if not self.eval_type:
append_answer_prompt = True # append answer prompt to prompt
Expand All @@ -148,8 +153,6 @@ def __init__(
not eval_type
), f"eval_type must be unspecified, if it is specified in modelgraded_spec_file"
append_answer_prompt = False
assert self.eval_type in CHOICE_FNS, f"eval_type must be one of {list(CHOICE_FNS.keys())}"
self.choice_fn = CHOICE_FNS[self.eval_type]

# 'prompt' is a string that specifies the model-graded evaluation
prompt = modelgraded_specs.pop("prompt")
Expand Down Expand Up @@ -259,8 +262,11 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
for metric, args in args_dict.items():
args = {k: v[1] for k, v in args.items()}
evaluation, _ = evaluate(**args, **eval_kwargs)
raw_choice = self.choice_fn(evaluation)
choice = clean_choice(raw_choice, self.match_fn, self.choice_strings)
choice = get_choice(evaluation, self.eval_type, self.match_fn, self.choice_strings)
if choice == INVALID_STR:
logging.warn(
f"Choices {self.choice_strings} not parsable for {self.eval_type}: {evaluation}"
)
metrics[metric] = choice
if self.metaeval:
assert (
Expand Down
14 changes: 9 additions & 5 deletions evals/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,19 +292,23 @@ def __init__(self, log_path: Optional[str], run_spec: RunSpec):
super().__init__(run_spec)
self.event_file_path = log_path
if log_path is not None:
with bf.BlobFile(log_path, "w") as f:
f.write(jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n")
with bf.BlobFile(log_path, "wb") as f:
f.write(
(
jsondumps({"spec": dataclasses.asdict(run_spec)}, ensure_ascii=False) + "\n"
).encode("utf-8")
)

def _flush_events_internal(self, events_to_write: Sequence[Event]):
start = time.time()
try:
lines = [jsondumps(event) + "\n" for event in events_to_write]
lines = [jsondumps(event, ensure_ascii=False) + "\n" for event in events_to_write]
except TypeError as e:
logger.error(f"Failed to serialize events: {events_to_write}")
raise e

with bf.BlobFile(self.event_file_path, "a") as f:
f.writelines(lines)
with bf.BlobFile(self.event_file_path, "ab") as f:
f.write(b"".join([l.encode("utf-8") for l in lines]))

logger.info(
f"Logged {len(lines)} rows of events to {self.event_file_path}: insert_time={t(time.time()-start)}"
Expand Down
3 changes: 3 additions & 0 deletions evals/registry/data/test_modelgraded/humor_people_jp.jsonl
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
2 changes: 1 addition & 1 deletion evals/registry/eval_sets/test-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ test:
- joke-fruits-expl-meta
- diversity
- joke-animals-vs-fruits
- rap-people-vs-people
- rap-people-vs-people
1 change: 1 addition & 0 deletions evals/registry/evals/test-modelgraded-battle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ rap-people-vs-fruits.dev.v0:
samples_jsonl: test_multiio/battles/rap_people_vs_fruits.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle

12 changes: 12 additions & 0 deletions evals/registry/evals/test-modelgraded-generated.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This file is generated by modelgraded_generator.py

mg-humor-people_jp:
id: mg-humor-people_jp.dev.v0
metrics: [accuracy]
mg-humor-people_jp.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_modelgraded/humor_people_jp.jsonl
eval_type: cot_classify_jp
modelgraded_spec_file: humor_jp

24 changes: 24 additions & 0 deletions evals/registry/modelgraded/humor_jp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
prompt: |-
以下の文章って面白い?
(始まり)
{completion}
(終わり)
5段階の中から選んでください。
1. 退屈 - このレベルの面白さはほとんど感じられず、退屈で時間が過ぎるのが遅いと感じるでしょう。興味を惹かれる要素がほとんどなく、全体的に魅力に欠けます。
2. やや面白い - このレベルでは、若干の面白さが感じられますが、それでも十分ではありません。物語性やコンセプトには一定の魅力があるものの、期待外れであり、時間を過ごすにはまだ物足りないかもしれません。
3. まあまあ面白い - このレベルの面白さは、平均的で満足できる範囲です。ある程度の興味深い要素やストーリーがあり、時間を過ごすのに適していますが、特別印象に残るものではないかもしれません。
4. 面白い - このレベルでは、かなりの面白さが感じられ、魅力的なストーリーやキャラクターが存在します。多くの人が楽しめる内容であり、興味を持続させる要素が豊富に含まれています。ただし、最高の評価には僅かに及ばない部分が残っています。
5. 大変面白い - このレベルの面白さは、非常に優れており、観る者を魅了し、心に強く残る体験ができます。独創的なストーリーや魅力的なキャラクターが際立ち、多くの人が共感や感動を覚えるでしょう。このレベルの面白さは、他のものと比較しても突出していると言えます。
choice_scores: from_strings
choice_strings: "12345"
input_outputs:
input: completion
24 changes: 5 additions & 19 deletions scripts/battle_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
import string
from typing import Any


def text_prompt_to_chat_prompt(text: str) -> list[dict[str, Any]]:
return [{"role": "system", "content": text}]
REGISTRY_PATH = os.path.join(os.path.dirname(__file__), "../evals/registry")


def format(template: str, **kwargs: dict[str, str]) -> str:
Expand All @@ -18,18 +16,6 @@ def format(template: str, **kwargs: dict[str, str]) -> str:
return template.format(**cur_keys)


def get_yaml_dir() -> str:
d = os.path.join(os.path.dirname(__file__), "../registry/evals")
return d


def get_data_dir() -> str:
d = os.path.join(os.path.dirname(__file__), "../registry/data/test_multiio/battles")
if not os.path.exists(d):
os.makedirs(d)
return d


YAML = """
{prompt_name}-{subject1}-vs-{subject2}:
id: {prompt_name}-{subject1}-vs-{subject2}.dev.v0
Expand Down Expand Up @@ -62,14 +48,14 @@ def get_data_dir() -> str:
("rap", "people", "fruits"),
]

data_dir = get_data_dir()
data_dir = f"{REGISTRY_PATH}/data/test_multiio/battles"
yaml_str = f"# This file is generated by {os.path.basename(__file__)}\n\n"
for prompt_name, subject1, subject2 in target_sets:
prompt = prompts[prompt_name]
samples = [
{
"input1": text_prompt_to_chat_prompt(format(prompt, self=s1, other=s2)),
"input2": text_prompt_to_chat_prompt(format(prompt, self=s2, other=s1)),
"input1": format(prompt, self=s1, other=s2),
"input2": format(prompt, self=s2, other=s1),
}
for s1 in subjects[subject1]
for s2 in subjects[subject2]
Expand All @@ -83,7 +69,7 @@ def get_data_dir() -> str:
yaml_str += YAML.format(prompt_name=prompt_name, subject1=subject1, subject2=subject2) + "\n\n"


yaml_file = f"{get_yaml_dir()}/test-modelgraded-battle.yaml"
yaml_file = f"{REGISTRY_PATH}/evals/test-modelgraded-battle.yaml"
with open(yaml_file, "w") as f:
f.write(yaml_str)
print(f"wrote {yaml_file}")
Loading

0 comments on commit c4bcb9d

Please sign in to comment.