Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[evals] added multilingual example and support #260

Merged
merged 5 commits into from
Mar 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions evals/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ def default(self, o: Any) -> str:
return _to_py_types(o)


def jsondumps(o: Any, **kwargs: Any) -> str:
return json.dumps(o, cls=EnhancedJSONEncoder, **kwargs)
def jsondumps(o: Any, ensure_ascii: bool = False, **kwargs: Any) -> str:
return json.dumps(o, cls=EnhancedJSONEncoder, ensure_ascii=ensure_ascii, **kwargs)


def jsondump(o: Any, fp: Any, **kwargs: Any) -> None:
json.dump(o, fp, cls=EnhancedJSONEncoder, **kwargs)
def jsondump(o: Any, fp: Any, ensure_ascii: bool = False, **kwargs: Any) -> None:
json.dump(o, fp, cls=EnhancedJSONEncoder, ensure_ascii=ensure_ascii, **kwargs)


def jsonloads(s: str, **kwargs: Any) -> Any:
Expand Down
52 changes: 29 additions & 23 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Generic eval that uses a prompt + classification.
"""
import itertools
import logging
import string
from collections import Counter
from random import Random
Expand All @@ -25,23 +26,24 @@
"include": lambda x, y: float(x in y),
"exact": lambda x, y: float(x == y),
"endswith": lambda x, y: x.endswith(y),
}
CHOICE_FNS = {
# e.g. "Yes"
"classify": lambda x: x.strip(),
# e.g. "Yes\n The reasons are: ..."
"classify_cot": lambda x: x.strip().split("\n")[0].strip(),
# e.g. "Let's think step by step. ...\nYes"
"cot_classify": lambda x: x.strip().split("\n")[-1].strip(),
"starts_or_endswith": lambda x, y: x.startswith(y) or x.endswith(y),
}

ANSWER_PROMPTS = {
# e.g. "Yes"
"classify": "Answer the question by printing only a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer with no other text.".strip(),
# e.g. "Yes\n The reasons are: ..."
"classify_cot": "First, answer by printing a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer. Then, from the next line, explain your reasonings step by step.".strip(),
# e.g. "Let's think step by step. ...\nYes"
"cot_classify": """
First, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from {choices} (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.

Reasoning:""".strip(),
"cot_classify_jp": """
まず、一歩一歩あなたの推論を書き出してください。単に正しい答えを最初に述べることを避けてください。次に、{choices}(引用符や句読点なし)から正しい答えに対応する1つの選択肢を単独の行に書きだしてください。最後に、答えだけを新しい行に繰り返してください。

推論:
""".strip(),
}

EVAL_MODELSPEC = ModelSpec(name="gpt-3.5-turbo", model="gpt-3.5-turbo", is_chat=True)
Expand All @@ -52,15 +54,19 @@ def choice_to_str(choice_strings: Iterable[str]) -> str:
return " or ".join(f'"{choice}"' for choice in choice_strings)


def clean_choice(raw_choice: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
"""Clean a choice string to one of choice_strings. Return '__invalid__.' if no match."""
raw_choice = raw_choice.strip()
raw_choice = "".join(c for c in raw_choice if c not in string.punctuation)
if not raw_choice:
return INVALID_STR
for choice in choice_strings:
if match_fn(raw_choice, choice):
return choice
def get_choice(text: str, eval_type: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
"""Clean the answer string to a choice string to one of choice_strings. Return '__invalid__.' if no match."""
lines = text.strip().split("\n")
if eval_type.startswith("cot_classify"):
lines = lines[::-1] # reverse lines
for line in lines:
line = line.strip()
line = "".join(c for c in line if c not in string.punctuation)
if not line:
continue
for choice in choice_strings:
if match_fn(line, choice):
return choice
return INVALID_STR


Expand Down Expand Up @@ -94,7 +100,7 @@ def __init__(
samples_jsonl: str,
modelgraded_spec_file: str,
*args,
match_fn: str = "endswith",
match_fn: str = "starts_or_endswith",
max_tokens: int = 1024,
multicomp_n: int = 1,
multicomp_temperature: float = 0.4,
Expand Down Expand Up @@ -135,7 +141,6 @@ def __init__(
# - "classify_cot": answer then reason (explanation)
# if 'eval_type' is not supplied from modelgraded_specs, then it must be supplied as an argument.
# - Importantly, it also assumes the answer prompt needs to be appended to the prompt.
# 'eval_type' sets 'choice_fn', a function that takes the model's raw response and returns the choice string
self.eval_type = modelgraded_specs.pop("eval_type", None)
if not self.eval_type:
append_answer_prompt = True # append answer prompt to prompt
Expand All @@ -148,8 +153,6 @@ def __init__(
not eval_type
), f"eval_type must be unspecified, if it is specified in modelgraded_spec_file"
append_answer_prompt = False
assert self.eval_type in CHOICE_FNS, f"eval_type must be one of {list(CHOICE_FNS.keys())}"
self.choice_fn = CHOICE_FNS[self.eval_type]

# 'prompt' is a string that specifies the model-graded evaluation
prompt = modelgraded_specs.pop("prompt")
Expand Down Expand Up @@ -259,8 +262,11 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
for metric, args in args_dict.items():
args = {k: v[1] for k, v in args.items()}
evaluation, _ = evaluate(**args, **eval_kwargs)
raw_choice = self.choice_fn(evaluation)
choice = clean_choice(raw_choice, self.match_fn, self.choice_strings)
choice = get_choice(evaluation, self.eval_type, self.match_fn, self.choice_strings)
if choice == INVALID_STR:
logging.warn(
f"Choices {self.choice_strings} not parsable for {self.eval_type}: {evaluation}"
)
metrics[metric] = choice
if self.metaeval:
assert (
Expand Down
24 changes: 12 additions & 12 deletions evals/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,8 @@ def __init__(self, log_path: Optional[str], run_spec: RunSpec):
super().__init__(run_spec)
self.event_file_path = log_path
if log_path is not None:
with bf.BlobFile(log_path, "w") as f:
f.write(jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n")
with bf.BlobFile(log_path, "wb") as f:
f.write((jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n").encode("utf-8"))

def _flush_events_internal(self, events_to_write: Sequence[Event]):
start = time.time()
Expand All @@ -303,8 +303,8 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
logger.error(f"Failed to serialize events: {events_to_write}")
raise e

with bf.BlobFile(self.event_file_path, "a") as f:
f.writelines(lines)
with bf.BlobFile(self.event_file_path, "ab") as f:
f.write(b"".join([l.encode("utf-8") for l in lines]))

logger.info(
f"Logged {len(lines)} rows of events to {self.event_file_path}: insert_time={t(time.time()-start)}"
Expand All @@ -314,8 +314,8 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
self._flushes_done += 1

def record_final_report(self, final_report: Any):
with bf.BlobFile(self.event_file_path, "a") as f:
f.write(jsondumps({"final_report": final_report}) + "\n")
with bf.BlobFile(self.event_file_path, "ab") as f:
f.write((jsondumps({"final_report": final_report}) + "\n").encode("utf-8"))

logging.info(f"Final report: {final_report}. Logged to {self.event_file_path}")

Expand All @@ -341,8 +341,8 @@ def __init__(
self._conn = snowflake_connection

if log_path is not None:
with bf.BlobFile(log_path, "w") as f:
f.write(jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n")
with bf.BlobFile(log_path, "wb") as f:
f.write((jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n").encode("utf-8"))

query = """
INSERT ALL INTO runs (run_id, model_name, eval_name, base_eval, split, run_config, settings, created_by, created_at)
Expand Down Expand Up @@ -407,15 +407,15 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]):
)
idx_l = idx_r

with bf.BlobFile(self.event_file_path, "a") as f:
f.writelines(lines)
with bf.BlobFile(self.event_file_path, "ab") as f:
f.write(b"".join([l.encode("utf-8") for l in lines]))
self._last_flush_time = time.time()
self._flushes_done += 1

def record_final_report(self, final_report: Any):
with self._writing_lock:
with bf.BlobFile(self.event_file_path, "a") as f:
f.write(jsondumps({"final_report": final_report}) + "\n")
with bf.BlobFile(self.event_file_path, "ab") as f:
f.write((jsondumps({"final_report": final_report}) + "\n").encode("utf-8"))
query = """
UPDATE runs
SET final_report = PARSE_JSON(%(final_report)s)
Expand Down
3 changes: 3 additions & 0 deletions evals/registry/data/test_modelgraded/humor_people_jp.jsonl
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
2 changes: 1 addition & 1 deletion evals/registry/eval_sets/test-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ test:
- joke-fruits-expl-meta
- diversity
- joke-animals-vs-fruits
- rap-people-vs-people
- rap-people-vs-people
1 change: 1 addition & 0 deletions evals/registry/eval_sets/test-modelgraded.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ test-modelgraded:
- rap-people-vs-people
- rap-animals-vs-fruits
- rap-people-vs-fruits
- mg-humor-people_jp
1 change: 1 addition & 0 deletions evals/registry/evals/test-modelgraded-battle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ rap-people-vs-fruits.dev.v0:
samples_jsonl: test_multiio/battles/rap_people_vs_fruits.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle

12 changes: 12 additions & 0 deletions evals/registry/evals/test-modelgraded-generated.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This file is generated by modelgraded_generator.py

mg-humor-people_jp:
id: mg-humor-people_jp.dev.v0
metrics: [accuracy]
mg-humor-people_jp.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_modelgraded/humor_people_jp.jsonl
eval_type: cot_classify_jp
modelgraded_spec_file: humor_jp

24 changes: 24 additions & 0 deletions evals/registry/modelgraded/humor_jp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
prompt: |-
以下の文章って面白い?

(始まり)

{completion}

(終わり)

5段階の中から選んでください。

1. 退屈 - このレベルの面白さはほとんど感じられず、退屈で時間が過ぎるのが遅いと感じるでしょう。興味を惹かれる要素がほとんどなく、全体的に魅力に欠けます。

2. やや面白い - このレベルでは、若干の面白さが感じられますが、それでも十分ではありません。物語性やコンセプトには一定の魅力があるものの、期待外れであり、時間を過ごすにはまだ物足りないかもしれません。

3. まあまあ面白い - このレベルの面白さは、平均的で満足できる範囲です。ある程度の興味深い要素やストーリーがあり、時間を過ごすのに適していますが、特別印象に残るものではないかもしれません。

4. 面白い - このレベルでは、かなりの面白さが感じられ、魅力的なストーリーやキャラクターが存在します。多くの人が楽しめる内容であり、興味を持続させる要素が豊富に含まれています。ただし、最高の評価には僅かに及ばない部分が残っています。

5. 大変面白い - このレベルの面白さは、非常に優れており、観る者を魅了し、心に強く残る体験ができます。独創的なストーリーや魅力的なキャラクターが際立ち、多くの人が共感や感動を覚えるでしょう。このレベルの面白さは、他のものと比較しても突出していると言えます。
choice_scores: from_strings
choice_strings: "12345"
input_outputs:
input: completion
24 changes: 5 additions & 19 deletions scripts/battle_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
import string
from typing import Any


def text_prompt_to_chat_prompt(text: str) -> list[dict[str, Any]]:
return [{"role": "system", "content": text}]
REGISTRY_PATH = os.path.join(os.path.dirname(__file__), "../evals/registry")


def format(template: str, **kwargs: dict[str, str]) -> str:
Expand All @@ -18,18 +16,6 @@ def format(template: str, **kwargs: dict[str, str]) -> str:
return template.format(**cur_keys)


def get_yaml_dir() -> str:
d = os.path.join(os.path.dirname(__file__), "../registry/evals")
return d


def get_data_dir() -> str:
d = os.path.join(os.path.dirname(__file__), "../registry/data/test_multiio/battles")
if not os.path.exists(d):
os.makedirs(d)
return d


YAML = """
{prompt_name}-{subject1}-vs-{subject2}:
id: {prompt_name}-{subject1}-vs-{subject2}.dev.v0
Expand Down Expand Up @@ -62,14 +48,14 @@ def get_data_dir() -> str:
("rap", "people", "fruits"),
]

data_dir = get_data_dir()
data_dir = f"{REGISTRY_PATH}/data/test_multiio/battles"
yaml_str = f"# This file is generated by {os.path.basename(__file__)}\n\n"
for prompt_name, subject1, subject2 in target_sets:
prompt = prompts[prompt_name]
samples = [
{
"input1": text_prompt_to_chat_prompt(format(prompt, self=s1, other=s2)),
"input2": text_prompt_to_chat_prompt(format(prompt, self=s2, other=s1)),
"input1": format(prompt, self=s1, other=s2),
"input2": format(prompt, self=s2, other=s1),
}
for s1 in subjects[subject1]
for s2 in subjects[subject2]
Expand All @@ -83,7 +69,7 @@ def get_data_dir() -> str:
yaml_str += YAML.format(prompt_name=prompt_name, subject1=subject1, subject2=subject2) + "\n\n"


yaml_file = f"{get_yaml_dir()}/test-modelgraded-battle.yaml"
yaml_file = f"{REGISTRY_PATH}/evals/test-modelgraded-battle.yaml"
with open(yaml_file, "w") as f:
f.write(yaml_str)
print(f"wrote {yaml_file}")
Loading