[evals] added multilingual example and support

gauravjaincr7 · Mar 16, 2023 · c4bcb9d · c4bcb9d
1 parent a7fe8e0
commit c4bcb9d
Show file tree

Hide file tree

Showing 13 changed files with 302 additions and 56 deletions.
diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
@@ -2,6 +2,7 @@
 Generic eval that uses a prompt + classification.
 """
 import itertools
+import logging
 import string
 from collections import Counter
 from random import Random
@@ -25,23 +26,24 @@
  "include": lambda x, y: float(x in y),
  "exact": lambda x, y: float(x == y),
  "endswith": lambda x, y: x.endswith(y),
-}
-CHOICE_FNS = {
- # e.g. "Yes"
- "classify": lambda x: x.strip(),
- # e.g. "Yes\n The reasons are: ..."
- "classify_cot": lambda x: x.strip().split("\n")[0].strip(),
- # e.g. "Let's think step by step. ...\nYes"
- "cot_classify": lambda x: x.strip().split("\n")[-1].strip(),
+ "starts_or_endswith": lambda x, y: x.startswith(y) or x.endswith(y),
 }
 
 ANSWER_PROMPTS = {
+ # e.g. "Yes"
  "classify": "Answer the question by printing only a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer with no other text.".strip(),
+ # e.g. "Yes\n The reasons are: ..."
  "classify_cot": "First, answer by printing a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer. Then, from the next line, explain your reasonings step by step.".strip(),
+ # e.g. "Let's think step by step. ...\nYes"
  "cot_classify": """
 First, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from {choices} (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.
 
 Reasoning:""".strip(),
+ "cot_classify_jp": """
+まず、一歩一歩あなたの推論を書き出してください。単に正しい答えを最初に述べることを避けてください。次に、{choices}（引用符や句読点なし）から正しい答えに対応する1つの選択肢を単独の行に印刷します。最後に、答えだけを新しい行に繰り返してください。
+
+推論：
+ """.strip(),
 }
 
 EVAL_MODELSPEC = ModelSpec(name="gpt-3.5-turbo", model="gpt-3.5-turbo", is_chat=True)
@@ -52,15 +54,19 @@ def choice_to_str(choice_strings: Iterable[str]) -> str:
  return " or ".join(f'"{choice}"' for choice in choice_strings)
 
 
-def clean_choice(raw_choice: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
- """Clean a choice string to one of choice_strings. Return '__invalid__.' if no match."""
- raw_choice = raw_choice.strip()
- raw_choice = "".join(c for c in raw_choice if c not in string.punctuation)
- if not raw_choice:
- return INVALID_STR
- for choice in choice_strings:
- if match_fn(raw_choice, choice):
- return choice
+def get_choice(text: str, eval_type: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
+ """Clean the answer string to a choice string to one of choice_strings. Return '__invalid__.' if no match."""
+ lines = text.strip().split("\n")
+ if eval_type.startswith("cot_classify"):
+ lines = lines[::-1] # reverse lines
+ for line in lines:
+ line = line.strip()
+ line = "".join(c for c in line if c not in string.punctuation)
+ if not line:
+ continue
+ for choice in choice_strings:
+ if match_fn(line, choice):
+ return choice
  return INVALID_STR
 
 
@@ -94,7 +100,7 @@ def __init__(
  samples_jsonl: str,
  modelgraded_spec_file: str,
  *args,
- match_fn: str = "endswith",
+ match_fn: str = "starts_or_endswith",
  max_tokens: int = 1024,
  multicomp_n: int = 1,
  multicomp_temperature: float = 0.4,
@@ -135,7 +141,6 @@ def __init__(
  # - "classify_cot": answer then reason (explanation)
  # if 'eval_type' is not supplied from modelgraded_specs, then it must be supplied as an argument.
  # - Importantly, it also assumes the answer prompt needs to be appended to the prompt.
- # 'eval_type' sets 'choice_fn', a function that takes the model's raw response and returns the choice string
  self.eval_type = modelgraded_specs.pop("eval_type", None)
  if not self.eval_type:
  append_answer_prompt = True # append answer prompt to prompt
@@ -148,8 +153,6 @@ def __init__(
  not eval_type
  ), f"eval_type must be unspecified, if it is specified in modelgraded_spec_file"
  append_answer_prompt = False
- assert self.eval_type in CHOICE_FNS, f"eval_type must be one of {list(CHOICE_FNS.keys())}"
- self.choice_fn = CHOICE_FNS[self.eval_type]
 
  # 'prompt' is a string that specifies the model-graded evaluation
  prompt = modelgraded_specs.pop("prompt")
@@ -259,8 +262,11 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
  for metric, args in args_dict.items():
  args = {k: v[1] for k, v in args.items()}
  evaluation, _ = evaluate(**args, **eval_kwargs)
- raw_choice = self.choice_fn(evaluation)
- choice = clean_choice(raw_choice, self.match_fn, self.choice_strings)
+ choice = get_choice(evaluation, self.eval_type, self.match_fn, self.choice_strings)
+ if choice == INVALID_STR:
+ logging.warn(
+ f"Choices {self.choice_strings} not parsable for {self.eval_type}: {evaluation}"
+ )
  metrics[metric] = choice
  if self.metaeval:
  assert (

diff --git a/evals/record.py b/evals/record.py
@@ -292,19 +292,23 @@ def __init__(self, log_path: Optional[str], run_spec: RunSpec):
  super().__init__(run_spec)
  self.event_file_path = log_path
  if log_path is not None:
- with bf.BlobFile(log_path, "w") as f:
- f.write(jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n")
+ with bf.BlobFile(log_path, "wb") as f:
+ f.write(
+ (
+ jsondumps({"spec": dataclasses.asdict(run_spec)}, ensure_ascii=False) + "\n"
+ ).encode("utf-8")
+ )
 
  def _flush_events_internal(self, events_to_write: Sequence[Event]):
  start = time.time()
  try:
- lines = [jsondumps(event) + "\n" for event in events_to_write]
+ lines = [jsondumps(event, ensure_ascii=False) + "\n" for event in events_to_write]
  except TypeError as e:
  logger.error(f"Failed to serialize events: {events_to_write}")
  raise e
 
- with bf.BlobFile(self.event_file_path, "a") as f:
- f.writelines(lines)
+ with bf.BlobFile(self.event_file_path, "ab") as f:
+ f.write(b"".join([l.encode("utf-8") for l in lines]))
 
  logger.info(
  f"Logged {len(lines)} rows of events to {self.event_file_path}: insert_time={t(time.time()-start)}"

diff --git a/evals/registry/data/test_modelgraded/humor_people_jp.jsonl b/evals/registry/data/test_modelgraded/humor_people_jp.jsonl
diff --git a/evals/registry/data/test_multiio/battles/joke_animals_vs_fruits.jsonl b/evals/registry/data/test_multiio/battles/joke_animals_vs_fruits.jsonl
diff --git a/evals/registry/data/test_multiio/battles/rap_animals_vs_fruits.jsonl b/evals/registry/data/test_multiio/battles/rap_animals_vs_fruits.jsonl
diff --git a/evals/registry/data/test_multiio/battles/rap_people_vs_fruits.jsonl b/evals/registry/data/test_multiio/battles/rap_people_vs_fruits.jsonl
diff --git a/evals/registry/data/test_multiio/battles/rap_people_vs_people.jsonl b/evals/registry/data/test_multiio/battles/rap_people_vs_people.jsonl
diff --git a/evals/registry/eval_sets/test-all.yaml b/evals/registry/eval_sets/test-all.yaml
@@ -15,4 +15,4 @@ test:
  - joke-fruits-expl-meta
  - diversity
  - joke-animals-vs-fruits
- - rap-people-vs-people
+ - rap-people-vs-people
diff --git a/evals/registry/evals/test-modelgraded-battle.yaml b/evals/registry/evals/test-modelgraded-battle.yaml
@@ -39,3 +39,4 @@ rap-people-vs-fruits.dev.v0:
  samples_jsonl: test_multiio/battles/rap_people_vs_fruits.jsonl
  eval_type: cot_classify
  modelgraded_spec_file: battle
+
diff --git a/evals/registry/evals/test-modelgraded-generated.yaml b/evals/registry/evals/test-modelgraded-generated.yaml
@@ -0,0 +1,12 @@
+# This file is generated by modelgraded_generator.py
+
+mg-humor-people_jp:
+ id: mg-humor-people_jp.dev.v0
+ metrics: [accuracy]
+mg-humor-people_jp.dev.v0:
+ class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+ args:
+ samples_jsonl: test_modelgraded/humor_people_jp.jsonl
+ eval_type: cot_classify_jp
+ modelgraded_spec_file: humor_jp
+
diff --git a/evals/registry/modelgraded/humor_jp.yaml b/evals/registry/modelgraded/humor_jp.yaml
@@ -0,0 +1,24 @@
+prompt: |-
+ 以下の文章って面白い？
+
+ （始まり）
+
+ {completion}
+
+ （終わり）
+
+ ５段階の中から選んでください。
+ 
+ 1. 退屈 - このレベルの面白さはほとんど感じられず、退屈で時間が過ぎるのが遅いと感じるでしょう。興味を惹かれる要素がほとんどなく、全体的に魅力に欠けます。
+
+ 2. やや面白い - このレベルでは、若干の面白さが感じられますが、それでも十分ではありません。物語性やコンセプトには一定の魅力があるものの、期待外れであり、時間を過ごすにはまだ物足りないかもしれません。
+
+ 3. まあまあ面白い - このレベルの面白さは、平均的で満足できる範囲です。ある程度の興味深い要素やストーリーがあり、時間を過ごすのに適していますが、特別印象に残るものではないかもしれません。
+
+ 4. 面白い - このレベルでは、かなりの面白さが感じられ、魅力的なストーリーやキャラクターが存在します。多くの人が楽しめる内容であり、興味を持続させる要素が豊富に含まれています。ただし、最高の評価には僅かに及ばない部分が残っています。
+
+ 5. 大変面白い - このレベルの面白さは、非常に優れており、観る者を魅了し、心に強く残る体験ができます。独創的なストーリーや魅力的なキャラクターが際立ち、多くの人が共感や感動を覚えるでしょう。このレベルの面白さは、他のものと比較しても突出していると言えます。
+choice_scores: from_strings
+choice_strings: "12345"
+input_outputs:
+ input: completion
diff --git a/scripts/battle_generator.py b/scripts/battle_generator.py
@@ -5,9 +5,7 @@
 import string
 from typing import Any
 
-
-def text_prompt_to_chat_prompt(text: str) -> list[dict[str, Any]]:
- return [{"role": "system", "content": text}]
+REGISTRY_PATH = os.path.join(os.path.dirname(__file__), "../evals/registry")
 
 
 def format(template: str, **kwargs: dict[str, str]) -> str:
@@ -18,18 +16,6 @@ def format(template: str, **kwargs: dict[str, str]) -> str:
  return template.format(**cur_keys)
 
 
-def get_yaml_dir() -> str:
- d = os.path.join(os.path.dirname(__file__), "../registry/evals")
- return d
-
-
-def get_data_dir() -> str:
- d = os.path.join(os.path.dirname(__file__), "../registry/data/test_multiio/battles")
- if not os.path.exists(d):
- os.makedirs(d)
- return d
-
-
 YAML = """
 {prompt_name}-{subject1}-vs-{subject2}:
  id: {prompt_name}-{subject1}-vs-{subject2}.dev.v0
@@ -62,14 +48,14 @@ def get_data_dir() -> str:
  ("rap", "people", "fruits"),
 ]
 
-data_dir = get_data_dir()
+data_dir = f"{REGISTRY_PATH}/data/test_multiio/battles"
 yaml_str = f"# This file is generated by {os.path.basename(__file__)}\n\n"
 for prompt_name, subject1, subject2 in target_sets:
  prompt = prompts[prompt_name]
  samples = [
  {
- "input1": text_prompt_to_chat_prompt(format(prompt, self=s1, other=s2)),
- "input2": text_prompt_to_chat_prompt(format(prompt, self=s2, other=s1)),
+ "input1": format(prompt, self=s1, other=s2),
+ "input2": format(prompt, self=s2, other=s1),
  }
  for s1 in subjects[subject1]
  for s2 in subjects[subject2]
@@ -83,7 +69,7 @@ def get_data_dir() -> str:
  yaml_str += YAML.format(prompt_name=prompt_name, subject1=subject1, subject2=subject2) + "\n\n"
 
 
-yaml_file = f"{get_yaml_dir()}/test-modelgraded-battle.yaml"
+yaml_file = f"{REGISTRY_PATH}/evals/test-modelgraded-battle.yaml"
 with open(yaml_file, "w") as f:
  f.write(yaml_str)
 print(f"wrote {yaml_file}")