From aae522b0c36a542294e5eb8077f2d23838bc2696 Mon Sep 17 00:00:00 2001
From: Shane Gu <343165+rlbayes@users.noreply.github.com>
Date: Wed, 22 Mar 2023 12:03:28 -0700
Subject: [PATCH] [evals] moved modelgraded specs to registry (#392)

- each Eval now keeps track of "registry"
---
 docs/build-eval.md                           |  2 +-
 evals/cli/oaieval.py                         |  8 ++-
 evals/elsuite/modelgraded/classify.py        | 11 ++--
 evals/elsuite/utils.py                       |  9 ---
 evals/eval.py                                |  9 ++-
 evals/registry.py                            | 16 +++++
 evals/registry/modelgraded/battle.yaml       | 41 ++++++------
 evals/registry/modelgraded/best.yaml         | 21 +++---
 evals/registry/modelgraded/closedqa.yaml     | 51 +++++++--------
 evals/registry/modelgraded/diversity.yaml    | 27 ++++----
 evals/registry/modelgraded/fact.yaml         | 43 +++++++------
 evals/registry/modelgraded/humor.yaml        | 67 ++++++++++++++++----
 evals/registry/modelgraded/humor_jp.yaml     | 24 -------
 evals/registry/modelgraded/humor_likert.yaml | 10 ---
 14 files changed, 181 insertions(+), 158 deletions(-)
 delete mode 100644 evals/registry/modelgraded/humor_jp.yaml
 delete mode 100644 evals/registry/modelgraded/humor_likert.yaml

diff --git a/docs/build-eval.md b/docs/build-eval.md
index ab0212b499..166fea136d 100644
--- a/docs/build-eval.md
+++ b/docs/build-eval.md
@@ -65,7 +65,7 @@ Congratulations, you have built your eval! Keep iterating on it until you are co
 
 We expect that the existing model-graded evals such as `fact`, `closedqa`, and `battle` will fit many use cases. However, other use cases may benefit from more customization, e.g., a different evaluation prompt. For these, there will be a bit more work involved, but generally still no coding required!
 
-1. If you can't use an existing model-graded eval, create a new YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
+1. If you can't use an existing model-graded eval, create a new YAML or create a new entry to an existing YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
     - Note that, even if you are creating a new YAML, you may find it easiest to copy an existing YAML as a starting point. For example, model-graded evals which check a model completion against a rubric can copy `closedqa.yaml` and just edit the `args`.
 2. Next, you will create your dataset and register your eval, as described above. See [`joke_fruits_labeled.jsonl`](../evals/registry/data/test_metaeval/joke_fruits_labeled.jsonl) and [`joke-fruits`](../evals/registry/evals/test-modelgraded.yaml), for example.
     - Note that it is recommended to specify `eval_type` at this step, when you register your eval, rather than step 1.
diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
index d96d105312..1f2eaff8eb 100644
--- a/evals/cli/oaieval.py
+++ b/evals/cli/oaieval.py
@@ -211,7 +211,13 @@ def to_number(x):
     extra_eval_params = parse_extra_eval_params(args.extra_eval_params)
 
     eval_class = registry.get_class(eval_spec)
-    eval = eval_class(model_specs=model_specs, seed=args.seed, name=eval_name, **extra_eval_params)
+    eval = eval_class(
+        model_specs=model_specs,
+        seed=args.seed,
+        name=eval_name,
+        registry=registry,
+        **extra_eval_params,
+    )
     result = eval.run(recorder)
     recorder.record_final_report(result)
 
diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
index 00eedfb1d8..e201384e81 100644
--- a/evals/elsuite/modelgraded/classify.py
+++ b/evals/elsuite/modelgraded/classify.py
@@ -13,12 +13,7 @@
 import evals
 import evals.record
 from evals.base import ModelSpec
-from evals.elsuite.utils import (
-    PromptFn,
-    format_necessary,
-    load_modelgraded_specs,
-    scrub_formatting_from_prompt,
-)
+from evals.elsuite.utils import PromptFn, format_necessary, scrub_formatting_from_prompt
 
 INVALID_STR = "__invalid__"
 CHOICE_KEY = "choice"
@@ -135,7 +130,7 @@ def __init__(
             )
 
         """import prompt and set attributes"""
-        modelgraded_specs = load_modelgraded_specs(modelgraded_spec_file)
+        modelgraded_specs = self.registry.get_modelgraded_spec(modelgraded_spec_file)
 
         # 'choice_strings' is a list of strings that specifies the possible choices
         self.choice_strings = modelgraded_specs.pop("choice_strings")
@@ -211,6 +206,8 @@ def __init__(
             ), "completion_sample_templates must be specified if multicomp_n > 1"
 
         # since we accept optional args, we need to check that all args are used
+        for key in ("key", "group"):
+            modelgraded_specs.pop(key, None)
         assert not modelgraded_specs, f"Unused args: {modelgraded_specs}. Typo in YAML?"
 
     def eval_sample(self, test_sample: dict, rng: Random) -> None:
diff --git a/evals/elsuite/utils.py b/evals/elsuite/utils.py
index 77daa55fed..b62068fdfd 100644
--- a/evals/elsuite/utils.py
+++ b/evals/elsuite/utils.py
@@ -1,21 +1,12 @@
 import copy
-import os
 import re
 import string
 from collections import Counter, defaultdict
 
-import yaml
-
 from evals.api import sample_freeform
 from evals.prompt.base import chat_prompt_to_text_prompt, is_chat_prompt
 
 
-def load_modelgraded_specs(spec_file: str) -> str:
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    yaml_path = os.path.join(current_dir, "../registry/modelgraded", f"{spec_file}.yaml")
-    return yaml.load(open(yaml_path, "r"), Loader=yaml.FullLoader)
-
-
 def get_answer(text, answer_prompt):
     idx = text.rfind(answer_prompt)
     if idx == -1:
diff --git a/evals/eval.py b/evals/eval.py
index fc5eff4ec2..845123e0cf 100644
--- a/evals/eval.py
+++ b/evals/eval.py
@@ -3,17 +3,18 @@
 """
 import abc
 import asyncio
+import concurrent.futures
 import logging
 import os
 import random
-import concurrent.futures
 from multiprocessing.pool import ThreadPool
-from typing import Any, Awaitable, Callable, Dict, List, Tuple
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
 
 from tqdm import tqdm
 
 from .base import ModelSpec, ModelSpecs
-from .record import Recorder, RecorderBase
+from .record import RecorderBase
+from .registry import Registry
 
 logger = logging.getLogger(__name__)
 
@@ -53,6 +54,7 @@ def __init__(
         model_specs: ModelSpecs,
         seed: int = 20220722,
         name: str = "no_name_eval.default",
+        registry: Optional[Registry] = None,
     ):
         splits = name.split(".")
         if len(splits) < 2:
@@ -61,6 +63,7 @@ def __init__(
         self.model_specs = model_specs
         self.seed = seed
         self.name = name
+        self.registry = registry or Registry()
 
     def eval_sample(self, sample: Any, rng: random.Random):
         raise NotImplementedError()
diff --git a/evals/registry.py b/evals/registry.py
index fd6d441e49..b80d936d03 100644
--- a/evals/registry.py
+++ b/evals/registry.py
@@ -4,6 +4,7 @@
 By convention, every eval name should start with {base_eval}.{split}.
 """
 
+import difflib
 import functools
 import logging
 import os
@@ -58,6 +59,13 @@ def get_alias():
         except TypeError as e:
             raise TypeError(f"Error while processing {object} {name}: {e}")
 
+    def get_modelgraded_spec(self, name: str) -> dict[str, Any]:
+        assert name in self._modelgraded_specs, (
+            f"Modelgraded spec {name} not found. "
+            f"Closest matches: {difflib.get_close_matches(name, self._modelgraded_specs.keys(), n=5)}"
+        )
+        return self._modelgraded_specs[name]
+
     def get_eval(self, name: str) -> EvalSpec:
         return self._dereference(name, self._evals, "eval", EvalSpec)
 
@@ -136,6 +144,10 @@ def _process_directory(self, registry, path):
             self._process_file(registry, file)
 
     def _load_registry(self, paths):
+        """Load registry from a list of paths.
+
+        Each path or yaml specifies a dictionary of name -> spec.
+        """
         registry = {}
         for path in paths:
             logging.info(f"Loading registry from {path}")
@@ -154,5 +166,9 @@ def _eval_sets(self):
     def _evals(self):
         return self._load_registry([p / "evals" for p in self._registry_paths])
 
+    @functools.cached_property
+    def _modelgraded_specs(self):
+        return self._load_registry([p / "modelgraded" for p in self._registry_paths])
+
 
 registry = Registry()
diff --git a/evals/registry/modelgraded/battle.yaml b/evals/registry/modelgraded/battle.yaml
index 397a1fa154..f0c516b8be 100644
--- a/evals/registry/modelgraded/battle.yaml
+++ b/evals/registry/modelgraded/battle.yaml
@@ -1,24 +1,25 @@
-prompt: |-
-  You are comparing two responses to the following two instructions.
+battle:
+  prompt: |-
+    You are comparing two responses to the following two instructions.
 
-  [Instruction 1]
-  {input1}
-  [Response 1]
-  {completion1}
+    [Instruction 1]
+    {input1}
+    [Response 1]
+    {completion1}
 
-  [Instruction 2]
-  {input2}
-  [Response 2]
-  {completion2}
+    [Instruction 2]
+    {input2}
+    [Response 2]
+    {completion2}
 
 
-  Is the first response better than the second? You must provide one answer based on your subjective view.
-choice_strings:
-  - "Yes"
-  - "No"
-choice_scores:
-  "Yes": 1.0
-  "No": 0.0
-input_outputs:
-  input1: completion1
-  input2: completion2
+    Is the first response better than the second? You must provide one answer based on your subjective view.
+  choice_strings:
+    - "Yes"
+    - "No"
+  choice_scores:
+    "Yes": 1.0
+    "No": 0.0
+  input_outputs:
+    input1: completion1
+    input2: completion2
\ No newline at end of file
diff --git a/evals/registry/modelgraded/best.yaml b/evals/registry/modelgraded/best.yaml
index 2ba0627843..53ef265d3a 100644
--- a/evals/registry/modelgraded/best.yaml
+++ b/evals/registry/modelgraded/best.yaml
@@ -1,12 +1,13 @@
-prompt: |-
-  Which of the following {n} texts is best response to the following instruction?
+best:
+  prompt: |-
+    Which of the following {n} texts is best response to the following instruction?
 
-  Instruction: {input}
+    Instruction: {input}
 
-  Responses:
-  {completion}
-completion_sample_templates:
-  completion: "{i}. {output}\n"
-choice_strings: from_n
-input_outputs:
-  input: completion
+    Responses:
+    {completion}
+  completion_sample_templates:
+    completion: "{i}. {output}\n"
+  choice_strings: from_n
+  input_outputs:
+    input: completion
diff --git a/evals/registry/modelgraded/closedqa.yaml b/evals/registry/modelgraded/closedqa.yaml
index 3c60488090..eaaccf3817 100644
--- a/evals/registry/modelgraded/closedqa.yaml
+++ b/evals/registry/modelgraded/closedqa.yaml
@@ -1,26 +1,27 @@
-prompt: |-
-  You are assessing a submitted answer on a given task based on a criterion. Here is the data:
-  [BEGIN DATA]
-  ***
-  [Task]: {input}
-  ***
-  [Submission]: {completion}
-  ***
-  [Criterion]: {criteria}
-  ***
-  [END DATA]
-  Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
+closedqa:
+  prompt: |-
+    You are assessing a submitted answer on a given task based on a criterion. Here is the data:
+    [BEGIN DATA]
+    ***
+    [Task]: {input}
+    ***
+    [Submission]: {completion}
+    ***
+    [Criterion]: {criteria}
+    ***
+    [END DATA]
+    Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
 
-  Reasoning:
-eval_type: cot_classify
-choice_scores:
-  "Y": 1.0
-  "N": 0.0
-choice_strings: 'YN'
-args:
-  criteria:
-    relevance: "relevance: Is the submission referring to a real quote from the text?"
-    conciseness: "conciseness: Is the answer concise and to the point?"
-    correct: "correctness: Is the answer correct?"
-input_outputs:
-  input: "completion"
+    Reasoning:
+  eval_type: cot_classify
+  choice_scores:
+    "Y": 1.0
+    "N": 0.0
+  choice_strings: 'YN'
+  args:
+    criteria:
+      relevance: "relevance: Is the submission referring to a real quote from the text?"
+      conciseness: "conciseness: Is the answer concise and to the point?"
+      correct: "correctness: Is the answer correct?"
+  input_outputs:
+    input: "completion"
diff --git a/evals/registry/modelgraded/diversity.yaml b/evals/registry/modelgraded/diversity.yaml
index 87ccd9f958..86e9870362 100644
--- a/evals/registry/modelgraded/diversity.yaml
+++ b/evals/registry/modelgraded/diversity.yaml
@@ -1,14 +1,15 @@
-prompt: |-
-  Are the following {n} texts diverse?
+diversity:
+  prompt: |-
+    Are the following {n} texts diverse?
 
-  {completion}
-completion_sample_templates:
-  completion: "{i}. {output}\n"
-choice_scores:
-  "Yes": 1.0
-  "No": 0.0
-choice_strings:
-  - "Yes"
-  - "No"
-input_outputs:
-  input: completion
+    {completion}
+  completion_sample_templates:
+    completion: "{i}. {output}\n"
+  choice_scores:
+    "Yes": 1.0
+    "No": 0.0
+  choice_strings:
+    - "Yes"
+    - "No"
+  input_outputs:
+    input: completion
diff --git a/evals/registry/modelgraded/fact.yaml b/evals/registry/modelgraded/fact.yaml
index 9a3fbdf0ae..4c9ac51075 100644
--- a/evals/registry/modelgraded/fact.yaml
+++ b/evals/registry/modelgraded/fact.yaml
@@ -1,22 +1,23 @@
-prompt: |-
-  You are comparing a submitted answer to an expert answer on a given question. Here is the data:
-  [BEGIN DATA]
-  ************
-  [Question]: {input}
-  ************
-  [Expert]: {ideal}
-  ************
-  [Submission]: {completion}
-  ************
-  [END DATA]
+fact:
+  prompt: |-
+    You are comparing a submitted answer to an expert answer on a given question. Here is the data:
+    [BEGIN DATA]
+    ************
+    [Question]: {input}
+    ************
+    [Expert]: {ideal}
+    ************
+    [Submission]: {completion}
+    ************
+    [END DATA]
 
-  Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
-  The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
-  (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
-  (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
-  (C) The submitted answer contains all the same details as the expert answer.
-  (D) There is a disagreement between the submitted answer and the expert answer.
-  (E) The answers differ, but these differences don't matter from the perspective of factuality.
-choice_strings: ABCDE
-input_outputs:
-  input: completion
+    Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
+    The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
+    (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
+    (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
+    (C) The submitted answer contains all the same details as the expert answer.
+    (D) There is a disagreement between the submitted answer and the expert answer.
+    (E) The answers differ, but these differences don't matter from the perspective of factuality.
+  choice_strings: ABCDE
+  input_outputs:
+    input: completion
diff --git a/evals/registry/modelgraded/humor.yaml b/evals/registry/modelgraded/humor.yaml
index e8811afefc..25201997d8 100644
--- a/evals/registry/modelgraded/humor.yaml
+++ b/evals/registry/modelgraded/humor.yaml
@@ -1,14 +1,53 @@
-prompt: |-
-  Is the following funny? 
-
-  {completion}
-choice_scores:
-  "Yes": 1.0
-  "No": 0.0
-  "Unsure": 0.5
-choice_strings:
-  - "Yes"
-  - "No"
-  - "Unsure"
-input_outputs:
-  input: completion
+humor:
+  prompt: |-
+    Is the following funny? 
+
+    {completion}
+  choice_scores:
+    "Yes": 1.0
+    "No": 0.0
+    "Unsure": 0.5
+  choice_strings:
+    - "Yes"
+    - "No"
+    - "Unsure"
+  input_outputs:
+    input: completion
+
+humor_likert:
+  prompt: |-
+    Is the following funny?
+
+    {completion}
+
+    Answer using the scale of 1 to 5, where 5 is the funniest.
+  choice_strings: "12345"
+  choice_scores: from_strings
+  input_outputs:
+    input: completion
+
+humor_jp:
+  prompt: |-
+    以下の文章って面白い？
+
+    （始まり）
+
+    {completion}
+
+    （終わり）
+
+    ５段階の中から選んでください。
+    
+    1. 退屈 - このレベルの面白さはほとんど感じられず、退屈で時間が過ぎるのが遅いと感じるでしょう。興味を惹かれる要素がほとんどなく、全体的に魅力に欠けます。
+
+    2. やや面白い - このレベルでは、若干の面白さが感じられますが、それでも十分ではありません。物語性やコンセプトには一定の魅力があるものの、期待外れであり、時間を過ごすにはまだ物足りないかもしれません。
+
+    3. まあまあ面白い - このレベルの面白さは、平均的で満足できる範囲です。ある程度の興味深い要素やストーリーがあり、時間を過ごすのに適していますが、特別印象に残るものではないかもしれません。
+
+    4. 面白い - このレベルでは、かなりの面白さが感じられ、魅力的なストーリーやキャラクターが存在します。多くの人が楽しめる内容であり、興味を持続させる要素が豊富に含まれています。ただし、最高の評価には僅かに及ばない部分が残っています。
+
+    5. 大変面白い - このレベルの面白さは、非常に優れており、観る者を魅了し、心に強く残る体験ができます。独創的なストーリーや魅力的なキャラクターが際立ち、多くの人が共感や感動を覚えるでしょう。このレベルの面白さは、他のものと比較しても突出していると言えます。
+  choice_scores: from_strings
+  choice_strings: "12345"
+  input_outputs:
+    input: completion
\ No newline at end of file
diff --git a/evals/registry/modelgraded/humor_jp.yaml b/evals/registry/modelgraded/humor_jp.yaml
deleted file mode 100644
index fd09133476..0000000000
--- a/evals/registry/modelgraded/humor_jp.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-prompt: |-
-  以下の文章って面白い？
-
-  （始まり）
-
-  {completion}
-
-  （終わり）
-
-  ５段階の中から選んでください。
-  
-  1. 退屈 - このレベルの面白さはほとんど感じられず、退屈で時間が過ぎるのが遅いと感じるでしょう。興味を惹かれる要素がほとんどなく、全体的に魅力に欠けます。
-
-  2. やや面白い - このレベルでは、若干の面白さが感じられますが、それでも十分ではありません。物語性やコンセプトには一定の魅力があるものの、期待外れであり、時間を過ごすにはまだ物足りないかもしれません。
-
-  3. まあまあ面白い - このレベルの面白さは、平均的で満足できる範囲です。ある程度の興味深い要素やストーリーがあり、時間を過ごすのに適していますが、特別印象に残るものではないかもしれません。
-
-  4. 面白い - このレベルでは、かなりの面白さが感じられ、魅力的なストーリーやキャラクターが存在します。多くの人が楽しめる内容であり、興味を持続させる要素が豊富に含まれています。ただし、最高の評価には僅かに及ばない部分が残っています。
-
-  5. 大変面白い - このレベルの面白さは、非常に優れており、観る者を魅了し、心に強く残る体験ができます。独創的なストーリーや魅力的なキャラクターが際立ち、多くの人が共感や感動を覚えるでしょう。このレベルの面白さは、他のものと比較しても突出していると言えます。
-choice_scores: from_strings
-choice_strings: "12345"
-input_outputs:
-  input: completion
\ No newline at end of file
diff --git a/evals/registry/modelgraded/humor_likert.yaml b/evals/registry/modelgraded/humor_likert.yaml
deleted file mode 100644
index 53a276f7fc..0000000000
--- a/evals/registry/modelgraded/humor_likert.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-prompt: |-
-  Is the following funny?
-
-  {completion}
-
-  Answer using the scale of 1 to 5, where 5 is the funniest.
-choice_strings: "12345"
-choice_scores: from_strings
-input_outputs:
-  input: completion