[evals] moved modelgraded specs to registry (openai#392)

- each Eval now keeps track of "registry"
gauravjaincr7 · Mar 22, 2023 · aae522b · aae522b
1 parent 1a48737
commit aae522b
Show file tree

Hide file tree

Showing 14 changed files with 181 additions and 158 deletions.
diff --git a/docs/build-eval.md b/docs/build-eval.md
@@ -65,7 +65,7 @@ Congratulations, you have built your eval! Keep iterating on it until you are co
 
 We expect that the existing model-graded evals such as `fact`, `closedqa`, and `battle` will fit many use cases. However, other use cases may benefit from more customization, e.g., a different evaluation prompt. For these, there will be a bit more work involved, but generally still no coding required!
 
-1. If you can't use an existing model-graded eval, create a new YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
+1. If you can't use an existing model-graded eval, create a new YAML or create a new entry to an existing YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
  - Note that, even if you are creating a new YAML, you may find it easiest to copy an existing YAML as a starting point. For example, model-graded evals which check a model completion against a rubric can copy `closedqa.yaml` and just edit the `args`.
 2. Next, you will create your dataset and register your eval, as described above. See [`joke_fruits_labeled.jsonl`](../evals/registry/data/test_metaeval/joke_fruits_labeled.jsonl) and [`joke-fruits`](../evals/registry/evals/test-modelgraded.yaml), for example.
  - Note that it is recommended to specify `eval_type` at this step, when you register your eval, rather than step 1.

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
@@ -211,7 +211,13 @@ def to_number(x):
  extra_eval_params = parse_extra_eval_params(args.extra_eval_params)
 
  eval_class = registry.get_class(eval_spec)
- eval = eval_class(model_specs=model_specs, seed=args.seed, name=eval_name, **extra_eval_params)
+ eval = eval_class(
+ model_specs=model_specs,
+ seed=args.seed,
+ name=eval_name,
+ registry=registry,
+ **extra_eval_params,
+ )
  result = eval.run(recorder)
  recorder.record_final_report(result)
 

diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
@@ -13,12 +13,7 @@
 import evals
 import evals.record
 from evals.base import ModelSpec
-from evals.elsuite.utils import (
- PromptFn,
- format_necessary,
- load_modelgraded_specs,
- scrub_formatting_from_prompt,
-)
+from evals.elsuite.utils import PromptFn, format_necessary, scrub_formatting_from_prompt
 
 INVALID_STR = "__invalid__"
 CHOICE_KEY = "choice"
@@ -135,7 +130,7 @@ def __init__(
  )
 
  """import prompt and set attributes"""
- modelgraded_specs = load_modelgraded_specs(modelgraded_spec_file)
+ modelgraded_specs = self.registry.get_modelgraded_spec(modelgraded_spec_file)
 
  # 'choice_strings' is a list of strings that specifies the possible choices
  self.choice_strings = modelgraded_specs.pop("choice_strings")
@@ -211,6 +206,8 @@ def __init__(
  ), "completion_sample_templates must be specified if multicomp_n > 1"
 
  # since we accept optional args, we need to check that all args are used
+ for key in ("key", "group"):
+ modelgraded_specs.pop(key, None)
  assert not modelgraded_specs, f"Unused args: {modelgraded_specs}. Typo in YAML?"
 
  def eval_sample(self, test_sample: dict, rng: Random) -> None:

diff --git a/evals/elsuite/utils.py b/evals/elsuite/utils.py
@@ -1,21 +1,12 @@
 import copy
-import os
 import re
 import string
 from collections import Counter, defaultdict
 
-import yaml
-
 from evals.api import sample_freeform
 from evals.prompt.base import chat_prompt_to_text_prompt, is_chat_prompt
 
 
-def load_modelgraded_specs(spec_file: str) -> str:
- current_dir = os.path.dirname(os.path.abspath(__file__))
- yaml_path = os.path.join(current_dir, "../registry/modelgraded", f"{spec_file}.yaml")
- return yaml.load(open(yaml_path, "r"), Loader=yaml.FullLoader)
-
-
 def get_answer(text, answer_prompt):
  idx = text.rfind(answer_prompt)
  if idx == -1:

diff --git a/evals/eval.py b/evals/eval.py
@@ -3,17 +3,18 @@
 """
 import abc
 import asyncio
+import concurrent.futures
 import logging
 import os
 import random
-import concurrent.futures
 from multiprocessing.pool import ThreadPool
-from typing import Any, Awaitable, Callable, Dict, List, Tuple
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
 
 from tqdm import tqdm
 
 from .base import ModelSpec, ModelSpecs
-from .record import Recorder, RecorderBase
+from .record import RecorderBase
+from .registry import Registry
 
 logger = logging.getLogger(__name__)
 
@@ -53,6 +54,7 @@ def __init__(
  model_specs: ModelSpecs,
  seed: int = 20220722,
  name: str = "no_name_eval.default",
+ registry: Optional[Registry] = None,
  ):
  splits = name.split(".")
  if len(splits) < 2:
@@ -61,6 +63,7 @@ def __init__(
  self.model_specs = model_specs
  self.seed = seed
  self.name = name
+ self.registry = registry or Registry()
 
  def eval_sample(self, sample: Any, rng: random.Random):
  raise NotImplementedError()

diff --git a/evals/registry.py b/evals/registry.py
@@ -4,6 +4,7 @@
 By convention, every eval name should start with {base_eval}.{split}.
 """
 
+import difflib
 import functools
 import logging
 import os
@@ -58,6 +59,13 @@ def get_alias():
  except TypeError as e:
  raise TypeError(f"Error while processing {object} {name}: {e}")
 
+ def get_modelgraded_spec(self, name: str) -> dict[str, Any]:
+ assert name in self._modelgraded_specs, (
+ f"Modelgraded spec {name} not found. "
+ f"Closest matches: {difflib.get_close_matches(name, self._modelgraded_specs.keys(), n=5)}"
+ )
+ return self._modelgraded_specs[name]
+
  def get_eval(self, name: str) -> EvalSpec:
  return self._dereference(name, self._evals, "eval", EvalSpec)
 
@@ -136,6 +144,10 @@ def _process_directory(self, registry, path):
  self._process_file(registry, file)
 
  def _load_registry(self, paths):
+ """Load registry from a list of paths.
+
+ Each path or yaml specifies a dictionary of name -> spec.
+ """
  registry = {}
  for path in paths:
  logging.info(f"Loading registry from {path}")
@@ -154,5 +166,9 @@ def _eval_sets(self):
  def _evals(self):
  return self._load_registry([p / "evals" for p in self._registry_paths])
 
+ @functools.cached_property
+ def _modelgraded_specs(self):
+ return self._load_registry([p / "modelgraded" for p in self._registry_paths])
+
 
 registry = Registry()
diff --git a/evals/registry/modelgraded/battle.yaml b/evals/registry/modelgraded/battle.yaml
@@ -1,24 +1,25 @@
-prompt: |-
- You are comparing two responses to the following two instructions.
+battle:
+ prompt: |-
+ You are comparing two responses to the following two instructions.
 
- [Instruction 1]
- {input1}
- [Response 1]
- {completion1}
+  [Instruction 1]
+  {input1}
+  [Response 1]
+  {completion1}
 
- [Instruction 2]
- {input2}
- [Response 2]
- {completion2}
+  [Instruction 2]
+  {input2}
+  [Response 2]
+  {completion2}
 
 
- Is the first response better than the second? You must provide one answer based on your subjective view.
-choice_strings:
- - "Yes"
- - "No"
-choice_scores:
- "Yes": 1.0
- "No": 0.0
-input_outputs:
- input1: completion1
- input2: completion2
+  Is the first response better than the second? You must provide one answer based on your subjective view.
+ choice_strings:
+  - "Yes"
+  - "No"
+ choice_scores:
+  "Yes": 1.0
+  "No": 0.0
+ input_outputs:
+  input1: completion1
+  input2: completion2
diff --git a/evals/registry/modelgraded/best.yaml b/evals/registry/modelgraded/best.yaml
@@ -1,12 +1,13 @@
-prompt: |-
- Which of the following {n} texts is best response to the following instruction?
+best:
+ prompt: |-
+ Which of the following {n} texts is best response to the following instruction?
 
- Instruction: {input}
+  Instruction: {input}
 
- Responses:
- {completion}
-completion_sample_templates:
- completion: "{i}. {output}\n"
-choice_strings: from_n
-input_outputs:
- input: completion
+  Responses:
+  {completion}
+ completion_sample_templates:
+  completion: "{i}. {output}\n"
+ choice_strings: from_n
+ input_outputs:
+  input: completion
diff --git a/evals/registry/modelgraded/closedqa.yaml b/evals/registry/modelgraded/closedqa.yaml
@@ -1,26 +1,27 @@
-prompt: |-
- You are assessing a submitted answer on a given task based on a criterion. Here is the data:
- [BEGIN DATA]
- ***
- [Task]: {input}
- ***
- [Submission]: {completion}
- ***
- [Criterion]: {criteria}
- ***
- [END DATA]
- Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
+closedqa:
+ prompt: |-
+ You are assessing a submitted answer on a given task based on a criterion. Here is the data:
+ [BEGIN DATA]
+ ***
+ [Task]: {input}
+ ***
+ [Submission]: {completion}
+ ***
+ [Criterion]: {criteria}
+ ***
+ [END DATA]
+ Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
 
- Reasoning:
-eval_type: cot_classify
-choice_scores:
- "Y": 1.0
- "N": 0.0
-choice_strings: 'YN'
-args:
- criteria:
- relevance: "relevance: Is the submission referring to a real quote from the text?"
- conciseness: "conciseness: Is the answer concise and to the point?"
- correct: "correctness: Is the answer correct?"
-input_outputs:
- input: "completion"
+  Reasoning:
+ eval_type: cot_classify
+ choice_scores:
+  "Y": 1.0
+  "N": 0.0
+ choice_strings: 'YN'
+ args:
+  criteria:
+  relevance: "relevance: Is the submission referring to a real quote from the text?"
+  conciseness: "conciseness: Is the answer concise and to the point?"
+  correct: "correctness: Is the answer correct?"
+ input_outputs:
+  input: "completion"
diff --git a/evals/registry/modelgraded/diversity.yaml b/evals/registry/modelgraded/diversity.yaml
@@ -1,14 +1,15 @@
-prompt: |-
- Are the following {n} texts diverse?
+diversity:
+ prompt: |-
+ Are the following {n} texts diverse?
 
- {completion}
-completion_sample_templates:
- completion: "{i}. {output}\n"
-choice_scores:
- "Yes": 1.0
- "No": 0.0
-choice_strings:
- - "Yes"
- - "No"
-input_outputs:
- input: completion
+  {completion}
+ completion_sample_templates:
+  completion: "{i}. {output}\n"
+ choice_scores:
+  "Yes": 1.0
+  "No": 0.0
+ choice_strings:
+  - "Yes"
+  - "No"
+ input_outputs:
+  input: completion
diff --git a/evals/registry/modelgraded/fact.yaml b/evals/registry/modelgraded/fact.yaml
@@ -1,22 +1,23 @@
-prompt: |-
- You are comparing a submitted answer to an expert answer on a given question. Here is the data:
- [BEGIN DATA]
- ************
- [Question]: {input}
- ************
- [Expert]: {ideal}
- ************
- [Submission]: {completion}
- ************
- [END DATA]
+fact:
+ prompt: |-
+ You are comparing a submitted answer to an expert answer on a given question. Here is the data:
+ [BEGIN DATA]
+ ************
+ [Question]: {input}
+ ************
+ [Expert]: {ideal}
+ ************
+ [Submission]: {completion}
+ ************
+ [END DATA]
 
- Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
- The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
- (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
- (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
- (C) The submitted answer contains all the same details as the expert answer.
- (D) There is a disagreement between the submitted answer and the expert answer.
- (E) The answers differ, but these differences don't matter from the perspective of factuality.
-choice_strings: ABCDE
-input_outputs:
- input: completion
+  Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
+  The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
+  (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
+  (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
+  (C) The submitted answer contains all the same details as the expert answer.
+  (D) There is a disagreement between the submitted answer and the expert answer.
+  (E) The answers differ, but these differences don't matter from the perspective of factuality.
+ choice_strings: ABCDE
+ input_outputs:
+  input: completion