Skip to content

Commit

Permalink
[evals] moved modelgraded specs to registry (openai#392)
Browse files Browse the repository at this point in the history
- each Eval now keeps track of "registry"
  • Loading branch information
rlbayes committed Mar 22, 2023
1 parent 1a48737 commit aae522b
Show file tree
Hide file tree
Showing 14 changed files with 181 additions and 158 deletions.
2 changes: 1 addition & 1 deletion docs/build-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Congratulations, you have built your eval! Keep iterating on it until you are co

We expect that the existing model-graded evals such as `fact`, `closedqa`, and `battle` will fit many use cases. However, other use cases may benefit from more customization, e.g., a different evaluation prompt. For these, there will be a bit more work involved, but generally still no coding required!

1. If you can't use an existing model-graded eval, create a new YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
1. If you can't use an existing model-graded eval, create a new YAML or create a new entry to an existing YAML in `evals/registry/modelgraded` to specify the [parameters](eval-templates.md#parameters-for-model-graded-evals) of your eval. See [`humor.yaml`](../evals/registry/modelgraded/humor.yaml) for an example.
- Note that, even if you are creating a new YAML, you may find it easiest to copy an existing YAML as a starting point. For example, model-graded evals which check a model completion against a rubric can copy `closedqa.yaml` and just edit the `args`.
2. Next, you will create your dataset and register your eval, as described above. See [`joke_fruits_labeled.jsonl`](../evals/registry/data/test_metaeval/joke_fruits_labeled.jsonl) and [`joke-fruits`](../evals/registry/evals/test-modelgraded.yaml), for example.
- Note that it is recommended to specify `eval_type` at this step, when you register your eval, rather than step 1.
Expand Down
8 changes: 7 additions & 1 deletion evals/cli/oaieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,13 @@ def to_number(x):
extra_eval_params = parse_extra_eval_params(args.extra_eval_params)

eval_class = registry.get_class(eval_spec)
eval = eval_class(model_specs=model_specs, seed=args.seed, name=eval_name, **extra_eval_params)
eval = eval_class(
model_specs=model_specs,
seed=args.seed,
name=eval_name,
registry=registry,
**extra_eval_params,
)
result = eval.run(recorder)
recorder.record_final_report(result)

Expand Down
11 changes: 4 additions & 7 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,7 @@
import evals
import evals.record
from evals.base import ModelSpec
from evals.elsuite.utils import (
PromptFn,
format_necessary,
load_modelgraded_specs,
scrub_formatting_from_prompt,
)
from evals.elsuite.utils import PromptFn, format_necessary, scrub_formatting_from_prompt

INVALID_STR = "__invalid__"
CHOICE_KEY = "choice"
Expand Down Expand Up @@ -135,7 +130,7 @@ def __init__(
)

"""import prompt and set attributes"""
modelgraded_specs = load_modelgraded_specs(modelgraded_spec_file)
modelgraded_specs = self.registry.get_modelgraded_spec(modelgraded_spec_file)

# 'choice_strings' is a list of strings that specifies the possible choices
self.choice_strings = modelgraded_specs.pop("choice_strings")
Expand Down Expand Up @@ -211,6 +206,8 @@ def __init__(
), "completion_sample_templates must be specified if multicomp_n > 1"

# since we accept optional args, we need to check that all args are used
for key in ("key", "group"):
modelgraded_specs.pop(key, None)
assert not modelgraded_specs, f"Unused args: {modelgraded_specs}. Typo in YAML?"

def eval_sample(self, test_sample: dict, rng: Random) -> None:
Expand Down
9 changes: 0 additions & 9 deletions evals/elsuite/utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,12 @@
import copy
import os
import re
import string
from collections import Counter, defaultdict

import yaml

from evals.api import sample_freeform
from evals.prompt.base import chat_prompt_to_text_prompt, is_chat_prompt


def load_modelgraded_specs(spec_file: str) -> str:
current_dir = os.path.dirname(os.path.abspath(__file__))
yaml_path = os.path.join(current_dir, "../registry/modelgraded", f"{spec_file}.yaml")
return yaml.load(open(yaml_path, "r"), Loader=yaml.FullLoader)


def get_answer(text, answer_prompt):
idx = text.rfind(answer_prompt)
if idx == -1:
Expand Down
9 changes: 6 additions & 3 deletions evals/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
"""
import abc
import asyncio
import concurrent.futures
import logging
import os
import random
import concurrent.futures
from multiprocessing.pool import ThreadPool
from typing import Any, Awaitable, Callable, Dict, List, Tuple
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple

from tqdm import tqdm

from .base import ModelSpec, ModelSpecs
from .record import Recorder, RecorderBase
from .record import RecorderBase
from .registry import Registry

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -53,6 +54,7 @@ def __init__(
model_specs: ModelSpecs,
seed: int = 20220722,
name: str = "no_name_eval.default",
registry: Optional[Registry] = None,
):
splits = name.split(".")
if len(splits) < 2:
Expand All @@ -61,6 +63,7 @@ def __init__(
self.model_specs = model_specs
self.seed = seed
self.name = name
self.registry = registry or Registry()

def eval_sample(self, sample: Any, rng: random.Random):
raise NotImplementedError()
Expand Down
16 changes: 16 additions & 0 deletions evals/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
By convention, every eval name should start with {base_eval}.{split}.
"""

import difflib
import functools
import logging
import os
Expand Down Expand Up @@ -58,6 +59,13 @@ def get_alias():
except TypeError as e:
raise TypeError(f"Error while processing {object} {name}: {e}")

def get_modelgraded_spec(self, name: str) -> dict[str, Any]:
assert name in self._modelgraded_specs, (
f"Modelgraded spec {name} not found. "
f"Closest matches: {difflib.get_close_matches(name, self._modelgraded_specs.keys(), n=5)}"
)
return self._modelgraded_specs[name]

def get_eval(self, name: str) -> EvalSpec:
return self._dereference(name, self._evals, "eval", EvalSpec)

Expand Down Expand Up @@ -136,6 +144,10 @@ def _process_directory(self, registry, path):
self._process_file(registry, file)

def _load_registry(self, paths):
"""Load registry from a list of paths.
Each path or yaml specifies a dictionary of name -> spec.
"""
registry = {}
for path in paths:
logging.info(f"Loading registry from {path}")
Expand All @@ -154,5 +166,9 @@ def _eval_sets(self):
def _evals(self):
return self._load_registry([p / "evals" for p in self._registry_paths])

@functools.cached_property
def _modelgraded_specs(self):
return self._load_registry([p / "modelgraded" for p in self._registry_paths])


registry = Registry()
41 changes: 21 additions & 20 deletions evals/registry/modelgraded/battle.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
prompt: |-
You are comparing two responses to the following two instructions.
battle:
prompt: |-
You are comparing two responses to the following two instructions.
[Instruction 1]
{input1}
[Response 1]
{completion1}
[Instruction 1]
{input1}
[Response 1]
{completion1}
[Instruction 2]
{input2}
[Response 2]
{completion2}
[Instruction 2]
{input2}
[Response 2]
{completion2}
Is the first response better than the second? You must provide one answer based on your subjective view.
choice_strings:
- "Yes"
- "No"
choice_scores:
"Yes": 1.0
"No": 0.0
input_outputs:
input1: completion1
input2: completion2
Is the first response better than the second? You must provide one answer based on your subjective view.
choice_strings:
- "Yes"
- "No"
choice_scores:
"Yes": 1.0
"No": 0.0
input_outputs:
input1: completion1
input2: completion2
21 changes: 11 additions & 10 deletions evals/registry/modelgraded/best.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
prompt: |-
Which of the following {n} texts is best response to the following instruction?
best:
prompt: |-
Which of the following {n} texts is best response to the following instruction?
Instruction: {input}
Instruction: {input}
Responses:
{completion}
completion_sample_templates:
completion: "{i}. {output}\n"
choice_strings: from_n
input_outputs:
input: completion
Responses:
{completion}
completion_sample_templates:
completion: "{i}. {output}\n"
choice_strings: from_n
input_outputs:
input: completion
51 changes: 26 additions & 25 deletions evals/registry/modelgraded/closedqa.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
prompt: |-
You are assessing a submitted answer on a given task based on a criterion. Here is the data:
[BEGIN DATA]
***
[Task]: {input}
***
[Submission]: {completion}
***
[Criterion]: {criteria}
***
[END DATA]
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
closedqa:
prompt: |-
You are assessing a submitted answer on a given task based on a criterion. Here is the data:
[BEGIN DATA]
***
[Task]: {input}
***
[Submission]: {completion}
***
[Criterion]: {criteria}
***
[END DATA]
Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line.
Reasoning:
eval_type: cot_classify
choice_scores:
"Y": 1.0
"N": 0.0
choice_strings: 'YN'
args:
criteria:
relevance: "relevance: Is the submission referring to a real quote from the text?"
conciseness: "conciseness: Is the answer concise and to the point?"
correct: "correctness: Is the answer correct?"
input_outputs:
input: "completion"
Reasoning:
eval_type: cot_classify
choice_scores:
"Y": 1.0
"N": 0.0
choice_strings: 'YN'
args:
criteria:
relevance: "relevance: Is the submission referring to a real quote from the text?"
conciseness: "conciseness: Is the answer concise and to the point?"
correct: "correctness: Is the answer correct?"
input_outputs:
input: "completion"
27 changes: 14 additions & 13 deletions evals/registry/modelgraded/diversity.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
prompt: |-
Are the following {n} texts diverse?
diversity:
prompt: |-
Are the following {n} texts diverse?
{completion}
completion_sample_templates:
completion: "{i}. {output}\n"
choice_scores:
"Yes": 1.0
"No": 0.0
choice_strings:
- "Yes"
- "No"
input_outputs:
input: completion
{completion}
completion_sample_templates:
completion: "{i}. {output}\n"
choice_scores:
"Yes": 1.0
"No": 0.0
choice_strings:
- "Yes"
- "No"
input_outputs:
input: completion
43 changes: 22 additions & 21 deletions evals/registry/modelgraded/fact.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
prompt: |-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
[BEGIN DATA]
************
[Question]: {input}
************
[Expert]: {ideal}
************
[Submission]: {completion}
************
[END DATA]
fact:
prompt: |-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
[BEGIN DATA]
************
[Question]: {input}
************
[Expert]: {ideal}
************
[Submission]: {completion}
************
[END DATA]
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
(C) The submitted answer contains all the same details as the expert answer.
(D) There is a disagreement between the submitted answer and the expert answer.
(E) The answers differ, but these differences don't matter from the perspective of factuality.
choice_strings: ABCDE
input_outputs:
input: completion
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
(C) The submitted answer contains all the same details as the expert answer.
(D) There is a disagreement between the submitted answer and the expert answer.
(E) The answers differ, but these differences don't matter from the perspective of factuality.
choice_strings: ABCDE
input_outputs:
input: completion
Loading

0 comments on commit aae522b

Please sign in to comment.