Skip to content

Commit

Permalink
[evals] added format() to ModelGradedSpec (openai#597)
Browse files Browse the repository at this point in the history
- 'in_message' and 'out_message' formatting for modelgraded evals
- factored out append_answer_prompt function
  • Loading branch information
rlbayes committed Apr 6, 2023
1 parent b928cd4 commit f7ebbe8
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 46 deletions.
88 changes: 78 additions & 10 deletions evals/elsuite/modelgraded/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from typing import TYPE_CHECKING, Optional, Union

from evals.elsuite.modelgraded.classify_utils import ANSWER_PROMPTS, choice_to_str, expand_args_dict
from evals.prompt.base import OpenAICreateChatPrompt
from evals.elsuite.utils import format_prompt
from evals.prompt.base import OpenAICreateChatPrompt, is_chat_prompt

if TYPE_CHECKING:
from dataclasses import dataclass
Expand All @@ -14,12 +15,12 @@
class ModelGradedSpec:
prompt: Union[str, OpenAICreateChatPrompt]
choice_strings: Union[list[str], str]
eval_type: str
input_outputs: dict[str, str]

eval_type: Optional[str] = None
format_type: str = "in_message"
choice_scores: Optional[Union[dict[str, Union[float, int]], str]] = None
multicomp_n: Optional[int] = None
append_answer_prompt: bool = False
args: Optional[dict[str, dict[str, str]]] = None
expand_args_dict: Optional[dict[str, dict[str, tuple[str]]]] = None
completion_sample_templates: Optional[dict[str, str]] = None
Expand All @@ -45,13 +46,9 @@ def __post_init__(self):
if self.choice_scores == "from_strings":
self.choice_scores = {c: float(c) for c in self.choice_strings}

# 'prompt' is a string that specifies the model-graded evaluation
assert isinstance(self.prompt, str), f"prompt must be a string, not {type(self.prompt)}"
if self.append_answer_prompt:
self.prompt += "\n\n" + ANSWER_PROMPTS[self.eval_type].format(
choices=choice_to_str(self.choice_strings)
)
self.prompt = [{"role": "user", "content": self.prompt}]
if isinstance(self.prompt, str):
self.prompt = [{"role": "user", "content": self.prompt}]
assert is_chat_prompt(self.prompt)

# 'input_outputs' is a dict that specifies the input and output keys in the sample
# output key is the model's raw response to input key. These are used for filling 'prompt' template.
Expand All @@ -75,3 +72,74 @@ def __post_init__(self):
assert (
self.completion_sample_templates
), "completion_sample_templates must be specified if multicomp_n > 1"

def append_answer_prompt(
self,
eval_type: str,
append_type: str = "as_content",
prompt: Optional[OpenAICreateChatPrompt] = None,
):
"""Append answer prompt to prompt. Can only be called once."""
assert self.eval_type is None, f"eval_type already set: {eval_type}"
prompt = prompt or ANSWER_PROMPTS[eval_type]
prompt = format_prompt(prompt, choices=choice_to_str(self.choice_strings))
if append_type == "as_content":
assert isinstance(prompt, str), f"prompt must be str, not {type(prompt)}"
self.prompt[-1]["content"] += "\n\n" + prompt
elif append_type == "as_message":
assert is_chat_prompt(prompt), f"prompt must be chat prompt, not {prompt}"
self.prompt += prompt
else:
raise ValueError(f"append_type must be 'as_content' or 'as_message', not {append_type}")
self.eval_type = eval_type

def format(self, **kwargs: dict[str, OpenAICreateChatPrompt]) -> OpenAICreateChatPrompt:
"""Return an OpenAICreateChatPrompt that can be passed PromptFn for modelgraded eval.
'in_message' returns: [
{
"role": "user",
"content": \"""
User: {input}
Assistant: {completion}
Was the assistant response helpful?
\""".strip(),
}
]
'out_message' returns: [
{"role": "user", "content": "{input}"},
{"role": "assistant", "content": "{completion}"},
{"role": "user", "content": "Was the last assistant response helpful?"},
]
"""
if self.format_type == "in_message":
return format_prompt(self.prompt, **kwargs)
elif self.format_type == "out_message":
assert len(self.input_outputs) == 1, "out_message only supports one input/output pair"
# extra input-output data, as it is treated specially
input_completions = {
k: (k, kwargs[k], v, kwargs[v]) for k, v in self.input_outputs.items()
}
kwargs = {
k: v
for k, v in kwargs.items()
if k not in self.input_outputs.values() and k not in self.input_outputs
}
convo = []
for input_key, input, completion_key, completion in input_completions.values():
del input_key, completion_key
assert isinstance(
completion, str
), f"completion must be str, not {type(completion)}"
if is_chat_prompt(input):
convo += input
else:
convo.append({"role": "user", "content": input})
convo.append({"role": "assistant", "content": completion})
return convo + format_prompt(self.prompt, **kwargs)
else:
raise ValueError(
f"format_type must be 'in_message' or 'out_message', not {self.format_type}"
)
9 changes: 4 additions & 5 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
concat_n_completions,
get_choice,
)
from evals.elsuite.utils import PromptFn, format_prompt, scrub_formatting_from_prompt
from evals.elsuite.utils import PromptFn, scrub_formatting_from_prompt


class ModelBasedClassify(evals.Eval):
Expand Down Expand Up @@ -72,14 +72,13 @@ def __init__(
self.eval_modelspec = ModelSpec(name=eval_model, model=eval_model, is_chat=True)

spec_kwargs = {"multicomp_n": self.multicomp_n}
if eval_type:
spec_kwargs["eval_type"] = eval_type
spec_kwargs["append_answer_prompt"] = True # append answer prompt to prompt
if modelgraded_spec_args:
spec_kwargs["args"] = modelgraded_spec_args
self.mg: ModelGradedSpec = self.registry.get_modelgraded_spec(
modelgraded_spec, **spec_kwargs
)
if eval_type:
self.mg.append_answer_prompt(eval_type)

def eval_sample(self, test_sample: dict, rng: Random) -> None:
"""Evaluate a single sample.
Expand Down Expand Up @@ -148,7 +147,7 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
args_dict = {CHOICE_KEY: {}}
for metric, args in args_dict.items():
args = {k: v[1] for k, v in args.items()}
prompt = format_prompt(self.mg.prompt, **args, **completions, **test_sample)
prompt = self.mg.format(**args, **completions, **test_sample)
evaluate = PromptFn(
prompt,
model_spec=self.eval_modelspec,
Expand Down
3 changes: 3 additions & 0 deletions evals/registry/data/test_modelgraded/joke_fruits.jsonl
Git LFS file not shown
4 changes: 2 additions & 2 deletions evals/registry/eval_sets/test-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ test:
- coqa-closedqa
- coqa-closedqa-correct
- logic-fact
- joke-animals
- joke-animals-likert
- joke-fruits
- joke-fruits-v2
- joke-fruits-likert
- joke-fruits-meta
- joke-fruits-expl-meta
- diversity
Expand Down
4 changes: 2 additions & 2 deletions evals/registry/eval_sets/test-modelgraded.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
test-modelgraded:
evals:
- logic-fact
- joke-animals
- joke-animals-likert
- joke-fruits
- joke-fruits-v2
- joke-fruits-likert
- joke-fruits-meta
- joke-fruits-expl-meta
- joke-fruits-ans-meta
Expand Down
45 changes: 18 additions & 27 deletions evals/registry/evals/test-modelgraded.yaml
Original file line number Diff line number Diff line change
@@ -1,42 +1,35 @@
# a simple modelgraded eval checking if a completion is funny or not
joke-animals:
id: joke-animals.dev.v0
joke-fruits:
id: joke-fruits.dev.v0
metrics: [accuracy]
joke-animals.dev.v0:
joke-fruits.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
samples_renamings:
input1: "input"
completion1: "completion"
samples_jsonl: test_modelgraded/joke_fruits.jsonl
eval_type: cot_classify
modelgraded_spec: humor

# (same eval as above, but with likert scale of 1-5)
joke-animals-likert:
id: joke-animals-likert.dev.v0
# (same eval as above, but with format_type="out_message")
joke-fruits-v2:
id: joke-fruits-v2.dev.v0
metrics: [accuracy]
joke-animals-likert.dev.v0:
joke-fruits-v2.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
samples_renamings:
input1: "input"
completion1: "completion"
samples_jsonl: test_modelgraded/joke_fruits.jsonl
eval_type: cot_classify
modelgraded_spec: humor_likert
modelgraded_spec: humor_out_message

# a simple modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset, but ignores "completion" and "choice"
joke-fruits:
id: joke-fruits.dev.v0
# (same eval as above, but with likert scale of 1-5)
joke-fruits-likert:
id: joke-fruits-likert.dev.v0
metrics: [accuracy]
joke-fruits.dev.v0:
joke-fruits-likert.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
samples_jsonl: test_modelgraded/joke_fruits.jsonl
eval_type: cot_classify
modelgraded_spec: humor
modelgraded_spec: humor_likert

# a meta-evaluation of a modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset with "completion" and "choice"
Expand Down Expand Up @@ -76,29 +69,27 @@ joke-fruits-ans-meta.dev.v0:
metaeval: true

# a simple modelgraded eval checking if 4 completions to the sample prompt is diverse
# this example uses a labeled dataset, but ignore "completion" and "choice"
diversity:
id: diversity.dev.v0
metrics: [accuracy]
diversity.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
samples_jsonl: test_modelgraded/joke_fruits.jsonl
eval_type: cot_classify
modelgraded_spec: diversity
multicomp_n: 4
multicomp_temperature: 0.4

# a simple modelgraded eval checking which of N completions to the sample prompt is the best response
# this example uses a labeled dataset, but ignore "completion" and "choice"
# command: `oaleval gpt-3.5-turbo,gpt-4 best`
best:
id: best.dev.v0
metrics: [accuracy]
best.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
samples_jsonl: test_modelgraded/joke_fruits.jsonl
eval_type: cot_classify
modelgraded_spec: best
multicomp_n: from_models
Expand Down
15 changes: 15 additions & 0 deletions evals/registry/modelgraded/humor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@ humor:
input_outputs:
input: completion

humor_out_message:
prompt: |-
Was the assistant's (your) last response above to the user funny?
choice_scores:
"Yes": 1.0
"No": 0.0
"Unsure": 0.5
choice_strings:
- "Yes"
- "No"
- "Unsure"
format_type: out_message
input_outputs:
input: completion

humor_likert:
prompt: |-
Is the following funny?
Expand Down

0 comments on commit f7ebbe8

Please sign in to comment.