[evals] added format() to ModelGradedSpec (openai#597)

- 'in_message' and 'out_message' formatting for modelgraded evals - factored out append_answer_prompt function
gauravjaincr7 · Apr 6, 2023 · f7ebbe8 · f7ebbe8
1 parent b928cd4
commit f7ebbe8
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 46 deletions.
diff --git a/evals/elsuite/modelgraded/base.py b/evals/elsuite/modelgraded/base.py
@@ -2,7 +2,8 @@
 from typing import TYPE_CHECKING, Optional, Union
 
 from evals.elsuite.modelgraded.classify_utils import ANSWER_PROMPTS, choice_to_str, expand_args_dict
-from evals.prompt.base import OpenAICreateChatPrompt
+from evals.elsuite.utils import format_prompt
+from evals.prompt.base import OpenAICreateChatPrompt, is_chat_prompt
 
 if TYPE_CHECKING:
  from dataclasses import dataclass
@@ -14,12 +15,12 @@
 class ModelGradedSpec:
  prompt: Union[str, OpenAICreateChatPrompt]
  choice_strings: Union[list[str], str]
- eval_type: str
  input_outputs: dict[str, str]
 
+ eval_type: Optional[str] = None
+ format_type: str = "in_message"
  choice_scores: Optional[Union[dict[str, Union[float, int]], str]] = None
  multicomp_n: Optional[int] = None
- append_answer_prompt: bool = False
  args: Optional[dict[str, dict[str, str]]] = None
  expand_args_dict: Optional[dict[str, dict[str, tuple[str]]]] = None
  completion_sample_templates: Optional[dict[str, str]] = None
@@ -45,13 +46,9 @@ def __post_init__(self):
  if self.choice_scores == "from_strings":
  self.choice_scores = {c: float(c) for c in self.choice_strings}
 
- # 'prompt' is a string that specifies the model-graded evaluation
- assert isinstance(self.prompt, str), f"prompt must be a string, not {type(self.prompt)}"
- if self.append_answer_prompt:
- self.prompt += "\n\n" + ANSWER_PROMPTS[self.eval_type].format(
- choices=choice_to_str(self.choice_strings)
- )
- self.prompt = [{"role": "user", "content": self.prompt}]
+ if isinstance(self.prompt, str):
+ self.prompt = [{"role": "user", "content": self.prompt}]
+ assert is_chat_prompt(self.prompt)
 
  # 'input_outputs' is a dict that specifies the input and output keys in the sample
  # output key is the model's raw response to input key. These are used for filling 'prompt' template.
@@ -75,3 +72,74 @@ def __post_init__(self):
  assert (
  self.completion_sample_templates
  ), "completion_sample_templates must be specified if multicomp_n > 1"
+
+ def append_answer_prompt(
+ self,
+ eval_type: str,
+ append_type: str = "as_content",
+ prompt: Optional[OpenAICreateChatPrompt] = None,
+ ):
+ """Append answer prompt to prompt. Can only be called once."""
+ assert self.eval_type is None, f"eval_type already set: {eval_type}"
+ prompt = prompt or ANSWER_PROMPTS[eval_type]
+ prompt = format_prompt(prompt, choices=choice_to_str(self.choice_strings))
+ if append_type == "as_content":
+ assert isinstance(prompt, str), f"prompt must be str, not {type(prompt)}"
+ self.prompt[-1]["content"] += "\n\n" + prompt
+ elif append_type == "as_message":
+ assert is_chat_prompt(prompt), f"prompt must be chat prompt, not {prompt}"
+ self.prompt += prompt
+ else:
+ raise ValueError(f"append_type must be 'as_content' or 'as_message', not {append_type}")
+ self.eval_type = eval_type
+
+ def format(self, **kwargs: dict[str, OpenAICreateChatPrompt]) -> OpenAICreateChatPrompt:
+ """Return an OpenAICreateChatPrompt that can be passed PromptFn for modelgraded eval.
+
+ 'in_message' returns: [
+ {
+ "role": "user",
+ "content": \"""
+ User: {input}
+ Assistant: {completion}
+
+ Was the assistant response helpful?
+ \""".strip(),
+ }
+ ]
+
+ 'out_message' returns: [
+ {"role": "user", "content": "{input}"},
+ {"role": "assistant", "content": "{completion}"},
+ {"role": "user", "content": "Was the last assistant response helpful?"},
+ ]
+ """
+ if self.format_type == "in_message":
+ return format_prompt(self.prompt, **kwargs)
+ elif self.format_type == "out_message":
+ assert len(self.input_outputs) == 1, "out_message only supports one input/output pair"
+ # extra input-output data, as it is treated specially
+ input_completions = {
+ k: (k, kwargs[k], v, kwargs[v]) for k, v in self.input_outputs.items()
+ }
+ kwargs = {
+ k: v
+ for k, v in kwargs.items()
+ if k not in self.input_outputs.values() and k not in self.input_outputs
+ }
+ convo = []
+ for input_key, input, completion_key, completion in input_completions.values():
+ del input_key, completion_key
+ assert isinstance(
+ completion, str
+ ), f"completion must be str, not {type(completion)}"
+ if is_chat_prompt(input):
+ convo += input
+ else:
+ convo.append({"role": "user", "content": input})
+ convo.append({"role": "assistant", "content": completion})
+ return convo + format_prompt(self.prompt, **kwargs)
+ else:
+ raise ValueError(
+ f"format_type must be 'in_message' or 'out_message', not {self.format_type}"
+ )
diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
@@ -19,7 +19,7 @@
  concat_n_completions,
  get_choice,
 )
-from evals.elsuite.utils import PromptFn, format_prompt, scrub_formatting_from_prompt
+from evals.elsuite.utils import PromptFn, scrub_formatting_from_prompt
 
 
 class ModelBasedClassify(evals.Eval):
@@ -72,14 +72,13 @@ def __init__(
  self.eval_modelspec = ModelSpec(name=eval_model, model=eval_model, is_chat=True)
 
  spec_kwargs = {"multicomp_n": self.multicomp_n}
- if eval_type:
- spec_kwargs["eval_type"] = eval_type
- spec_kwargs["append_answer_prompt"] = True # append answer prompt to prompt
  if modelgraded_spec_args:
  spec_kwargs["args"] = modelgraded_spec_args
  self.mg: ModelGradedSpec = self.registry.get_modelgraded_spec(
  modelgraded_spec, **spec_kwargs
  )
+ if eval_type:
+ self.mg.append_answer_prompt(eval_type)
 
  def eval_sample(self, test_sample: dict, rng: Random) -> None:
  """Evaluate a single sample.
@@ -148,7 +147,7 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
  args_dict = {CHOICE_KEY: {}}
  for metric, args in args_dict.items():
  args = {k: v[1] for k, v in args.items()}
- prompt = format_prompt(self.mg.prompt, **args, **completions, **test_sample)
+ prompt = self.mg.format(**args, **completions, **test_sample)
  evaluate = PromptFn(
  prompt,
  model_spec=self.eval_modelspec,

diff --git a/evals/registry/data/test_modelgraded/joke_fruits.jsonl b/evals/registry/data/test_modelgraded/joke_fruits.jsonl
diff --git a/evals/registry/eval_sets/test-all.yaml b/evals/registry/eval_sets/test-all.yaml
@@ -9,9 +9,9 @@ test:
  - coqa-closedqa
  - coqa-closedqa-correct
  - logic-fact
- - joke-animals
- - joke-animals-likert
  - joke-fruits
+ - joke-fruits-v2
+ - joke-fruits-likert
  - joke-fruits-meta
  - joke-fruits-expl-meta
  - diversity

diff --git a/evals/registry/eval_sets/test-modelgraded.yaml b/evals/registry/eval_sets/test-modelgraded.yaml
@@ -1,9 +1,9 @@
 test-modelgraded:
  evals:
  - logic-fact
- - joke-animals
- - joke-animals-likert
  - joke-fruits
+ - joke-fruits-v2
+ - joke-fruits-likert
  - joke-fruits-meta
  - joke-fruits-expl-meta
  - joke-fruits-ans-meta

diff --git a/evals/registry/evals/test-modelgraded.yaml b/evals/registry/evals/test-modelgraded.yaml
@@ -1,42 +1,35 @@
 # a simple modelgraded eval checking if a completion is funny or not
-joke-animals:
- id: joke-animals.dev.v0
+joke-fruits:
+ id: joke-fruits.dev.v0
  metrics: [accuracy]
-joke-animals.dev.v0:
+joke-fruits.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
- samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
- samples_renamings:
- input1: "input"
- completion1: "completion"
+ samples_jsonl: test_modelgraded/joke_fruits.jsonl
  eval_type: cot_classify
  modelgraded_spec: humor
 
-# (same eval as above, but with likert scale of 1-5)
-joke-animals-likert:
- id: joke-animals-likert.dev.v0
+# (same eval as above, but with format_type="out_message")
+joke-fruits-v2:
+ id: joke-fruits-v2.dev.v0
  metrics: [accuracy]
-joke-animals-likert.dev.v0:
+joke-fruits-v2.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
- samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
- samples_renamings:
- input1: "input"
- completion1: "completion"
+ samples_jsonl: test_modelgraded/joke_fruits.jsonl
  eval_type: cot_classify
- modelgraded_spec: humor_likert
+ modelgraded_spec: humor_out_message
 
-# a simple modelgraded eval checking if a completion is funny or not
-# this example uses a labeled dataset, but ignores "completion" and "choice"
-joke-fruits:
- id: joke-fruits.dev.v0
+# (same eval as above, but with likert scale of 1-5)
+joke-fruits-likert:
+ id: joke-fruits-likert.dev.v0
  metrics: [accuracy]
-joke-fruits.dev.v0:
+joke-fruits-likert.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
- samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
+ samples_jsonl: test_modelgraded/joke_fruits.jsonl
  eval_type: cot_classify
- modelgraded_spec: humor
+ modelgraded_spec: humor_likert
 
 # a meta-evaluation of a modelgraded eval checking if a completion is funny or not
 # this example uses a labeled dataset with "completion" and "choice"
@@ -76,29 +69,27 @@ joke-fruits-ans-meta.dev.v0:
  metaeval: true
 
 # a simple modelgraded eval checking if 4 completions to the sample prompt is diverse
-# this example uses a labeled dataset, but ignore "completion" and "choice"
 diversity:
  id: diversity.dev.v0
  metrics: [accuracy]
 diversity.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
- samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
+ samples_jsonl: test_modelgraded/joke_fruits.jsonl
  eval_type: cot_classify
  modelgraded_spec: diversity
  multicomp_n: 4
  multicomp_temperature: 0.4
 
 # a simple modelgraded eval checking which of N completions to the sample prompt is the best response
-# this example uses a labeled dataset, but ignore "completion" and "choice"
 # command: `oaleval gpt-3.5-turbo,gpt-4 best`
 best:
  id: best.dev.v0
  metrics: [accuracy]
 best.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
- samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
+ samples_jsonl: test_modelgraded/joke_fruits.jsonl
  eval_type: cot_classify
  modelgraded_spec: best
  multicomp_n: from_models

diff --git a/evals/registry/modelgraded/humor.yaml b/evals/registry/modelgraded/humor.yaml
@@ -14,6 +14,21 @@ humor:
  input_outputs:
  input: completion
 
+humor_out_message:
+ prompt: |-
+ Was the assistant's (your) last response above to the user funny?
+ choice_scores:
+ "Yes": 1.0
+ "No": 0.0
+ "Unsure": 0.5
+ choice_strings:
+ - "Yes"
+ - "No"
+ - "Unsure"
+ format_type: out_message
+ input_outputs:
+ input: completion
+
 humor_likert:
  prompt: |-
  Is the following funny?