[evals] added model-graded model comparison example (openai#373)

- added model-graded model comparison example - example: oaieval gpt-3.5-turbo,gpt-4 best
gauravjaincr7 · Mar 20, 2023 · 1475a9e · 1475a9e
1 parent 3747371
commit 1475a9e
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 3 deletions.
diff --git a/evals/base.py b/evals/base.py
@@ -111,8 +111,6 @@ def ranking(self) -> ModelSpec:
  def completion(self) -> ModelSpec:
  if self.completions_ is None:
  raise ValueError("Completion model was not specified")
- if len(self.completions_) != 1:
- raise ValueError("ModelSpecs.completion only works with a single completion model")
  return self.completions_[0]
 
  @property

diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
@@ -117,6 +117,16 @@ def __init__(
  self.multicomp_temperature = multicomp_temperature
  self.samples_renamings = samples_renamings or {}
 
+ # check if multiple models are specified
+ if len(self.model_specs.completions) > 1:
+ assert self.multicomp_n == len(
+ self.model_specs.completions
+ ), f"multicomp_n={self.multicomp_n} must be equal to the number of models={len(self.model_specs.completions)} if multiple models are specified."
+ if self.multicomp_n > 1 and self.multicomp_temperature == 0:
+ logging.warning(
+ f"multicomp_temperature={self.multicomp_temperature} is 0 for {self.multicomp_n} model outputs. Specify multiple completion models, e.g. 'oaieval gpt-3.5-turbo,gpt-4 ...'?"
+ )
+
  if self.model_spec.name == "dummy-completion" or self.model_spec.name == "dummy-chat":
  self.eval_modelspec = self.model_spec
  else:
@@ -129,6 +139,8 @@ def __init__(
 
  # 'choice_strings' is a list of strings that specifies the possible choices
  self.choice_strings = modelgraded_specs.pop("choice_strings")
+ if self.choice_strings == "from_n":
+ self.choice_strings = [str(i + 1) for i in range(self.multicomp_n)]
  # make sure each choice doesn't contain any punctuation
  for s in self.choice_strings:
  assert not any(c in s for c in string.punctuation), f"{s} contains punctuation"
@@ -230,9 +242,15 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
  completion = ""
  completion_i_template = self.completion_sample_templates[v]
  for i in range(self.multicomp_n):
+ if len(self.model_specs.completions) > 1:
+ # use a separate model for each completion
+ model_spec = self.model_specs.completions[i]
+ else:
+ # use the single model for all completions
+ model_spec = self.model_spec
  get_input_completion = PromptFn(
  test_sample[k],
- model_spec=self.model_spec,
+ model_spec=model_spec,
  max_tokens=self.max_tokens,
  temperature=self.multicomp_temperature,
  )

diff --git a/evals/registry/evals/test-modelgraded.yaml b/evals/registry/evals/test-modelgraded.yaml
@@ -88,3 +88,18 @@ diversity.dev.v0:
  modelgraded_spec_file: diversity
  multicomp_n: 4
  multicomp_temperature: 0.4
+
+# a simple modelgraded eval checking which of 2 completions to the sample prompt is the best response
+# this example uses a labeled dataset, but ignore "completion" and "choice"
+# command: `oaleval gpt-3.5-turbo,gpt-4 best`
+best:
+ id: best.dev.v0
+ metrics: [accuracy]
+best.dev.v0:
+ class: evals.elsuite.modelgraded.classify:ModelBasedClassify
+ args:
+ samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
+ eval_type: cot_classify
+ modelgraded_spec_file: best
+ multicomp_n: 2
+ multicomp_temperature: 0.0
diff --git a/evals/registry/modelgraded/best.yaml b/evals/registry/modelgraded/best.yaml
@@ -0,0 +1,12 @@
+prompt: |-
+ Which of the following {n} texts is best response to the following instruction?
+
+ Instruction: {input}
+
+ Responses:
+ {completion}
+completion_sample_templates:
+ completion: "{i}. {output}\n"
+choice_strings: from_n
+input_outputs:
+ input: completion