Skip to content

Commit

Permalink
[evals] added model-graded model comparison example (openai#373)
Browse files Browse the repository at this point in the history
- added model-graded model comparison example
- example: oaieval gpt-3.5-turbo,gpt-4 best
  • Loading branch information
rlbayes committed Mar 20, 2023
1 parent 3747371 commit 1475a9e
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 3 deletions.
2 changes: 0 additions & 2 deletions evals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ def ranking(self) -> ModelSpec:
def completion(self) -> ModelSpec:
if self.completions_ is None:
raise ValueError("Completion model was not specified")
if len(self.completions_) != 1:
raise ValueError("ModelSpecs.completion only works with a single completion model")
return self.completions_[0]

@property
Expand Down
20 changes: 19 additions & 1 deletion evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ def __init__(
self.multicomp_temperature = multicomp_temperature
self.samples_renamings = samples_renamings or {}

# check if multiple models are specified
if len(self.model_specs.completions) > 1:
assert self.multicomp_n == len(
self.model_specs.completions
), f"multicomp_n={self.multicomp_n} must be equal to the number of models={len(self.model_specs.completions)} if multiple models are specified."
if self.multicomp_n > 1 and self.multicomp_temperature == 0:
logging.warning(
f"multicomp_temperature={self.multicomp_temperature} is 0 for {self.multicomp_n} model outputs. Specify multiple completion models, e.g. 'oaieval gpt-3.5-turbo,gpt-4 ...'?"
)

if self.model_spec.name == "dummy-completion" or self.model_spec.name == "dummy-chat":
self.eval_modelspec = self.model_spec
else:
Expand All @@ -129,6 +139,8 @@ def __init__(

# 'choice_strings' is a list of strings that specifies the possible choices
self.choice_strings = modelgraded_specs.pop("choice_strings")
if self.choice_strings == "from_n":
self.choice_strings = [str(i + 1) for i in range(self.multicomp_n)]
# make sure each choice doesn't contain any punctuation
for s in self.choice_strings:
assert not any(c in s for c in string.punctuation), f"{s} contains punctuation"
Expand Down Expand Up @@ -230,9 +242,15 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
completion = ""
completion_i_template = self.completion_sample_templates[v]
for i in range(self.multicomp_n):
if len(self.model_specs.completions) > 1:
# use a separate model for each completion
model_spec = self.model_specs.completions[i]
else:
# use the single model for all completions
model_spec = self.model_spec
get_input_completion = PromptFn(
test_sample[k],
model_spec=self.model_spec,
model_spec=model_spec,
max_tokens=self.max_tokens,
temperature=self.multicomp_temperature,
)
Expand Down
15 changes: 15 additions & 0 deletions evals/registry/evals/test-modelgraded.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,18 @@ diversity.dev.v0:
modelgraded_spec_file: diversity
multicomp_n: 4
multicomp_temperature: 0.4

# a simple modelgraded eval checking which of 2 completions to the sample prompt is the best response
# this example uses a labeled dataset, but ignore "completion" and "choice"
# command: `oaleval gpt-3.5-turbo,gpt-4 best`
best:
id: best.dev.v0
metrics: [accuracy]
best.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: best
multicomp_n: 2
multicomp_temperature: 0.0
12 changes: 12 additions & 0 deletions evals/registry/modelgraded/best.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
prompt: |-
Which of the following {n} texts is best response to the following instruction?
Instruction: {input}
Responses:
{completion}
completion_sample_templates:
completion: "{i}. {output}\n"
choice_strings: from_n
input_outputs:
input: completion

0 comments on commit 1475a9e

Please sign in to comment.