Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[evals] added model-graded model comparison example #373

Merged
merged 1 commit into from
Mar 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions evals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ def ranking(self) -> ModelSpec:
def completion(self) -> ModelSpec:
if self.completions_ is None:
raise ValueError("Completion model was not specified")
if len(self.completions_) != 1:
raise ValueError("ModelSpecs.completion only works with a single completion model")
return self.completions_[0]

@property
Expand Down
20 changes: 19 additions & 1 deletion evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ def __init__(
self.multicomp_temperature = multicomp_temperature
self.samples_renamings = samples_renamings or {}

# check if multiple models are specified
if len(self.model_specs.completions) > 1:
assert self.multicomp_n == len(
self.model_specs.completions
), f"multicomp_n={self.multicomp_n} must be equal to the number of models={len(self.model_specs.completions)} if multiple models are specified."
if self.multicomp_n > 1 and self.multicomp_temperature == 0:
logging.warning(
f"multicomp_temperature={self.multicomp_temperature} is 0 for {self.multicomp_n} model outputs. Specify multiple completion models, e.g. 'oaieval gpt-3.5-turbo,gpt-4 ...'?"
)

if self.model_spec.name == "dummy-completion" or self.model_spec.name == "dummy-chat":
self.eval_modelspec = self.model_spec
else:
Expand All @@ -129,6 +139,8 @@ def __init__(

# 'choice_strings' is a list of strings that specifies the possible choices
self.choice_strings = modelgraded_specs.pop("choice_strings")
if self.choice_strings == "from_n":
self.choice_strings = [str(i + 1) for i in range(self.multicomp_n)]
# make sure each choice doesn't contain any punctuation
for s in self.choice_strings:
assert not any(c in s for c in string.punctuation), f"{s} contains punctuation"
Expand Down Expand Up @@ -230,9 +242,15 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
completion = ""
completion_i_template = self.completion_sample_templates[v]
for i in range(self.multicomp_n):
if len(self.model_specs.completions) > 1:
# use a separate model for each completion
model_spec = self.model_specs.completions[i]
else:
# use the single model for all completions
model_spec = self.model_spec
get_input_completion = PromptFn(
test_sample[k],
model_spec=self.model_spec,
model_spec=model_spec,
max_tokens=self.max_tokens,
temperature=self.multicomp_temperature,
)
Expand Down
15 changes: 15 additions & 0 deletions evals/registry/evals/test-modelgraded.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,18 @@ diversity.dev.v0:
modelgraded_spec_file: diversity
multicomp_n: 4
multicomp_temperature: 0.4

# a simple modelgraded eval checking which of 2 completions to the sample prompt is the best response
# this example uses a labeled dataset, but ignore "completion" and "choice"
# command: `oaleval gpt-3.5-turbo,gpt-4 best`
best:
id: best.dev.v0
metrics: [accuracy]
best.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: best
multicomp_n: 2
multicomp_temperature: 0.0
12 changes: 12 additions & 0 deletions evals/registry/modelgraded/best.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
prompt: |-
Which of the following {n} texts is best response to the following instruction?

Instruction: {input}

Responses:
{completion}
completion_sample_templates:
completion: "{i}. {output}\n"
choice_strings: from_n
input_outputs:
input: completion