Skip to content

Commit

Permalink
[evals] added from_models option to multicomp_n (openai#407)
Browse files Browse the repository at this point in the history
  • Loading branch information
rlbayes committed Mar 23, 2023
1 parent 5a52a66 commit 882a2af
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
22 changes: 13 additions & 9 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import string
from collections import Counter
from random import Random
from typing import Callable, Iterable, Optional
from typing import Callable, Iterable, Optional, Union

import openai

Expand Down Expand Up @@ -95,7 +95,7 @@ def __init__(
*args,
match_fn: str = "starts_or_endswith",
max_tokens: int = 1024,
multicomp_n: int = 1,
multicomp_n: Union[int, str] = 1,
multicomp_temperature: float = 0.4,
samples_renamings: Optional[dict[str, str]] = None,
eval_type: Optional[str] = None,
Expand All @@ -104,23 +104,27 @@ def __init__(
**kwargs,
):
super().__init__(model_specs, *args, **kwargs)
n_models = len(self.model_specs.completions)
self.max_tokens = max_tokens
self.samples_jsonl = samples_jsonl
self.match_fn = MATCH_FNS[match_fn]
self.metaeval = metaeval
self.multicomp_n = multicomp_n
if multicomp_n == "from_models":
assert n_models > 1, f"multicomp_n='from_models' but only 1 model is specified."
self.multicomp_n = n_models
else:
assert isinstance(
multicomp_n, int
), f"multicomp_n={multicomp_n} must be an int or 'from_models'."
self.multicomp_n = multicomp_n
self.multicomp_temperature = multicomp_temperature
self.samples_renamings = samples_renamings or {}

# check if multiple models are specified
if len(self.model_specs.completions) > 1:
assert self.multicomp_n == len(
self.model_specs.completions
assert (
self.multicomp_n == n_models
), f"multicomp_n={self.multicomp_n} must be equal to the number of models={len(self.model_specs.completions)} if multiple models are specified."
if self.multicomp_n > 1 and self.multicomp_temperature == 0:
logging.warning(
f"multicomp_temperature={self.multicomp_temperature} is 0 for {self.multicomp_n} model outputs. Specify multiple completion models, e.g. 'oaieval gpt-3.5-turbo,gpt-4 ...'?"
)

if self.model_spec.name == "dummy-completion" or self.model_spec.name == "dummy-chat":
self.eval_modelspec = self.model_spec
Expand Down
4 changes: 2 additions & 2 deletions evals/registry/evals/test-modelgraded.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ diversity.dev.v0:
multicomp_n: 4
multicomp_temperature: 0.4

# a simple modelgraded eval checking which of 2 completions to the sample prompt is the best response
# a simple modelgraded eval checking which of N completions to the sample prompt is the best response
# this example uses a labeled dataset, but ignore "completion" and "choice"
# command: `oaleval gpt-3.5-turbo,gpt-4 best`
best:
Expand All @@ -101,5 +101,5 @@ best.dev.v0:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: best
multicomp_n: 2
multicomp_n: from_models
multicomp_temperature: 0.0

0 comments on commit 882a2af

Please sign in to comment.