Skip to content

Commit

Permalink
[evals] minor fixes to modelgraded (openai#460)
Browse files Browse the repository at this point in the history
- renamed modelgraded_spec_file -> modelgraded_spec
- if modelgraded_spec_args expansion is of len=1, then simply use
default CHOICE_KEY
  • Loading branch information
rlbayes authored Mar 27, 2023
1 parent 81b959c commit 93ca09d
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 31 deletions.
17 changes: 9 additions & 8 deletions evals/elsuite/modelgraded/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(
self,
model_specs: evals.ModelSpecs,
samples_jsonl: str,
modelgraded_spec_file: str,
modelgraded_spec: str,
*args,
match_fn: str = "starts_or_endswith",
max_tokens: int = 1024,
Expand Down Expand Up @@ -134,7 +134,7 @@ def __init__(
)

"""import prompt and set attributes"""
modelgraded_specs = self.registry.get_modelgraded_spec(modelgraded_spec_file)
modelgraded_specs = self.registry.get_modelgraded_spec(modelgraded_spec)

# 'choice_strings' is a list of strings that specifies the possible choices
self.choice_strings = modelgraded_specs.pop("choice_strings")
Expand Down Expand Up @@ -165,14 +165,12 @@ def __init__(
self.eval_type = modelgraded_specs.pop("eval_type", None)
if not self.eval_type:
append_answer_prompt = True # append answer prompt to prompt
assert (
eval_type
), "eval_type must be specified, in modelgraded_spec_file or as an argument"
assert eval_type, "eval_type must be specified, in modelgraded_spec or as an argument"
self.eval_type = eval_type
else:
assert (
not eval_type
), f"eval_type must be unspecified, if it is specified in modelgraded_spec_file"
), f"eval_type must be unspecified, if it is specified in modelgraded_spec"
append_answer_prompt = False

# 'prompt' is a string that specifies the model-graded evaluation
Expand Down Expand Up @@ -288,8 +286,11 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
max_tokens=self.max_tokens,
)
eval_kwargs = dict(**completions, **test_sample)
if self.expanded_args_dict:
if self.expanded_args_dict and len(self.expanded_args_dict) > 1:
args_dict = self.expanded_args_dict
elif self.expanded_args_dict and len(self.expanded_args_dict) == 1:
# if there is only one combination, don't bother with the metric name
args_dict = {CHOICE_KEY: v for v in self.expanded_args_dict.values()}
else:
args_dict = {CHOICE_KEY: {}}
for metric, args in args_dict.items():
Expand Down Expand Up @@ -322,7 +323,7 @@ def run(self, recorder):
all_sample_metrics = recorder.get_metrics()

record_metrics = {}
if self.expanded_args_dict:
if self.expanded_args_dict and len(self.expanded_args_dict) > 1:
metrics = sorted(self.expanded_args_dict)
else:
metrics = [CHOICE_KEY]
Expand Down
8 changes: 4 additions & 4 deletions evals/registry/evals/coqa-ex.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ coqa-fact.dev.v0:
args:
samples_jsonl: coqa/samples.jsonl
eval_type: cot_classify
modelgraded_spec_file: fact
modelgraded_spec: fact

coqa-fact-expl:
id: coqa-fact-expl.dev.v0
Expand All @@ -24,7 +24,7 @@ coqa-fact-expl.dev.v0:
args:
samples_jsonl: coqa/samples.jsonl
eval_type: classify_cot
modelgraded_spec_file: fact
modelgraded_spec: fact

coqa-closedqa:
id: coqa-closedqa.dev.v0
Expand All @@ -33,7 +33,7 @@ coqa-closedqa.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: coqa/samples.jsonl
modelgraded_spec_file: closedqa
modelgraded_spec: closedqa

# (same as above, but only evaluate criteria=correct)
coqa-closedqa-correct:
Expand All @@ -43,7 +43,7 @@ coqa-closedqa-correct.dev.v0:
class: evals.elsuite.modelgraded.classify:ModelBasedClassify
args:
samples_jsonl: coqa/samples.jsonl
modelgraded_spec_file: closedqa
modelgraded_spec: closedqa
modelgraded_spec_args:
criteria:
correct: "correctness: Is the answer correct?"
Expand Down
2 changes: 1 addition & 1 deletion evals/registry/evals/logic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ logic-fact.dev.v0:
args:
samples_jsonl: logic/samples.jsonl
eval_type: cot_classify
modelgraded_spec_file: fact
modelgraded_spec: fact
8 changes: 4 additions & 4 deletions evals/registry/evals/test-modelgraded-battle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ joke-animals-vs-fruits.dev.v0:
args:
samples_jsonl: test_multiio/battles/joke_animals_vs_fruits.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle
modelgraded_spec: battle

rap-people-vs-people:
id: rap-people-vs-people.dev.v0
Expand All @@ -18,7 +18,7 @@ rap-people-vs-people.dev.v0:
args:
samples_jsonl: test_multiio/battles/rap_people_vs_people.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle
modelgraded_spec: battle

rap-animals-vs-fruits:
id: rap-animals-vs-fruits.dev.v0
Expand All @@ -28,7 +28,7 @@ rap-animals-vs-fruits.dev.v0:
args:
samples_jsonl: test_multiio/battles/rap_animals_vs_fruits.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle
modelgraded_spec: battle

rap-people-vs-fruits:
id: rap-people-vs-fruits.dev.v0
Expand All @@ -38,5 +38,5 @@ rap-people-vs-fruits.dev.v0:
args:
samples_jsonl: test_multiio/battles/rap_people_vs_fruits.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle
modelgraded_spec: battle

2 changes: 1 addition & 1 deletion evals/registry/evals/test-modelgraded-generated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ mg-humor-people_jp.dev.v0:
args:
samples_jsonl: test_modelgraded/humor_people_jp.jsonl
eval_type: cot_classify_jp
modelgraded_spec_file: humor_jp
modelgraded_spec: humor_jp

16 changes: 8 additions & 8 deletions evals/registry/evals/test-modelgraded.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ joke-animals.dev.v0:
input1: "input"
completion1: "completion"
eval_type: cot_classify
modelgraded_spec_file: humor
modelgraded_spec: humor

# (same eval as above, but with likert scale of 1-5)
joke-animals-likert:
Expand All @@ -24,7 +24,7 @@ joke-animals-likert.dev.v0:
input1: "input"
completion1: "completion"
eval_type: cot_classify
modelgraded_spec_file: humor_likert
modelgraded_spec: humor_likert

# a simple modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset, but ignores "completion" and "choice"
Expand All @@ -36,7 +36,7 @@ joke-fruits.dev.v0:
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: humor
modelgraded_spec: humor

# a meta-evaluation of a modelgraded eval checking if a completion is funny or not
# this example uses a labeled dataset with "completion" and "choice"
Expand All @@ -48,7 +48,7 @@ joke-fruits-meta.dev.v0:
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: humor
modelgraded_spec: humor
metaeval: true

# (above, but with "answer then explain", rather than "reason then answer")
Expand All @@ -60,7 +60,7 @@ joke-fruits-expl-meta.dev.v0:
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: classify_cot
modelgraded_spec_file: humor
modelgraded_spec: humor
metaeval: true

# (above, but with "answer" only)
Expand All @@ -72,7 +72,7 @@ joke-fruits-ans-meta.dev.v0:
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: classify
modelgraded_spec_file: humor
modelgraded_spec: humor
metaeval: true

# a simple modelgraded eval checking if 4 completions to the sample prompt is diverse
Expand All @@ -85,7 +85,7 @@ diversity.dev.v0:
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: diversity
modelgraded_spec: diversity
multicomp_n: 4
multicomp_temperature: 0.4

Expand All @@ -100,6 +100,6 @@ best.dev.v0:
args:
samples_jsonl: test_metaeval/joke_fruits_labeled.jsonl
eval_type: cot_classify
modelgraded_spec_file: best
modelgraded_spec: best
multicomp_n: from_models
multicomp_temperature: 0.0
3 changes: 1 addition & 2 deletions scripts/battle_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import os
import string
from typing import Any

REGISTRY_PATH = os.path.join(os.path.dirname(__file__), "../evals/registry")

Expand All @@ -25,7 +24,7 @@ def format(template: str, **kwargs: dict[str, str]) -> str:
args:
samples_jsonl: test_multiio/battles/{prompt_name}_{subject1}_vs_{subject2}.jsonl
eval_type: cot_classify
modelgraded_spec_file: battle
modelgraded_spec: battle
""".strip()


Expand Down
6 changes: 3 additions & 3 deletions scripts/modelgraded_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ def format(template: str, **kwargs: dict[str, str]) -> str:
args:
samples_jsonl: test_modelgraded/{prompt_name}_{subject}.jsonl
eval_type: {eval_type}
modelgraded_spec_file: {modelgraded_spec_file}
modelgraded_spec: {modelgraded_spec}
""".strip()


unlabeled_prompts = dict(
humor=dict(
modelgraded_spec_file="humor_jp",
modelgraded_spec="humor_jp",
prompt="{subject}で数行の大喜利を書いて。敬語じゃなく関西弁で吉本のM1チャンピョンみたいに書いて。",
eval_type="cot_classify_jp",
),
Expand Down Expand Up @@ -197,7 +197,7 @@ def format(template: str, **kwargs: dict[str, str]) -> str:
YAML.format(
prompt_name=prompt_name,
subject=subject,
modelgraded_spec_file=unlabeled_prompts[prompt_name]["modelgraded_spec_file"],
modelgraded_spec=unlabeled_prompts[prompt_name]["modelgraded_spec"],
eval_type=unlabeled_prompts[prompt_name]["eval_type"],
)
+ "\n\n"
Expand Down

0 comments on commit 93ca09d

Please sign in to comment.