openai · andrew-openai · Apr 11, 2023 · Mar 29, 2023 · Apr 2, 2023 · Apr 5, 2023
@@ -2,3 +2,4 @@ recursive-include evals *.py
 recursive-include evals *.yaml
 recursive-include evals *.sql
 recursive-include evals/registry/data *.jsonl
+recursive-include evals *.jsonl
@@ -10,7 +10,7 @@ oaieval gpt-3.5-turbo test-match
 ```
 The valid eval names are specified in the YAML files under `evals/registry/evals` and their corresponding implementations can be found in `evals/elsuite`.
 
-In this example, `gpt-3.5-turbo` is an OpenAI model that we dynamically instantiate as a completion function using `OpenAIChatCompletionFn(model=gpt-3.5-turbo)`. Any implementation of the `CompletionFn` protocol can be run against `oaieval`. By default, we support calling `oaieval` with any model availableon the OpenAI API or with CompletionFunctions available in [`evals/registry/completion_fns`](../evals/registry/completion_fns/). We are always interested in adding more completion functions and we encourage you to implement you own to reflect specific use cases.
+In this example, `gpt-3.5-turbo` is an OpenAI model that we dynamically instantiate as a completion function using `OpenAIChatCompletionFn(model=gpt-3.5-turbo)`. Any implementation of the `CompletionFn` protocol can be run against `oaieval`. By default, we support calling `oaieval` with any model available on the OpenAI API or with CompletionFunctions available in [`evals/registry/completion_fns`](../evals/registry/completion_fns/). We are always interested in adding more completion functions and we encourage you to implement you own to reflect specific use cases.
 
 More details on `CompletionFn` found here: [`completion-fns.md`](completion-fns.md)
 

@@ -17,7 +17,6 @@ class CompletionFnSpec:
  """
  Specification for a CompletionFn.
  """
-
  cls: str
  args: Optional[Dict[str, Any]] = None
  key: Optional[str] = None

@@ -27,11 +27,10 @@ def get_parser() -> argparse.ArgumentParser:
  parser.add_argument(
  "completion_fn",
  type=str,
- help="One or more CompletionFn URLs, separated by commas (,). The format of a CompletionFn URL can be two forms: 1) an OpenAI API model followed by query parameters (e.g. `gpt-3.5-turbo?api_key=..`) or 2) a path to a Python class followed by query parameters (e.g. `evals:OpenAICompletionFn?model=text-davinci-003`).",
+ help="One or more CompletionFn URLs, separated by commas (,). A CompletionFn can either be the name of a model available in the OpenAI API or a key in the registry (see evals/registry/completion_fns).",
  )
  parser.add_argument("eval", type=str, help="Name of an eval. See registry.")
  parser.add_argument("--extra_eval_params", type=str, default="")
- parser.add_argument("--modelspec_extra_options", type=str, default="")
  parser.add_argument("--max_samples", type=int, default=None)
  parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True)
  parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None)
@@ -110,6 +109,25 @@ def run(args, registry: Optional[Registry] = None):
  run_url = f"{run_spec.run_id}"
  logger.info(_purple(f"Run started: {run_url}"))
 
+ def parse_extra_eval_params(param_str: Optional[str]) -> Mapping[str, Any]:
+ """Parse a string of the form "key1=value1,key2=value2" into a dict."""
+ if not param_str:
+ return {}
+
+ def to_number(x):
+ try:
+ return int(x)
+ except:
+ pass
+ try:
+ return float(x)
+ except:
+ pass
+ return x
+
+ str_dict = dict(kv.split("=") for kv in param_str.split(","))
+ return {k: to_number(v) for k, v in str_dict.items()}
+
  extra_eval_params = parse_extra_eval_params(args.extra_eval_params)
 
  eval_class = registry.get_class(eval_spec)
@@ -143,7 +161,7 @@ def main():
  logging.getLogger("openai").setLevel(logging.WARN)
  if hasattr(openai.error, "set_display_cause"):
  openai.error.set_display_cause()
- run(args, model_resolver=ModelResolver())
+ run(args)
 
 
 if __name__ == "__main__":

@@ -61,7 +61,7 @@ def __init__(
 
  def __call__(
  self,
- prompt: Union[OpenAICreatePrompt, Prompt],
+ prompt: Union[str, OpenAICreateChatPrompt],
  **kwargs,
  ) -> OpenAICompletionResult:
  if not isinstance(prompt, Prompt):
@@ -108,7 +108,7 @@ def __init__(
 
  def __call__(
  self,
- prompt: Union[OpenAICreateChatPrompt, Prompt],
+ prompt: Union[str, OpenAICreateChatPrompt],
  **kwargs,
  ) -> OpenAIChatCompletionResult:
  if not isinstance(prompt, Prompt):

@@ -11,74 +11,15 @@
 import evals
 import evals.record
 from evals import CompletionFn, DummyCompletionFn, OpenAIChatCompletionFn
-from evals.elsuite.utils import PromptFn, format_necessary, scrub_formatting_from_prompt
-
-INVALID_STR = "__invalid__"
-CHOICE_KEY = "choice"
-MATCH_FNS = {
- "include": lambda x, y: float(x in y),
- "exact": lambda x, y: float(x == y),
- "endswith": lambda x, y: x.endswith(y),
- "starts_or_endswith": lambda x, y: x.startswith(y) or x.endswith(y),
-}
-
-ANSWER_PROMPTS = {
- # e.g. "Yes"
- "classify": "Answer the question by printing only a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer with no other text.".strip(),
- # e.g. "Yes\n The reasons are: ..."
- "classify_cot": "First, answer by printing a single choice from {choices} (without quotes or punctuation) corresponding to the correct answer. Then, from the next line, explain your reasonings step by step.".strip(),
- # e.g. "Let's think step by step. ...\nYes"
- "cot_classify": """
-First, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from {choices} (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.
-
-Reasoning:""".strip(),
- "cot_classify_jp": """
-まず、一歩一歩あなたの推論を書き出してください。単に正しい答えを最初に述べることを避けてください。次に、{choices}（引用符や句読点なし）から正しい答えに対応する1つの選択肢を単独の行に書きだしてください。最後に、答えだけを新しい行に繰り返してください。
-
-推論：
- """.strip(),
-}
-
-
-def choice_to_str(choice_strings: Iterable[str]) -> str:
- """Return a string of choices, e.g. '"Yes" or "No" or "Maybe"'."""
- return " or ".join(f'"{choice}"' for choice in choice_strings)
-
-
-def get_choice(text: str, eval_type: str, match_fn: Callable, choice_strings: Iterable[str]) -> str:
- """Clean the answer string to a choice string to one of choice_strings. Return '__invalid__.' if no match."""
- lines = text.strip().split("\n")
- if eval_type.startswith("cot_classify"):
- lines = lines[::-1] # reverse lines
- for line in lines:
- line = line.strip()
- line = "".join(c for c in line if c not in string.punctuation)
- if not line:
- continue
- for choice in choice_strings:
- if match_fn(line, choice):
- return choice
- return INVALID_STR
-
-
-def expand_args_dict(args_dict):
- """Expand a dict of dicts, with namings.
-
- args_dict = {
- "a": {"a1": 1, "a2": 2},
- "b": {"b1": 3, "b2": 4},
- }
- expand_args_dict(args_dict) = {
- "a=a1:b=b1": {"a": ("a1", 1), "b": ("b1", 3)},
- "a=a1:b=b2": {"a": ("a1", 1), "b": ("b2", 4)},
- ...}
- """
- args_dict = {k: list(v.items()) for k, v in args_dict.items()}
- keys = list(args_dict.keys())
- values = list(args_dict.values())
- new_values = [dict(zip(keys, v)) for v in itertools.product(*values)]
- new_names = [":".join([f"{k}={v[0]}" for k, v in sorted(d.items())]) for d in new_values]
- return dict(zip(new_names, new_values))
+from evals.elsuite.modelgraded.base import ModelGradedSpec
+from evals.elsuite.modelgraded.classify_utils import (
+ CHOICE_KEY,
+ INVALID_STR,
+ MATCH_FNS,
+ concat_n_completions,
+ get_choice,
+)
+from evals.elsuite.utils import PromptFn, scrub_formatting_from_prompt
 
 
 class ModelBasedClassify(evals.Eval):
@@ -208,7 +149,7 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
  args = {k: v[1] for k, v in args.items()}
  prompt = self.mg.format(**args, **completions, **test_sample)
  evaluate = PromptFn(
- self.prompt,
+ prompt,
  completion_fn=self.eval_completion_fn,
  max_tokens=self.max_tokens,
  )

@@ -12,7 +12,7 @@
 ENCODER_LOCK = threading.Lock()
 
 # This is an approximation to the type accepted as the `prompt` field to `openai.Completion.create` calls
-OpenAICreatePrompt = Union[str, list[str]]
+OpenAICreatePrompt = Union[str, list[str], list[int], list[list[int]]]
 
 # This is the type accepted as the `prompt` field to `openai.ChatCompletion.create` calls
 OpenAIChatMessage = Dict[str, str] # A message is a dictionary with "role" and "content" keys

@@ -20,6 +20,7 @@
 from evals import OpenAIChatCompletionFn, OpenAICompletionFn
 from evals.api import CompletionFn, DummyCompletionFn
 from evals.base import BaseEvalSpec, CompletionFnSpec, EvalSetSpec, EvalSpec
+from evals.elsuite.modelgraded.base import ModelGradedSpec
 from evals.utils.misc import make_object
 
 logger = logging.getLogger(__name__)
@@ -147,10 +148,11 @@ def get_alias():
  except TypeError as e:
  raise TypeError(f"Error while processing {object} '{name}': {e}")
 
- def get_model(self, name: str) -> ModelSpec:
- return self._dereference(name, self._models, "model", ModelSpec)
-
  def get_modelgraded_spec(self, name: str, **kwargs: dict) -> dict[str, Any]:
+ assert name in self._modelgraded_specs, (
+ f"Modelgraded spec {name} not found. "
+ f"Closest matches: {difflib.get_close_matches(name, self._modelgraded_specs.keys(), n=5)}"
+ )
  return self._dereference(
  name, self._modelgraded_specs, "modelgraded spec", ModelGradedSpec, **kwargs
  )
@@ -266,9 +268,5 @@ def _evals(self):
  def _modelgraded_specs(self):
  return self._load_registry([p / "modelgraded" for p in self._registry_paths])
 
- @functools.cached_property
- def _models(self):
- return self._load_registry([p / "models" for p in self._registry_paths])
-
 
 registry = Registry()
diff --git a/evals/registry/completion_fns/tmp.yaml b/evals/registry/completion_fns/tmp.yaml