Change CI to use a dummy model call

SalehHindi · Mar 18, 2023 · feed6c7 · feed6c7
1 parent 1838a5f
commit feed6c7
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 8 deletions.
diff --git a/.github/workflows/test_eval.yaml b/.github/workflows/test_eval.yaml
@@ -40,18 +40,15 @@ jobs:
  echo "new_files=$(cat new_files)" >> $GITHUB_ENV
 
  - name: Run oaieval command for each new YAML file
- env:
- OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
  run: |
  files="${{ env.new_files }}"
  if [ -n "$files" ]; then
  for file in $files; do
  echo "Processing $file"
  first_key=$(python .github/workflows/parse_yaml.py $file)
  echo "Eval Name: $first_key"
- # Replace the following line with the actual oaieval command if needed
- echo "Running: oaieval gpt-4 $first_key"
- oaieval gpt-4 $first_key --max_samples 10
+ oaieval dummy-chat $first_key --max_samples 10
+ oaieval dummy-completion $first_key --max_samples 10
  done
  else
  echo "No new YAML files found in evals/registry/evals"

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
@@ -51,6 +51,7 @@ def n_ctx_from_model_name(model_name: str) -> Optional[int]:
  """Returns n_ctx for a given API model name. Model list last updated 2023-03-14."""
  # note that for most models, the max tokens is n_ctx + 1
  DICT_OF_N_CTX_BY_MODEL_NAME_PREFIX: dict[str, int] = {
+ "dummy-": 2048,
  "gpt-3.5-turbo-": 4096,
  "gpt-4-": 8192,
  "gpt-4-32k-": 32768,
@@ -92,9 +93,19 @@ class ModelResolver:
  "gpt-4-0314",
  "gpt-4-32k",
  "gpt-4-32k-0314",
+ "dummy-chat",
+ }
+
+ DUMMY_MODELS = {
+ "dummy-chat",
+ "dummy-completion",
  }
 
  def resolve(self, name: str) -> ModelSpec:
+ if name in self.DUMMY_MODELS:
+ result = ModelSpec(name=name, model=name, is_chat=(name in self.CHAT_MODELS))
+ return result
+
  if name in self.api_model_ids:
  result = ModelSpec(
  name=name,

diff --git a/evals/elsuite/modelgraded/classify.py b/evals/elsuite/modelgraded/classify.py
@@ -46,8 +46,6 @@
  """.strip(),
 }
 
-EVAL_MODELSPEC = ModelSpec(name="gpt-3.5-turbo", model="gpt-3.5-turbo", is_chat=True)
-
 
 def choice_to_str(choice_strings: Iterable[str]) -> str:
  """Return a string of choices, e.g. '"Yes" or "No" or "Maybe"'."""
@@ -119,6 +117,13 @@ def __init__(
  self.multicomp_temperature = multicomp_temperature
  self.samples_renamings = samples_renamings or {}
 
+ if self.model_spec.name == "dummy-completion" or self.model_spec.name == "dummy-chat":
+ self.eval_modelspec = self.model_spec
+ else:
+ self.eval_modelspec = ModelSpec(
+ name="gpt-3.5-turbo", model="gpt-3.5-turbo", is_chat=True
+ )
+
  """import prompt and set attributes"""
  modelgraded_specs = load_modelgraded_specs(modelgraded_spec_file)
 
@@ -254,7 +259,7 @@ def eval_sample(self, test_sample: dict, rng: Random) -> None:
  metrics = {}
  evaluate = PromptFn(
  self.prompt,
- model_spec=EVAL_MODELSPEC,
+ model_spec=self.eval_modelspec,
  max_tokens=self.max_tokens,
  )
  eval_kwargs = dict(**completions, **test_sample)

diff --git a/evals/utils/api_utils.py b/evals/utils/api_utils.py
@@ -7,6 +7,41 @@
 import openai
 
 
+def generate_dummy_chat_completion():
+ return {
+ "id": "dummy-id",
+ "object": "chat.completion",
+ "created": 12345,
+ "model": "dummy-chat",
+ "usage": {"prompt_tokens": 56, "completion_tokens": 6, "total_tokens": 62},
+ "choices": [
+ {
+ "message": {"role": "assistant", "content": "This is a dummy response."},
+ "finish_reason": "stop",
+ "index": 0,
+ }
+ ],
+ }
+
+
+def generate_dummy_completion():
+ return {
+ "id": "dummy-id",
+ "object": "text_completion",
+ "created": 12345,
+ "model": "dummy-completion",
+ "choices": [
+ {
+ "text": "This is a dummy response.",
+ "index": 0,
+ "logprobs": None,
+ "finish_reason": "stop",
+ }
+ ],
+ "usage": {"prompt_tokens": 5, "completion_tokens": 6, "total_tokens": 11},
+ }
+
+
 @backoff.on_exception(
  wait_gen=backoff.expo,
  exception=(
@@ -24,6 +59,9 @@ def openai_completion_create_retrying(*args, **kwargs):
  Helper function for creating a completion.
  `args` and `kwargs` match what is accepted by `openai.Completion.create`.
  """
+ if kwargs["model"] == "dummy-completion":
+ return generate_dummy_completion()
+
  result = openai.Completion.create(*args, **kwargs)
  if "error" in result:
  logging.warning(result)
@@ -48,6 +86,9 @@ def openai_chat_completion_create_retrying(*args, **kwargs):
  Helper function for creating a chat completion.
  `args` and `kwargs` match what is accepted by `openai.ChatCompletion.create`.
  """
+ if kwargs["model"] == "dummy-chat":
+ return generate_dummy_chat_completion()
+
  result = openai.ChatCompletion.create(*args, **kwargs)
  if "error" in result:
  logging.warning(result)