rename

microsoft · sonichi · Apr 8, 2023 · Mar 30, 2023 · Mar 30, 2023 · Apr 2, 2023
commit ff8126babd295c9803ef72282acfa0513469d9d8
diff --git a/README.md b/README.md
@@ -104,7 +104,7 @@ config, analysis = oai.Completion.tune(
     data=tune_data,
     metric="success",
     mode="max",
-    eval_func=success_metrics,
+    eval_func=eval_func,
     inference_budget=0.05,
     optimization_budget=3,
     num_samples=-1,

diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
@@ -66,14 +66,14 @@ def _remove_check(response):
     return response[:pos]
 
 
-def success_metrics(
+def eval_function_completions(
     responses: List[str],
     definition: str,
     test: Optional[str] = None,
     entry_point: Optional[str] = None,
     assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = None,
 ) -> Dict:
-    """Check if the task is successful.
+    """Select a response from a list of responses for the function completion task (using generated assertions), and/or evaluate if the task is successful using a gold test.
 
     Args:
         responses (list): The list of responses.
@@ -153,7 +153,7 @@ def implement(
         Union[str, Callable[[str], Tuple[str, float]]]
     ] = generate_assertions,
 ) -> Tuple[str, float]:
-    """Implement a function.
+    """Implement a function from a definition.
 
     Args:
         definition (str): The function definition, including the signature and docstr.
@@ -172,7 +172,9 @@ def implement(
         response = oai.Completion.create({"definition": definition}, **config)
         cost += oai.Completion.cost(config["model"], response)
         responses = oai.Completion.extract_text(response)
-        metrics = success_metrics(responses, definition, assertions=assertions)
+        metrics = eval_function_completions(
+            responses, definition, assertions=assertions
+        )
         assertions = metrics["assertions"]
         cost += metrics["gen_cost"]
         if metrics["succeed_assertions"] or i == len(configs) - 1:

diff --git a/flaml/autogen/math_utils.py b/flaml/autogen/math_utils.py
@@ -280,8 +280,8 @@ def voting_counts(responses):
     return answers
 
 
-def success_metrics(responses, solution, **args):
-    """Check if each response is correct.
+def eval_math_responses(responses, solution=None, **args):
+    """Select a response for a math problem using voting, and check if the response is correct if the solution is provided.
 
     Args:
         responses (list): The list of responses.
@@ -292,10 +292,11 @@ def success_metrics(responses, solution, **args):
     """
     success_list = []
     n = len(responses)
-    for i in range(n):
-        response = responses[i]
-        succeed = is_equiv_chain_of_thought(response, solution)
-        success_list.append(succeed)
+    if solution is not None:
+        for i in range(n):
+            response = responses[i]
+            succeed = is_equiv_chain_of_thought(response, solution)
+            success_list.append(succeed)
     # voting
     answers = voting_counts(responses)
     # find the answer with highest votes in answers

diff --git a/notebook/autogen_chatgpt.ipynb b/notebook/autogen_chatgpt.ipynb
@@ -277,7 +277,7 @@
    },
    "outputs": [],
    "source": [
-    "from flaml.autogen.math_utils import success_metrics"
+    "from flaml.autogen.math_utils import eval_math_responses"
    ]
   },
   {
@@ -435,7 +435,7 @@
     "    data=tune_data,  # the data for tuning\n",
     "    metric=\"success_vote\",  # the metric to optimize\n",
     "    mode=\"max\",  # the optimization mode\n",
-    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
+    "    eval_func=eval_math_responses,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/math.log\",  # the log file name\n",
     "    inference_budget=0.02,  # the inference budget (dollar)\n",
     "    optimization_budget=1,  # the optimization budget (dollar)\n",
@@ -970,7 +970,7 @@
    ],
    "source": [
     "response = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
-    "metric_results = success_metrics(oai.ChatCompletion.extract_text(response), **tune_data[1])\n",
+    "metric_results = eval_math_responses(oai.ChatCompletion.extract_text(response), **tune_data[1])\n",
     "print(\"response on an example data instance:\", response)\n",
     "print(\"metric_results on the example data instance:\", metric_results)\n"
    ]
@@ -1006,7 +1006,7 @@
     }
    ],
    "source": [
-    "# result = oai.Completion.test(test_data, config, success_metrics)\n",
+    "# result = oai.Completion.test(test_data, config, eval_math_responses)\n",
     "# print(\"performance on test data with the tuned config:\", result)"
    ]
   },
@@ -1036,7 +1036,7 @@
     "# the following code will cost roughly $2 if uncommented and run.\n",
     "\n",
     "# default_config = {\"model\": 'gpt-4', \"prompt\": prompts[0]}\n",
-    "# default_result = oai.Completion.test(test_data, default_config, success_metrics)\n",
+    "# default_result = oai.Completion.test(test_data, default_config, eval_math_responses)\n",
     "# print(\"performance on test data from gpt-4 with a default config:\", default_result)"
    ]
   },
@@ -1084,7 +1084,7 @@
     "# The following evaluation costs $3 and longer than one hour if you uncomment it and run it.\n",
     "\n",
     "# config_n2 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 2}\n",
-    "# result_n2 = oai.ChatCompletion.test(test_data, config_n2, success_metrics)\n",
+    "# result_n2 = oai.ChatCompletion.test(test_data, config_n2, eval_math_responses)\n",
     "# print(\"performance on test data from gpt-4 with a default config and n=2:\", result_n2)\n"
    ]
   },
@@ -1113,7 +1113,7 @@
     "# The following evaluation costs $8 and longer than one hour if you uncomment it and run it.\n",
     "\n",
     "# config_n5 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 5}\n",
-    "# result_n5 = oai.ChatCompletion.test(test_data, config_n5, success_metrics)\n",
+    "# result_n5 = oai.ChatCompletion.test(test_data, config_n5, eval_math_responses)\n",
     "# print(\"performance on test data from gpt-4 with a default config and n=5:\", result_n5)"
    ]
   },