Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

autogen subpackage #968

Merged
merged 31 commits into from
Apr 8, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
rename
  • Loading branch information
sonichi committed Apr 7, 2023
commit ff8126babd295c9803ef72282acfa0513469d9d8
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ config, analysis = oai.Completion.tune(
data=tune_data,
metric="success",
mode="max",
eval_func=success_metrics,
eval_func=eval_func,
inference_budget=0.05,
optimization_budget=3,
num_samples=-1,
Expand Down
10 changes: 6 additions & 4 deletions flaml/autogen/code_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ def _remove_check(response):
return response[:pos]


def success_metrics(
def eval_function_completions(
responses: List[str],
definition: str,
test: Optional[str] = None,
entry_point: Optional[str] = None,
assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = None,
) -> Dict:
"""Check if the task is successful.
"""Select a response from a list of responses for the function completion task (using generated assertions), and/or evaluate if the task is successful using a gold test.

Args:
responses (list): The list of responses.
Expand Down Expand Up @@ -153,7 +153,7 @@ def implement(
Union[str, Callable[[str], Tuple[str, float]]]
] = generate_assertions,
) -> Tuple[str, float]:
"""Implement a function.
"""Implement a function from a definition.

Args:
definition (str): The function definition, including the signature and docstr.
Expand All @@ -172,7 +172,9 @@ def implement(
response = oai.Completion.create({"definition": definition}, **config)
cost += oai.Completion.cost(config["model"], response)
responses = oai.Completion.extract_text(response)
metrics = success_metrics(responses, definition, assertions=assertions)
metrics = eval_function_completions(
responses, definition, assertions=assertions
)
assertions = metrics["assertions"]
cost += metrics["gen_cost"]
if metrics["succeed_assertions"] or i == len(configs) - 1:
Expand Down
13 changes: 7 additions & 6 deletions flaml/autogen/math_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,8 @@ def voting_counts(responses):
return answers


def success_metrics(responses, solution, **args):
"""Check if each response is correct.
def eval_math_responses(responses, solution=None, **args):
"""Select a response for a math problem using voting, and check if the response is correct if the solution is provided.

Args:
responses (list): The list of responses.
Expand All @@ -292,10 +292,11 @@ def success_metrics(responses, solution, **args):
"""
success_list = []
n = len(responses)
for i in range(n):
response = responses[i]
succeed = is_equiv_chain_of_thought(response, solution)
success_list.append(succeed)
if solution is not None:
for i in range(n):
response = responses[i]
succeed = is_equiv_chain_of_thought(response, solution)
success_list.append(succeed)
# voting
answers = voting_counts(responses)
# find the answer with highest votes in answers
Expand Down
14 changes: 7 additions & 7 deletions notebook/autogen_chatgpt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@
},
"outputs": [],
"source": [
"from flaml.autogen.math_utils import success_metrics"
"from flaml.autogen.math_utils import eval_math_responses"
]
},
{
Expand Down Expand Up @@ -435,7 +435,7 @@
" data=tune_data, # the data for tuning\n",
" metric=\"success_vote\", # the metric to optimize\n",
" mode=\"max\", # the optimization mode\n",
" eval_func=success_metrics, # the evaluation function to return the success metrics\n",
" eval_func=eval_math_responses, # the evaluation function to return the success metrics\n",
" # log_file_name=\"logs/math.log\", # the log file name\n",
" inference_budget=0.02, # the inference budget (dollar)\n",
" optimization_budget=1, # the optimization budget (dollar)\n",
Expand Down Expand Up @@ -970,7 +970,7 @@
],
"source": [
"response = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
"metric_results = success_metrics(oai.ChatCompletion.extract_text(response), **tune_data[1])\n",
"metric_results = eval_math_responses(oai.ChatCompletion.extract_text(response), **tune_data[1])\n",
"print(\"response on an example data instance:\", response)\n",
"print(\"metric_results on the example data instance:\", metric_results)\n"
]
Expand Down Expand Up @@ -1006,7 +1006,7 @@
}
],
"source": [
"# result = oai.Completion.test(test_data, config, success_metrics)\n",
"# result = oai.Completion.test(test_data, config, eval_math_responses)\n",
"# print(\"performance on test data with the tuned config:\", result)"
]
},
Expand Down Expand Up @@ -1036,7 +1036,7 @@
"# the following code will cost roughly $2 if uncommented and run.\n",
"\n",
"# default_config = {\"model\": 'gpt-4', \"prompt\": prompts[0]}\n",
"# default_result = oai.Completion.test(test_data, default_config, success_metrics)\n",
"# default_result = oai.Completion.test(test_data, default_config, eval_math_responses)\n",
"# print(\"performance on test data from gpt-4 with a default config:\", default_result)"
]
},
Expand Down Expand Up @@ -1084,7 +1084,7 @@
"# The following evaluation costs $3 and longer than one hour if you uncomment it and run it.\n",
"\n",
"# config_n2 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 2}\n",
"# result_n2 = oai.ChatCompletion.test(test_data, config_n2, success_metrics)\n",
"# result_n2 = oai.ChatCompletion.test(test_data, config_n2, eval_math_responses)\n",
"# print(\"performance on test data from gpt-4 with a default config and n=2:\", result_n2)\n"
]
},
Expand Down Expand Up @@ -1113,7 +1113,7 @@
"# The following evaluation costs $8 and longer than one hour if you uncomment it and run it.\n",
"\n",
"# config_n5 = {\"model\": 'gpt-4', \"prompt\": prompts[0], \"n\": 5}\n",
"# result_n5 = oai.ChatCompletion.test(test_data, config_n5, success_metrics)\n",
"# result_n5 = oai.ChatCompletion.test(test_data, config_n5, eval_math_responses)\n",
"# print(\"performance on test data from gpt-4 with a default config and n=5:\", result_n5)"
]
},
Expand Down
Loading