In [25]:
%load_ext autoreload
%autoreload 2
%pip install datasets

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Note: you may need to restart the kernel to use updated packages.


Let's get the dataset and see what it looks like.

In [26]:
import datasets
ds = datasets.load_dataset("openai_humaneval")
ds['test'][0]


{'task_id': 'HumanEval/0',
 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n',
 'canonical_solution': '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert 

Before we try to solve the problem, let's just load a language model and make sure everything works.

In [27]:
import dspy
import dotenv
import os
dotenv.load_dotenv(os.path.expanduser("~/.env"))  # load OpenAI API key from .env file
lm = dspy.OpenAI(model="gpt-3.5-turbo", max_tokens=4000)
dspy.settings.configure(lm=lm)

predictor = dspy.Predict("question -> answer")
print(predictor(question="What is the capital of France?"))

Prediction(
    answer='Paris'
)


Next let's write a program that actually outputs code.

In [30]:
from dspy import InputField, OutputField, Signature
from dspy.functional import TypedPredictor
import pydantic

# We define a pydantic type that automatically checks if it's argument is valid python code.
class PythonCode(pydantic.BaseModel):
    code: str

    @pydantic.field_validator('code')
    def check_syntax(cls, v):
        try:
            # Attempt to compile the code snippet
            compile(v, "<string>", "exec")
        except SyntaxError as e:
            # If a SyntaxError is raised, the code is not syntactically valid
            raise ValueError(f"Code is not syntactically valid: {e}")
            
        return v

# The signature is the main DSpy object. Note that we have types for the input and output fields,
# which was not possible beofore.
class CodeSignature(Signature):
    prompt: PythonCode = InputField()
    test: PythonCode = InputField()
    entry_point: str = InputField()
    solution: PythonCode = OutputField()

predictor = TypedPredictor(CodeSignature)
prediction = predictor(
    prompt=PythonCode(code=ds['test'][0]['prompt']),
    test=PythonCode(code=ds['test'][0]['test']),
    entry_point=ds['test'][0]['entry_point']
)


Now parsing: '{"code": "from typing import List\\n\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n    \\"\\"\\" Check if in given list of numbers, are any two numbers closer to each other than\\n    given threshold.\\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n    False\\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n    True\\n    \\"\\"\\"\\n    for i in range(len(numbers)):\\n        for j in range(i+1, len(numbers)):\\n            if abs(numbers[i] - numbers[j]) < threshold:\\n                return True\\n    return False\\n"}'
Parsed: PythonCode(code='from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n    for i in rang

Let's see what's happening under the hood

In [None]:
lm.inspect_history(n=3)






Given the fields `prompt`, `test`, `entry_point`, produce the fields `solution`.

---

Follow the following format.

Prompt: ${prompt}

Test: ${test}

Entry Point: ${entry_point}

Past Error (solution): An error to avoid in the future

Past Error (solution, 2): An error to avoid in the future

Solution:
${solution}. Respond with a single JSON object. 
You MUST use this format: {"code": "print('Hello, World!')"}
JSON Schema: {"properties": {"code": {"title": "Code", "type": "string"}}, "required": ["code"], "title": "PythonCode", "type": "object"}

---

Prompt: code='from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n'

Test: {"code":"\n\nMETADATA = {\n    'author': 'jt',\n    'da

In [None]:
d = {"code": "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False"}
print(d["code"])

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for i in range(len(numbers)):
        for j in range(i+1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False


In [None]:
import json
json.loads('{"code": "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False"}')

JSONDecodeError: Invalid control character at: line 1 column 82 (char 81)

In [None]:
import ujson
ujson.loads('{"code": "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False"} ')

{'code': 'def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False'}

In [None]:
json.loads(ujson.dumps(ujson.loads('{"code": "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False"} ')))

{'code': 'def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False'}

We can see `functional` first created an example value {"code": "print('Hello, World!')"}, which can be useful to boostrap the json generation.
After that it still failed to generate valid json.
It apparently decided to first repeat the schema, and then give the actual code "as an example"
The validator caught the error, and gave it as a "Past Error", which made the model finally output a valid output.

We need a way to run code. This is actually super tricky to do right in python (see https://stackoverflow.com/questions/3068139/how-can-i-sandbox-python-in-pure-python), so we'll just YOLO and call "exec" with globals={}.

In [None]:
from repl import execute_code
print(execute_code("print(3)"))
print(execute_code("assert False"))

3
None
AssertionError()


Let's run the evaluator on all the "canonical solutions" from HumanEval to check that everything is working.

In [None]:
from dspy import Example

devset = [Example(
    prompt=PythonCode(code=test['prompt']),
    test=PythonCode(code=test['test']),
    entry_point=test['entry_point'],
    solution=PythonCode(code=test['prompt']+test['canonical_solution']),
).with_inputs('prompt', 'test', 'entry_point') for test in ds['test']]

trainset = devset[:40]
testset = devset[40:]

def test_code(timeout=5):
    def metric(example, pred, trace=None):
        if pred.solution.code is None:
            return 0
        error = execute_code(
            "from typing import List\n"
            + f"{pred.solution.code}\n"
            + f"{example.test.code}\n"
            + f"check({example.entry_point})",
            timeout=timeout,
        )
        return int(error is None)
    return metric

metric5s = test_code(timeout=5)

print("Score with the original model:")
metrix = test_code(timeout=5)
print(100 * sum(metric5s(example, example) for example in testset) / len(testset))

for example in devset:
    if not metric5s(example, example):
        print("Bad example:")
        code = (
            "from typing import List\n"
            + f"{example.solution.code}\n"
            + f"{example.test.code}\n"
            + f"check({example.entry_point})"
        )
        print(code)
        error = execute_code(code)
        print(f"{error=}")
        break


Score with the original model:
100.0


Now test our program.

In [None]:
from dspy.evaluate.evaluate import Evaluate
evaluator = Evaluate(
    devset=testset, num_threads=30,
    display_progress=True,
    display_table=5,
    max_errors=100,
)
res = evaluator(predictor, metric5s)
print(res)

Error for example in dev set: 		 Too many retries


  0%|          | 0/124 [00:00<?, ?it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 0.0 / 3  (0.0):   2%|▏         | 2/124 [00:00<00:01, 77.99it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 24.0 / 37  (64.9):  29%|██▉       | 36/124 [00:00<00:01, 81.56it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 25.0 / 40  (62.5):  31%|███▏      | 39/124 [00:00<00:01, 81.56it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 45.0 / 80  (56.2):  64%|██████▎   | 79/124 [00:00<00:00, 109.09it/s]

Error for example in dev set: 		 Too many retries

Average Metric: 45.0 / 81  (55.6):  65%|██████▍   | 80/124 [00:00<00:00, 109.09it/s]




Average Metric: 64.0 / 119  (53.8):  95%|█████████▌| 118/124 [00:01<00:00, 110.58it/s]

37
15
8


Average Metric: 66.0 / 124  (53.2): 100%|██████████| 124/124 [00:01<00:00, 116.06it/s]
  df = df.applymap(truncate_cell)


Average Metric: 66.0 / 124  (53.2%)


Unnamed: 0,prompt,test,entry_point,example_solution,pred_solution,metric,solution
0,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, 5, -1]) == False\n assert candidate([1, 3, -2, 1]) == True\n...",triples_sum_to_zero,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...","code='def triples_sum_to_zero(l: list):\n for i in range(len(l)):\n for j in range(i+1, len(l)):\n for k in range(j+1, len(l)):\n if l[i] + l[j] + l[k] == 0:\n...",1.0,
1,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(2) == 4\n assert candidate(3) == 9\n assert candidate(4) == 16\n assert candidate(8) == 64\n assert candidate(10) == 100\n\n',car_race_collision,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",code='def car_race_collision(n: int):\n return n ** 2',1.0,
2,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([]) == []\n assert candidate([3, 2, 1]) == [4, 3, 2]\n assert candidate([5, 2, 5, 2, 3, 3, 9, 0,...",incr_list,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...",code='def incr_list(l: list):\n return [x + 1 for x in l]',1.0,
3,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, -2, 1]) == False\n assert candidate([1, 2, 3, 7]) == False\n...",pairs_sum_to_zero,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...",code='def pairs_sum_to_zero(l):\n return any(-x in l for x in l if x != 0)\n',1.0,
4,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(8, 3) == ""22""\n assert candidate(9, 3) == ""100""\n assert candidate(234, 2) == ""11101010""\n assert candidate(16, 2) == ""10000""\n assert...",change_base,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","code='def change_base(x: int, base: int):\n return str(int(str(x), base))'",0.0,


53.23


Let's try to optimize it a bit

In [None]:
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch

print("Compiling...")
compiled = BootstrapFewShotWithRandomSearch(
#compiled = BootstrapFewShot(
    metric=metric5s,
    num_threads=30,
    num_candidate_programs=5,
    max_labeled_demos=8,
).compile(
    predictor,
    trainset=trainset,
)

Compiling...
Going to sample between 1 and 4 traces per predictor.
Will attempt to train 5 candidate sets.
Error for example in dev set: 		 Too many retries


Average Metric: 25.0 / 40  (62.5): 100%|██████████| 40/40 [00:01<00:00, 22.16it/s] 
  df = df.applymap(truncate_cell)


Average Metric: 25.0 / 40  (62.5%)
Score: 62.5 for set: [0]
New best score: 62.5 for seed -3
Scores so far: [62.5]
Best score: 62.5


Average Metric: 34.0 / 40  (85.0): 100%|██████████| 40/40 [00:18<00:00,  2.17it/s]


Error for example in dev set: 		 Too many retries
Average Metric: 34.0 / 40  (85.0%)
Score: 85.0 for set: [8]
New best score: 85.0 for seed -2
Scores so far: [62.5, 85.0]
Best score: 85.0


 10%|█         | 4/40 [00:02<00:19,  1.82it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 33 / 40  (82.5): 100%|██████████| 40/40 [01:48<00:00,  2.72s/it] 


Average Metric: 33 / 40  (82.5%)
Score: 82.5 for set: [8]
Scores so far: [62.5, 85.0, 82.5]
Best score: 85.0
Average of max per entry across top 1 scores: 0.85
Average of max per entry across top 2 scores: 0.975
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


 10%|█         | 4/40 [00:10<01:37,  2.70s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 35 / 40  (87.5): 100%|██████████| 40/40 [00:12<00:00,  3.15it/s] 


Average Metric: 35 / 40  (87.5%)
Score: 87.5 for set: [8]
New best score: 87.5 for seed 0
Scores so far: [62.5, 85.0, 82.5, 87.5]
Best score: 87.5
Average of max per entry across top 1 scores: 0.875
Average of max per entry across top 2 scores: 0.975
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  5%|▌         | 2/40 [00:03<01:07,  1.79s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 36 / 40  (90.0): 100%|██████████| 40/40 [00:09<00:00,  4.35it/s] 


Average Metric: 36 / 40  (90.0%)
Score: 90.0 for set: [8]
New best score: 90.0 for seed 1
Scores so far: [62.5, 85.0, 82.5, 87.5, 90.0]
Best score: 90.0
Average of max per entry across top 1 scores: 0.9
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.975
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  5%|▌         | 2/40 [00:05<01:37,  2.57s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 34 / 40  (85.0): 100%|██████████| 40/40 [00:12<00:00,  3.31it/s] 


Average Metric: 34 / 40  (85.0%)
Score: 85.0 for set: [8]
Scores so far: [62.5, 85.0, 82.5, 87.5, 90.0, 85.0]
Best score: 90.0
Average of max per entry across top 1 scores: 0.9
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.975
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  8%|▊         | 3/40 [00:07<01:34,  2.56s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 32 / 40  (80.0): 100%|██████████| 40/40 [00:18<00:00,  2.16it/s] 


Average Metric: 32 / 40  (80.0%)
Score: 80.0 for set: [8]
Scores so far: [62.5, 85.0, 82.5, 87.5, 90.0, 85.0, 80.0]
Best score: 90.0
Average of max per entry across top 1 scores: 0.9
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.975
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  5%|▌         | 2/40 [00:03<01:07,  1.76s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 35 / 40  (87.5): 100%|██████████| 40/40 [00:12<00:00,  3.18it/s] 

Average Metric: 35 / 40  (87.5%)
Score: 87.5 for set: [8]
Scores so far: [62.5, 85.0, 82.5, 87.5, 90.0, 85.0, 80.0, 87.5]
Best score: 90.0
Average of max per entry across top 1 scores: 0.9
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.975
Average of max per entry across top 5 scores: 0.975
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
8 candidate programs found.





Finally evaluate the trained model

In [None]:
print("Evaluating...")
print(
    "Compiled HumanEval score:",
    evaluator(compiled, metric=test_code(timeout=100)),
)

Evaluating...


Average Metric: 59.0 / 95  (62.1):  77%|███████▋  | 95/124 [00:16<00:04,  7.14it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 74.0 / 120  (61.7):  97%|█████████▋| 120/124 [00:22<00:01,  2.43it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 75.0 / 123  (61.0):  99%|█████████▉| 123/124 [00:25<00:00,  1.39it/s]

Error for example in dev set: 		 Too many retries


Average Metric: 75.0 / 124  (60.5): 100%|██████████| 124/124 [00:30<00:00,  4.07it/s]

Error for example in dev set: 		 Too many retries
Average Metric: 75.0 / 124  (60.5%)





Unnamed: 0,prompt,test,entry_point,example_solution,pred_solution,metric,solution
0,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, 5, -1]) == False\n assert candidate([1, 3, -2, 1]) == True\n...",triples_sum_to_zero,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...","code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...",1.0,
1,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(2) == 4\n assert candidate(3) == 9\n assert candidate(4) == 16\n assert candidate(8) == 64\n assert candidate(10) == 100\n\n',car_race_collision,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...","code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",1.0,
2,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([]) == []\n assert candidate([3, 2, 1]) == [4, 3, 2]\n assert candidate([5, 2, 5, 2, 3, 3, 9, 0,...",incr_list,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...","code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...",1.0,
3,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, -2, 1]) == False\n assert candidate([1, 2, 3, 7]) == False\n...",pairs_sum_to_zero,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...","code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...",1.0,
4,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(8, 3) == ""22""\n assert candidate(9, 3) == ""100""\n assert candidate(234, 2) == ""11101010""\n assert candidate(16, 2) == ""10000""\n assert...",change_base,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...",1.0,


Compiled HumanEval score: 60.48
