Changed MIPRO and COPRO fields

stanfordnlp · Mar 7, 2024 · 3b7f285 · 3b7f285
1 parent 5f4cd19
commit 3b7f285
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 48 deletions.
diff --git a/dspy/teleprompt/copro_optimizer.py b/dspy/teleprompt/copro_optimizer.py
@@ -13,7 +13,7 @@
 
 teleprompter = COPRO(prompt_model=prompt_model, metric=metric, breadth=BREADTH, depth=DEPTH, init_temperature=INIT_TEMPERATURE)
 kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=0)
-compiled_prompt_opt = teleprompter.compile(program.deepcopy(), devset=devset[:DEV_NUM], eval_kwargs=kwargs)
+compiled_prompt_opt = teleprompter.compile(program.deepcopy(), trainset=trainset[:DEV_NUM], eval_kwargs=kwargs)
 eval_score = evaluate(compiled_prompt_opt, devset=evalset[:EVAL_NUM], **kwargs)
 
 Note that this teleprompter takes in the following parameters:
@@ -109,10 +109,10 @@ def _set_signature(self, predictor, updated_signature):
             predictor.signature = updated_signature
 
 
-    def compile(self, student, *, devset, eval_kwargs):
+    def compile(self, student, *, trainset, eval_kwargs):
         """student is a program that needs to be optimized, note that it may be zero-shot or already pre-optimized for demos != []"""
         module = student.deepcopy()
-        evaluate = Evaluate(devset=devset, metric=self.metric, **eval_kwargs)
+        evaluate = Evaluate(devset=trainset, metric=self.metric, **eval_kwargs)
         total_calls = 0
         results_best = {id(p):{"depth": [], "max": [], "average": [], "min":[], "std": []} for p in module.predictors()}
         results_latest = {id(p):{"depth": [], "max": [], "average": [], "min":[], "std": []} for p in module.predictors()}
@@ -179,7 +179,7 @@ def compile(self, student, *, devset, eval_kwargs):
                         if self.verbose: print(f"Predictor {i}")
                         self._print_signature(predictor)
                     if self.verbose: print(f"At Depth {d}/{self.depth}, Evaluating Prompt Candidate #{c_i}/{len(candidates_)} for Predictor {p_i} of {len(module.predictors())}.")
-                    score = evaluate(module_clone, devset=devset, **eval_kwargs)
+                    score = evaluate(module_clone, devset=trainset, **eval_kwargs)
                     if self.verbose and self.prompt_model: print(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}")
                     total_calls += 1
                     if self.verbose: print("----------------")

diff --git a/dspy/teleprompt/mipro_optimizer.py b/dspy/teleprompt/mipro_optimizer.py
@@ -23,17 +23,17 @@
 
 from dspy.teleprompt import MIPROOptimizer
 
-teleprompter = MIPROOptimizer(prompt_model=prompt_model, task_model=task_model, metric=metric, n=10, init_temperature=1.0)
+teleprompter = MIPROOptimizer(prompt_model=prompt_model, task_model=task_model, metric=metric, num_candidates=10, init_temperature=1.0)
 kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=0)
-compiled_prompt_opt = teleprompter.compile(program, devset=devset[:DEV_NUM], trials_num=100, max_bootstrapped_demos=3, max_labeled_demos=5, eval_kwargs=kwargs)
+compiled_prompt_opt = teleprompter.compile(program, trainset=trainset[:TRAIN_NUM], num_trials=100, max_bootstrapped_demos=3, max_labeled_demos=5, eval_kwargs=kwargs)
 eval_score = evaluate(compiled_prompt_opt, devset=evalset[:EVAL_NUM], **kwargs)
 
 Note that this teleprompter takes in the following parameters:
 
 * prompt_model: The model used for prompt generation. When unspecified, defaults to the model set in settings (ie. dspy.settings.configure(lm=task_model)).
 * task_model: The model used for prompt generation. When unspecified, defaults to the model set in settings (ie. dspy.settings.configure(lm=task_model)).
 * metric: The task metric used for optimization.
-* n: The number of new prompts and sets of fewshot examples to generate and evaluate. Default=10.
+* num_candidates: The number of new prompts and sets of fewshot examples to generate and evaluate. Default=10.
 * init_temperature: The temperature used to generate new prompts. Higher roughly equals more creative. Default=1.0.
 * verbose: Tells the method whether or not to print intermediate steps.
 * track_stats: Tells the method whether or not to track statistics about the optimization process.
@@ -105,8 +105,8 @@ class DatasetDescriptorWithPriorObservations(dspy.Signature):
     observations = dspy.OutputField(desc="Somethings that holds true for most or all of the data you observed or COMPLETE if you have nothing to add")
 
 class MIPRO(Teleprompter):
-    def __init__(self, prompt_model=None, task_model=None, teacher_settings={}, n=10, metric=None, init_temperature=1.0, verbose=False, track_stats=True, view_data_batch_size=10):
-        self.n = n
+    def __init__(self, prompt_model=None, task_model=None, teacher_settings={}, num_candidates=10, metric=None, init_temperature=1.0, verbose=False, track_stats=True, view_data_batch_size=10):
+        self.n = num_candidates
         self.metric = metric
         self.init_temperature = init_temperature
         self.prompt_model = prompt_model if prompt_model is not None else dspy.settings.lm
@@ -279,43 +279,24 @@ def _generate_first_N_candidates(self, module, N, view_data, view_examples, demo
 
         return candidates, evaluated_candidates
 
-    def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos, eval_kwargs, seed=42, view_data=True, view_examples=True, requires_permission_to_run=True, trials_num=None, optuna_trials_num=None):
+    def compile(self, student, *, trainset, max_bootstrapped_demos, max_labeled_demos, eval_kwargs, seed=42, view_data=True, view_examples=True, requires_permission_to_run=True, num_trials=None):
         # Define ANSI escape codes for colors
         YELLOW = '\033[93m'
         BLUE = '\033[94m'
         BOLD = '\033[1m'
         ENDC = '\033[0m'  # Resets the color to default
 
-        # Check if both trials_num and optuna_trials_num are None
-        if trials_num is None and optuna_trials_num is None:
-            raise ValueError(f"{YELLOW}{BOLD}You must specify the number of trials using the 'trials_num' parameter.{ENDC}")
-
-        # Check if the deprecated parameter is used
-        if optuna_trials_num is not None:
-            print("in it!")
-            # Issue a deprecation warning
-            warnings.warn(
-                "`trials_num` is deprecated and will be removed in a future version. "
-                "Use `trials_num` instead.", 
-                DeprecationWarning
-            )
-            # Use trials_num as a fallback if trials_num is not provided
-            if trials_num is None:
-                trials_num = optuna_trials_num
-
         random.seed(seed)
 
-        estimated_task_model_calls_wo_module_calls = len(devset) * trials_num  # M * T * P
+        estimated_task_model_calls_wo_module_calls = len(trainset) * num_trials  # M * T * P
         estimated_prompt_model_calls = 10 + self.n * len(student.predictors()) # num data summary calls + N * P
 
-
-
         user_message = textwrap.dedent(f"""\
             {YELLOW}{BOLD}WARNING: Projected Language Model (LM) Calls{ENDC}
 
             Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:
 
-            {YELLOW}- Task Model: {BLUE}{BOLD}{len(devset)}{ENDC}{YELLOW} examples in dev set * {BLUE}{BOLD}{trials_num}{ENDC}{YELLOW} trials * {BLUE}{BOLD}# of LM calls in your program{ENDC}{YELLOW} = ({BLUE}{BOLD}{estimated_task_model_calls_wo_module_calls} * # of LM calls in your program{ENDC}{YELLOW}) task model calls{ENDC}
+            {YELLOW}- Task Model: {BLUE}{BOLD}{len(trainset)}{ENDC}{YELLOW} examples in dev set * {BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} trials * {BLUE}{BOLD}# of LM calls in your program{ENDC}{YELLOW} = ({BLUE}{BOLD}{estimated_task_model_calls_wo_module_calls} * # of LM calls in your program{ENDC}{YELLOW}) task model calls{ENDC}
             {YELLOW}- Prompt Model: # data summarizer calls (max {BLUE}{BOLD}10{ENDC}{YELLOW}) + {BLUE}{BOLD}{self.n}{ENDC}{YELLOW} * {BLUE}{BOLD}{len(student.predictors())}{ENDC}{YELLOW} lm calls in program = {BLUE}{BOLD}{estimated_prompt_model_calls}{ENDC}{YELLOW} prompt model calls{ENDC}
 
             {YELLOW}{BOLD}Estimated Cost Calculation:{ENDC}
@@ -326,7 +307,7 @@ def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos,
             For a preliminary estimate of potential costs, we recommend you perform your own calculations based on the task
             and prompt models you intend to use. If the projected costs exceed your budget or expectations, you may consider:
 
-            {YELLOW}- Reducing the number of trials (`trials_num`), the size of the trainset, or the number of LM calls in your program.{ENDC}
+            {YELLOW}- Reducing the number of trials (`num_trials`), the size of the trainset, or the number of LM calls in your program.{ENDC}
             {YELLOW}- Using a cheaper task model to optimize the prompt.{ENDC}
 
             To proceed with the execution of this program, please confirm by typing {BLUE}'y'{ENDC} for yes or {BLUE}'n'{ENDC} for no.
@@ -348,7 +329,7 @@ def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos,
             else:
                 # Set up program and evaluation function
                 module = student.deepcopy()
-                evaluate = Evaluate(devset=devset, metric=self.metric, **eval_kwargs)
+                evaluate = Evaluate(devset=trainset, metric=self.metric, **eval_kwargs)
 
                 # In the case where the bootstrapped and labeled demos are set to 0, we'll stil bootstrap examples to use in our meta prompt
                 if max_bootstrapped_demos==0 and max_labeled_demos==0: #TODO: address case when max_bootstrapped alone is 0
@@ -371,10 +352,10 @@ def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos,
 
                         # Create a new basic bootstrap few - shot program .
                         rng = random.Random(i)
-                        shuffled_devset = devset[:]  # Create a copy of devset
-                        rng.shuffle(shuffled_devset)  # Shuffle the copy
+                        shuffled_trainset = trainset[:]  # Create a copy of devset
+                        rng.shuffle(shuffled_trainset)  # Shuffle the copy
                         tp = BootstrapFewShot(metric = self.metric, max_bootstrapped_demos=max_bootstrapped_demos_for_candidate_gen, max_labeled_demos=max_labeled_demos_for_candidate_gen, teacher_settings=self.teacher_settings)
-                        candidate_program = tp.compile(student=module.deepcopy(), trainset=shuffled_devset)
+                        candidate_program = tp.compile(student=module.deepcopy(), trainset=shuffled_trainset)
 
                         # Store the candidate demos
                         for module_p, candidate_p in zip(module.predictors(), candidate_program.predictors()):
@@ -383,7 +364,7 @@ def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos,
                             demo_candidates[id(module_p)].append(candidate_p.demos)
 
                 # Generate N candidate prompts
-                instruction_candidates, _ = self._generate_first_N_candidates(module, self.n, view_data, view_examples, demo_candidates, devset)
+                instruction_candidates, _ = self._generate_first_N_candidates(module, self.n, view_data, view_examples, demo_candidates, trainset)
 
                 # Reset demo_candidates to None for our optimization if the user asked for no fewshot examples
                 if max_bootstrapped_demos==0 and max_labeled_demos==0:
@@ -397,7 +378,7 @@ def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos,
                 trial_logs = {}
 
                 # Define our trial objective
-                def create_objective(baseline_program, instruction_candidates, demo_candidates, evaluate, devset):
+                def create_objective(baseline_program, instruction_candidates, demo_candidates, evaluate, trainset):
                     def objective(trial):
                         nonlocal best_program, best_score, trial_num, trial_logs  # Allow access to the outer variables
                         candidate_program = baseline_program.deepcopy()
@@ -444,17 +425,17 @@ def objective(trial):
                         # Evaluate with the new prompts
                         total_score = 0
                         batch_size = 100
-                        num_batches = math.ceil(len(devset) / batch_size)
+                        num_batches = math.ceil(len(trainset) / batch_size)
 
                         for i in range(num_batches):
                             start_index = i * batch_size
-                            end_index = min((i + 1) * batch_size, len(devset))
-                            split_dev = devset[start_index:end_index]
-                            split_score = evaluate(candidate_program, devset=split_dev, display_table=0)
+                            end_index = min((i + 1) * batch_size, len(trainset))
+                            split_trainset = trainset[start_index:end_index]
+                            split_score = evaluate(candidate_program, devset=split_trainset, display_table=0)
                             if self.verbose: print(f"{i}st split score: {split_score}")
 
-                            total_score += split_score * len(split_dev)
-                            curr_weighted_avg_score = total_score / min((i+1)*100,len(devset))
+                            total_score += split_score * len(split_trainset)
+                            curr_weighted_avg_score = total_score / min((i+1)*100,len(trainset))
                             if self.verbose: print(f"curr average score: {curr_weighted_avg_score}")
 
                             trial.report(curr_weighted_avg_score, i)
@@ -487,10 +468,10 @@ def objective(trial):
                     return objective
 
                 # Run the trial 
-                objective_function = create_objective(module, instruction_candidates, demo_candidates, evaluate, devset)
+                objective_function = create_objective(module, instruction_candidates, demo_candidates, evaluate, trainset)
                 sampler = optuna.samplers.TPESampler(seed=seed)
                 study = optuna.create_study(direction="maximize", sampler=sampler)
-                score = study.optimize(objective_function, n_trials=trials_num)
+                score = study.optimize(objective_function, n_trials=num_trials)
 
                 if best_program is not None and self.track_stats:
                     best_program.trial_logs = trial_logs

diff --git a/dspy/teleprompt/signature_opt.py b/dspy/teleprompt/signature_opt.py
@@ -1,4 +1,5 @@
 from .copro_optimizer import COPRO
+import warnings
 """
 ===============================================================
 DEPRECATED!!!
@@ -32,4 +33,13 @@
 
 class SignatureOptimizer(COPRO):
     def __init__(self, prompt_model=None, metric=None, breadth=10, depth=3, init_temperature=1.4, verbose=False, track_stats=False):
-        super().__init__(prompt_model, metric, breadth, depth, init_temperature, verbose, track_stats)
+        # warnings.warn(
+        #         "`SignatureOptimizer` is deprecated and will be removed in a future version. "
+        #         "Use `COPRO` instead.", 
+        #         DeprecationWarning
+        #     )
+        print(u"\u001b[31m[WARNING] SignatureOptimizer has been deprecated and replaced with COPRO.  SignatureOptimizer will be removed in a future release. \u001b[31m")
+        super().__init__(prompt_model, metric, breadth, depth, init_temperature, verbose, track_stats)
+
+    def compile(self, student, *, devset, eval_kwargs):
+        super().compile(student, trainset=devset, eval_kwargs=eval_kwargs)
diff --git a/dspy/teleprompt/signature_opt_bayesian.py b/dspy/teleprompt/signature_opt_bayesian.py
@@ -1,4 +1,5 @@
 from dspy.teleprompt.mipro_optimizer import MIPRO
+import warnings
 
 """
 ===============================================================
@@ -36,6 +37,35 @@
 
 class BayesianSignatureOptimizer(MIPRO):
     def __init__(self, prompt_model=None, task_model=None, teacher_settings={}, n=10, metric=None, init_temperature=1.0, verbose=False, track_stats=True, view_data_batch_size=10):
+        # warnings.warn(
+        #         "`BayesianSignatureOptimizer` is deprecated and will be removed in a future version. "
+        #         "Use `MIPRO` instead.", 
+        #         DeprecationWarning
+        #     )
         print(u"\u001b[31m[WARNING] BayesianSignatureOptimizer has been deprecated and replaced with MIPRO.  BayesianSignatureOptimizer will be removed in a future release. \u001b[31m")
+
+        super().__init__(prompt_model, task_model, teacher_settings,n,metric,init_temperature,verbose,track_stats,view_data_batch_size)
+
+    def compile(self, student, *, devset, max_bootstrapped_demos, max_labeled_demos, eval_kwargs, seed=42, view_data=True, view_examples=True, requires_permission_to_run=True, trials_num=None, optuna_trials_num=None):
+        # Define ANSI escape codes for colors
+        YELLOW = '\033[93m'
+        BLUE = '\033[94m'
+        BOLD = '\033[1m'
+        ENDC = '\033[0m'  # Resets the color to default
 
-        super().__init__(prompt_model, task_model, teacher_settings,n,metric,init_temperature,verbose,track_stats,view_data_batch_size)
+        # Check if both trials_num and optuna_trials_num are None
+        if trials_num is None and optuna_trials_num is None:
+            raise ValueError(f"{YELLOW}{BOLD}You must specify the number of trials using the 'trials_num' parameter.{ENDC}")
+
+        # Check if the deprecated parameter is used
+        if optuna_trials_num is not None:
+            # Issue a deprecation warning
+            warnings.warn(
+                "`optuna_trials_num` is deprecated and will be removed in a future version. "
+                "Use `trials_num` instead.", 
+                DeprecationWarning
+            )
+            # Use trials_num as a fallback if trials_num is not provided
+            if trials_num is None:
+                trials_num = optuna_trials_num
+        super().compile(student, trainset=devset, max_bootstrapped_demos=max_bootstrapped_demos, max_labeled_demos=max_labeled_demos, eval_kwargs=eval_kwargs, seed=seed, view_data=view_data, view_examples=view_examples, requires_permission_to_run=requires_permission_to_run, num_trials=trials_num)