Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sweep MVP #191

Merged
merged 7 commits into from
Apr 16, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Support pooled datasets, --name, and --add_pooled in sweep
  • Loading branch information
norabelrose committed Apr 16, 2023
commit d78b13772732421c1da594575eaf047f18c765ca
32 changes: 25 additions & 7 deletions elk/training/sweep.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import InitVar, dataclass

from ..extraction import Extract, PromptConfig
from ..files import elk_reporter_dir, memorably_named_dir
Expand All @@ -8,20 +8,34 @@
@dataclass
class Sweep:
models: list[str]
"""List of Huggingface model strings to sweep over."""
datasets: list[str]
"""List of dataset strings to sweep over. Each dataset string can contain
multiple datasets, separated by plus signs. For example, "sst2+imdb" will
pool SST-2 and IMDB together."""
add_pooled: InitVar[bool] = False
"""Whether to add a dataset that pools all of the other datasets together."""

def __post_init__(self):
if not self.models:
raise ValueError("No models specified")
name: str | None = None

def __post_init__(self, add_pooled: bool):
if not self.datasets:
raise ValueError("No datasets specified")
if not self.models:
raise ValueError("No models specified")

# Add an additional dataset that pools all of the datasets together.
if add_pooled:
self.datasets.append("+".join(self.datasets))

def execute(self):
M, D = len(self.models), len(self.datasets)
print(f"Starting sweep over {M} models and {D} datasets ({M * D} runs))")
print(f"Starting sweep over {M} models and {D} datasets ({M * D} runs)")
print(f"Models: {self.models}")
print(f"Datasets: {self.datasets}")

root_dir = elk_reporter_dir() / "sweeps"
sweep_dir = memorably_named_dir(root_dir)
sweep_dir = root_dir / self.name if self.name else memorably_named_dir(root_dir)
print(f"Saving sweep results to \033[1m{sweep_dir}\033[0m") # bold

for i, model_str in enumerate(self.models):
Expand All @@ -31,11 +45,15 @@ def execute(self):
for dataset_str in self.datasets:
out_dir = sweep_dir / model_str / dataset_str

# Allow for multiple datasets to be specified in a single string with
# plus signs. This means we can pool datasets together inside of a
# single sweep.
datasets = [ds.strip() for ds in dataset_str.split("+")]
Elicit(
data=Extract(
model=model_str,
prompts=PromptConfig(
datasets=[dataset_str],
datasets=datasets,
),
),
out_dir=out_dir,
Expand Down