Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Python 3.11 to CI tests #91

Merged
merged 18 commits into from
Feb 19, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
split dataset before multiprocessing
  • Loading branch information
AlexWan0 committed Feb 18, 2023
commit 189bb30ed4d9471f300276bd4c6e027b119958f8
13 changes: 9 additions & 4 deletions elk/extraction/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ class ExtractionParameters:
model: PreTrainedModel
tokenizer: PreTrainedTokenizerBase
collator: PromptCollator
indices: int
batch_size: int = 1
layers: Sequence[int] = ()
prompt_suffix: str = ""
Expand Down Expand Up @@ -59,7 +58,8 @@ def extract_hiddens(
prompt_suffix: str = "",
token_loc: Literal["first", "last", "mean"] = "last",
use_encoder_states: bool = False,
num_procs: int = 1
num_procs: int = 1,
seed_start: int = 42
):
"""Run inference on a model with a set of prompts, yielding the hidden states."""

Expand All @@ -72,12 +72,14 @@ def extract_hiddens(

all_params = []

# use different random seed for each process
curr_seed = seed_start

for proc_indices in all_proc_indices:
params = ExtractionParameters(
model=model,
tokenizer=tokenizer,
collator=collator,
indices=proc_indices,
collator=collator.split_and_copy(proc_indices, curr_seed),
batch_size=batch_size,
layers=layers,
prompt_suffix=prompt_suffix,
Expand All @@ -86,6 +88,8 @@ def extract_hiddens(
)

all_params.append(params)

curr_seed += 1

# each list needs to have length num_proc
multiprocess_kwargs = {
Expand All @@ -94,6 +98,7 @@ def extract_hiddens(
'wrapped_num_procs': [num_procs] * num_procs
}

# CUDA needs os.spawn instead of the default os.fork (on Unix)
mp.set_start_method("spawn")

return Dataset.from_generator(
Expand Down
35 changes: 30 additions & 5 deletions elk/extraction/prompt_collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Literal, Optional
import numpy as np
from torch.utils.data import Dataset
import copy

from elk.extraction.dataset_preprocessing import undersample

Expand Down Expand Up @@ -34,6 +35,8 @@ def __init__(
strategy: Literal["all", "randomize"] = "randomize",
balance: bool = False,
):
self.label_column = label_column

data = load_dataset(path, name)
assert isinstance(data, DatasetDict)

Expand All @@ -42,7 +45,7 @@ def __init__(
if not others:
print("Creating a train-test split...")
data = data[train_name].train_test_split(
seed=seed, shuffle=False, stratify_by_column=label_column
seed=seed, shuffle=False, stratify_by_column=self.label_column
)

if split not in data and split == "validation":
Expand All @@ -52,10 +55,9 @@ def __init__(
self.dataset = data[split]

if balance:
self.dataset = undersample(self.dataset, seed, label_column)
self.dataset = undersample(self.dataset, seed, self.label_column)

self.labels, counts = np.unique(self.dataset[label_column], return_counts=True)
self.label_fracs = counts / counts.sum()
self.set_labels()

print(f"Class balance '{split}': {[f'{x:.2%}' for x in self.label_fracs]}")
pivot, *rest = self.label_fracs
Expand All @@ -69,7 +71,6 @@ def __init__(
if max_examples:
self.dataset = self.dataset.select(range(max_examples))

self.label_column = label_column
self.prompter = DatasetTemplates(path, subset_name=name) # type: ignore
self.rng = Random(seed)
self.strategy = strategy
Expand Down Expand Up @@ -111,3 +112,27 @@ def __len__(self):
N *= len(self.prompter.templates)

return N

def set_labels(self):
self.labels, counts = np.unique(self.dataset[self.label_column], return_counts=True)
self.label_fracs = counts / counts.sum()

def split_and_copy(self, indices, new_seed):
"""
To avoid copying entire dataest num_proccesses times when multiprocessing,
this makes a shallow copy of self, but with self.dataset split
according to given indices.
"""
dataset_split = self.dataset.select(indices)

# only shallow copy is needed -- multiprocess will pickle (dill) objects
self_copy = copy.copy(self)
self_copy.dataset = dataset_split

# redo counts based on new split
self_copy.set_labels()

# give copy a new rng
self_copy.rng = Random(new_seed)

return self_copy