split dataset before multiprocessing

EleutherAI · norabelrose · Feb 19, 2023 · Feb 16, 2023 · Feb 18, 2023 · Feb 18, 2023
commit 189bb30ed4d9471f300276bd4c6e027b119958f8
diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -15,7 +15,6 @@ class ExtractionParameters:
  model: PreTrainedModel
  tokenizer: PreTrainedTokenizerBase
  collator: PromptCollator
- indices: int
  batch_size: int = 1
  layers: Sequence[int] = ()
  prompt_suffix: str = ""
@@ -59,7 +58,8 @@ def extract_hiddens(
  prompt_suffix: str = "",
  token_loc: Literal["first", "last", "mean"] = "last",
  use_encoder_states: bool = False,
- num_procs: int = 1
+ num_procs: int = 1,
+ seed_start: int = 42
 ):
  """Run inference on a model with a set of prompts, yielding the hidden states."""
 
@@ -72,12 +72,14 @@ def extract_hiddens(
 
  all_params = []
 
+ # use different random seed for each process
+ curr_seed = seed_start
+
  for proc_indices in all_proc_indices:
  params = ExtractionParameters(
  model=model,
  tokenizer=tokenizer,
- collator=collator,
- indices=proc_indices,
+ collator=collator.split_and_copy(proc_indices, curr_seed),
  batch_size=batch_size,
  layers=layers,
  prompt_suffix=prompt_suffix,
@@ -86,6 +88,8 @@ def extract_hiddens(
  )
 
  all_params.append(params)
+
+ curr_seed += 1
 
  # each list needs to have length num_proc
  multiprocess_kwargs = {
@@ -94,6 +98,7 @@ def extract_hiddens(
  'wrapped_num_procs': [num_procs] * num_procs
  }
 
+ # CUDA needs os.spawn instead of the default os.fork (on Unix)
  mp.set_start_method("spawn")
 
  return Dataset.from_generator(

diff --git a/elk/extraction/prompt_collator.py b/elk/extraction/prompt_collator.py
@@ -5,6 +5,7 @@
 from typing import Literal, Optional
 import numpy as np
 from torch.utils.data import Dataset
+import copy
 
 from elk.extraction.dataset_preprocessing import undersample
 
@@ -34,6 +35,8 @@ def __init__(
  strategy: Literal["all", "randomize"] = "randomize",
  balance: bool = False,
  ):
+ self.label_column = label_column
+
  data = load_dataset(path, name)
  assert isinstance(data, DatasetDict)
 
@@ -42,7 +45,7 @@ def __init__(
  if not others:
  print("Creating a train-test split...")
  data = data[train_name].train_test_split(
- seed=seed, shuffle=False, stratify_by_column=label_column
+ seed=seed, shuffle=False, stratify_by_column=self.label_column
  )
 
  if split not in data and split == "validation":
@@ -52,10 +55,9 @@ def __init__(
  self.dataset = data[split]
 
  if balance:
- self.dataset = undersample(self.dataset, seed, label_column)
+ self.dataset = undersample(self.dataset, seed, self.label_column)
 
- self.labels, counts = np.unique(self.dataset[label_column], return_counts=True)
- self.label_fracs = counts / counts.sum()
+ self.set_labels()
 
  print(f"Class balance '{split}': {[f'{x:.2%}' for x in self.label_fracs]}")
  pivot, *rest = self.label_fracs
@@ -69,7 +71,6 @@ def __init__(
  if max_examples:
  self.dataset = self.dataset.select(range(max_examples))
 
- self.label_column = label_column
  self.prompter = DatasetTemplates(path, subset_name=name) # type: ignore
  self.rng = Random(seed)
  self.strategy = strategy
@@ -111,3 +112,27 @@ def __len__(self):
  N *= len(self.prompter.templates)
 
  return N
+
+ def set_labels(self):
+ self.labels, counts = np.unique(self.dataset[self.label_column], return_counts=True)
+ self.label_fracs = counts / counts.sum()
+
+ def split_and_copy(self, indices, new_seed):
+ """
+ To avoid copying entire dataest num_proccesses times when multiprocessing,
+ this makes a shallow copy of self, but with self.dataset split
+ according to given indices.
+ """
+ dataset_split = self.dataset.select(indices)
+
+ # only shallow copy is needed -- multiprocess will pickle (dill) objects
+ self_copy = copy.copy(self)
+ self_copy.dataset = dataset_split
+
+ # redo counts based on new split
+ self_copy.set_labels()
+
+ # give copy a new rng
+ self_copy.rng = Random(new_seed)
+
+ return self_copy