prompt loading refactor to enable better streaming

EleutherAI · lauritowal · Mar 28, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 10, 2023
commit 3a762b0dd995ed61ee61820214ee67f804647b35
diff --git a/elk/extraction/balanced_sampler.py b/elk/extraction/balanced_sampler.py
@@ -6,7 +6,7 @@
 from itertools import cycle
 from random import Random
 from torch.utils.data import IterableDataset as TorchIterableDataset
-from typing import Iterator, Optional
+from typing import Iterator, Optional, Iterable
 
 
 @dataclass
@@ -25,14 +25,14 @@ class BalancedSampler(TorchIterableDataset):
  divided between the two binary label values (0 and 1). Defaults to 1000.
  """
 
- def __init__(self, dataset: IterableDataset, buffer_size: int = 1000):
- self.dataset = dataset
+ def __init__(self, data: Iterable[dict], buffer_size: int = 1000):
+ self.data = data
 
  self.neg_buffer = deque(maxlen=buffer_size)
  self.pos_buffer = deque(maxlen=buffer_size)
 
  def __iter__(self):
- for sample in self.dataset:
+ for sample in self.data:
  label = sample["label"]
 
  # Add the sample to the appropriate buffer

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -33,6 +33,7 @@
 import logging
 import os
 import torch
+from itertools import islice
 
 
 @dataclass
@@ -90,19 +91,14 @@ def extract_hiddens(
  # Silence datasets logging messages from all but the first process
  if rank != 0:
  logging.disable(logging.CRITICAL)
- if rank == 0 and cfg.prompts.num_variants >= 1:
- print(f"Using {cfg.prompts.num_variants} prompts per example")
 
- limits = cfg.prompts.max_examples
  prompt_ds = load_prompts(
  *cfg.prompts.datasets,
- max_examples=limits[0 if split_type == "train" else 1],
  split_type=split_type,
  stream=cfg.prompts.stream,
  rank=rank,
  world_size=world_size,
- )
- num_variants = prompt_ds.features["prompts"].length
+ ) # this dataset is already sharded, but hasn't been truncated to max_examples
 
  # AutoModel should do the right thing here in nearly all cases. We don't actually
  # care what head the model has, since we are just extracting hidden states.
@@ -131,7 +127,9 @@ def extract_hiddens(
  layer_indices = cfg.layers or tuple(range(model.config.num_hidden_layers))
  # print(f"Using {prompt_ds} variants for each dataset")
 
- for example in BalancedSampler(prompt_ds):
+ max_examples = cfg.prompts.max_examples[0 if split_type == "train" else 1]
+ for example in islice(BalancedSampler(prompt_ds), max_examples):
+ num_variants = len(example["prompts"])
  hidden_dict = {
  f"hidden_{layer_idx}": torch.empty(
  num_variants,
@@ -150,7 +148,7 @@ def extract_hiddens(
 
  # Iterate over answers
  for j in range(2):
- text = record["text"][j]
+ text = record[j]["text"]
  variant_inputs.append(text)
 
  inputs = tokenizer(

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
@@ -73,21 +73,20 @@ def __post_init__(self):
 
 def load_prompts(
  *dataset_strings: str,
- max_examples: int = 0,
  num_shots: int = 0,
+ num_variants: int = -1,
  seed: int = 42,
  shuffle: bool = True,
  split_type: Literal["train", "val"] = "train",
  stream: bool = False,
  rank: int = 0,
  world_size: int = 1,
-) -> IterableDataset:
+) -> Iterator[dict]:
  """Load a dataset full of prompts generated from the specified datasets.
 
  Args:
  dataset_strings: Space-delimited names of the HuggingFace datasets to use,
  e.g. `"super_glue boolq"` or `"imdb"`.
- max_examples: The maximum number of examples to use from the dataset.
  num_shots: The number of examples to use in few-shot prompts. If zero, prompts
  are zero-shot.
  seed: The seed to use for prompt randomization.
@@ -99,7 +98,6 @@ def load_prompts(
  Returns:
  An iterable dataset of prompts.
  """
- prompt_datasets = []
  prompters = []
  raw_datasets = []
  train_datasets = []
@@ -117,94 +115,82 @@ def load_prompts(
  train_name, val_name = select_train_val_splits(ds_dict)
  split_name = val_name if split_type == "val" else train_name
 
- # If we're not streaming, take the opportunity to shuffle the dataset.
+ # Note that when streaming we can only approximately shuffle the dataset
+ # using a buffer. Streaming shuffling is NOT an adequate shuffle for
+ # datasets like IMDB, which are sorted by label.
+ bad_streaming_datasets = ["imdb"]
+ assert not (
+ stream and ds_name in bad_streaming_datasets
+ ), f"Streaming is not supported for {ds_name}."
+ split = ds_dict[split_name].shuffle(seed=seed)
+ train_ds = ds_dict[train_name].shuffle(seed=seed)
  if not stream:
- ds = assert_type(Dataset, ds_dict[split_name].shuffle(seed=seed))
- train_ds = assert_type(Dataset, ds_dict[train_name].shuffle(seed=seed))
- split = ds.to_iterable_dataset().cast(ds.features)
- else:
- train_ds = assert_type(IterableDataset, ds_dict[train_name])
- split = assert_type(IterableDataset, ds_dict[split_name])
+ split = assert_type(Dataset, split)
+ split = split.to_iterable_dataset().cast(split.features)
+
+ # only keep the datapoints relevant to the current process
+ if world_size > 1:
+ # This prints to stdout which is slightly annoying
+ split = split_dataset_by_node(split, world_size, rank)
 
  raw_datasets.append(split)
  train_datasets.append(train_ds)
 
- num_variants = min(len(prompter.templates) for prompter in prompters)
+ min_num_templates = min(len(prompter.templates) for prompter in prompters)
+ num_variants = (
+ min_num_templates
+ if num_variants == -1
+ else min(num_variants, min_num_templates)
+ )
  assert num_variants > 0
+ if rank == 0:
+ print(f"Using {num_variants} variants of each prompt")
+
+ ds_iterators = [iter(ds) for ds in raw_datasets]
+ while True: # terminates when the first dataset runs out of examples
+ for ds_iterator, ds, train_ds, prompter in zip(
+ ds_iterators, raw_datasets, train_datasets, prompters
+ ):
+ label_column = infer_label_column(ds.features)
+ num_classes = infer_num_classes(ds.features[label_column])
+
+ # Remove everything except the label column
+ extra_cols = list(assert_type(Features, ds.features))
+ extra_cols.remove(label_column)
+
+ if label_column != "label":
+ ds = ds.rename_column(label_column, "label")
+ if num_shots > 0:
+ fewshot = FewShotSampler(
+ train_ds, # TODO: not iterator
+ num_shots=num_shots,
+ rng=rng,
+ )
+ fewshot_iter = iter(fewshot)
+ else:
+ fewshot_iter = None
 
- for ds, train_ds, prompter in zip(raw_datasets, train_datasets, prompters):
- label_column = infer_label_column(ds.features)
- num_classes = infer_num_classes(ds.features[label_column])
-
- # Remove everything except the label column
- extra_cols = list(assert_type(Features, ds.features))
- extra_cols.remove(label_column)
+ try:
+ example = next(ds_iterator)
+ except StopIteration:
+ return
 
- if label_column != "label":
- ds = ds.rename_column(label_column, "label")
- if num_shots > 0:
- fewshot = FewShotSampler(
- train_ds,
- num_shots=num_shots,
- rng=rng,
- )
- fewshot_iter = iter(fewshot)
- else:
- fewshot_iter = None
-
- # Canonicalize the name and dtype of the label column
- ds = ds.map(
- _convert_to_prompts,
- fn_kwargs=dict(
+ example = _convert_to_prompts(
+ example,
  label_column=label_column,
  num_classes=num_classes,
  num_variants=num_variants,
  prompter=prompter,
  rng=rng,
  fewshot_iter=fewshot_iter,
- ),
- remove_columns=extra_cols,
- ).map(
+ )
+
  # Add the builder and config name to the records directly to make
  # sure we don't forget what dataset they came from.
- lambda _: dict(
- builder_name=ds.info.builder_name,
- config_name=ds.info.config_name,
- ),
- # Explicit typing makes interleave_datasets work a lot faster
- features=Features(
- {
- label_column: ClassLabel(names=["neg", "pos"]),
- "builder_name": "string",
- "config_name": "string",
- "prompts": Sequence(
- Sequence(
- {"answer": "string", "text": "string"},
- length=2, # contrast pair
- ),
- length=num_variants,
- ),
- "template_names": Sequence("string"),
- }
- ),
- )
- prompt_datasets.append(ds)
-
- master_ds = interleave_datasets(prompt_datasets)
- if max_examples > 0:
- master_ds = master_ds.take(max_examples)
- if world_size > 1:
- # This prints to stdout which is slightly annoying
- master_ds = split_dataset_by_node(master_ds, rank, world_size)
- if shuffle:
- master_ds = master_ds.shuffle(seed=seed)
-
- # Try to approximately shuffle the dataset if we're streaming. Note that this is
- # NOT an adequate shuffle for datasets like IMDB, which are sorted by label.
- if stream:
- master_ds = master_ds.shuffle(seed=seed)
-
- return master_ds
+ example["builder_name"] = ds.info.builder_name
+ example["config_name"] = ds.info.config_name
+
+ yield example
 
 
 def _convert_to_prompts(

diff --git a/tests/test_load_prompts.py b/tests/test_load_prompts.py
@@ -1,6 +1,6 @@
 from elk.extraction import load_prompts, PromptConfig
 from elk.promptsource.templates import DatasetTemplates
-from itertools import cycle
+from itertools import cycle, islice
 from typing import Literal
 import pytest
 
@@ -10,7 +10,6 @@ def test_load_prompts():
  def test_single_split(cfg: PromptConfig, split_type: Literal["train", "val"]):
  prompt_ds = load_prompts(
  *cfg.datasets,
- max_examples=cfg.max_examples[0],
  shuffle=False,
  split_type=split_type,
  )
@@ -21,7 +20,8 @@ def test_single_split(cfg: PromptConfig, split_type: Literal["train", "val"]):
  prompter = DatasetTemplates(ds_name, config_name or None)
  prompters.append(prompter)
 
- for prompter, record in zip(cycle(prompters), prompt_ds):
+ limit = cfg.max_examples[0 if split_type == "train" else 1]
+ for prompter, record in zip(cycle(prompters), islice(prompt_ds, limit)):
  true_template_names = prompter.all_template_names
  returned_template_names = record["template_names"]