check is streamable

EleutherAI · AlexTMallen · Apr 4, 2023 · Apr 4, 2023 · Apr 6, 2023 · Apr 8, 2023
commit 3a24b56e32d15de881c1738ee92150ec53cf6cff
diff --git a/elk/extraction/__init__.py b/elk/extraction/__init__.py
@@ -1,4 +1,4 @@
 from .balanced_sampler import BalancedSampler, FewShotSampler
 from .extraction import Extract, extract_hiddens, extract
 from .generator import _GeneratorConfig, _GeneratorBuilder
-from .prompt_loading import PromptConfig, load_prompts
+from .prompt_loading import PromptConfig, yield_prompts
diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -30,7 +30,7 @@
 )
 from .balanced_sampler import BalancedSampler
 from .generator import _GeneratorBuilder
-from .prompt_loading import PromptConfig, load_prompts
+from .prompt_loading import PromptConfig, yield_prompts
 
 
 @dataclass
@@ -93,10 +93,18 @@ def extract_hiddens(
  if rank != 0:
  logging.disable(logging.CRITICAL)
 
- prompt_ds = load_prompts(
+ global_max_examples = cfg.prompts.max_examples[0 if split_type == "train" else 1]
+ # break `max_examples` among the processes roughly equally
+ max_examples = global_max_examples // world_size
+ # the last process gets the remainder (which is usually small)
+ if rank == world_size - 1:
+ max_examples += global_max_examples % world_size
+
+ prompt_ds = yield_prompts(
  *cfg.prompts.datasets,
  split_type=split_type,
  stream=cfg.prompts.stream,
+ max_examples=max_examples,
  rank=rank,
  world_size=world_size,
  ) # this dataset is already sharded, but hasn't been truncated to max_examples
@@ -128,15 +136,6 @@ def extract_hiddens(
  layer_indices = cfg.layers or tuple(range(model.config.num_hidden_layers))
  # print(f"Using {prompt_ds} variants for each dataset")
 
- global_max_examples = cfg.prompts.max_examples[0 if split_type == "train" else 1]
- # break `max_examples` among the processes roughly equally
- max_examples = global_max_examples // world_size
- # the last process gets the remainder (which is usually small)
- if rank == world_size - 1:
- max_examples += global_max_examples % world_size
-
- print(f"Extracting {max_examples} examples from {prompt_ds} on {device}")
-
  for example in islice(BalancedSampler(prompt_ds), max_examples):
  num_variants = len(example["prompts"])
  hidden_dict = {

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
@@ -4,6 +4,7 @@
  binarize,
  infer_label_column,
  infer_num_classes,
+ is_streamable,
  select_train_val_splits,
 )
 from .balanced_sampler import FewShotSampler
@@ -71,13 +72,14 @@ def __post_init__(self):
  self.max_examples *= 2
 
 
-def load_prompts(
+def yield_prompts(
  *dataset_strings: str,
  num_shots: int = 0,
  num_variants: int = -1,
  seed: int = 42,
  split_type: Literal["train", "val"] = "train",
  stream: bool = False,
+ max_examples: int = 750,
  rank: int = 0,
  world_size: int = 1,
 ) -> Iterator[dict]:
@@ -114,18 +116,17 @@ def load_prompts(
  train_name, val_name = select_train_val_splits(ds_dict)
  split_name = val_name if split_type == "val" else train_name
 
- # Note that when streaming we can only approximately shuffle the dataset
- # using a buffer. Streaming shuffling is NOT an adequate shuffle for
- # datasets like IMDB, which are sorted by label.
- bad_streaming_datasets = ["imdb"]
- assert not (
- stream and ds_name in bad_streaming_datasets
- ), f"Streaming is not supported for {ds_name}."
  split = ds_dict[split_name].shuffle(seed=seed)
  train_ds = ds_dict[train_name].shuffle(seed=seed)
  if not stream:
  split = assert_type(Dataset, split)
  split = split.to_iterable_dataset().cast(split.features)
+ else:
+ if not is_streamable(split, max_examples=max_examples):
+ raise ValueError(
+ f"Streaming dataset {ds_name} is not streamable because the first "
+ f"{max_examples} examples are all of the same label."
+ )
 
  # only keep the datapoints relevant to the current process
  if world_size > 1:

diff --git a/elk/utils/__init__.py b/elk/utils/__init__.py
@@ -3,6 +3,7 @@
  get_columns_all_equal,
  infer_label_column,
  infer_num_classes,
+ is_streamable,
  select_train_val_splits,
 )
 

diff --git a/elk/utils/data_utils.py b/elk/utils/data_utils.py
@@ -4,13 +4,13 @@
  ClassLabel,
  DatasetDict,
  Features,
+ IterableDataset,
  Split,
  Value,
 )
 from random import Random
-import torch
-from typing import Iterable, Optional, List, Any
-import numpy as np
+from itertools import islice
+from typing import Iterable, List, Any
 import copy
 
 
@@ -120,3 +120,15 @@ def binarize(template: Template, label: int, new_label: int, rng: Random) -> Tem
  )
 
  return new_template
+
+
+def is_streamable(ds: IterableDataset, max_examples: int) -> bool:
+ """Checks that the first `max_examples` are not all of the same label.
+
+ Note that when streaming we can only approximately shuffle the dataset
+ using a buffer. Streaming shuffling is NOT an adequate shuffle for
+ datasets like IMDB, which are sorted by label.
+ """
+ label_column = infer_label_column(assert_type(Features, ds.features))
+ labels = [ex[label_column] for ex in islice(ds, max_examples)]
+ return len(set(labels)) > 1