EleutherAI · lauritowal · Mar 28, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 10, 2023
diff --git a/elk/__init__.py b/elk/__init__.py
@@ -1 +1 @@
-from .extraction import extract_hiddens, ExtractionConfig, PromptDataset
+from .extraction import extract_hiddens, ExtractionConfig
diff --git a/elk/__main__.py b/elk/__main__.py
@@ -1,7 +1,7 @@
 """Main entry point for `elk`."""
 
 from .extraction import extract, ExtractionConfig
-from elk.evaluation.evaluate import EvaluateConfig, evaluate_reporters
+from .evaluation.evaluate import EvaluateConfig, evaluate_reporters
 from .training import RunConfig
 from .training.train import train
 from pathlib import Path

diff --git a/elk/evaluation/__init__.py b/elk/evaluation/__init__.py
diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -1,21 +1,16 @@
-import csv
-import os
-import pickle
+from ..training.preprocessing import normalize
 from dataclasses import dataclass
+from datasets import DatasetDict
 from functools import partial
-from hashlib import md5
 from pathlib import Path
-from typing import List, Literal, Optional, cast
-
-import torch
-import torch.multiprocessing as mp
-import yaml
 from simple_parsing.helpers import Serializable, field
 from torch import Tensor
 from tqdm.auto import tqdm
-
-from datasets import DatasetDict
-from elk.training.preprocessing import normalize
+from typing import Literal, Optional, cast
+import csv
+import os
+import torch
+import torch.multiprocessing as mp
 
 from ..extraction import ExtractionConfig, extract
 from ..files import elk_reporter_dir, memorably_named_dir

diff --git a/elk/extraction/__init__.py b/elk/extraction/__init__.py
@@ -1,3 +1,4 @@
+from .balanced_sampler import BalancedBatchSampler, BalancedSampler
 from .extraction import ExtractionConfig, extract_hiddens, extract
 from .generator import _GeneratorConfig, _GeneratorBuilder
-from .prompt_dataset import PromptDataset, PromptConfig
+from .prompt_loading import PromptConfig, load_prompts
diff --git a/elk/extraction/balanced_sampler.py b/elk/extraction/balanced_sampler.py
@@ -0,0 +1,88 @@
+from ..utils import infer_label_column
+from collections import Counter
+from dataclasses import dataclass, field, InitVar
+from datasets import IterableDataset
+from itertools import cycle
+from torch.utils.data import IterableDataset as TorchIterableDataset
+from typing import Iterator, Optional
+import numpy as np
+
+
+@dataclass
+class BalancedSampler(TorchIterableDataset):
+ """
+ Approximately balances a binary classification dataset in a streaming fashion.
+ Written mostly by GPT-4.
+
+ Args:
+ dataset (IterableDataset): The HuggingFace IterableDataset to balance.
+ label_col (Optional[str], optional): The name of the column containing the
+ binary label. If not provided, the label column will be inferred from
+ the dataset features. Defaults to None.
+ buffer_size (int, optional): The total buffer size to use for balancing the
+ dataset. This value should be divisible by 2, as it will be equally
+ divided between the two binary label values (0 and 1). Defaults to 1000.
+ """
+
+ dataset: IterableDataset
+ label_counts: np.ndarray = field(default_factory=lambda: np.zeros(2))
+ seed: InitVar[int] = 42
+
+ def __post_init__(self, seed: int):
+ self.rng = np.random.default_rng(seed)
+
+ def __iter__(self):
+ for sample in self.dataset:
+ label = sample["label"]
+
+ # Update class counts
+ self.label_counts[label] += 1
+ current_balance = self.label_counts / self.label_counts.sum()
+
+ # Check if the sample should be dropped
+ majority_class = np.argmax(current_balance)
+ if label == majority_class:
+ # Solution of n * p * q / [n * (1 - p) + n * p * q] = 0.5 for q
+ keep_prob = 1 / current_balance[majority_class] - 1
+ if self.rng.uniform() < 1 - keep_prob:
+ continue
+
+ yield sample
+
+
+class BalancedBatchSampler:
+ """Yields precisely balanced batches from a binary classification dataset.
+
+ Written by a human being because GPT-4 couldn't figure out how to do it.
+ """
+
+ def __init__(
+ self,
+ dataset: IterableDataset,
+ label_col: Optional[str] = None,
+ batch_size: int = 32,
+ ):
+ self.batch_size = batch_size
+ self.dataset = dataset
+ self.label_col = label_col or infer_label_column(dataset.features)
+
+ def __iter__(self) -> Iterator[list[dict]]:
+ batch = []
+
+ max_count = self.batch_size // 2
+ label_counts = Counter()
+
+ # Infinite loop!
+ for sample in cycle(self.dataset):
+ label = sample[self.label_col]
+ if label_counts[label] >= max_count:
+ continue
+
+ batch.append(sample)
+ label_counts[label] += 1
+
+ if len(batch) == self.batch_size:
+ yield batch
+
+ batch = []
+ label_counts.clear()