Remove .predict and .predict_prob on Reporter; trying to get SciQ to …

…work
EleutherAI · norabelrose · Apr 16, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
commit a8c21a6aec422610e05ddba8ac8623310a0a281c
diff --git a/elk/extraction/balanced_sampler.py b/elk/extraction/balanced_sampler.py
@@ -29,6 +29,7 @@ class BalancedSampler(TorchIterableDataset):
  num_classes: int
  buffer_size: int = 1000
  buffers: dict[int, deque[dict]] = field(default_factory=dict, init=False)
+ label_col: str = "label"
 
  def __post_init__(self):
  # Initialize empty buffers
@@ -38,7 +39,12 @@ def __post_init__(self):
 
  def __iter__(self):
  for sample in self.data:
- label = sample["label"]
+ label = sample[self.label_col]
+
+ # This whole class is a no-op if the label is not an integer
+ if not isinstance(label, int):
+ yield sample
+ continue
 
  # Add the sample to the buffer for its class label
  self.buffers[label].append(sample)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -99,6 +99,8 @@ def extract_hiddens(
 
  prompt_ds = load_prompts(
  *cfg.prompts.datasets,
+ label_column=cfg.prompts.label_column,
+ num_classes=cfg.prompts.num_classes,
  split_type=split_type,
  stream=cfg.prompts.stream,
  rank=rank,
@@ -273,8 +275,8 @@ def get_splits() -> SplitDict:
  info = get_dataset_config_info(ds_name, config_name or None)
 
  ds_features = assert_type(Features, info.features)
- label_col = infer_label_column(ds_features)
- num_classes = infer_num_classes(ds_features[label_col])
+ label_col = cfg.prompts.label_column or infer_label_column(ds_features)
+ num_classes = cfg.prompts.num_classes or infer_num_classes(ds_features[label_col])
 
  layer_cols = {
  f"hidden_{layer}": Array3D(

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
@@ -50,6 +50,7 @@ class PromptConfig(Serializable):
  data_dir: Optional[str] = None
  label_column: Optional[str] = None
  max_examples: list[int] = field(default_factory=lambda: [750, 250])
+ num_classes: int = 0
  num_shots: int = 0
  num_variants: int = -1
  seed: int = 42
@@ -71,6 +72,8 @@ def __post_init__(self):
 
 def load_prompts(
  *dataset_strings: str,
+ label_column: Optional[str] = None,
+ num_classes: int = 0,
  num_shots: int = 0,
  num_variants: int = -1,
  seed: int = 42,
@@ -98,6 +101,7 @@ def load_prompts(
  class_counts = []
  prompters = []
  datasets = []
+ label_cols = []
  train_datasets = []
  rng = Random(seed)
  assert num_shots == 0
@@ -128,14 +132,11 @@ def load_prompts(
  # This prints to stdout which is slightly annoying
  ds = split_dataset_by_node(dataset=ds, rank=rank, world_size=world_size)
 
- label_column = infer_label_column(ds.features)
- num_classes = infer_num_classes(ds.features[label_column])
- if label_column != "label":
- ds = ds.rename_column(label_column, "label")
- train_ds = train_ds.rename_column(label_column, "label")
-
+ ds_label_col = label_column or infer_label_column(ds.features)
+ num_classes = num_classes or infer_num_classes(ds.features[ds_label_col])
  class_counts.append(num_classes)
  datasets.append(ds)
+ label_cols.append(ds_label_col)
  train_datasets.append(train_ds)
 
  # Number of classes should be the same for all datasets
@@ -155,16 +156,21 @@ def load_prompts(
  if rank == 0:
  print(f"Using {num_variants} variants of each prompt")
 
- ds_iters = [iter(BalancedSampler(ds, num_classes)) for ds in datasets]
- for ds_iter, ds, prompter in cycle(zip(ds_iters, datasets, prompters)):
+ ds_iters = [
+ iter(BalancedSampler(ds, num_classes, label_col=label_col))
+ for ds, label_col in zip(datasets, label_cols)
+ ]
+ for ds_iter, ds, label_col, prompter in cycle(
+ zip(ds_iters, datasets, label_cols, prompters)
+ ):
  try:
  example = next(ds_iter)
  except StopIteration:
  return
 
  example = _convert_to_prompts(
  example,
- label_column="label",
+ label_column=label_col,
  num_classes=num_classes,
  num_variants=num_variants,
  prompter=prompter,
@@ -190,7 +196,7 @@ def _convert_to_prompts(
  fewshot_iter: Optional[Iterator[list[dict]]] = None,
 ) -> dict[str, Any]:
  """Prompt-generating function to pass to `IterableDataset.map`."""
- label = assert_type(int, example[label_column])
+ labels_are_strings = isinstance(example[label_column], str)
  prompts = []
  templates = list(prompter.templates.values())
  if num_variants < len(templates):
@@ -203,15 +209,24 @@ def qa_cat(q: str, a: str) -> str:
 
  # For sanity checking that prompts are unique
  prompt_counter = Counter()
+ label_indices = set()
+
  for template in templates:
  choices = []
+ string_choices = template.get_answer_choices_list(example)
+
+ label = example[label_column]
+ label_indices.add(string_choices.index(label) if labels_are_strings else label)
 
  for answer_idx in range(num_classes):
  fake_example = example.copy()
- fake_example[label_column] = answer_idx
+ if labels_are_strings:
+ fake_example[label_column] = string_choices[answer_idx]
+ else:
+ fake_example[label_column] = answer_idx
 
  q, a = template.apply(fake_example)
- text = qa_cat(q, a)
+ text = qa_cat(q, a or string_choices[answer_idx])
  prompt_counter[text] += 1
 
  if fewshot_iter is not None:
@@ -238,8 +253,14 @@ def qa_cat(q: str, a: str) -> str:
  if dup_count > 1:
  raise ValueError(f'Prompt duplicated {dup_count} times! "{maybe_dup}"')
 
+ # Sanity check: label should be the same across all variants
+ if len(label_indices) > 1:
+ raise ValueError(
+ f"Label index should be the same all variants, but got {label_indices}"
+ )
+
  return dict(
- label=label,
+ label=label_indices.pop(),
  prompts=prompts,
- template_names=prompter.all_template_names,
+ template_names=[template.name for template in templates],
  )
diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
@@ -164,13 +164,6 @@ def forward(self, x: Tensor) -> Tensor:
  """Return the raw score output of the probe on `x`."""
  return self.probe(x).squeeze(-1)
 
- def predict(self, hiddens: Tensor) -> Tensor:
- return self.predict_prob(hiddens).logit()
-
- def predict_prob(self, hiddens: Tensor) -> Tensor:
- x_pos, x_neg = hiddens.unbind(2)
- return 0.5 * (self(x_pos).sigmoid() + (1 - self(x_neg).sigmoid()))
-
  def loss(
  self,
  logit0: Tensor,

diff --git a/elk/training/eigen_reporter.py b/elk/training/eigen_reporter.py
@@ -103,16 +103,6 @@ def forward(self, hiddens: Tensor) -> Tensor:
  raw_scores = hiddens @ self.weight.mT
  return raw_scores.mul(self.scale).add(self.bias).squeeze(-1)
 
- predict = forward
-
- def predict_prob(self, hiddens: Tensor) -> Tensor:
- """Return the predicted probabilities on the contrast set `hiddens`."""
- logits = self(hiddens)
- if len(hiddens) == 2:
- return logits.sigmoid()
- else:
- return logits.softmax(dim=-1)
-
  @property
  def contrastive_xcov(self) -> Tensor:
  return self.contrastive_xcov_M2 / self.n

diff --git a/elk/training/reporter.py b/elk/training/reporter.py
@@ -13,7 +13,7 @@
 from torch import Tensor
 
 from ..calibration import CalibrationError
-from ..metrics import mean_auc, to_one_hot
+from ..metrics import to_one_hot
 from .classifier import Classifier
 
 
@@ -96,42 +96,43 @@ def check_separability(
  """Measure how linearly separable the pseudo-labels are for a contrast pair.
 
  Args:
- train_hiddens: Tensor of shape [n, ], where x0 and x1 are the
- contrastive representations. Used for training the classifier.
- val_pair: A tuple of tensors, (x0, x1), where x0 and x1 are the
- contrastive representations. Used for evaluating the classifier.
+ train_hiddens: Contrast set of shape [n, v, k, d]. Used for training the
+ classifier.
+ val_hiddens: Contrast set of shape [n, v, k, d]. Used for evaluating the
+ classifier.
 
  Returns:
  The AUROC of a linear classifier fit on the pseudo-labels.
  """
- x0, x1 = train_hiddens
- val_x0, val_x1 = val_hiddens
+ (n_train, v, k, d) = train_hiddens.shape
+ (n_val, _, k_val, d_val) = val_hiddens.shape
+ assert d == d_val, "Must have the same number of features in each split"
+ assert k == k_val == 2, "Must be a binary contrast set"
 
- pseudo_clf = Classifier(x0.shape[-1], device=x0.device) # type: ignore
+ pseudo_clf = Classifier(d, device=train_hiddens.device)
  pseudo_train_labels = torch.cat(
  [
- x0.new_zeros(x0.shape[0]),
- x0.new_ones(x0.shape[0]),
+ train_hiddens.new_zeros(n_train),
+ train_hiddens.new_ones(n_train),
  ]
  ).repeat_interleave(
- x0.shape[1]
+ v
  ) # make num_variants copies of each pseudo-label
+
  pseudo_val_labels = torch.cat(
  [
- val_x0.new_zeros(val_x0.shape[0]),
- val_x0.new_ones(val_x0.shape[0]),
+ val_hiddens.new_zeros(n_val),
+ val_hiddens.new_ones(n_val),
  ]
- ).repeat_interleave(val_x0.shape[1])
+ ).repeat_interleave(v)
 
  pseudo_clf.fit(
- # b v d -> (b v) d
- torch.cat([x0, x1]).flatten(0, 1),
+ rearrange(train_hiddens, "n v k d -> (k n v) d"),
  pseudo_train_labels,
  )
  with torch.no_grad():
  pseudo_preds = pseudo_clf(
- # b v d -> (b v) d
- torch.cat([val_x0, val_x1]).flatten(0, 1)
+ rearrange(val_hiddens, "n v k d -> (k n v) d"),
  )
  return float(roc_auc_score(pseudo_val_labels.cpu(), pseudo_preds.cpu()))
 
@@ -170,35 +171,27 @@ def fit(
  ) -> float:
  ...
 
- @abstractmethod
- def predict(self, hiddens: Tensor) -> Tensor:
- """Return pooled logits for the contrast set `hiddens`."""
-
- @abstractmethod
- def predict_prob(self, hiddens: Tensor) -> Tensor:
- """Like `predict` but returns normalized probabilities, not logits."""
-
  @torch.no_grad()
  def score(self, labels: Tensor, hiddens: Tensor) -> EvalResult:
  """Score the probe on the contrast set `hiddens`.
 
  Args:
  labels: The labels of the contrast pair.
- hiddens: The hidden representations of the contrast set.
+ hiddens: Contrast set of shape [n, v, k, d].
 
  Returns:
  an instance of EvalResult containing the loss, accuracy, calibrated
  accuracy, and AUROC of the probe on `hiddens`.
  """
- pred_probs = self.predict_prob(hiddens)
- (_, v, c) = pred_probs.shape
+ logits = self(hiddens)
+ (_, v, c) = logits.shape
 
  # makes `num_variants` copies of each label
+ logits = rearrange(logits, "n v c -> (n v) c")
  Y = repeat(labels, "n -> (n v)", v=v).float()
- to_one_hot(Y, n_classes=c).long().flatten()
 
  if c == 2:
- pos_probs = pred_probs[..., 1].flatten()
+ pos_probs = logits[..., 1].flatten().sigmoid()
  cal_err = CalibrationError().update(Y.cpu(), pos_probs.cpu()).compute().ece
 
  # Calibrated accuracy
@@ -212,12 +205,10 @@ def score(self, labels: Tensor, hiddens: Tensor) -> EvalResult:
  cal_acc = 0.0
  cal_err = 0.0
 
- raw_preds = pred_probs.argmax(dim=-1)
+ raw_preds = to_one_hot(logits.argmax(dim=-1), c).long()
+ Y = to_one_hot(Y, c).long().flatten()
 
- # roc_auc_score only takes flattened input
- auroc = mean_auc(
- Y.cpu(), rearrange(pred_probs.cpu(), "n v ... -> (n v) ..."), curve="roc"
- )
+ auroc = roc_auc_score(Y.cpu(), logits.cpu().flatten())
  raw_acc = raw_preds.flatten().eq(Y).float().mean()
 
  return EvalResult(