EleutherAI · norabelrose · Apr 16, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -28,19 +28,20 @@ class Eval(Serializable):
  `elk.training.preprocessing.normalize()` for details.
  num_gpus: The number of GPUs to use. Defaults to -1, which means
  "use all available GPUs".
+ skip_supervised: Whether to skip evaluation of the supervised classifier.
  debug: When in debug mode, a useful log file is saved to the memorably-named
  output directory. Defaults to False.
  """
 
  data: Extract
  source: str = field(positional=True)
 
+ concatenated_layer_offset: int = 0
  debug: bool = False
- out_dir: Path | None = None
- num_gpus: int = -1
  min_gpu_mem: int | None = None
- skip_baseline: bool = False
- concatenated_layer_offset: int = 0
+ num_gpus: int = -1
+ out_dir: Path | None = None
+ skip_supervised: bool = False
 
  def execute(self):
  datasets = self.data.prompts.datasets
@@ -82,7 +83,7 @@ def evaluate_reporter(
  )
 
  lr_dir = experiment_dir / "lr_models"
- if not self.cfg.skip_baseline and lr_dir.exists():
+ if not self.cfg.skip_supervised and lr_dir.exists():
  with open(lr_dir / f"layer_{layer}.pt", "rb") as f:
  lr_model = torch.load(f, map_location=device).eval()
 

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
@@ -76,8 +76,19 @@ def explode(self) -> list["PromptConfig"]:
  # Broadcast the dataset name to all data_dirs and label_columns
  if len(self.data_dirs) == 1:
  self.data_dirs *= len(self.datasets)
+ elif self.data_dirs and len(self.data_dirs) != len(self.datasets):
+ raise ValueError(
+ "data_dirs should be a list of length 0, 1, or len(datasets),"
+ f" but got {len(self.data_dirs)}"
+ )
+
  if len(self.label_columns) == 1:
  self.label_columns *= len(self.datasets)
+ elif self.label_columns and len(self.label_columns) != len(self.datasets):
+ raise ValueError(
+ "label_columns should be a list of length 0, 1, or len(datasets),"
+ f" but got {len(self.label_columns)}"
+ )
 
  for ds, data_dir, col in zip_longest(
  self.datasets, self.data_dirs, self.label_columns

diff --git a/elk/logging.py b/elk/logging.py
@@ -1,9 +1,9 @@
 import logging
 
-from .utils import select_train_val_splits
+from .utils import get_dataset_name, select_train_val_splits
 
 
-def save_debug_log(ds, out_dir):
+def save_debug_log(datasets, out_dir):
  """
  Save a debug log to the output directory. This is useful for debugging
  training issues.
@@ -16,32 +16,39 @@ def save_debug_log(ds, out_dir):
  filemode="w",
  )
 
- train_split, val_split = select_train_val_splits(ds)
- text_inputs = ds[val_split][0]["text_inputs"]
- template_ids = ds[val_split][0]["variant_ids"]
- label = ds[val_split][0]["label"]
-
- # log the train size and val size
- logging.info(f"Train size: {len(ds[train_split])}")
- logging.info(f"Val size: {len(ds[val_split])}")
-
- templates_text = f"{len(text_inputs)} templates used:\n"
- trailing_whitespace = False
- for (text0, text1), id in zip(text_inputs, template_ids):
- templates_text += (
- f'***---TEMPLATE "{id}"---***\n'
- f"{'false' if label else 'true'}:\n"
- f'"""{text0}"""\n'
- f"{'true' if label else 'false'}:\n"
- f'"""{text1}"""\n\n\n'
+ for ds in datasets:
+ logging.info(
+ "=========================================\n"
+ f"Dataset: {get_dataset_name(ds)}\n"
+ "========================================="
  )
- if text0[-1].isspace() or text1[-1].isspace():
- trailing_whitespace = True
- if trailing_whitespace:
- logging.warning(
- "Some inputs to the model have trailing whitespace! "
- "Check that the jinja templates are not adding "
- "trailing whitespace. If `token_loc` is 'last', this "
- "will extract hidden states from the whitespace token."
- )
- logging.info(templates_text)
+
+ train_split, val_split = select_train_val_splits(ds)
+ text_inputs = ds[val_split][0]["text_inputs"]
+ template_ids = ds[val_split][0]["variant_ids"]
+ label = ds[val_split][0]["label"]
+
+ # log the train size and val size
+ logging.info(f"Train size: {len(ds[train_split])}")
+ logging.info(f"Val size: {len(ds[val_split])}")
+
+ templates_text = f"{len(text_inputs)} templates used:\n"
+ trailing_whitespace = False
+ for (text0, text1), id in zip(text_inputs, template_ids):
+ templates_text += (
+ f'***---TEMPLATE "{id}"---***\n'
+ f"{'false' if label else 'true'}:\n"
+ f'"""{text0}"""\n'
+ f"{'true' if label else 'false'}:\n"
+ f'"""{text1}"""\n\n\n'
+ )
+ if text0[-1].isspace() or text1[-1].isspace():
+ trailing_whitespace = True
+ if trailing_whitespace:
+ logging.warning(
+ "Some inputs to the model have trailing whitespace! "
+ "Check that the jinja templates are not adding "
+ "trailing whitespace. If `token_loc` is 'last', this "
+ "will extract hidden states from the whitespace token."
+ )
+ logging.info(templates_text)
diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
@@ -7,11 +7,13 @@
 
 import torch
 import torch.nn as nn
+from sklearn.metrics import roc_auc_score
 from torch import Tensor
 from torch.nn.functional import binary_cross_entropy as bce
 
 from ..parsing import parse_loss
 from ..utils.typing import assert_type
+from .classifier import Classifier
 from .losses import LOSSES
 from .normalizer import Normalizer
 from .reporter import Reporter, ReporterConfig
@@ -55,7 +57,6 @@ class CcsReporterConfig(ReporterConfig):
  init: Literal["default", "pca", "spherical", "zero"] = "default"
  loss: list[str] = field(default_factory=lambda: ["ccs"])
  loss_dict: dict[str, float] = field(default_factory=dict, init=False)
- normalization: Literal["none", "meanonly", "full"] = "full"
  num_layers: int = 1
  pre_ln: bool = False
  seed: int = 42
@@ -96,12 +97,8 @@ def __init__(
 
  hidden_size = cfg.hidden_size or 4 * in_features // 3
 
- self.neg_norm = Normalizer(
- (in_features,), device=device, dtype=dtype, mode=cfg.normalization
- )
- self.pos_norm = Normalizer(
- (in_features,), device=device, dtype=dtype, mode=cfg.normalization
- )
+ self.neg_norm = Normalizer((in_features,), device=device, dtype=dtype)
+ self.pos_norm = Normalizer((in_features,), device=device, dtype=dtype)
 
  self.probe = nn.Sequential(
  nn.Linear(
@@ -131,6 +128,56 @@ def __init__(
  )
  )
 
+ def check_separability(
+ self,
+ train_pair: tuple[Tensor, Tensor],
+ val_pair: tuple[Tensor, Tensor],
+ ) -> float:
+ """Measure how linearly separable the pseudo-labels are for a contrast pair.
+
+ Args:
+ train_pair: A tuple of tensors, (x0, x1), where x0 and x1 are the
+ contrastive representations. Used for training the classifier.
+ val_pair: A tuple of tensors, (x0, x1), where x0 and x1 are the
+ contrastive representations. Used for evaluating the classifier.
+
+ Returns:
+ The AUROC of a linear classifier fit on the pseudo-labels.
+ """
+ _x0, _x1 = train_pair
+ _val_x0, _val_x1 = val_pair
+
+ x0, x1 = self.neg_norm(_x0), self.pos_norm(_x1)
+ val_x0, val_x1 = self.neg_norm(_val_x0), self.pos_norm(_val_x1)
+
+ pseudo_clf = Classifier(x0.shape[-1], device=x0.device) # type: ignore
+ pseudo_train_labels = torch.cat(
+ [
+ x0.new_zeros(x0.shape[0]),
+ x0.new_ones(x0.shape[0]),
+ ]
+ ).repeat_interleave(
+ x0.shape[1]
+ ) # make num_variants copies of each pseudo-label
+ pseudo_val_labels = torch.cat(
+ [
+ val_x0.new_zeros(val_x0.shape[0]),
+ val_x0.new_ones(val_x0.shape[0]),
+ ]
+ ).repeat_interleave(val_x0.shape[1])
+
+ pseudo_clf.fit(
+ # b v d -> (b v) d
+ torch.cat([x0, x1]).flatten(0, 1),
+ pseudo_train_labels,
+ )
+ with torch.no_grad():
+ pseudo_preds = pseudo_clf(
+ # b v d -> (b v) d
+ torch.cat([val_x0, val_x1]).flatten(0, 1)
+ )
+ return float(roc_auc_score(pseudo_val_labels.cpu(), pseudo_preds.cpu()))
+
  def unsupervised_loss(self, logit0: Tensor, logit1: Tensor) -> Tensor:
  loss = sum(
  LOSSES[name](logit0, logit1, coef)
@@ -224,10 +271,10 @@ def fit(
  hiddens: Tensor,
  labels: Optional[Tensor] = None,
  ) -> float:
- """Fit the probe to the contrast pair (x0, x1).
+ """Fit the probe to the contrast pair (neg, pos).
 
  Args:
- contrast_pair: A tuple of tensors, (x0, x1), where x0 and x1 are the
+ contrast_pair: A tuple of tensors, (neg, pos), where x0 and x1 are the
  contrastive representations.
  labels: The labels of the contrast pair. Defaults to None.
 
@@ -275,8 +322,8 @@ def fit(
 
  def train_loop_adam(
  self,
- x_pos: Tensor,
  x_neg: Tensor,
+ x_pos: Tensor,
  labels: Optional[Tensor] = None,
  ) -> float:
  """Adam train loop, returning the final loss. Modifies params in-place."""
@@ -297,8 +344,8 @@ def train_loop_adam(
 
  def train_loop_lbfgs(
  self,
- x_pos: Tensor,
  x_neg: Tensor,
+ x_pos: Tensor,
  labels: Optional[Tensor] = None,
  ) -> float:
  """LBFGS train loop, returning the final loss. Modifies params in-place."""

diff --git a/elk/training/reporter.py b/elk/training/reporter.py
@@ -14,7 +14,6 @@
 
 from ..calibration import CalibrationError
 from ..metrics import accuracy, to_one_hot
-from .classifier import Classifier
 
 
 class EvalResult(NamedTuple):
@@ -62,55 +61,6 @@ class OptimConfig(Serializable):
 class Reporter(nn.Module, ABC):
  """An ELK reporter network."""
 
- @classmethod
- def check_separability(
- cls,
- train_hiddens: Tensor,
- val_hiddens: Tensor,
- ) -> float:
- """Measure how linearly separable the pseudo-labels are for a contrast pair.
-
- Args:
- train_hiddens: Contrast set of shape [n, v, k, d]. Used for training the
- classifier.
- val_hiddens: Contrast set of shape [n, v, k, d]. Used for evaluating the
- classifier.
-
- Returns:
- The AUROC of a linear classifier fit on the pseudo-labels.
- """
- (n_train, v, k, d) = train_hiddens.shape
- (n_val, _, k_val, d_val) = val_hiddens.shape
- assert d == d_val, "Must have the same number of features in each split"
- assert k == k_val == 2, "Must be a binary contrast set"
-
- pseudo_clf = Classifier(d, device=train_hiddens.device)
- pseudo_train_labels = torch.cat(
- [
- train_hiddens.new_zeros(n_train),
- train_hiddens.new_ones(n_train),
- ]
- ).repeat_interleave(
- v
- ) # make num_variants copies of each pseudo-label
-
- pseudo_val_labels = torch.cat(
- [
- val_hiddens.new_zeros(n_val),
- val_hiddens.new_ones(n_val),
- ]
- ).repeat_interleave(v)
-
- pseudo_clf.fit(
- rearrange(train_hiddens, "n v k d -> (k n v) d"),
- pseudo_train_labels,
- )
- with torch.no_grad():
- pseudo_preds = pseudo_clf(
- rearrange(val_hiddens, "n v k d -> (k n v) d"),
- )
- return float(roc_auc_score(pseudo_val_labels.cpu(), pseudo_preds.cpu()))
-
  def reset_parameters(self):
  """Reset the parameters of the probe."""
 

diff --git a/elk/training/supervised.py b/elk/training/supervised.py
@@ -26,7 +26,7 @@ def evaluate_supervised(
  return assert_type(float, lr_auroc), assert_type(float, lr_acc)
 
 
-def train_supervised(data: dict[str, tuple], device: str) -> Classifier:
+def train_supervised(data: dict[str, tuple], device: str, cv: bool) -> Classifier:
  Xs, train_labels = [], []
 
  for train_h, labels, _ in data.values():
@@ -41,6 +41,9 @@ def train_supervised(data: dict[str, tuple], device: str) -> Classifier:
 
  X, train_labels = torch.cat(Xs), torch.cat(train_labels)
  lr_model = Classifier(X.shape[-1], device=device)
- lr_model.fit_cv(X, train_labels)
+ if cv:
+ lr_model.fit_cv(X, train_labels)
+ else:
+ lr_model.fit(X, train_labels)
 
  return lr_model