EleutherAI · norabelrose · Jul 10, 2023 · Jun 8, 2023 · Jun 11, 2023 · Jun 14, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,4 +24,4 @@ repos:
  hooks:
  - id: codespell
  # The promptsource templates spuriously get flagged without this
- args: ["-L fpr", "--skip=*.yaml"]
+ args: ["-L fpr,leace", "--skip=*.yaml"]
diff --git a/elk/training/__init__.py b/elk/training/__init__.py
@@ -1,14 +1,12 @@
 from .ccs_reporter import CcsReporter, CcsReporterConfig
 from .classifier import Classifier
-from .concept_eraser import ConceptEraser
 from .eigen_reporter import EigenReporter, EigenReporterConfig
 from .reporter import Reporter, ReporterConfig
 
 __all__ = [
  "CcsReporter",
  "CcsReporterConfig",
  "Classifier",
- "ConceptEraser",
  "EigenReporter",
  "EigenReporterConfig",
  "Reporter",

diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
@@ -8,13 +8,11 @@
 
 import torch
 import torch.nn as nn
+from concept_erasure import ConceptEraser
 from torch import Tensor
 
-from ..metrics import roc_auc
 from ..parsing import parse_loss
 from ..utils.typing import assert_type
-from .classifier import Classifier
-from .concept_eraser import ConceptEraser
 from .losses import LOSSES
 from .reporter import Reporter, ReporterConfig
 
@@ -142,60 +140,6 @@ def __init__(
  )
  )
 
- @torch.no_grad()
- def check_separability(
- self,
- train_pair: tuple[Tensor, Tensor],
- val_pair: tuple[Tensor, Tensor],
- ) -> float:
- """Measure how linearly separable the pseudo-labels are for a contrast pair.
-
- Args:
- train_pair: A tuple of tensors, (x0, x1), where x0 and x1 are the
- contrastive representations. Used for training the classifier.
- val_pair: A tuple of tensors, (x0, x1), where x0 and x1 are the
- contrastive representations. Used for evaluating the classifier.
-
- Returns:
- The AUROC of a linear classifier fit on the pseudo-labels.
- """
- x0, x1 = map(self.norm, train_pair)
- val_x0, val_x1 = map(self.norm, val_pair)
-
- pseudo_clf = Classifier(x0.shape[-1], device=x0.device) # type: ignore
- pseudo_train = torch.cat(
- [
- torch.zeros_like(x0[..., 0]),
- torch.ones_like(x1[..., 0]),
- ]
- ).flatten()
- pseudo_val = torch.cat(
- [
- torch.zeros_like(val_x0[..., 0]),
- torch.ones_like(val_x1[..., 0]),
- ]
- ).flatten()
-
- pseudo_clf.fit(
- # b v d -> (b v) d
- torch.cat([x0, x1]).flatten(0, 1),
- pseudo_train,
- # Use the same weight decay as the reporter
- l2_penalty=self.config.weight_decay,
- )
- pseudo_preds = pseudo_clf(
- # b v d -> (b v) d
- torch.cat([val_x0, val_x1]).flatten(0, 1)
- ).squeeze(-1)
-
- # Edge case where the classifier learns to set its weights to zero
- # Technically AUROC is not defined here but we "fill in" the value of 0.5
- # since this is the limit as the weights approach zero
- if not pseudo_preds.any():
- return 0.5
- else:
- return roc_auc(pseudo_val, pseudo_preds).item()
-
  def reset_parameters(self):
  """Reset the parameters of the probe.
 
@@ -265,12 +209,12 @@ def fit(self, hiddens: Tensor) -> float:
  self.norm.update(
  x=x_neg,
  # Independent indicator for each (template, pseudo-label) pair
- y=torch.cat([torch.zeros_like(prompt_ids), prompt_ids], dim=-1),
+ z=torch.cat([torch.zeros_like(prompt_ids), prompt_ids], dim=-1),
  )
  self.norm.update(
  x=x_pos,
  # Independent indicator for each (template, pseudo-label) pair
- y=torch.cat([prompt_ids, torch.zeros_like(prompt_ids)], dim=-1),
+ z=torch.cat([prompt_ids, torch.zeros_like(prompt_ids)], dim=-1),
  )
  x_neg, x_pos = self.norm(x_neg), self.norm(x_pos)
 

diff --git a/elk/training/concept_eraser.py b/elk/training/concept_eraser.py
diff --git a/elk/training/eigen_reporter.py b/elk/training/eigen_reporter.py
@@ -4,12 +4,12 @@
 from pathlib import Path
 
 import torch
+from concept_erasure import ConceptEraser
 from einops import rearrange
 from torch import Tensor, nn
 
 from ..truncated_eigh import truncated_eigh
 from ..utils.math_util import cov_mean_fused
-from .concept_eraser import ConceptEraser
 from .reporter import Reporter, ReporterConfig
 
 
@@ -184,12 +184,12 @@ def update(self, hiddens: Tensor) -> None:
  if self.config.erase_prompts:
  # Independent indicator for each (template, pseudo-label) pair
  indicators = torch.eye(k * v, device=hiddens.device).expand(n, -1, -1)
- self.norm.update(x=hiddens, y=indicators)
+ self.norm.update(x=hiddens, z=indicators)
  else:
  # Only use indicators for each pseudo-label
  indicators = torch.eye(k, device=hiddens.device).expand(n, v, -1, -1)
 
- self.norm.update(x=hiddens, y=indicators)
+ self.norm.update(x=hiddens, z=indicators)
 
  # *** Invariance (intra-cluster) ***
  # This is just a standard online *mean* update, since we're computing the

diff --git a/elk/training/train.py b/elk/training/train.py
@@ -82,13 +82,6 @@ def apply_to_layer(
  train_loss = reporter.fit(first_train_h)
 
  (val_h, val_gt, _) = next(iter(val_dict.values()))
- x0, x1 = first_train_h.unbind(2)
- val_x0, val_x1 = val_h.unbind(2)
- pseudo_auroc = reporter.check_separability(
- train_pair=(x0, x1),
- val_pair=(val_x0, val_x1),
- )
-
  (_, v, k, _) = first_train_h.shape
  reporter.platt_scale(
  to_one_hot(repeat(train_gt, "n -> (n v)", v=v), k).flatten(),
@@ -112,7 +105,6 @@ def apply_to_layer(
  )
  reporter.update(train_h)
 
- pseudo_auroc = None
  train_loss = reporter.fit_streaming()
  reporter.platt_scale(
  torch.cat(label_list),
@@ -150,7 +142,6 @@ def apply_to_layer(
  **meta,
  "ensembling": mode,
  **evaluate_preds(val_gt, val_credences, mode).to_dict(),
- "pseudo_auroc": pseudo_auroc,
  "train_loss": train_loss,
  }
  )

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,9 @@ license = { text = "MIT License" }
 dependencies = [
  # Allows us to use device_map in from_pretrained. Also needed for 8bit
  "accelerate",
+ # For pseudolabel and prompt normalization. We're picky about the version because
+ # the package isn't guaranteed to be stable yet.
+ "concept-erasure==0.0.2",
  # Added distributed.split_dataset_by_node for IterableDatasets
  "datasets>=2.9.0",
  "einops",

diff --git a/tests/test_concept_eraser.py b/tests/test_concept_eraser.py