Use concept-erasure implementation of LEACE and SAL

EleutherAI · norabelrose · Jul 10, 2023 · Jun 8, 2023 · Jun 11, 2023 · Jun 14, 2023
commit dc2cc499a331dcc6a1cee9d53bed2bebced37881
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,4 +24,4 @@ repos:
  hooks:
  - id: codespell
  # The promptsource templates spuriously get flagged without this
- args: ["-L fpr", "--skip=*.yaml"]
+ args: ["-L fpr,leace", "--skip=*.yaml"]
diff --git a/elk/training/__init__.py b/elk/training/__init__.py
@@ -1,14 +1,12 @@
 from .ccs_reporter import CcsReporter, CcsReporterConfig
 from .classifier import Classifier
-from .concept_eraser import ConceptEraser
 from .eigen_reporter import EigenReporter, EigenReporterConfig
 from .reporter import Reporter, ReporterConfig
 
 __all__ = [
  "CcsReporter",
  "CcsReporterConfig",
  "Classifier",
- "ConceptEraser",
  "EigenReporter",
  "EigenReporterConfig",
  "Reporter",

diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
@@ -8,13 +8,13 @@
 
 import torch
 import torch.nn as nn
+from concept_erasure import ConceptEraser
 from torch import Tensor
 
 from ..metrics import roc_auc
 from ..parsing import parse_loss
 from ..utils.typing import assert_type
 from .classifier import Classifier
-from .concept_eraser import ConceptEraser
 from .losses import LOSSES
 from .reporter import Reporter, ReporterConfig
 
@@ -302,6 +302,8 @@ def fit(self, hiddens: Tensor) -> float:
  raise RuntimeError("Got NaN/infinite loss during training")
 
  self.load_state_dict(best_state)
+ self.norm.finalize() # Save disk space by dropping covariance stats
+
  return best_loss
 
  def train_loop_adam(self, x_neg: Tensor, x_pos: Tensor) -> float:

diff --git a/elk/training/concept_eraser.py b/elk/training/concept_eraser.py
diff --git a/elk/training/eigen_reporter.py b/elk/training/eigen_reporter.py
@@ -4,12 +4,12 @@
 from pathlib import Path
 
 import torch
+from concept_erasure import ConceptEraser
 from einops import rearrange
 from torch import Tensor, nn
 
 from ..truncated_eigh import truncated_eigh
 from ..utils.math_util import cov_mean_fused
-from .concept_eraser import ConceptEraser
 from .reporter import Reporter, ReporterConfig
 
 
@@ -184,12 +184,12 @@ def update(self, hiddens: Tensor) -> None:
  if self.config.erase_prompts:
  # Independent indicator for each (template, pseudo-label) pair
  indicators = torch.eye(k * v, device=hiddens.device).expand(n, -1, -1)
- self.norm.update(x=hiddens, y=indicators)
+ self.norm.update(x=hiddens, z=indicators)
  else:
  # Only use indicators for each pseudo-label
  indicators = torch.eye(k, device=hiddens.device).expand(n, v, -1, -1)
 
- self.norm.update(x=hiddens, y=indicators)
+ self.norm.update(x=hiddens, z=indicators)
 
  # *** Invariance (intra-cluster) ***
  # This is just a standard online *mean* update, since we're computing the
@@ -283,6 +283,10 @@ def fit(self, hiddens: Tensor) -> float:
  Returns:
  loss: Negative eigenvalue associated with the VINC direction.
  """
+ # Save disk space by dropping covariance stats for LEACE
+ if not self.config.save_reporter_stats:
+ self.norm.finalize()
+
  self.update(hiddens)
  return self.fit_streaming()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,8 @@ license = { text = "MIT License" }
 dependencies = [
  # Allows us to use device_map in from_pretrained. Also needed for 8bit
  "accelerate",
+ # For pseudolabel and prompt normalization
+ "concept-erasure>=0.0.2",
  # Added distributed.split_dataset_by_node for IterableDatasets
  "datasets>=2.9.0",
  "einops",

diff --git a/tests/test_concept_eraser.py b/tests/test_concept_eraser.py