Support encoder-decoder model LM output

EleutherAI · norabelrose · Apr 16, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
commit a20d4ca4504f78dec4f6c5f7c52c729eb075c8aa
diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -7,7 +7,7 @@
 from simple_parsing.helpers import Serializable, field
 
 from elk.evaluation.evaluate_log import EvalLog
-from elk.extraction.extraction import Extract
+from elk.extraction import Extract
 from elk.run import Run
 from elk.training import Reporter
 

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -1,7 +1,6 @@
 """Functions for extracting the hidden states of a model."""
 from dataclasses import InitVar, dataclass
 from datasets import (
- Array2D,
  Array3D,
  ClassLabel,
  DatasetDict,
@@ -14,17 +13,22 @@
 )
 from itertools import islice
 from simple_parsing import Serializable, field
-from transformers import AutoConfig, AutoTokenizer, PreTrainedModel
+from torch import Tensor
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_outputs import Seq2SeqLMOutput
 from typing import Iterable, Literal, Optional, Union
 import logging
 import os
 import torch
 
+# import torch.nn.functional as F
+
 from ..utils import (
  assert_type,
  convert_span,
  float32_to_int16,
- get_model_class,
+ instantiate_model,
+ is_autoregressive,
  select_train_val_splits,
  select_usable_devices,
 )
@@ -101,30 +105,12 @@ def extract_hiddens(
  world_size=world_size,
  ) # this dataset is already sharded, but hasn't been truncated to max_examples
 
- model_cls = get_model_class(cfg.model)
- model = assert_type(
- PreTrainedModel,
- model_cls.from_pretrained(
- cfg.model, torch_dtype="auto" if device != "cpu" else torch.float32
- ),
+ model = instantiate_model(
+ cfg.model, torch_dtype="auto" if device != "cpu" else torch.float32
  ).to(device)
- # TODO: Maybe also make this configurable?
- # We want to make sure the answer is never truncated
  tokenizer = AutoTokenizer.from_pretrained(
  cfg.model, truncation_side="left", verbose=False
  )
- is_enc_dec = model.config.is_encoder_decoder
-
- # If this is an encoder-decoder model we don't need to run the decoder at all.
- # Just strip it off, making the problem equivalent to a regular encoder-only model.
- if is_enc_dec:
- # This isn't actually *guaranteed* by HF, but it's true for all existing models
- if not hasattr(model, "get_encoder") or not callable(model.get_encoder):
- raise ValueError(
- "Encoder-decoder model doesn't have expected get_encoder() method"
- )
-
- model = assert_type(PreTrainedModel, model.get_encoder())
 
  # Iterating over questions
  layer_indices = cfg.layers or tuple(range(model.config.num_hidden_layers))
@@ -148,7 +134,7 @@ def extract_hiddens(
  )
  for layer_idx in layer_indices
  }
- model_preds = torch.empty(
+ lm_preds = torch.empty(
  num_variants,
  2, # contrast pair
  device=device,
@@ -165,29 +151,51 @@ def extract_hiddens(
  text = choice["text"]
  variant_inputs.append(text)
 
+ # TODO: Do something smarter than "rindex" here. Really we want to
+ # get the span of the answer directly from Jinja, but that doesn't
+ # seem possible. This approach may fail for complex templates.
+ answer_start = text.rindex(choice["answer"])
+
+ # Only feed question, not the answer, to the encoder for enc-dec models
+ if model.config.is_encoder_decoder:
+ # TODO: Maybe make this more generic for complex templates?
+ text = text[:answer_start].rstrip()
+ target = choice["answer"]
+ else:
+ target = None
+
  inputs = tokenizer(
  text,
  return_offsets_mapping=True,
  return_tensors="pt",
+ text_target=target, # type: ignore[arg-type]
  truncation=True,
- ).to(device)
+ )
 
  # The offset_mapping is a sorted list of (start, end) tuples. We locate
  # the start of the answer in the tokenized sequence with binary search.
  offsets = inputs.pop("offset_mapping").squeeze().tolist()
+ inputs = inputs.to(device)
 
+ # Run the forward pass
  outputs = model(**inputs, output_hidden_states=True)
 
- # TODO: Do something smarter than "rindex" here. Really we'd like to
- # get the span of the answer directly from Jinja, but that doesn't seem
- # to be supported. The current approach may fail for complex templates.
- answer_start = text.rindex(choice["answer"])
- start, end = convert_span(
- offsets, (answer_start, answer_start + len(choice["answer"]))
- )
- log_p = outputs.logits[..., start - 1 : end - 1, :].log_softmax(dim=-1)
- tokens = inputs.input_ids[..., start:end, None]
- model_preds[i, j] = log_p.gather(-1, tokens).sum()
+ # Compute the log probability of the answer tokens if available
+ if type(outputs).__name__.startswith("CausalLMOutput"):
+ start, end = convert_span(
+ offsets, (answer_start, answer_start + len(choice["answer"]))
+ )
+ log_p = outputs.logits[..., start - 1 : end - 1, :].log_softmax(
+ dim=-1
+ )
+ tokens = inputs.input_ids[..., start:end, None]
+ lm_preds[i, j] = log_p.gather(-1, tokens).sum()
+
+ elif isinstance(outputs, Seq2SeqLMOutput):
+ # The cross entropy loss is averaged over tokens, so we need to
+ # multiply by the length to get the total log probability.
+ length = inputs.labels.shape[-1]
+ lm_preds[i, j] = -assert_type(Tensor, outputs.loss) * length
 
  hiddens = (
  outputs.get("decoder_hidden_states") or outputs["hidden_states"]
@@ -216,7 +224,7 @@ def extract_hiddens(
  yield dict(
  label=example["label"],
  # We only need the probability of the positive example since this is binary
- model_preds=model_preds.softmax(dim=-1)[..., 1],
+ model_preds=lm_preds.softmax(dim=-1)[..., 1],
  variant_ids=example["template_names"],
  text_inputs=text_inputs,
  **hidden_dict,
@@ -269,10 +277,6 @@ def get_splits() -> SplitDict:
  length=num_variants,
  ),
  "label": ClassLabel(names=["neg", "pos"]),
- "model_preds": Sequence(
- Value(dtype="float32"),
- length=num_variants,
- ),
  "text_inputs": Sequence(
  Sequence(
  Value(dtype="string"),
@@ -281,6 +285,14 @@ def get_splits() -> SplitDict:
  length=num_variants,
  ),
  }
+
+ # Only add model_preds if the model is an autoregressive model
+ if is_autoregressive(model_cfg):
+ other_cols["model_preds"] = Sequence(
+ Value(dtype="float32"),
+ length=num_variants,
+ )
+
  devices = select_usable_devices(num_gpus, min_memory=cfg.min_gpu_mem)
  builders = {
  split_name: _GeneratorBuilder(

diff --git a/elk/training/train.py b/elk/training/train.py
@@ -144,15 +144,21 @@ def train_reporter(
  )
 
  reporter_dir, lr_dir = self.create_models_dir(assert_type(Path, self.out_dir))
- val_gt_cpu = val_gt.repeat_interleave(val_lm_preds.shape[1]).float().cpu()
+ if val_lm_preds is not None:
+ val_gt_cpu = val_gt.repeat_interleave(val_lm_preds.shape[1]).float().cpu()
+ val_lm_auroc = float(roc_auc_score(val_gt_cpu, val_lm_preds.flatten()))
+ val_lm_acc = float(accuracy_score(val_gt_cpu, val_lm_preds.flatten() > 0.5))
+ else:
+ val_lm_auroc = None
+ val_lm_acc = None
 
  stats = ElicitLog(
  layer=layer,
  pseudo_auroc=pseudo_auroc,
  train_loss=train_loss,
  eval_result=val_result,
- lm_auroc=float(roc_auc_score(val_gt_cpu, val_lm_preds.flatten())),
- lm_acc=float(accuracy_score(val_gt_cpu, val_lm_preds.flatten() > 0.5)),
+ lm_auroc=val_lm_auroc,
+ lm_acc=val_lm_acc,
  )
 
  if not self.cfg.skip_baseline:
@@ -165,8 +171,8 @@ def train_reporter(
  val_gt,
  device,
  )
- stats.lr_auroc = lr_auroc
- stats.lr_acc = lr_acc
+ stats.lr_auroc = float(lr_auroc)
+ stats.lr_acc = float(lr_acc)
  self.save_baseline(lr_dir, layer, lr_model)
 
  with open(reporter_dir / f"layer_{layer}.pt", "wb") as file:

diff --git a/elk/training/train_log.py b/elk/training/train_log.py
@@ -12,8 +12,9 @@ class ElicitLog:
  train_loss: float
  eval_result: EvalResult
 
- lm_auroc: float
- lm_acc: float
+ # Only available when the LM is autoregressive
+ lm_auroc: Optional[float] = None
+ lm_acc: Optional[float] = None
 
  # Only available if reporting baseline
  lr_auroc: Optional[float] = None
@@ -48,9 +49,9 @@ def to_csv_line(self, skip_baseline: bool) -> list[str]:
  self.eval_result.cal_acc,
  self.eval_result.auroc,
  self.eval_result.ece,
- self.lm_auroc,
- self.lm_acc,
  ]
+ if self.lm_auroc is not None and self.lm_acc is not None:
+ items += [self.lm_auroc, self.lm_acc]
  if not skip_baseline:
  items += [self.lr_auroc, self.lr_acc]
 

diff --git a/elk/utils/__init__.py b/elk/utils/__init__.py
@@ -8,6 +8,6 @@
 )
 
 from .gpu_utils import select_usable_devices
-from .hf_utils import get_model_class
+from .hf_utils import instantiate_model, is_autoregressive
 from .tree_utils import pytree_map
 from .typing import assert_type, float32_to_int16, int16_to_float32
diff --git a/elk/utils/hf_utils.py b/elk/utils/hf_utils.py
@@ -1,32 +1,39 @@
 from .typing import assert_type
-from transformers import AutoConfig, PreTrainedModel
-from typing import Type
+from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
 import transformers
 
 
-def get_model_class(model_str: str) -> Type[PreTrainedModel]:
- """Get the appropriate model class for a model string."""
+# Ordered by preference
+_AUTOREGRESSIVE_SUFFIXES = [
+ # Encoder-decoder models
+ "ConditionalGeneration",
+ # Autoregressive models
+ "CausalLM",
+ "LMHeadModel",
+]
+
+
+def instantiate_model(model_str: str, **kwargs) -> PreTrainedModel:
+ """Instantiate a model string with the appropriate `Auto` class."""
  model_cfg = AutoConfig.from_pretrained(model_str)
  archs = assert_type(list, model_cfg.architectures)
 
- # Ordered by preference
- suffixes = [
- # Fine-tuned for classification
- "SequenceClassification",
- # Encoder-decoder models
- "ConditionalGeneration",
- # Autoregressive models
- "CausalLM",
- "LMHeadModel",
- ]
-
- for suffix in suffixes:
+ for suffix in _AUTOREGRESSIVE_SUFFIXES:
  # Check if any of the architectures in the config end with the suffix.
  # If so, return the corresponding model class.
  for arch_str in archs:
  if arch_str.endswith(suffix):
- return getattr(transformers, arch_str)
+ model_cls = getattr(transformers, arch_str)
+ return model_cls.from_pretrained(model_str, **kwargs)
+
+ return AutoModel.from_pretrained(model_str, **kwargs)
 
- raise ValueError(
- f"'{model_str}' does not have any supported architectures: {archs}"
+
+def is_autoregressive(model_cfg: PretrainedConfig) -> bool:
+ """Check if a model config is autoregressive."""
+ archs = assert_type(list, model_cfg.architectures)
+ return any(
+ arch_str.endswith(suffix)
+ for arch_str in archs
+ for suffix in _AUTOREGRESSIVE_SUFFIXES
  )