label_choices

EleutherAI · norabelrose · Apr 22, 2023 · Apr 22, 2023 · Apr 22, 2023 · Apr 22, 2023
commit 756fa532c48731d951e9791d8eba761f564043b2
diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -30,6 +30,7 @@
  assert_type,
  float32_to_int16,
  infer_label_column,
+ infer_num_classes,
  instantiate_model,
  instantiate_tokenizer,
  is_autoregressive,
@@ -216,13 +217,13 @@ def extract_hiddens(
 
  log_p = outputs.logits[..., -answer_len:, :].log_softmax(dim=-1)
  tokens = answer[..., None]
- lm_logits[i, j] = log_p.gather(-1, tokens).sum()
+ lm_logits[i, j] = log_p.gather(-1, tokens).mean()
 
  elif isinstance(outputs, Seq2SeqLMOutput):
  # The cross entropy loss is averaged over tokens, so we need to
  # multiply by the length to get the total log probability.
- length = encoding.labels.shape[-1]
- lm_logits[i, j] = -assert_type(Tensor, outputs.loss) * length
+ # length = encoding.labels.shape[-1]
+ lm_logits[i, j] = -assert_type(Tensor, outputs.loss)  # * length
 
  hiddens = (
  outputs.get("decoder_hidden_states") or outputs["hidden_states"]
@@ -300,8 +301,10 @@ def get_splits() -> SplitDict:
 
  prompter = DatasetTemplates(ds_name, config_name)
  ds_features = assert_type(Features, info.features)
- prompter.label_column or infer_label_column(ds_features)
- num_classes = 2 # prompter.num_classes or infer_num_classes(ds_features[label_col])
+ label_col = prompter.label_column or infer_label_column(ds_features)
+ num_classes = len(prompter.label_choices) or infer_num_classes(
+ ds_features[label_col]
+ )
 
  num_variants = cfg.prompts.num_variants
  if num_variants < 0:

diff --git a/elk/promptsource/templates.py b/elk/promptsource/templates.py
@@ -386,7 +386,7 @@ class DatasetTemplates:
  TEMPLATE_FILENAME = "templates.yaml"
 
  label_column: str | None
- label_choices: list[str] | None
+ label_choices: list[str]
 
  def __init__(self, dataset_name: str, subset_name: str | None = None):
  self.dataset_name = dataset_name
@@ -400,7 +400,7 @@ def __init__(self, dataset_name: str, subset_name: str | None = None):
 
  # Optional fields; may be None
  self.label_column = yaml_dict.get(self.LABEL_COLUMN_KEY)
- self.label_choices = yaml_dict.get(self.LABEL_CHOICES_KEY)
+ self.label_choices = yaml_dict.get(self.LABEL_CHOICES_KEY, [])
 
  # Mapping from template name to template id
  self.name_to_id_mapping = {}

diff --git a/elk/promptsource/templates/glue/mnli/templates.yaml b/elk/promptsource/templates/glue/mnli/templates.yaml
@@ -1,5 +1,6 @@
 dataset: glue
 subset: mnli
+label_choices: [0, 2]
 templates:
  02b4c44e-52cb-417b-b069-5d334b1f1a91: !Template
  answer_choices: Always ||| Sometimes ||| Never