Merge branch 'main' into multiprocessing

EleutherAI · norabelrose · Feb 19, 2023 · Feb 16, 2023 · Feb 18, 2023 · Feb 18, 2023
commit 03b782d80ae59bda25debe1cdee88858b677d537
diff --git a/elk/__main__.py b/elk/__main__.py
@@ -4,9 +4,7 @@
 from .files import args_to_uuid
 from .list import list_runs
 from argparse import ArgumentParser
-from contextlib import nullcontext, redirect_stdout
-import logging
-import os
+
 
 def run():
  """Run `elk`.
@@ -45,6 +43,9 @@ def run():
  list_runs(args)
  return
 
+ # Import here and not at the top to speed up `elk list`
+ from .extraction.extraction_main import run as run_extraction
+ from .training.train import train
  from transformers import AutoConfig, PretrainedConfig
 
  if model := getattr(args, "model", None):
@@ -62,14 +63,14 @@ def run():
  args.layers = list(range(0, num_layers, args.layer_stride))
 
  # Default to CUDA iff available
- # TODO: args.device isn't used right now because model needs to be loaded on CPU first
+ # args.device isn't used right now because model needs to be loaded on CPU first
  if args.device is None:
  import torch
 
  if not torch.cuda.is_available():
  args.device = "cpu"
  else:
- args.device = f"cuda"
+ args.device = "cuda"
 
  # Prevent printing from processes other than the first one
  for key in list(vars(args).keys()):

diff --git a/elk/extraction/extraction_main.py b/elk/extraction/extraction_main.py
@@ -48,7 +48,7 @@ def extract(args, split: str):
  prompt_suffix=args.prompt_suffix,
  token_loc=args.token_loc,
  use_encoder_states=args.use_encoder_states,
- num_procs=torch.cuda.device_count()
+ num_procs=torch.cuda.device_count(),
  )
  ]
  save_dir.mkdir(parents=True, exist_ok=True)

diff --git a/elk/extraction/prompt_collator.py b/elk/extraction/prompt_collator.py
@@ -150,7 +150,9 @@ def __len__(self):
  return N
 
  def set_labels(self):
- self.labels, counts = np.unique(self.dataset[self.label_column], return_counts=True)
+ self.labels, counts = np.unique(
+ self.dataset[self.label_column], return_counts=True
+ )
  self.label_fracs = counts / counts.sum()
 
  def split_and_copy(self, indices, new_seed):
@@ -160,7 +162,7 @@ def split_and_copy(self, indices, new_seed):
  according to given indices.
  """
  dataset_split = self.dataset.select(indices)
- 
+
  # only shallow copy is needed -- multiprocess will pickle (dill) objects
  self_copy = copy.copy(self)
  self_copy.dataset = dataset_split