make min_memory usable; broadcast mmax_examples in __post_init__

EleutherAI · lauritowal · Mar 28, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 10, 2023
commit 177eec2e02f11366f70b9c9213a267bec170eb59
diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -77,7 +77,7 @@ def evaluate_reporters(cfg: EvaluateConfig, out_dir: Optional[Path] = None):
  if feat.startswith("hidden_")
  ]
 
- devices = select_usable_devices(cfg.num_gpus)
+ devices = select_usable_devices(cfg.num_gpus, min_memory=cfg.target.min_gpu_mem)
  num_devices = len(devices)
 
  transfer_eval = elk_reporter_dir() / cfg.source / "transfer_eval"

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
@@ -29,7 +29,7 @@
  AutoTokenizer,
  PreTrainedModel,
 )
-from typing import Iterable, Literal, Union
+from typing import Iterable, Literal, Union, Optional
 import logging
 import os
 import torch
@@ -46,6 +46,7 @@ class ExtractionConfig(Serializable):
  layer_stride: Shortcut for setting `layers` to `range(0, num_layers, stride)`.
  token_loc: The location of the token to extract hidden states from. Can be
  either "first", "last", or "mean". Defaults to "last".
+ min_gpu_mem: Minimum amount of free memory (in bytes) required to select a GPU.
  """
 
  prompts: PromptConfig
@@ -54,6 +55,7 @@ class ExtractionConfig(Serializable):
  layers: tuple[int, ...] = ()
  layer_stride: InitVar[int] = 1
  token_loc: Literal["first", "last", "mean"] = "last"
+ min_gpu_mem: Optional[int] = None
 
  def __post_init__(self, layer_stride: int):
  if self.layers and layer_stride > 1:
@@ -203,18 +205,7 @@ def get_splits() -> SplitDict:
  train_name, val_name = select_train_val_splits(available_splits)
  print(f"Using '{train_name}' for training and '{val_name}' for validation")
 
- out_splits = SplitDict(
- train=available_splits[train_name], val=available_splits[val_name]
- )
-
- # Empty list means no limit
  limit_list = cfg.prompts.max_examples
- if not limit_list:
- limit_list = [int(1e100)]
-
- # Broadcast the limit to all splits
- if len(limit_list) == 1:
- limit_list *= len(out_splits)
 
  return SplitDict(
  {
@@ -255,7 +246,7 @@ def get_splits() -> SplitDict:
  length=num_variants,
  ),
  }
- devices = select_usable_devices(num_gpus)
+ devices = select_usable_devices(num_gpus, min_memory=cfg.min_gpu_mem)
  builders = {
  split_name: _GeneratorBuilder(
  cache_dir=None,

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
@@ -63,6 +63,12 @@ def __post_init__(self):
  "max_examples should be a list of length 0, 1, or 2,"
  f"but got {len(self.max_examples)}"
  )
+ if not self.max_examples:
+ self.max_examples = [int(1e100)]
+
+ # Broadcast the limit to all splits
+ if len(self.max_examples) == 1:
+ self.max_examples *= 2
 
 
 def load_prompts(

diff --git a/elk/training/train.py b/elk/training/train.py
@@ -200,7 +200,7 @@ def train(cfg: RunConfig, out_dir: Optional[Path] = None):
  with open(out_dir / "metadata.yaml", "w") as meta_f:
  yaml.dump(meta, meta_f)
 
- devices = select_usable_devices(cfg.num_gpus)
+ devices = select_usable_devices(cfg.num_gpus, min_memory=cfg.data.min_gpu_mem)
  num_devices = len(devices)
 
  cols = [

diff --git a/elk/utils/gpu_utils.py b/elk/utils/gpu_utils.py
@@ -1,15 +1,19 @@
 """Utilities that use PyNVML to get GPU usage info, and select GPUs accordingly."""
 
 from .typing import assert_type
+from typing import Optional
 import os
 import pynvml
 import torch
 import warnings
 import time
 
 
-def select_usable_devices(num_gpus: int = -1, *, min_memory: int = -1) -> list[str]:
+def select_usable_devices(
+ num_gpus: int = -1, *, min_memory: Optional[int] = None
+) -> list[str]:
  """Select a set of devices that have at least `min_memory` bytes of free memory.
+ Blocks until at least `num_gpus` devices are available.
 
  When there are more than enough GPUs to satisfy the request, the GPUs with the
  most free memory will be selected. With default arguments, this function will
@@ -30,7 +34,7 @@ def select_usable_devices(num_gpus: int = -1, *, min_memory: int = -1) -> list[s
  num_gpus: Number of GPUs to select. If negative, all available GPUs
  meeting the criteria will be selected.
  min_memory: Minimum amount of free memory (in bytes) required to select a GPU.
- If negative, `min_memory` is set to 90% of the per-GPU memory.
+ If None, `min_memory` is set to 90% of the per-GPU memory.
 
  Returns:
  A list of suitable PyTorch device strings, in ascending numerical order, with
@@ -85,7 +89,7 @@ def select_usable_devices(num_gpus: int = -1, *, min_memory: int = -1) -> list[s
  assert num_installed == num_visible, "PyNVML and PyTorch disagree on GPU count"
 
  # Set default value for `min_memory`
- if min_memory < 0:
+ if min_memory is None:
  min_device_ram = min(
  (
  assert_type(

diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
@@ -7,39 +7,45 @@
 
 
 def test_smoke_elicit_run_tiny_gpt2_ccs(tmp_path: Path):
- model_path = "sshleifer/tiny-gpt2"
+ # we need about 5 mb of gpu memory to run this test
+ model_path, min_mem = "sshleifer/tiny-gpt2", 10 * 1024**2
  dataset_name = "imdb"
  config = RunConfig(
  data=ExtractionConfig(
  model=model_path,
  prompts=PromptConfig(datasets=[dataset_name], max_examples=[10]),
+ min_gpu_mem=min_mem,
  # run on all layers, tiny-gpt only has 2 layers
  ),
+ num_gpus=2,
  net=CcsReporterConfig(),
  )
  train(config, tmp_path)
  # get the files in the tmp_path
- files: Path = list(tmp_path.iterdir())
+ files: list[Path] = list(tmp_path.iterdir())
  created_file_names = {file.name for file in files}
  expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"]
  for file in expected_files:
  assert file in created_file_names
 
 
 def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path):
- model_path = "sshleifer/tiny-gpt2"
+ # we need about 5 mb of gpu memory to run this test
+ model_path, min_mem = "sshleifer/tiny-gpt2", 10 * 1024**2
  dataset_name = "imdb"
  config = RunConfig(
  data=ExtractionConfig(
  model=model_path,
  prompts=PromptConfig(datasets=[dataset_name], max_examples=[10]),
+ min_gpu_mem=min_mem,
  # run on all layers, tiny-gpt only has 2 layers
  ),
+ num_gpus=2,
  net=EigenReporterConfig(),
  )
  train(config, tmp_path)
  # get the files in the tmp_path
- files: Path = list(tmp_path.iterdir())
+ files: list[Path] = list(tmp_path.iterdir())
  created_file_names = {file.name for file in files}
  expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"]
  for file in expected_files: