forked from EleutherAI/elk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
111 lines (88 loc) · 3.71 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from typing import Callable
import pandas as pd
import torch
from simple_parsing.helpers import Serializable, field
from ..extraction.extraction import Extract
from ..files import elk_reporter_dir
from ..metrics import evaluate_preds
from ..run import Run
from ..training import Reporter
from ..utils import select_usable_devices
@dataclass
class Eval(Serializable):
"""
Full specification of a reporter evaluation run.
Args:
data: Config specifying hidden states on which the reporter will be evaluated.
source: The name of the source run directory
which contains the reporters directory.
normalization: The normalization method to use. Defaults to "meanonly". See
`elk.training.preprocessing.normalize()` for details.
num_gpus: The number of GPUs to use. Defaults to -1, which means
"use all available GPUs".
skip_supervised: Whether to skip evaluation of the supervised classifier.
debug: When in debug mode, a useful log file is saved to the memorably-named
output directory. Defaults to False.
"""
data: Extract
source: str = field(positional=True)
concatenated_layer_offset: int = 0
debug: bool = False
min_gpu_mem: int | None = None
num_gpus: int = -1
out_dir: Path | None = None
skip_supervised: bool = False
combine_evals: bool = False
def execute(self):
datasets = self.data.prompts.datasets
transfer_dir = elk_reporter_dir() / self.source / "transfer_eval"
if self.combine_evals:
run = Evaluate(cfg=self, out_dir=transfer_dir / ", ".join(datasets))
else:
# eval on each dataset separately
for dataset in datasets:
self.data.prompts.datasets = [dataset]
run = Evaluate(cfg=self, out_dir=transfer_dir / dataset)
run.evaluate()
@dataclass
class Evaluate(Run):
cfg: Eval
def evaluate_reporter(
self, layer: int, devices: list[str], world_size: int = 1
) -> pd.DataFrame:
"""Evaluate a single reporter on a single layer."""
device = self.get_device(devices, world_size)
val_output = self.prepare_data(device, layer, "val")
experiment_dir = elk_reporter_dir() / self.cfg.source
reporter_path = experiment_dir / "reporters" / f"layer_{layer}.pt"
reporter: Reporter = torch.load(reporter_path, map_location=device)
reporter.eval()
row_buf = []
for ds_name, (val_h, val_gt, _) in val_output.items():
val_result = evaluate_preds(val_gt, reporter(val_h))
stats_row = {
"dataset": ds_name,
"layer": layer,
**val_result.to_dict(),
}
lr_dir = experiment_dir / "lr_models"
if not self.cfg.skip_supervised and lr_dir.exists():
with open(lr_dir / f"layer_{layer}.pt", "rb") as f:
lr_model = torch.load(f, map_location=device).eval()
lr_result = evaluate_preds(val_gt, lr_model(val_h))
stats_row.update(lr_result.to_dict(prefix="lr_"))
row_buf.append(stats_row)
return pd.DataFrame.from_records(row_buf)
def evaluate(self):
"""Evaluate the reporter on all layers."""
devices = select_usable_devices(
self.cfg.num_gpus, min_memory=self.cfg.min_gpu_mem
)
num_devices = len(devices)
func: Callable[[int], pd.DataFrame] = partial(
self.evaluate_reporter, devices=devices, world_size=num_devices
)
self.apply_to_layers(func=func, num_devices=num_devices)