forked from EleutherAI/elk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
119 lines (96 loc) · 3.73 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from typing import Callable, Literal, Optional
import pandas as pd
import torch
from simple_parsing.helpers import Serializable, field
from ..extraction.extraction import Extract
from ..files import elk_reporter_dir
from ..run import Run
from ..training import Reporter
from ..training.baseline import evaluate_baseline, load_baseline
from ..utils import select_usable_devices
@dataclass
class Eval(Serializable):
"""
Full specification of a reporter evaluation run.
Args:
data: Config specifying hidden states on which the reporter will be evaluated.
source: The name of the source run directory
which contains the reporters directory.
normalization: The normalization method to use. Defaults to "meanonly". See
`elk.training.preprocessing.normalize()` for details.
num_gpus: The number of GPUs to use. Defaults to -1, which means
"use all available GPUs".
debug: When in debug mode, a useful log file is saved to the memorably-named
output directory. Defaults to False.
"""
data: Extract
source: str = field(positional=True)
normalization: Literal["legacy", "none", "elementwise", "meanonly"] = "meanonly"
debug: bool = False
out_dir: Optional[Path] = None
num_gpus: int = -1
skip_baseline: bool = False
concatenated_layer_offset: int = 0
combine_evals: bool = False
def execute(self):
datasets = self.data.prompts.datasets
transfer_dir = elk_reporter_dir() / self.source / "transfer_eval"
if self.combine_evals:
run = Evaluate(cfg=self, out_dir=transfer_dir / ", ".join(datasets))
run.evaluate()
else:
# eval on each dataset separately
for dataset in datasets:
self.data.prompts.datasets = [dataset]
run = Evaluate(cfg=self, out_dir=transfer_dir / dataset)
run.evaluate()
@dataclass
class Evaluate(Run):
cfg: Eval
def evaluate_reporter(
self, layer: int, devices: list[str], world_size: int = 1
) -> pd.Series:
"""Evaluate a single reporter on a single layer."""
device = self.get_device(devices, world_size)
_, _, test_x0, test_x1, _, test_labels, _ = self.prepare_data(
device,
layer,
)
experiment_dir = elk_reporter_dir() / self.cfg.source
reporter_path = experiment_dir / "reporters" / f"layer_{layer}.pt"
reporter: Reporter = torch.load(reporter_path, map_location=device)
reporter.eval()
test_result = reporter.score(
test_labels,
test_x0,
test_x1,
)
stats_row = pd.Series(
{
"layer": layer,
**test_result._asdict(),
}
)
lr_dir = experiment_dir / "lr_models"
if not self.cfg.skip_baseline and lr_dir.exists():
lr_model = load_baseline(lr_dir, layer)
lr_model.eval()
lr_auroc, lr_acc = evaluate_baseline(
lr_model.cuda(), test_x0.cuda(), test_x1.cuda(), test_labels
)
stats_row["lr_auroc"] = lr_auroc
stats_row["lr_acc"] = lr_acc
return stats_row
def evaluate(self):
"""Evaluate the reporter on all layers."""
devices = select_usable_devices(
self.cfg.num_gpus, min_memory=self.cfg.data.min_gpu_mem
)
num_devices = len(devices)
func: Callable[[int], pd.Series] = partial(
self.evaluate_reporter, devices=devices, world_size=num_devices
)
self.apply_to_layers(func=func, num_devices=num_devices)