forked from openai/evals
-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics.py
76 lines (58 loc) · 2.4 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
This file defines various common metrics of interest.
"""
import random
from typing import Optional, Sequence, Set
import numpy as np
from evals.record import Event
def get_accuracy(events: Sequence[Event]) -> float:
num_correct = 0
num_total = 0
for event in events:
num_total += 1
num_correct += int(event.data["correct"])
if num_total == 0:
return float("nan")
else:
return num_correct / num_total
def get_bootstrap_accuracy_std(events: Sequence[Event], num_samples: int = 1000):
vals = [m.data["correct"] for m in events]
return np.std([np.mean(random.sample(vals, len(vals) // 2)) for _ in range(1000)])
def get_confusion_matrix(
matches: Sequence[Event], class_labels: Optional[Set] = None
) -> np.ndarray:
labels = set()
for match in matches:
labels.add(match.data["expected"])
if class_labels is None:
labels = {label: i for i, label in enumerate(sorted(labels))}
else:
assert labels.issubset(class_labels)
labels = {label: i for i, label in enumerate(class_labels)}
result = np.zeros((len(labels), len(labels) + 1), dtype=int)
for match in matches:
i = labels[match.data["expected"]]
j = labels.get(match.data["picked"], len(labels))
result[i, j] += 1
return result
def compute_matthew_corr(confusion_matrix):
assert confusion_matrix.shape == (2, 3), f"Got shape: {confusion_matrix.shape}"
r = confusion_matrix[:, :2]
r[:, 0] += confusion_matrix[:, 2]
return (r[1, 1] * r[0, 0] - r[1, 0] * r[0, 1]) / np.sqrt(
r[1, :].sum() * r[0, :].sum() * r[:, 0].sum() * r[:, 1].sum()
)
def compute_precision(confusion_matrix, idx=0):
return confusion_matrix[idx, idx] / confusion_matrix[:, idx].sum()
def compute_recall(confusion_matrix, idx=0):
return confusion_matrix[idx, idx] / confusion_matrix[idx, :].sum()
def compute_f_score(confusion_matrix, idx=0, beta=1.0):
precision = compute_precision(confusion_matrix, idx=idx)
recall = compute_recall(confusion_matrix, idx=idx)
return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
def compute_averaged_f_score(confusion_matrix, beta=1.0, average="macro"):
assert average in ["macro"]
f_scores = []
for i in range(confusion_matrix.shape[0]):
f_scores.append(compute_f_score(confusion_matrix, idx=i, beta=beta))
return np.array(f_scores).mean()