-
Notifications
You must be signed in to change notification settings - Fork 230
/
sacred_output_to_markdown_summary.py
157 lines (138 loc) · 5.55 KB
/
sacred_output_to_markdown_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""Generate a markdown summary of the results of a benchmarking run."""
import argparse
import pathlib
from collections import Counter
from functools import lru_cache
from typing import Generator, Sequence, cast
import datasets
import numpy as np
from huggingface_sb3 import EnvironmentName
from rliable import library as rly
from rliable import metrics
from imitation.data import rollout, types
from imitation.data.huggingface_utils import TrajectoryDatasetSequence
from imitation.util.sacred_file_parsing import (
find_sacred_runs,
group_runs_by_algo_and_env,
)
@lru_cache(maxsize=None)
def get_random_agent_score(env: str):
stats = rollout.rollout_stats(
cast(
Sequence[types.TrajectoryWithRew],
TrajectoryDatasetSequence(
datasets.load_dataset(
f"HumanCompatibleAI/random-{EnvironmentName(env)}",
)["train"],
),
),
)
return stats["monitor_return_mean"]
def print_markdown_summary(path: pathlib.Path) -> Generator[str, None, None]:
if not path.exists():
raise NotADirectoryError(f"Path {path} does not exist.")
yield "# Benchmark Summary"
yield ""
yield (
f"This is a summary of the sacred runs in `{path}` generated by "
f"`sacred_output_to_markdown_summary.py`."
)
runs_by_algo_and_env = group_runs_by_algo_and_env(path)
algos = sorted(runs_by_algo_and_env.keys())
status_counts = Counter((run["status"] for _, run in find_sacred_runs(path)))
statuses = sorted(list(status_counts))
# Note: we only print the status section if there are multiple statuses
if not (len(statuses) == 1 and statuses[0] == "COMPLETED"):
yield "## Run status" ""
yield "Status | Count"
yield "--- | ---"
for status in statuses:
yield f"{status} | {status_counts[status]}"
yield ""
yield "## Detailed Run Status"
yield f"Algorithm | Environment | {' | '.join(statuses)}"
yield "--- | --- " + " | --- " * len(statuses)
for algo in algos:
envs = sorted(runs_by_algo_and_env[algo].keys())
for env in envs:
status_counts = Counter(
(run["status"] for run in runs_by_algo_and_env[algo][env]),
)
yield (
f"{algo} | {env} | "
f"{' | '.join([str(status_counts[status]) for status in statuses])}"
)
yield "## Scores"
yield ""
yield (
"The scores are normalized based on the performance of a random agent as the"
" baseline and the expert as the maximum possible score as explained "
"[in this blog post](https://araffin.github.io/post/rliable/):"
)
yield "> `(score - random_score) / (expert_score - random_score)`"
yield ""
yield (
"Aggregate scores and confidence intervals are computed using the "
"[rliable library](https://agarwl.github.io/rliable/)."
)
for algo in algos:
yield f"### {algo.upper()}"
yield "Environment | Score (mean/std)| Normalized Score (mean/std) | N"
yield " --- | --- | --- | --- "
envs = sorted(runs_by_algo_and_env[algo].keys())
accumulated_normalized_scores = []
for env in envs:
scores = [
run["result"]["imit_stats"]["monitor_return_mean"]
for run in runs_by_algo_and_env[algo][env]
]
expert_scores = [
run["result"]["expert_stats"]["monitor_return_mean"]
for run in runs_by_algo_and_env[algo][env]
]
random_score = get_random_agent_score(env)
normalized_score = [
(score - random_score) / (expert_score - random_score)
for score, expert_score in zip(scores, expert_scores)
]
accumulated_normalized_scores.append(normalized_score)
yield (
f"{env} | "
f"{np.mean(scores):.3f} / {np.std(scores):.3f} | "
f"{np.mean(normalized_score):.3f} / {np.std(normalized_score):.3f} | "
f"{len(scores)}"
)
aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(
{"normalized_score": np.asarray(accumulated_normalized_scores).T},
lambda x: np.array([metrics.aggregate_mean(x), metrics.aggregate_iqm(x)]),
reps=1000,
)
yield ""
yield "#### Aggregate Normalized scores"
yield "Metric | Value | 95% CI"
yield " --- | --- | --- "
yield (
f"Mean | "
f"{aggregate_scores['normalized_score'][0]:.3f} | "
f"[{aggregate_score_cis['normalized_score'][0][0]:.3f}, "
f"{aggregate_score_cis['normalized_score'][0][1]:.3f}]"
)
yield (
f"IQM | "
f"{aggregate_scores['normalized_score'][1]:.3f} | "
f"[{aggregate_score_cis['normalized_score'][1][0]:.3f}, "
f"{aggregate_score_cis['normalized_score'][1][1]:.3f}]"
)
yield ""
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate a markdown summary of the results of a benchmarking run.",
)
parser.add_argument("path", type=pathlib.Path)
parser.add_argument("--output", type=pathlib.Path, default="summary.md")
args = parser.parse_args()
with open(args.output, "w") as fh:
for line in print_markdown_summary(pathlib.Path(args.path)):
fh.write(line)
fh.write("\n")
fh.flush()