Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize metrics #1167

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
e7cd7d6
sample metrics that have both sample-wise and set-wise operations
lintangsutawika Dec 19, 2023
1d262a5
change how metrics are registered
lintangsutawika Dec 19, 2023
028f04c
loglikelihood and loglikelihood rolling modified
lintangsutawika Dec 19, 2023
6117c50
changed how metrics are calculated
lintangsutawika Dec 19, 2023
a808c66
Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-ha…
lintangsutawika Dec 19, 2023
c6a9158
update
lintangsutawika Dec 27, 2023
4d49dd0
aggregation to compute_metric
lintangsutawika Dec 28, 2023
9d6bc92
aggregation to compute_metric
lintangsutawika Dec 28, 2023
3888193
simplify registry
lintangsutawika Dec 28, 2023
039832e
removed passthrough fn
lintangsutawika Dec 28, 2023
e5b245c
remove aggregation
lintangsutawika Dec 28, 2023
20c10df
kwargs are added to metric_fn through partial at the beginning
lintangsutawika Dec 28, 2023
6a336b1
use HFEvaluateAdaptor for hf metrics
lintangsutawika Dec 28, 2023
150f11f
revert to just load metric_fn
lintangsutawika Dec 28, 2023
99ce4ef
process hf evaluate metrics
lintangsutawika Dec 28, 2023
439dca5
list tuple for string based multigpu collection
lintangsutawika Dec 29, 2023
aaf64aa
readded suport for aggregation
lintangsutawika Jan 2, 2024
787b23f
readd aggregation
lintangsutawika Jan 2, 2024
703e0d5
adjusted aggregation config
lintangsutawika Jan 2, 2024
2a573a1
adjust to be backwards compatible
lintangsutawika Jan 2, 2024
2054c2e
revert
lintangsutawika Jan 2, 2024
dfb4183
revert
lintangsutawika Jan 2, 2024
cda25fe
Merge branch 'main' into standardize_metrics
lintangsutawika Jan 2, 2024
470fb31
resolved git conflict
lintangsutawika Jan 2, 2024
dfb036b
resolved again
lintangsutawika Jan 2, 2024
de46fb9
reformat
lintangsutawika Jan 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
changed how metrics are calculated
  • Loading branch information
lintangsutawika committed Dec 19, 2023
commit 6117c50786725553bda2dfd95d484c7ea6154abe
26 changes: 15 additions & 11 deletions lm_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def evaluate(
# subset instances to only this document id ; sort by idx
requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
requests.sort(key=lambda x: x.idx)
metrics = task.process_results(
items = task.process_results(
doc, [req.filtered_resps[key] for req in requests]
)
if log_samples:
Expand All @@ -383,10 +383,11 @@ def evaluate(
"resps": [req.resps for req in requests],
"filtered_resps": [req.filtered_resps[key] for req in requests],
}
example.update(metrics)
example.update(items)
samples[task_name].append(example)
for metric, value in metrics.items():
vals[(task_name, key, metric)].append(value)
vals[(task_name, key)].append(items)
# for metric, value in results.items():
# vals[(task_name, key, metric)].append(value)

if lm.world_size > 1:
# if multigpu, then gather data across all ranks
Expand All @@ -399,7 +400,8 @@ def evaluate(

# then collect metrics across all ranks
vals_torch = collections.defaultdict(list)
for (task_name, key, metric), items in vals.items():
# for (task_name, key, metric), items in vals.items():
for (task_name, key), items in vals.items():
numitem = 0
if type(items[0]) == tuple:
numitem = len(items[0])
Expand Down Expand Up @@ -435,7 +437,8 @@ def evaluate(
gathered_item = [tuple(g) for g in gathered_item]

if lm.rank == 0:
vals_torch[(task_name, key, metric)] = gathered_item
# vals_torch[(task_name, key, metric)] = gathered_item
vals_torch[(task_name, key)] = gathered_item

vals = vals_torch

Expand Down Expand Up @@ -469,18 +472,19 @@ def evaluate(

### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
# for (task_name, key, metric), items in vals.items():
for (task_name, key), items in vals.items():
task = task_dict[task_name]
metric_key = metric + "," + key
# metric_key = metric + "," + key

if type(task) == tuple:
group_name, task = task
else:
group_name = None

agg_fn = task.aggregation()[metric]
results[task_name][metric_key] = agg_fn(items)
results[task_name]["samples"] = len(items)
for metric_key, metric_fn in task.aggregation().items():
results[task_name][metric_key] = metric_fn(*list(zip(*items)))
results[task_name]["samples"] = len(items)
lintangsutawika marked this conversation as resolved.
Show resolved Hide resolved

# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
Expand Down
Loading