Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor evaluate.py. #10

Merged
merged 16 commits into from
Apr 22, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Put CAML metrics to metrics.py (which give different results if chang…
…e to sklearn's)
  • Loading branch information
Eleven1Liu committed Apr 18, 2021
commit 3dd78ca42ea8c58e1691992d7faf7a40d75ace6d
53 changes: 11 additions & 42 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.metrics import f1_score
from tqdm import tqdm

from metrics import macro_f1, precision_at_k, recall_at_k
from utils import log
from utils.utils import Timer, dump_log

Expand Down Expand Up @@ -33,12 +34,12 @@ def evaluate(config, model, dataset_loader, eval_metric, split='dev', dump=True)


class FewShotMetrics():
def __init__(self, config, dataset, few_shot_limit=5, target_label='label'):
# read train / test labels
# dev is not considered for now
test_labels = np.hstack([instance[target_label]
def __init__(self, config, dataset, few_shot_limit=5):
# if dataset does not have train in the test mode?

test_labels = np.hstack([instance['label']
for instance in dataset['test']])
train_labels = np.hstack([instance[target_label]
train_labels = np.hstack([instance['label']
for instance in dataset['train']])

self.config = config
Expand Down Expand Up @@ -96,16 +97,18 @@ def eval(self, y_true, y_pred, threshold=0.5):

# micro/macro f1 of the target groups
result['Micro-F1'] = f1_score(y_true=target_y_true, y_pred=target_y_pred > threshold, average='micro')
result['Macro-F1'] = f1_score(y_true=target_y_true, y_pred=target_y_pred > threshold, average='macro')
# result['Macro-F1'] = f1_score(y_true=target_y_true, y_pred=target_y_pred > threshold, average='macro')
# result['Micro-F1'] = micro_f1((target_y_pred > threshold).ravel(), target_y_true.ravel())
result['Macro-F1'] = macro_f1(target_y_true, target_y_pred > threshold)

# find all metric starts with P(Precition) or R(Recall)
pattern = re.compile('(?:P|R)@\d+')
for metric in self.config.monitor_metrics:
for pr_metric in re.findall(pattern, metric):
metric_type, top_k = pr_metric.split('@')
top_k = int(top_k)
metric_at_k = precision_at_k(target_y_pred, target_y_true, k=top_k) if metric_type == 'P' \
else recall_at_k(target_y_pred, target_y_true, k=top_k)
metric_at_k = precision_at_k(target_y_true, target_y_pred, k=top_k) if metric_type == 'P' \
else recall_at_k(target_y_true, target_y_pred, k=top_k)
result[pr_metric] = metric_at_k

results.append(result)
Expand All @@ -122,37 +125,3 @@ def __repr__(self):
df = pd.DataFrame(results).applymap(
lambda x: f'{x * 100:.4f}' if isinstance(x, (np.floating, float)) else x)
return df.to_markdown(index=False)


def recall_at_k(yhat_raw, y, k):
#num true labels in top k predictions / num true labels
sortd = np.argsort(yhat_raw)[:,::-1]
topk = sortd[:,:k]

#get recall at k for each example
vals = []
for i, tk in enumerate(topk):
num_true_in_top_k = y[i,tk].sum()
denom = y[i,:].sum()
vals.append(num_true_in_top_k / float(denom))

vals = np.array(vals)
vals[np.isnan(vals)] = 0.

return np.mean(vals)


def precision_at_k(yhat_raw, y, k):
#num true labels in top k predictions / k
sortd = np.argsort(yhat_raw)[:,::-1]
topk = sortd[:,:k]

# get precision at k for each example
vals = []
for i, tk in enumerate(topk):
if len(tk) > 0:
num_true_in_top_k = y[i,tk].sum()
denom = len(tk)
vals.append(num_true_in_top_k / float(denom))

return np.mean(vals)
65 changes: 65 additions & 0 deletions metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Metrics different to sklearn are placed here.
Some of the functions are from CAML-MIMIC:
(https://github.com/jamesmullenbach/caml-mimic/blob/master/evaluation.py)."""


import numpy as np


def intersect_size(y_true, y_pred, axis):
# axis=0 for label-level union (macro). axis=1 for instance-level
return np.logical_and(y_pred, y_true).sum(axis=axis).astype(float)


def macro_precision(y_true, y_pred):
num = intersect_size(y_pred, y_true, 0) / (y_pred.sum(axis=0) + 1e-10)
Eleven1Liu marked this conversation as resolved.
Show resolved Hide resolved
return np.mean(num)


def macro_recall(y_true, y_pred):
num = intersect_size(y_pred, y_true, 0) / (y_true.sum(axis=0) + 1e-10)
Eleven1Liu marked this conversation as resolved.
Show resolved Hide resolved
return np.mean(num)


def macro_f1(y_true, y_pred):
prec = macro_precision(y_pred, y_true)
Eleven1Liu marked this conversation as resolved.
Show resolved Hide resolved
rec = macro_recall(y_pred, y_true)
Eleven1Liu marked this conversation as resolved.
Show resolved Hide resolved
if prec + rec == 0:
f1 = 0.
else:
f1 = 2*(prec*rec)/(prec+rec)
Eleven1Liu marked this conversation as resolved.
Show resolved Hide resolved
return f1


def precision_at_k(y_true, y_pred, k):
# num true labels in top k predictions / k
sortd = np.argsort(y_pred)[:,::-1]
topk = sortd[:,:k]

# get precision at k for each example
vals = []
for i, tk in enumerate(topk):
if len(tk) > 0:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't len(tk) always be k?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There could be multiple top-ks in CAML like [8,15] for the full code, [5] for the top-50 code.
If we don't use multiple top-ks in the future, I should remove this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, you're right!

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remember to solve this!

num_true_in_top_k = y_true[i,tk].sum()
denom = len(tk)
vals.append(num_true_in_top_k / float(denom))

return np.mean(vals)


def recall_at_k(y_true, y_pred, k):
# num true labels in top k predictions / num true labels
sortd = np.argsort(y_pred)[:,::-1]
topk = sortd[:,:k]

# get recall at k for each example
vals = []
for i, tk in enumerate(topk):
num_true_in_top_k = y_true[i,tk].sum()
denom = y_true[i,:].sum()
vals.append(num_true_in_top_k / float(denom))

vals = np.array(vals)
vals[np.isnan(vals)] = 0.
Eleven1Liu marked this conversation as resolved.
Show resolved Hide resolved

return np.mean(vals)