Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add switch to QA pred head for ranking by confidence scores #836

Merged
merged 3 commits into from
Aug 18, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion farm/modeling/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,7 @@ def __init__(self, layer_dims=[768,2],
n_best_per_sample=None,
duplicate_filtering=-1,
temperature_for_confidence=1.0,
use_confidence_scores_for_ranking=False,
**kwargs):
"""
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2
Expand All @@ -963,6 +964,8 @@ def __init__(self, layer_dims=[768,2],
:type duplicate_filtering: int
:param temperature_for_confidence: The divisor that is used to scale logits to calibrate confidence scores
:type temperature_for_confidence: float
:param use_confidence_scores_for_ranking: Whether to sort answers by confidence score (normalized between 0 and 1) or by standard score (unbounded)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we document here somewhere a bit clearer what these different scores exactly are and how they are calculated? If I remember correctly we have these three cases:
a) score = start + end logit (unbounded)
b) confidence (default) = logits scaled to 0-1 and incorporating no_answer
c) confidence (calibrated) - same a b) but we multiply it with a learned scaling parameter
I am sure I will forget about it in a couple of weeks and would be helpful to have it documented for others. Probably it's best to do that in the general prediction head doc string (+ Haystack's FARMReader).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I added an explanation of the three kinds of scores to the doc string of the QAPredictionHead.

:type use_confidence_scores_for_ranking: bool
"""
super(QuestionAnsweringHead, self).__init__()
if len(kwargs) > 0:
Expand All @@ -988,6 +991,7 @@ def __init__(self, layer_dims=[768,2],
self.duplicate_filtering = duplicate_filtering
self.generate_config()
self.temperature_for_confidence = nn.Parameter(torch.ones(1) * temperature_for_confidence)
self.use_confidence_scores_for_ranking = use_confidence_scores_for_ranking


@classmethod
Expand Down Expand Up @@ -1472,7 +1476,7 @@ def reduce_preds(self, preds):

# Add no answer to positive answers, sort the order and return the n_best
n_preds = [no_answer_pred] + pos_answer_dedup
n_preds_sorted = sorted(n_preds, key=lambda x: x.score, reverse=True)
n_preds_sorted = sorted(n_preds, key=lambda x: x.confidence if self.use_confidence_scores_for_ranking else x.score, reverse=True)
n_preds_reduced = n_preds_sorted[:self.n_best]
return n_preds_reduced, no_ans_gap

Expand Down
18 changes: 18 additions & 0 deletions test/test_question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,24 @@ def span_inference_result(bert_base_squad2, caplog=None):
return result


def test_span_inference_result_ranking_by_confidence(bert_base_squad2, caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
obj_input = [QAInput(doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
questions=Question("Who counted the game among the best ever made?", uid="best_id_ever"))]
result = bert_base_squad2.inference_from_objects(obj_input, return_json=False)[0]

# by default, result is sorted by score and not by confidence
assert all(result.prediction[i].score >= result.prediction[i + 1].score for i in range(len(result.prediction) - 1))
assert not all(result.prediction[i].confidence >= result.prediction[i + 1].confidence for i in range(len(result.prediction) - 1))

# ranking can be adjusted so that result is sorted by confidence
bert_base_squad2.model.prediction_heads[0].use_confidence_scores_for_ranking = True
result_ranked_by_confidence = bert_base_squad2.inference_from_objects(obj_input, return_json=False)[0]
assert all(result_ranked_by_confidence.prediction[i].confidence >= result_ranked_by_confidence.prediction[i + 1].confidence for i in range(len(result_ranked_by_confidence.prediction) - 1))
assert not all(result_ranked_by_confidence.prediction[i].score >= result_ranked_by_confidence.prediction[i + 1].score for i in range(len(result_ranked_by_confidence.prediction) - 1))


@pytest.fixture()
def no_answer_inference_result(bert_base_squad2, caplog=None):
if caplog:
Expand Down