deepset-ai · julian-risch · Aug 18, 2021 · Aug 17, 2021 · Aug 18, 2021 · Aug 18, 2021
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
@@ -941,6 +941,7 @@ def __init__(self, layer_dims=[768,2],
  n_best_per_sample=None,
  duplicate_filtering=-1,
  temperature_for_confidence=1.0,
+ use_confidence_scores_for_ranking=False,
  **kwargs):
  """
  :param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2
@@ -963,6 +964,8 @@ def __init__(self, layer_dims=[768,2],
  :type duplicate_filtering: int
  :param temperature_for_confidence: The divisor that is used to scale logits to calibrate confidence scores
  :type temperature_for_confidence: float
+ :param use_confidence_scores_for_ranking: Whether to sort answers by confidence score (normalized between 0 and 1) or by standard score (unbounded)
+ :type use_confidence_scores_for_ranking: bool
  """
  super(QuestionAnsweringHead, self).__init__()
  if len(kwargs) > 0:
@@ -988,6 +991,7 @@ def __init__(self, layer_dims=[768,2],
  self.duplicate_filtering = duplicate_filtering
  self.generate_config()
  self.temperature_for_confidence = nn.Parameter(torch.ones(1) * temperature_for_confidence)
+ self.use_confidence_scores_for_ranking = use_confidence_scores_for_ranking
 
 
  @classmethod
@@ -1472,7 +1476,7 @@ def reduce_preds(self, preds):
 
  # Add no answer to positive answers, sort the order and return the n_best
  n_preds = [no_answer_pred] + pos_answer_dedup
- n_preds_sorted = sorted(n_preds, key=lambda x: x.score, reverse=True)
+ n_preds_sorted = sorted(n_preds, key=lambda x: x.confidence if self.use_confidence_scores_for_ranking else x.score, reverse=True)
  n_preds_reduced = n_preds_sorted[:self.n_best]
  return n_preds_reduced, no_ans_gap
 

diff --git a/test/test_question_answering.py b/test/test_question_answering.py
@@ -58,6 +58,24 @@ def span_inference_result(bert_base_squad2, caplog=None):
  return result
 
 
+def test_span_inference_result_ranking_by_confidence(bert_base_squad2, caplog=None):
+ if caplog:
+ caplog.set_level(logging.CRITICAL)
+ obj_input = [QAInput(doc_text="Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
+ questions=Question("Who counted the game among the best ever made?", uid="best_id_ever"))]
+ result = bert_base_squad2.inference_from_objects(obj_input, return_json=False)[0]
+
+ # by default, result is sorted by score and not by confidence
+ assert all(result.prediction[i].score >= result.prediction[i + 1].score for i in range(len(result.prediction) - 1))
+ assert not all(result.prediction[i].confidence >= result.prediction[i + 1].confidence for i in range(len(result.prediction) - 1))
+
+ # ranking can be adjusted so that result is sorted by confidence
+ bert_base_squad2.model.prediction_heads[0].use_confidence_scores_for_ranking = True
+ result_ranked_by_confidence = bert_base_squad2.inference_from_objects(obj_input, return_json=False)[0]
+ assert all(result_ranked_by_confidence.prediction[i].confidence >= result_ranked_by_confidence.prediction[i + 1].confidence for i in range(len(result_ranked_by_confidence.prediction) - 1))
+ assert not all(result_ranked_by_confidence.prediction[i].score >= result_ranked_by_confidence.prediction[i + 1].score for i in range(len(result_ranked_by_confidence.prediction) - 1))
+
+
 @pytest.fixture()
 def no_answer_inference_result(bert_base_squad2, caplog=None):
  if caplog: