forked from deepset-ai/haystack
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Tutorial5_Evaluation.py
472 lines (400 loc) · 26.7 KB
/
Tutorial5_Evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
import logging
# We configure how logging messages should be displayed and which log level should be used before importing Haystack.
# Example log message:
# INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt
# Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily:
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
import tempfile
from pathlib import Path
from haystack.document_stores import ElasticsearchDocumentStore, InMemoryDocumentStore
from haystack.pipelines import Pipeline, ExtractiveQAPipeline, DocumentSearchPipeline
from haystack.nodes import (
BM25Retriever,
DensePassageRetriever,
EmbeddingRetriever,
FARMReader,
PreProcessor,
TextConverter,
)
from haystack.utils import fetch_archive_from_http, launch_es
from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span
def tutorial5_evaluation():
# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
doc_index = "tutorial5_docs"
label_index = "tutorial5_labels"
##############################################
# Code
##############################################
launch_es()
# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
doc_dir = "data/tutorial5"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(
host="localhost",
username="",
password="",
index=doc_index,
label_index=label_index,
embedding_field="emb",
embedding_dim=768,
excluded_meta_data=["emb"],
)
# Add evaluation data to Elasticsearch document store
# We first delete the custom tutorial indices to not have duplicate elements
# and also split our documents into shorter passages using the PreProcessor
preprocessor = PreProcessor(
split_by="word",
split_length=200,
split_overlap=0,
split_respect_sentence_boundary=False,
clean_empty_lines=False,
clean_whitespace=False,
)
document_store.delete_documents(index=doc_index)
document_store.delete_documents(index=label_index)
# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects.
# Those objects are then indexed in their respective document and label index in the document store.
# The method can be used with any dataset in SQuAD format.
document_store.add_eval_data(
filename="data/tutorial5/nq_dev_subset_v2.json",
doc_index=doc_index,
label_index=label_index,
preprocessor=preprocessor,
)
# Initialize Retriever
retriever = BM25Retriever(document_store=document_store)
# Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever)
# The EmbeddingRetriever uses a single transformer based encoder model for query and document.
# In contrast, DensePassageRetriever uses two separate encoders for both.
# Please make sure the "embedding_dim" parameter in the DocumentStore above matches the output dimension of your models!
# Please also take care that the PreProcessor splits your files into chunks that can be completely converted with
# the max_seq_len limitations of Transformers
# The SentenceTransformer model "sentence-transformers/multi-qa-mpnet-base-dot-v1" generally works well with the EmbeddingRetriever on any kind of English text.
# For more information and suggestions on different models check out the documentation at: https://www.sbert.net/docs/pretrained_models.html
# from haystack.retriever import EmbeddingRetriever, DensePassageRetriever
# retriever = EmbeddingRetriever(document_store=document_store,
# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
# retriever = DensePassageRetriever(document_store=document_store,
# query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
# use_gpu=True,
# max_seq_len_passage=256,
# embed_title=True)
# document_store.update_embeddings(retriever, index=doc_index)
# Initialize Reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", top_k=4, return_no_answer=True)
# Define a pipeline consisting of the initialized retriever and reader
# Here we evaluate retriever and reader in an integrated (a.k.a. open domain) fashion on the full corpus of documents
# i.e. a document is considered
# correctly retrieved if it contains the gold answer string within it. The reader is evaluated based purely on the
# predicted answer string, regardless of which document this came from and the position of the extracted span.
# The generation of predictions is separated from the calculation of metrics.
# This allows you to run the computation-heavy model predictions only once and then iterate flexibly on the metrics or reports you want to generate.
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
# The evaluation also works with any other pipeline.
# For example you could use a DocumentSearchPipeline as an alternative:
# pipeline = DocumentSearchPipeline(retriever=retriever)
# We can load evaluation labels from the document store
# We are also opting to filter out no_answer samples
eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=True)
## Alternative: Define queries and labels directly
# eval_labels = [
# MultiLabel(
# labels=[
# Label(
# query="who is written in the book of life",
# answer=Answer(
# answer="every person who is destined for Heaven or the World to Come",
# offsets_in_context=[Span(374, 434)]
# ),
# document=Document(
# id='1b090aec7dbd1af6739c4c80f8995877-0',
# content_type="text",
# content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is
# about the book mentioned in Christian and Jewish religious teachings...'
# ),
# is_correct_answer=True,
# is_correct_document=True,
# origin="gold-label"
# )
# ]
# )
# ]
# Similar to pipeline.run() we can execute pipeline.eval()
eval_result = pipeline.eval(labels=eval_labels, params={"Retriever": {"top_k": 5}})
# The EvaluationResult contains a pandas dataframe for each pipeline node.
# That's why there are two dataframes in the EvaluationResult of an ExtractiveQAPipeline.
retriever_result = eval_result["Retriever"]
retriever_result.head()
reader_result = eval_result["Reader"]
reader_result.head()
# We can filter for all documents retrieved for a given query
query = "who is written in the book of life"
retriever_book_of_life = retriever_result[retriever_result["query"] == query]
# We can also filter for all answers predicted for a given query
reader_book_of_life = reader_result[reader_result["query"] == "who is written in the book of life"]
# Save the evaluation result so that we can reload it later
# and calculate evaluation metrics without running the pipeline again.
eval_result.save("../")
## Calculating Evaluation Metrics
# Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions,
# such as F1-score of each individual prediction of the Reader node or recall of the retriever.
# To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/evaluation#metrics-retrieval)
saved_eval_result = EvaluationResult.load("../")
metrics = saved_eval_result.calculate_metrics()
print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}')
print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}')
print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}')
print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}')
print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}')
print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}')
print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}')
## Generating an Evaluation Report
# A summary of the evaluation results can be printed to get a quick overview.
# It includes some aggregated metrics and also shows a few wrongly predicted examples.
pipeline.print_eval_report(saved_eval_result)
## Advanced Evaluation Metrics
# Semantic Answer Similarity (SAS) is an advanced evaluation metric can be calculated in Haystack.
# This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer
# rather than just doing string comparison. To this end SAS relies on pre-trained models.
# For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts".
# A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".
# More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130)
# or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa).
advanced_eval_result = pipeline.eval(
labels=eval_labels,
params={"Retriever": {"top_k": 5}},
sas_model_name_or_path="cross-encoder/stsb-roberta-large",
)
metrics = advanced_eval_result.calculate_metrics()
print(metrics["Reader"]["sas"])
## Isolated Evaluation Mode
# The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding retriever node.
# Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader.
# Note that even with isolated evaluation enabled, integrated evaluation will still be running.
eval_result_with_upper_bounds = pipeline.eval(
labels=eval_labels, params={"Retriever": {"top_k": 5}}, add_isolated_node_eval=True
)
pipeline.print_eval_report(eval_result_with_upper_bounds)
# ## Advanced Label Scopes
# Answers are considered correct if the predicted answer matches the gold answer in the labels.
# Documents are considered correct if the predicted document ID matches the gold document ID in the labels.
# Sometimes, these simple definitions of "correctness" are not sufficient.
# There are cases where you want to further specify the "scope" within which an answer or a document is considered correct.
# For this reason, `EvaluationResult.calculate_metrics()` offers the parameters `answer_scope` and `document_scope`.
#
# Say you want to ensure that an answer is only considered correct if it stems from a specific context of surrounding words.
# This is especially useful if your answer is very short, like a date (for example, "2011") or a place ("Berlin").
# Such short answer might easily appear in multiple completely different contexts.
# Some of those contexts might perfectly fit the actual question and answer it.
# Some others might not: they don't relate to the question at all but still contain the answer string.
# In that case, you might want to ensure that only answers that stem from the correct context are considered correct.
# To do that, specify `answer_scope="context"` in `calculate_metrics()`.
#
# `answer_scope` takes the following values:
# - `any` (default): Any matching answer is considered correct.
# - `context`: The answer is only considered correct if its context matches as well. It uses fuzzy matching (see `context_matching` parameters of `pipeline.eval()`).
# - `document_id`: The answer is only considered correct if its document ID matches as well. You can specify a custom document ID through the `custom_document_id_field` parameter of `pipeline.eval()`.
# - `document_id_and_context`: The answer is only considered correct if its document ID and its context match as well.
#
# In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to `answer` or `document_id_or_answer`.
#
# `document_scope` takes the following values:
# - `document_id`: Specifies that the document ID must match. You can specify a custom document ID through the `custom_document_id_field` parameter of `pipeline.eval()`.
# - `context`: Specifies that the content of the document must match. It uses fuzzy matching (see the `context_matching` parameters of `pipeline.eval()`).
# - `document_id_and_context`: A Boolean operation specifying that both `'document_id' AND 'context'` must match.
# - `document_id_or_context`: A Boolean operation specifying that either `'document_id' OR 'context'` must match.
# - `answer`: Specifies that the document contents must include the answer. The selected `answer_scope` is enforced.
# - `document_id_or_answer` (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
metrics = saved_eval_result.calculate_metrics(answer_scope="context")
print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}')
print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}')
print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}')
print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}')
print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}')
print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}')
print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}')
document_store.get_all_documents()[0]
# Let's try Document Retrieval on a file level (it's sufficient if the correct file identified by its name (for example, 'Book of Life') was retrieved).
eval_result_custom_doc_id = pipeline.eval(
labels=eval_labels, params={"Retriever": {"top_k": 5}}, custom_document_id_field="name"
)
metrics = eval_result_custom_doc_id.calculate_metrics(document_scope="document_id")
print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}')
print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}')
print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}')
print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}')
print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}')
# Let's enforce the context again:
metrics = eval_result_custom_doc_id.calculate_metrics(document_scope="document_id_and_context")
print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}')
print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}')
print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}')
print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}')
print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}')
# ## Storing results in MLflow
# Storing evaluation results in CSVs is fine but not enough if you want to compare and track multiple evaluation runs. MLflow is a handy tool when it comes to tracking experiments. So we decided to use it to track all of `Pipeline.eval()` with reproducability of your experiments in mind.
# ### Host your own MLflow or use deepset's public MLflow
# If you don't want to use deepset's public MLflow instance under https://public-mlflow.deepset.ai, you can easily host it yourself.
# !pip install mlflow
# !mlflow server --serve-artifacts
# ### Preprocessing the dataset
# Preprocessing the dataset works a bit differently than before. Instead of directly generating documents (and labels) out of a SQuAD file, we first save them to disk. This is necessary to experiment with different indexing pipelines.
document_store = InMemoryDocumentStore()
label_preprocessor = PreProcessor(
split_length=200,
split_overlap=0,
split_respect_sentence_boundary=False,
clean_empty_lines=False,
clean_whitespace=False,
)
# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects.
# Those objects are then indexed in their respective document and label index in the document store.
# The method can be used with any dataset in SQuAD format.
# We only use it to get the evaluation set labels and the corpus files.
document_store.add_eval_data(
filename="data/tutorial5/nq_dev_subset_v2.json",
doc_index=document_store.index,
label_index=document_store.label_index,
preprocessor=label_preprocessor,
)
# the evaluation set to evaluate the pipelines on
evaluation_set_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=True)
# Pipelines need files as input to be able to test different preprocessors.
# Even though this looks a bit cumbersome to write the documents back to files we gain a lot of evaluation potential and reproducibility.
docs = document_store.get_all_documents()
temp_dir = tempfile.TemporaryDirectory()
file_paths = []
for doc in docs:
file_name = doc.id + ".txt"
file_path = Path(temp_dir.name) / file_name
file_paths.append(file_path)
with open(file_path, "w") as f:
f.write(doc.content)
file_metas = [d.meta for d in docs]
# ### Run experiments
# In this experiment we evaluate extractive QA pipelines with two different retrievers on the evaluation set given the corpus:
# **ElasticsearchRetriever vs. EmbeddingRetriever**
# helper function to create query and index pipeline
def create_pipelines(document_store, preprocessor, retriever, reader):
query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, inputs=["Query"], name="Retriever")
query_pipeline.add_node(component=reader, inputs=["Retriever"], name="Reader")
index_pipeline = Pipeline()
index_pipeline.add_node(component=TextConverter(), inputs=["File"], name="TextConverter")
index_pipeline.add_node(component=preprocessor, inputs=["TextConverter"], name="Preprocessor")
index_pipeline.add_node(component=retriever, inputs=["Preprocessor"], name="Retriever")
index_pipeline.add_node(component=document_store, inputs=["Retriever"], name="DocumentStore")
return query_pipeline, index_pipeline
# Name of the experiment in MLflow
EXPERIMENT_NAME = "haystack-tutorial-5"
# #### Run using BM25Retriever
document_store = ElasticsearchDocumentStore(index="sparse_index", recreate_index=True)
preprocessor = PreProcessor(
split_length=200,
split_overlap=0,
split_respect_sentence_boundary=False,
clean_empty_lines=False,
clean_whitespace=False,
)
es_retriever = BM25Retriever(document_store=document_store)
reader = FARMReader("deepset/roberta-base-squad2", top_k=3, return_no_answer=True, batch_size=8)
query_pipeline, index_pipeline = create_pipelines(document_store, preprocessor, es_retriever, reader)
sparse_eval_result = Pipeline.execute_eval_run(
index_pipeline=index_pipeline,
query_pipeline=query_pipeline,
evaluation_set_labels=evaluation_set_labels,
corpus_file_paths=file_paths,
corpus_file_metas=file_metas,
experiment_name=EXPERIMENT_NAME,
experiment_run_name="sparse",
corpus_meta={"name": "nq_dev_subset_v2.json"},
evaluation_set_meta={"name": "nq_dev_subset_v2.json"},
pipeline_meta={"name": "sparse-pipeline"},
add_isolated_node_eval=True,
experiment_tracking_tool="mlflow",
experiment_tracking_uri="https://public-mlflow.deepset.ai",
reuse_index=True,
)
# #### Run using EmbeddingRetriever
document_store = ElasticsearchDocumentStore(index="dense_index", recreate_index=True)
emb_retriever = EmbeddingRetriever(
document_store=document_store,
model_format="sentence_transformers",
embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
batch_size=8,
)
query_pipeline, index_pipeline = create_pipelines(document_store, preprocessor, emb_retriever, reader)
dense_eval_result = Pipeline.execute_eval_run(
index_pipeline=index_pipeline,
query_pipeline=query_pipeline,
evaluation_set_labels=evaluation_set_labels,
corpus_file_paths=file_paths,
corpus_file_metas=file_metas,
experiment_name=EXPERIMENT_NAME,
experiment_run_name="embedding",
corpus_meta={"name": "nq_dev_subset_v2.json"},
evaluation_set_meta={"name": "nq_dev_subset_v2.json"},
pipeline_meta={"name": "embedding-pipeline"},
add_isolated_node_eval=True,
experiment_tracking_tool="mlflow",
experiment_tracking_uri="https://public-mlflow.deepset.ai",
reuse_index=True,
answer_scope="context",
)
# You can now open MLflow (e.g. https://public-mlflow.deepset.ai/ if you used the public one hosted by deepset) and look for the haystack-eval-experiment experiment.
# Try out mlflow's compare function and have fun...
#
# Note that on our public mlflow instance we are not able to log artifacts like the evaluation results or the piplines.yaml file.
## Evaluation of Individual Components
# Sometimes you might want to evaluate individual components,
# for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself.
# Evaluate Retriever on its own
# Here we evaluate only the retriever, based on whether the gold_label document is retrieved.
# Note that no_answer samples are omitted when evaluation is performed with this method
retriever_eval_results = retriever.eval(top_k=5, label_index=label_index, doc_index=doc_index)
## Retriever Recall is the proportion of questions for which the correct document containing the answer is
## among the correct documents
print("Retriever Recall:", retriever_eval_results["recall"])
## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print("Retriever Mean Avg Precision:", retriever_eval_results["map"])
# Just as a sanity check, we can compare the recall from `retriever.eval()` with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`.
# These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels and setting document_scope to `"document_id"`.
# Per default `calculate_metrics()` has document_scope set to `"document_id_or_answer"` which interprets documents as relevant if they either match the gold document ID or contain the answer.
metrics = eval_result_with_upper_bounds.calculate_metrics(document_scope="document_id")
print(metrics["Retriever"]["recall_multi_hit"])
# Evaluate Reader on its own
# Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query
# and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by
# the model as the answer span (i.e. SQuAD style)
reader_eval_results = reader.eval(document_store=document_store, label_index=label_index, doc_index=doc_index)
top_n = reader_eval_results["top_n"]
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
# reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json")
# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer including no_answers
print(f"Reader Top-{top_n}-Accuracy:", reader_eval_results["top_n_accuracy"])
# Reader Top-1-Exact Match is the proportion of questions where the first predicted answer is exactly the same as the correct answer including no_answers
print("Reader Top-1-Exact Match:", reader_eval_results["EM"])
# Reader Top-1-F1-Score is the average overlap between the first predicted answers and the correct answers including no_answers
print("Reader Top-1-F1-Score:", reader_eval_results["f1"])
# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer excluding no_answers
print(f"Reader Top-{top_n}-Accuracy (without no_answers):", reader_eval_results["top_n_accuracy_text_answer"])
# Reader Top-N-Exact Match is the proportion of questions where the predicted answer within the first n results is exactly the same as the correct answer excluding no_answers (no_answers are always present within top n).
print(f"Reader Top-{top_n}-Exact Match (without no_answers):", reader_eval_results["top_n_EM_text_answer"])
# Reader Top-N-F1-Score is the average overlap between the top n predicted answers and the correct answers excluding no_answers (no_answers are always present within top n).
print(f"Reader Top-{top_n}-F1-Score (without no_answers):", reader_eval_results["top_n_f1_text_answer"])
# Just as a sanity check, we can compare the top-n exact_match and f1 metrics from `reader.eval()` with the exact_match and f1 from `pipeline.eval(add_isolated_node_eval=True)`.
# These two approaches return the same values because pipeline.eval() calculates top-n metrics per default.
# Small discrepancies might occur due to string normalization in pipeline.eval()'s answer-to-label comparison.
# reader.eval() does not use string normalization.
metrics = eval_result_with_upper_bounds.calculate_metrics(eval_mode="isolated")
print(metrics["Reader"]["exact_match"])
print(metrics["Reader"]["f1"])
if __name__ == "__main__":
tutorial5_evaluation()
# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/