Add evaluation guide

deepset-ai · brandenchan · Sep 23, 2021 · Sep 20, 2021 · Sep 20, 2021 · Sep 20, 2021
commit e944e76fd816aa0462fba984ae8d93ae734b2c69
diff --git a/docs/latest/guides/evaluation.mdx b/docs/latest/guides/evaluation.mdx
@@ -0,0 +1,92 @@
+# Evaluation
+
+Haystack has all the tools needed to evaluate Retrievers, Readers and Generators in both
+open domain and closed domain modes.
+Evaluation and the metrics that it generates are vital for:
+- judging how well your system is performing on a given domain.
+- comparing the performance of different models
+- identifying underperforming components in your pipeline
+
+<div className="max-w-xl bg-yellow-light-theme border-l-8 border-yellow-dark-theme px-6 pt-6 pb-4 my-4 rounded-md dark:bg-yellow-900">
+
+**Tutorial:** This documentation page is meant to give an in depth understanding of the concepts involved in evaluation.
+To get started using Haystack for evaluation, we recommend having a look at our [evaluation tutorial](/tutorials/evaluation)
+
+</div>
+
+## Datasets
+
+Annotated datasets are crucial for evaluating the retrieval as well as the question answering capabilities of your system.
+Haystack is designed to work with question answering datasets that follow SQuAD format.
+Please check out our [annotation tool](/guides/annotation) if you're interested in creating your own dataset.
+
+<div className="max-w-xl bg-yellow-light-theme border-l-8 border-yellow-dark-theme px-6 pt-6 pb-4 my-4 rounded-md dark:bg-yellow-900">
+
+**Data Tool:** have a look at our `SquadData` object in `haystack/squad_data.py` if you'd like to manipulate SQuAD style data using Pandas dataframes.
+
+</div>
+
+## Open vs Closed Domain
+
+There are two evaluation modes known as **open domain** and **closed domain.**
+
+**Closed domain** means single document QA.
+In this setting, you want to make sure the correct instance of a string is highlighted as the answer.
+So you compare the indexes of predicted against labeled answers.
+Even if the two strings have identical content, if they occur in different documents,
+or in different positions in the same document, they count as wrong.
+
+**Open domain** means multiple-document QA (typically over the entire database).
+Here, you only look for a match or overlap between the two answer strings.
+Even if the predicted answer is extracted from a different position than the correct answer,
+that's fine as long as the strings match.
+
+## Metrics: Retrieval
+
+### Recall
+
+Recall measures how many times the correct document was among the retrieved documents over a set of queries.
+For a single query, the output is binary: either a document is contained in the selection, or it is not.
+Over the entire dataset, the recall score amounts to a number between zero (no query retrieved the right document) and one (all queries retrieved the right documents).
+
+Note that recall is affected by the number of documents that the retriever returns.
+If the retriever returns only one or a few documents, it is a tougher task to retrieve correct documents.
+Make sure to set the Retriever's `top_k` to an appropriate value and to also define the `top_k` in `Retriever.eval()` or `EvalDocuments`
+
+### Mean Reciprocal Rank (MRR)
+
+In contrast to the recall metric, mean reciprocal rank takes the position of the top correctly retrieved document (the “rank”) into account.
+It does this to account for the fact that a query elicits multiple responses of varying relevance.
+Like recall, MRR can be a value between zero (no matches) and one (the system retrieved a correct document for all queries as the top result).
+For more details, check out [this page](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)
+
+### Mean Average Precision (mAP)
+
+Mean average precision is similar to mean reciprocal rank but takes into account the position of every correctly retrieved document.
+Like MRR, mAP can be a value between zero (no matches) and one (the system retrieved correct documents for all top results).
+mAP is particularly useful in cases where there are more than one correct document to be retrieved.
+For more details, check out [this page](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision)
+
+
+## Metrics: Question Answering
+
+### Exact Match (EM)
+
+Exact match measures the proportion of cases where the predicted answer is identical to the correct answer.
+For example, for the annotated question answer pair “What is Haystack?" + "A question answering library in Python”,
+even a predicted answer like “A Python question answering library” would yield a zero score because it does not match the expected answer 100 percent.
+
+### F1
+
+The F1 score is more forgiving and measures the word overlap between the labeled and the predicted answer.
+Whenever the EM is 1, F1 will also be 1.
+To learn more about the F1 score, check out this guide
+
+### Semantic Answer Similarity (SAS)
+
+Semantic Answer Similarity uses a transformer-based cross-encoder architecture to evaluate the semantic similarity of two answers rather than their lexical overlap.
+While F1 and EM would both score “one hundred percent” as sharing zero similarity with “100 %", SAS is trained to assign this a high score.
+
+SAS is particularly useful to seek out cases where F1 doesn't give a good indication of the validity of a predicted answer.
+
+You can read more about SAS in [this paper](https://arxiv.org/abs/2108.06130).
diff --git a/docs/latest/menu.json b/docs/latest/menu.json
@@ -39,6 +39,7 @@
  {"slug": "languages", "title": "Languages Other Than English"},
  {"slug": "domain-adaptation","title": "Domain Adaptation"},
  {"slug": "optimization", "title": "Optimization"},
+ {"slug": "evaluation", "title": "Evaluation"},
  {"slug": "annotation", "title": "Annotation Tool"},
  {"slug": "rest-api", "title": "REST API"},
  {"slug": "chatbots", "title": "Chatbot Integration"}
@@ -107,43 +108,22 @@
  "subMenuTitle": "API Reference",
  "pathPrefix": "/reference/",
  "items": [
- {
- "slug": "document-store",
- "title": "Document Store"
- },
+ {"slug": "document-store", "title": "Document Store"},
  { "slug": "retriever", "title": "Retriever" },
  { "slug": "reader", "title": "Reader" },
  { "slug": "generator", "title": "Generator" },
- {
- "slug": "summarizer",
- "title": "Summarizer"
- },
- {
- "slug": "translator",
- "title": "Translator"
- },
- {
- "slug": "preprocessor",
- "title": "Preprocessor"
- },
- {
- "slug": "file-converters",
- "title": "File Converters"
- },
- { "slug": "crawler", "title": "Crawler" },
- {
- "slug": "evaluation",
- "title": "Evaluation"
- },
+ {"slug": "summarizer", "title": "Summarizer"},
+ {"slug": "translator", "title": "Translator"},
+ {"slug": "preprocessor", "title": "Preprocessor"},
+ {"slug": "file-converters", "title": "File Converters"},
+ {"slug": "crawler", "title": "Crawler" },
+ {"slug": "evaluation", "title": "Evaluation"},
  { "slug": "pipelines", "title": "Pipelines" },
- {
- "slug": "knowledge-graph",
- "title": "Knowledge Graph"
- },
- {
- "slug": "graph-retriever",
- "title": "Graph Retriever"
- }
+ {"slug": "knowledge-graph", "title": "Knowledge Graph"},
+ {"slug": "graph-retriever", "title": "Graph Retriever"},
+ {"slug": "classifier", "title": "Classifier"},
+ {"slug": "question-generator", "title": "Question Generator"},
+ {"slug": "ranker", "title": "Ranker"}
  ]
  }
 ]
diff --git a/lib/constants.ts b/lib/constants.ts
@@ -75,6 +75,21 @@ export const referenceFiles: Meta = {
  filename: "graph_retriever.md",
  title: "Graph Retriever",
  },
+ {
+ slug: "question-generator",
+ filename: "question_generator.md",
+ title: "Question Generator",
+ },
+ {
+ slug: "classifier",
+ filename: "classifier.md",
+ title: "Classifier",
+ },
+ {
+ slug: "ranker",
+ filename: "ranker.md",
+ title: "Ranker",
+ },
  ],
 };