diff --git a/docs/latest/components/classifier.mdx b/docs/latest/components/classifier.mdx new file mode 100644 index 000000000..55c793f40 --- /dev/null +++ b/docs/latest/components/classifier.mdx @@ -0,0 +1,43 @@ +# Classifier + +The Classifier Node is a transformer based classification model used to create predictions that can be attached to retrieved documents as metadata. +For example, by using a sentiment model, you can label each document as being either positive or negative in sentiment. +Through a tight integration with the HuggingFace model hub, you can easily load any classification model by simply supplying the model name. + +![image](/img/classifier.png) + +
+ +Note that the Classifier is different from the Query Classifier. +While the Query Classifier categorizes incoming queries in order to route them to different parts of the pipeline, +the Classifier is used to create classification labels that can be attached to retrieved documents as metadata. + +
+ +## Usage + +Initialize it as follows: + +``` python +from haystack.classifier import FARMClassifier + +classifier_model = 'textattack/bert-base-uncased-imdb' +classifier = FARMClassifier(model_name_or_path=classifier_model) +``` + +It slotted into a pipeline as follows: + +``` python +pipeline = Pipeline() +pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) +pipeline.add_node(component=classifier, name='Classifier', inputs=['Retriever']) +``` + +It can also be run in isolation: + +``` python +documents = classifier.predict( + query="", + documents = [doc1, doc2, doc3, ...] +): +``` \ No newline at end of file diff --git a/docs/latest/components/document_store.mdx b/docs/latest/components/document_store.mdx index c9b618662..91e4e7fe9 100644 --- a/docs/latest/components/document_store.mdx +++ b/docs/latest/components/document_store.mdx @@ -29,12 +29,48 @@ docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 Next you can initialize the Haystack object that will connect to this instance. ```python +from haystack.document_store import ElasticSearchDocumentStore + document_store = ElasticsearchDocumentStore() ``` -Note that we also support [Open Distro for Elasticsearch](https://opendistro.github.io/for-elasticsearch-docs/). -Follow [their documentation](https://opendistro.github.io/for-elasticsearch-docs/docs/install/) -to run it and connect to it using Haystack's `OpenDistroElasticsearchDocumentStore` class. +### Open Distro for Elasticsearch + +Learn how to get started [here](https://opendistro.github.io/for-elasticsearch-docs/#get-started) + +If you have Docker set up, we recommend pulling the Docker image and running it. + +```bash +docker pull amazon/opendistro-for-elasticsearch:1.13.2 +docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" amazon/opendistro-for-elasticsearch:1.13.2 +``` + +Next you can initialize the Haystack object that will connect to this instance. + +```python +from haystack.document_store import OpenDistroElasticsearchDocumentStore + +document_store = OpenDistroElasticsearchDocumentStore() +``` + +### OpenSearch + +Learn how to get started [here](https://opensearch.org/docs/#docker-quickstart) + +If you have Docker set up, we recommend pulling the Docker image and running it. + +```bash +docker pull opensearchproject/opensearch:1.0.1 +docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.1 +``` + +Next you can initialize the Haystack object that will connect to this instance. + +```bash +from haystack.document_store import OpenSearchDocumentStore + +document_store = OpenSearchDocumentStore() +```
@@ -210,12 +246,34 @@ The Document Stores have different characteristics. You should choose one depend - Fast & accurate sparse retrieval with many tuning options - Basic support for dense retrieval - Production-ready -- Support also for Open Distro **Cons:** - Slow for dense retrieval with more than ~ 1 Mio documents +### Open Distro for Elasticsearch + +**Pros:** + +- Fully open source (Apache 2.0 license) +- Essentially the same features as Elasticsearch + +**Cons:** + +- Slow for dense retrieval with more than ~ 1 Mio documents + +### OpenSearch + +**Pros:** + +- Fully open source (Apache 2.0 license) +- Essentially the same features as Elasticsearch +- Has more support for vector similarity comparisons and approximate nearest neighbours algorithms + +**Cons:** + +- Not as optimized as dedicated vector similarity options like Milvus and FAISS +
### Milvus diff --git a/docs/latest/components/generator.mdx b/docs/latest/components/generator.mdx index a0b1c639a..73bab5c44 100644 --- a/docs/latest/components/generator.mdx +++ b/docs/latest/components/generator.mdx @@ -2,27 +2,61 @@ While extractive QA highlights the span of text that answers a query, generative QA can return a novel text answer that it has composed. + The best current approaches, such as [Retriever-Augmented Generation](https://arxiv.org/abs/2005.11401) and [LFQA](https://yjernite.github.io/lfqa.html), can draw upon both the knowledge it gained during language model pretraining (parametric memory) as well as passages provided to it with a retriever (non-parametric memory). + With the advent of Transformer based retrieval methods such as [Dense Passage Retrieval](https://arxiv.org/abs/2004.04906), retriever and generator can be trained concurrently from the one loss signal.
-**Tutorial** - -Checkout our tutorial notebooks for a guide on how to build your own generative QA system with RAG ([here](/tutorials/retrieval-augmented-generation)) +**Tutorial:** Checkout our tutorial notebooks for a guide on how to build your own generative QA system with RAG ([here](/tutorials/retrieval-augmented-generation)) or with LFQA ([here](/tutorials/pipelines)).
-Pros +**Pros** - More appropriately phrased answers - Able to synthesize information from different texts - Can draw on latent knowledge stored in language model -Cons +**Cons** - Not easy to track what piece of information the generator is basing its response off of + +## Usage + +Initialize a Generator as follows: + +``` python +from haystack.generator.transformers import RAGenerator + +generator = RAGenerator( + model_name_or_path="facebook/rag-sequence-nq", + retriever=dpr_retriever, + top_k=1, + min_length=2 +) +``` + +Running a Generator in a pipeline: + +``` python +from haystack.pipeline import GenerativeQAPipeline + +pipeline = GenerativeQAPipeline(generator=generator, retriever=dpr_retriever) +result = pipelines.run(query='What are the best party games for adults?', top_k_retriever=20) +``` + +Running a stand-alone Generator: + +``` python +result = generator.predict( + query='What are the best party games for adults?', + documents=[doc1, doc2, doc3...], + top_k=top_k +) +``` diff --git a/docs/latest/components/preprocessing.mdx b/docs/latest/components/preprocessing.mdx index 4043b33f6..3c0c4acd1 100644 --- a/docs/latest/components/preprocessing.mdx +++ b/docs/latest/components/preprocessing.mdx @@ -50,6 +50,17 @@ Please refer to [the API docs](/reference/file-converters) to see which converte valid_languages=["de","en"]) doc = converter.convert(file_path=file, meta=None) + + # Alternatively, if you have a PDF containing images, Haystack uses tessaract under the hood to OCR image PDFs. + + + from haystack.file_converter import PDFToTextOCRConverter + + + converter = PDFToTextOCRConverter(remove_numeric_tables=False, + valid_languages=["deu","eng"]) + + doc = converter.convert(file_path=file, meta=None) ), }, @@ -71,7 +82,7 @@ Please refer to [the API docs](/reference/file-converters) to see which converte content: (

- Haystack also has a`convert_files_to_dicts()` utility function that + Haystack also has a `convert_files_to_dicts()` utility function that will convert all txt or pdf files in a given folder into this dictionary format.

@@ -84,6 +95,26 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
), }, + { + title: "Image", + content: ( +
+

+ Haystack supports extraction of text from images using OCR. +

+
+            
+              from haystack.file_converter import ImageToTextConverter
+            
+            
+            converter = ImageToTextConverter(remove_numeric_tables=True,
+            valid_languages=["de","en"])
+          
+          doc = converter.convert(file_path=file, meta=None)
+          
+
+ ), + }, ]} /> diff --git a/docs/latest/components/ready_made_pipelines.mdx b/docs/latest/components/ready_made_pipelines.mdx index 8cd1438bf..c2a650773 100644 --- a/docs/latest/components/ready_made_pipelines.mdx +++ b/docs/latest/components/ready_made_pipelines.mdx @@ -43,7 +43,7 @@ We typically pass the output of the Retriever to another component such as the R `DocumentSearchPipeline` wraps the [Retriever](/components/retriever) into a pipeline. Note that this wrapper does not endow the Retrievers with additional functionality but instead allows them to be used consistently with other Haystack Pipeline objects and with the same familiar syntax. Creating this pipeline is as simple as passing the Retriever into the pipeline’s constructor: -```python +``` python pipeline = DocumentSearchPipeline(retriever=retriever) query = "Tell me something about that time when they play chess." @@ -128,7 +128,7 @@ result = pipeline.run(query=query, params={"retriever": {"top_k": 10}, "reader": You may access the answer and other information like the model’s confidence and original context via the `answers` key, in this manner: -```python +``` python result["answers"] >>> [{'answer': 'der Klang der Musik', 'score': 9.269367218017578, @@ -209,4 +209,33 @@ Output: ], ... } +``` + +## MostSimilarDocumentsPipeline + +This pipeline is used to find the most similar documents to a given document in your document store. + +You will need to first make sure that your indexed documents have attached embeddings. +You can generate and store their embeddings using the `DocumentStore.update_embeddings()` method. + +``` python +from haystack.pipeline import MostSimilarDocumentsPipeline + +msd_pipeline = MostSimilarDocumentsPipeline(document_store) +result = msd_pipeline.run(document_ids=[doc_id1, doc_id2, ...]) +print(result) +``` + +Output: + +``` python +[[ + {'text': "Southern California's economy is diver...", + 'score': 0.8605178832348279, + 'question': None, + 'meta': {'name': 'Southern_California'}, + 'embedding': ..., + 'id': '6e26b1b78c48efc6dd6c888e72d0970b'}, + ... +]] ``` \ No newline at end of file diff --git a/docs/latest/guides/evaluation.mdx b/docs/latest/guides/evaluation.mdx new file mode 100644 index 000000000..ea93dbf8e --- /dev/null +++ b/docs/latest/guides/evaluation.mdx @@ -0,0 +1,92 @@ +# Evaluation + +Haystack has all the tools needed to evaluate Retrievers, Readers and Generators in both +open domain and closed domain modes. +Evaluation and the metrics that it generates are vital for: +- judging how well your system is performing on a given domain. +- comparing the performance of different models +- identifying underperforming components in your pipeline + +
+ +**Tutorial:** This documentation page is meant to give an in depth understanding of the concepts involved in evaluation. +To get started using Haystack for evaluation, we recommend having a look at our [evaluation tutorial](/tutorials/evaluation) + +
+ +## Open vs Closed Domain + +There are two evaluation modes known as **open domain** and **closed domain.** + +**Closed domain** means single document QA. +In this setting, you want to make sure the correct instance of a string is highlighted as the answer. +So you compare the indexes of predicted against labeled answers. +Even if the two strings have identical content, if they occur in different documents, +or in different positions in the same document, they count as wrong. + +**Open domain** means multiple-document QA (typically over the entire database). +Here, you only look for a match or overlap between the two answer strings. +Even if the predicted answer is extracted from a different position than the correct answer, +that's fine as long as the strings match. + +## Metrics: Retrieval + +### Recall + +Recall measures how many times the correct document was among the retrieved documents over a set of queries. +For a single query, the output is binary: either a document is contained in the selection, or it is not. +Over the entire dataset, the recall score amounts to a number between zero (no query retrieved the right document) and one (all queries retrieved the right documents). + +Note that recall is affected by the number of documents that the retriever returns. +If the retriever returns only one or a few documents, it is a tougher task to retrieve correct documents. +Make sure to set the Retriever's `top_k` to an appropriate value and to also define the `top_k` in `Retriever.eval()` or `EvalDocuments` + +### Mean Reciprocal Rank (MRR) + +In contrast to the recall metric, mean reciprocal rank takes the position of the top correctly retrieved document (the “rank”) into account. +It does this to account for the fact that a query elicits multiple responses of varying relevance. +Like recall, MRR can be a value between zero (no matches) and one (the system retrieved a correct document for all queries as the top result). +For more details, check out [this page](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) + +### Mean Average Precision (mAP) + +Mean average precision is similar to mean reciprocal rank but takes into account the position of every correctly retrieved document. +Like MRR, mAP can be a value between zero (no matches) and one (the system retrieved correct documents for all top results). +mAP is particularly useful in cases where there are more than one correct document to be retrieved. +For more details, check out [this page](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) + + +## Metrics: Question Answering + +### Exact Match (EM) + +Exact match measures the proportion of cases where the predicted answer is identical to the correct answer. +For example, for the annotated question answer pair “What is Haystack?" + "A question answering library in Python”, +even a predicted answer like “A Python question answering library” would yield a zero score because it does not match the expected answer 100 percent. + +### F1 + +The F1 score is more forgiving and measures the word overlap between the labeled and the predicted answer. +Whenever the EM is 1, F1 will also be 1. +To learn more about the F1 score, check out this guide + +### Semantic Answer Similarity (SAS) + +Semantic Answer Similarity uses a transformer-based cross-encoder architecture to evaluate the semantic similarity of two answers rather than their lexical overlap. +While F1 and EM would both score “one hundred percent” as sharing zero similarity with “100 %", SAS is trained to assign this a high score. + +SAS is particularly useful to seek out cases where F1 doesn't give a good indication of the validity of a predicted answer. + +You can read more about SAS in [this paper](https://arxiv.org/abs/2108.06130). + +## Datasets + +Annotated datasets are crucial for evaluating the retrieval as well as the question answering capabilities of your system. +Haystack is designed to work with question answering datasets that follow SQuAD format. +Please check out our [annotation tool](/guides/annotation) if you're interested in creating your own dataset. + +
+ +**Data Tool:** have a look at our `SquadData` object in `haystack/squad_data.py` if you'd like to manipulate SQuAD style data using Pandas dataframes. + +
\ No newline at end of file diff --git a/docs/latest/menu.json b/docs/latest/menu.json index fbdbefc34..4b8768109 100644 --- a/docs/latest/menu.json +++ b/docs/latest/menu.json @@ -26,6 +26,7 @@ {"slug": "reader", "title": "Reader"}, {"slug": "generator", "title": "Generator" }, {"slug": "summarizer", "title": "Summarizer"}, + {"slug": "classifier", "title": "Classifier"}, {"slug": "translator", "title": "Translator"}, {"slug": "knowledge-graph", "title": "Knowledge Graph"}, {"slug": "ranker", "title": "Ranker"}, @@ -39,6 +40,7 @@ {"slug": "languages", "title": "Languages Other Than English"}, {"slug": "domain-adaptation","title": "Domain Adaptation"}, {"slug": "optimization", "title": "Optimization"}, + {"slug": "evaluation", "title": "Evaluation"}, {"slug": "annotation", "title": "Annotation Tool"}, {"slug": "rest-api", "title": "REST API"}, {"slug": "chatbots", "title": "Chatbot Integration"} @@ -107,43 +109,22 @@ "subMenuTitle": "API Reference", "pathPrefix": "/reference/", "items": [ - { - "slug": "document-store", - "title": "Document Store" - }, + {"slug": "document-store", "title": "Document Store"}, { "slug": "retriever", "title": "Retriever" }, { "slug": "reader", "title": "Reader" }, { "slug": "generator", "title": "Generator" }, - { - "slug": "summarizer", - "title": "Summarizer" - }, - { - "slug": "translator", - "title": "Translator" - }, - { - "slug": "preprocessor", - "title": "Preprocessor" - }, - { - "slug": "file-converters", - "title": "File Converters" - }, - { "slug": "crawler", "title": "Crawler" }, - { - "slug": "evaluation", - "title": "Evaluation" - }, + {"slug": "summarizer", "title": "Summarizer"}, + {"slug": "translator", "title": "Translator"}, + {"slug": "preprocessor", "title": "Preprocessor"}, + {"slug": "file-converters", "title": "File Converters"}, + {"slug": "crawler", "title": "Crawler" }, + {"slug": "evaluation", "title": "Evaluation"}, { "slug": "pipelines", "title": "Pipelines" }, - { - "slug": "knowledge-graph", - "title": "Knowledge Graph" - }, - { - "slug": "graph-retriever", - "title": "Graph Retriever" - } + {"slug": "knowledge-graph", "title": "Knowledge Graph"}, + {"slug": "graph-retriever", "title": "Graph Retriever"}, + {"slug": "classifier", "title": "Classifier"}, + {"slug": "question-generator", "title": "Question Generator"}, + {"slug": "ranker", "title": "Ranker"} ] } ] diff --git a/lib/constants.ts b/lib/constants.ts index 2b5f01b6c..8d6257411 100644 --- a/lib/constants.ts +++ b/lib/constants.ts @@ -75,6 +75,21 @@ export const referenceFiles: Meta = { filename: "graph_retriever.md", title: "Graph Retriever", }, + { + slug: "question-generator", + filename: "question_generator.md", + title: "Question Generator", + }, + { + slug: "classifier", + filename: "classifier.md", + title: "Classifier", + }, + { + slug: "ranker", + filename: "ranker.md", + title: "Ranker", + }, ], }; diff --git a/public/img/classifier.png b/public/img/classifier.png new file mode 100644 index 000000000..6897f5d7a Binary files /dev/null and b/public/img/classifier.png differ