From d2689386c70d6d7a9bc65fbb156807ed29454547 Mon Sep 17 00:00:00 2001 From: Markus Paff Date: Thu, 23 Sep 2021 16:32:14 +0200 Subject: [PATCH] New docs version (#159) * new docs version * add version manually for testing * add more information to read.me * documentstores as dropdown * dorpdown for pros and cons documentstore * fix link * missing ) * point again to haystack master --- README.md | 5 +- benchmarks/latest/map/retriever_map.json | 40 ++ .../performance/retriever_performance.json | 14 + benchmarks/latest/speed/retriever_speed.json | 40 ++ benchmarks/v0.10.0/map/retriever_map.json | 204 +++++++ .../performance/reader_performance.json | 44 ++ .../performance/retriever_performance.json | 88 +++ benchmarks/v0.10.0/speed/retriever_speed.json | 204 +++++++ components/VersionSelect.tsx | 2 + docs/latest/components/document_store.mdx | 570 +++++++++--------- docs/v0.10.0/components/classifier.mdx | 43 ++ docs/v0.10.0/components/document_store.mdx | 385 ++++++++++++ docs/v0.10.0/components/generator.mdx | 62 ++ docs/v0.10.0/components/knowledge_graph.mdx | 107 ++++ docs/v0.10.0/components/pipelines.mdx | 461 ++++++++++++++ docs/v0.10.0/components/preprocessing.mdx | 172 ++++++ docs/v0.10.0/components/query_classifier.mdx | 178 ++++++ .../v0.10.0/components/question_generator.mdx | 69 +++ docs/v0.10.0/components/ranker.mdx | 50 ++ docs/v0.10.0/components/reader.mdx | 357 +++++++++++ .../components/ready_made_pipelines.mdx | 241 ++++++++ docs/v0.10.0/components/retriever.mdx | 286 +++++++++ docs/v0.10.0/components/summarizer.mdx | 47 ++ docs/v0.10.0/components/translator.mdx | 52 ++ docs/v0.10.0/guides/annotation.mdx | 82 +++ docs/v0.10.0/guides/chatbots.mdx | 151 +++++ docs/v0.10.0/guides/domain_adaptation.mdx | 148 +++++ docs/v0.10.0/guides/evaluation.mdx | 92 +++ docs/v0.10.0/guides/languages.mdx | 196 ++++++ docs/v0.10.0/guides/optimization.mdx | 109 ++++ docs/v0.10.0/guides/rest_api.mdx | 183 ++++++ docs/v0.10.0/menu.json | 130 ++++ docs/v0.10.0/overview/faq.mdx | 105 ++++ docs/v0.10.0/overview/get_started.mdx | 209 +++++++ docs/v0.10.0/overview/glossary.mdx | 56 ++ docs/v0.10.0/overview/intro.mdx | 55 ++ docs/v0.10.0/overview/roadmap.mdx | 47 ++ docs/v0.10.0/overview/use_cases.mdx | 52 ++ lib/github.ts | 2 +- lib/utils.ts | 1 + package-lock.json | 393 ++++++++---- package.json | 6 +- pages/overview/[...slug].tsx | 1 + yarn.lock | 97 +-- 44 files changed, 5381 insertions(+), 455 deletions(-) create mode 100644 benchmarks/v0.10.0/map/retriever_map.json create mode 100644 benchmarks/v0.10.0/performance/reader_performance.json create mode 100644 benchmarks/v0.10.0/performance/retriever_performance.json create mode 100644 benchmarks/v0.10.0/speed/retriever_speed.json create mode 100644 docs/v0.10.0/components/classifier.mdx create mode 100644 docs/v0.10.0/components/document_store.mdx create mode 100644 docs/v0.10.0/components/generator.mdx create mode 100644 docs/v0.10.0/components/knowledge_graph.mdx create mode 100644 docs/v0.10.0/components/pipelines.mdx create mode 100644 docs/v0.10.0/components/preprocessing.mdx create mode 100644 docs/v0.10.0/components/query_classifier.mdx create mode 100644 docs/v0.10.0/components/question_generator.mdx create mode 100644 docs/v0.10.0/components/ranker.mdx create mode 100644 docs/v0.10.0/components/reader.mdx create mode 100644 docs/v0.10.0/components/ready_made_pipelines.mdx create mode 100644 docs/v0.10.0/components/retriever.mdx create mode 100644 docs/v0.10.0/components/summarizer.mdx create mode 100644 docs/v0.10.0/components/translator.mdx create mode 100644 docs/v0.10.0/guides/annotation.mdx create mode 100644 docs/v0.10.0/guides/chatbots.mdx create mode 100644 docs/v0.10.0/guides/domain_adaptation.mdx create mode 100644 docs/v0.10.0/guides/evaluation.mdx create mode 100644 docs/v0.10.0/guides/languages.mdx create mode 100644 docs/v0.10.0/guides/optimization.mdx create mode 100644 docs/v0.10.0/guides/rest_api.mdx create mode 100644 docs/v0.10.0/menu.json create mode 100644 docs/v0.10.0/overview/faq.mdx create mode 100644 docs/v0.10.0/overview/get_started.mdx create mode 100644 docs/v0.10.0/overview/glossary.mdx create mode 100644 docs/v0.10.0/overview/intro.mdx create mode 100644 docs/v0.10.0/overview/roadmap.mdx create mode 100644 docs/v0.10.0/overview/use_cases.mdx diff --git a/README.md b/README.md index 65123bf84..ec958b423 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,10 @@ To preview docs that are on a non-master branch of the Haystack repo, you run th ### Updating docs after a release -When there's a new Haystack release, we need to create a directory for the new version within the local `/docs` directory. In this directory, we can write new overview and usage docs in .mdx (or manually copy over the ones from the previous version directory). Once this is done, the project will automatically fetch the reference and tutorial docs for the new version from GitHub. Bear in mind that a `menu.json` file needs to exist in every new version directory so that our Menu components know which page links to display. Additionally, the `referenceFiles` and `tutorialFiles` constants in `lib/constants` need to be updated with any new reference or tutorial docs that get created as part of a new release. **Lastly**, we have to update the constant specified in the `components/VersionSelect` component, so that we default to the new version when navigating between pages. +When there's a new Haystack release, we need to create a directory for the new version within the local `/docs` directory. In this directory, we can write new overview and usage docs in .mdx (or manually copy over the ones from the previous version directory). Once this is done, the project will automatically fetch the reference and tutorial docs for the new version from GitHub. Bear in mind that a `menu.json` file needs to exist in every new version directory so that our Menu components know which page links to display. Moreover, we need to point the links, which are pointing to the latest to version, to the new version. Currently, we do not have a script for this process. Therefore, you need to use the search function of your IDE. Additionally, the `referenceFiles` and `tutorialFiles` constants in `lib/constants` need to be updated with any new reference or tutorial docs that get created as part of a new release. In the [haystack](https://github.com/deepset-ai/haystack) repo, we have to release the api and tutorial docs by copying them to a new version folder as well. If you want to include here files from another brnach than master follwo **Preview from non-master branches**. **Lastly**, we have to update the constant specified in the `components/VersionSelect` component, so that we default to the new version when navigating between pages. +After releasing the docs, we need to release the benchmarks. Create a new version folder in the folder `benchmarks` and copy all folders from `latest` to the new folder. +If you know start the local sever and go to the new version, you will see the 404 page. We pull the version from the haystack release tags. Most likely, the newest version is not released yet. Therefore, you have to add it manually to the array `tagNames` in the function `getDocsVersions` by adding the command `tagNames.push('v0.10.0');`. + ## Styling diff --git a/benchmarks/latest/map/retriever_map.json b/benchmarks/latest/map/retriever_map.json index 1d12c90f0..51e0687cf 100644 --- a/benchmarks/latest/map/retriever_map.json +++ b/benchmarks/latest/map/retriever_map.json @@ -159,6 +159,46 @@ "model": "BM25 / Elasticsearch", "n_docs": 1000, "map": 74.20444712972909 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "map": 89.8709701490436 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "map": 92.76308330349686 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "map": 89.00403653862938 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "map": 85.7342431384476 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "map": 80.85588135082547 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "map": 77.5426462347698 } ] } \ No newline at end of file diff --git a/benchmarks/latest/performance/retriever_performance.json b/benchmarks/latest/performance/retriever_performance.json index 166e0b32a..dbb934048 100644 --- a/benchmarks/latest/performance/retriever_performance.json +++ b/benchmarks/latest/performance/retriever_performance.json @@ -69,6 +69,20 @@ "index_speed": 115.61076852516383, "query_speed": 38.80526238789059, "map": 81.63864883662649 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "index_speed": 70.05381128388427, + "query_speed": 15.306895223372484, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "index_speed": 70.31004397719536, + "query_speed": 24.95733865947408, + "map": 85.7342431384476 } ] } \ No newline at end of file diff --git a/benchmarks/latest/speed/retriever_speed.json b/benchmarks/latest/speed/retriever_speed.json index 2660ceac4..7877d2a35 100644 --- a/benchmarks/latest/speed/retriever_speed.json +++ b/benchmarks/latest/speed/retriever_speed.json @@ -159,6 +159,46 @@ "model": "BM25 / Elasticsearch", "n_docs": 1000, "query_speed": 282.95914917837337 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "query_speed": 29.061163356184426 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "query_speed": 24.834414667596725 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "query_speed": 15.306895223372484 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "query_speed": 29.10621389658101 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "query_speed": 26.92417300437131 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "query_speed": 24.95733865947408 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "query_speed": 11.33271222977541 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "query_speed": 24.13921492357397 } ] } \ No newline at end of file diff --git a/benchmarks/v0.10.0/map/retriever_map.json b/benchmarks/v0.10.0/map/retriever_map.json new file mode 100644 index 000000000..51e0687cf --- /dev/null +++ b/benchmarks/v0.10.0/map/retriever_map.json @@ -0,0 +1,204 @@ +{ + "chart_type": "LineChart", + "title": "Retriever Accuracy", + "subtitle": "mAP at different number of docs", + "description": "Here you can see how the mean avg. precision (mAP) of the retriever decays as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.", + "columns": [ + "n_docs", + "BM25 / Elasticsearch", + "DPR / Elasticsearch", + "DPR / FAISS (flat)", + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)", + "Sentence Transformers / Elasticsearch" + ], + "axis": [ + { + "x": "Number of docs", + "y": "mAP" + } + ], + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 10000, + "map": 66.26543444531747 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 1000, + "map": 90.06638620360428 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 10000, + "map": 87.11255142468549 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "map": 89.51337675393017 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "map": 88.24421129104469 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "map": 86.54606328368976 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "map": 56.25299537353825 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 500000, + "map": 45.595090262466535 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "map": 82.74686664920836 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 500000, + "map": 76.49564526892904 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "map": 84.33419639513305 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "map": 75.73062475537202 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "map": 81.63864883662649 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "map": 73.57986207906387 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 1000, + "map": 74.20444712972909 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "map": 89.8709701490436 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "map": 92.76308330349686 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "map": 89.00403653862938 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "map": 85.7342431384476 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "map": 80.85588135082547 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "map": 77.5426462347698 + } + ] +} \ No newline at end of file diff --git a/benchmarks/v0.10.0/performance/reader_performance.json b/benchmarks/v0.10.0/performance/reader_performance.json new file mode 100644 index 000000000..be935fe27 --- /dev/null +++ b/benchmarks/v0.10.0/performance/reader_performance.json @@ -0,0 +1,44 @@ +{ + "chart_type": "BarChart", + "title": "Reader Performance", + "subtitle": "Time and Accuracy Benchmarks", + "description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set max_seq_len=384 and doc_stride=128. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this script. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.", + "bars": "horizontal", + "columns": [ + "Model", + "F1", + "Speed (passages/sec)" + ], + "data": [ + { + "F1": 82.58860575299658, + "Speed": 125.81040525892848, + "Model": "RoBERTa" + }, + { + "F1": 78.87858491007042, + "Speed": 260.6443097981493, + "Model": "MiniLM" + }, + { + "F1": 74.31182400443286, + "Speed": 121.08066567525722, + "Model": "BERT base" + }, + { + "F1": 83.26306774734308, + "Speed": 42.21949937744112, + "Model": "BERT large" + }, + { + "F1": 84.50422699207468, + "Speed": 42.07400844838985, + "Model": "XLM-RoBERTa" + }, + { + "F1": 42.31925844723574, + "Speed": 222.91207128366702, + "Model": "DistilBERT" + } + ] +} \ No newline at end of file diff --git a/benchmarks/v0.10.0/performance/retriever_performance.json b/benchmarks/v0.10.0/performance/retriever_performance.json new file mode 100644 index 000000000..dbb934048 --- /dev/null +++ b/benchmarks/v0.10.0/performance/retriever_performance.json @@ -0,0 +1,88 @@ +{ + "chart_type": "BarChart", + "title": "Retriever Performance", + "subtitle": "Time and Accuracy Benchmarks", + "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", + "bars": "horizontal", + "columns": [ + "Model", + "mAP", + "Index Speed (docs/sec)", + "Query Speed (queries/sec)" + ], + "series": { + "s0": "map", + "s1": "time", + "s2": "time" + }, + "axes": { + "label": "map", + "time_side": "top", + "time_label": "seconds" + }, + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "index_speed": 71.36964873196698, + "query_speed": 5.192368815242574, + "map": 86.54606328368976 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "index_speed": 485.5602670200369, + "query_speed": 103.0884393334727, + "map": 56.25299537353825 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "index_speed": 119.52937722555107, + "query_speed": 6.385621466857457, + "map": 82.74686664920836 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "index_speed": 100.01184910084558, + "query_speed": 6.6270933964840415, + "map": 86.54606328368973 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "index_speed": 89.90389306648805, + "query_speed": 39.7839528511866, + "map": 84.33419639513305 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "index_speed": 116.00982709720004, + "query_speed": 28.57264344960955, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "index_speed": 115.61076852516383, + "query_speed": 38.80526238789059, + "map": 81.63864883662649 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "index_speed": 70.05381128388427, + "query_speed": 15.306895223372484, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "index_speed": 70.31004397719536, + "query_speed": 24.95733865947408, + "map": 85.7342431384476 + } + ] +} \ No newline at end of file diff --git a/benchmarks/v0.10.0/speed/retriever_speed.json b/benchmarks/v0.10.0/speed/retriever_speed.json new file mode 100644 index 000000000..7877d2a35 --- /dev/null +++ b/benchmarks/v0.10.0/speed/retriever_speed.json @@ -0,0 +1,204 @@ +{ + "chart_type": "LineChart", + "title": "Retriever Speed", + "subtitle": "Query Speed at different number of docs", + "description": "Here you can see how the query speed of different Retriever / DocumentStore combinations scale as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.", + "columns": [ + "n_docs", + "BM25 / Elasticsearch", + "DPR / Elasticsearch", + "DPR / FAISS (flat)", + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)", + "Sentence Transformers / Elasticsearch" + ], + "axis": [ + { + "x": "Number of docs", + "y": "Queries/sec" + } + ], + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 1000, + "query_speed": 34.22768858415144 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 10000, + "query_speed": 22.197089725786853 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 10000, + "query_speed": 127.11481826852273 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 1000, + "query_speed": 47.51341215808855 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 10000, + "query_speed": 29.74515869340777 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "query_speed": 42.49634272581313 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "query_speed": 27.684040507849826 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "query_speed": 43.36685860983961 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "query_speed": 41.819147130090286 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "query_speed": 41.12204778755844 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "query_speed": 37.86882443918513 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "query_speed": 41.14803671045185 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "query_speed": 40.072871546542935 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "query_speed": 5.192368815242574 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 500000, + "query_speed": 1.0337466563959614 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "query_speed": 103.0884393334727 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 500000, + "query_speed": 78.95037031647355 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "query_speed": 6.385621466857457 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 500000, + "query_speed": 1.4175454254854258 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "query_speed": 6.6270933964840415 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "query_speed": 1.5394964631878052 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "query_speed": 39.7839528511866 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "query_speed": 39.84177061191119 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "query_speed": 28.57264344960955 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "query_speed": 15.645867393099733 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "query_speed": 38.80526238789059 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "query_speed": 37.15717318924075 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 1000, + "query_speed": 282.95914917837337 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "query_speed": 29.061163356184426 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "query_speed": 24.834414667596725 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "query_speed": 15.306895223372484 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "query_speed": 29.10621389658101 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "query_speed": 26.92417300437131 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "query_speed": 24.95733865947408 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "query_speed": 11.33271222977541 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "query_speed": 24.13921492357397 + } + ] +} \ No newline at end of file diff --git a/components/VersionSelect.tsx b/components/VersionSelect.tsx index 05f4d35e7..b35343f27 100644 --- a/components/VersionSelect.tsx +++ b/components/VersionSelect.tsx @@ -5,6 +5,7 @@ import { Listbox, Transition } from "@headlessui/react"; const versionRoutesHaystack: { [key: string]: string } = { "latest": "/overview/intro", + "v0.10.0": "/overview/v0.10.0/intro", "v0.9.0": "/overview/v0.9.0/intro", "v0.8.0": "/overview/v0.8.0/intro", "v0.7.0": "/overview/v0.7.0/intro", @@ -15,6 +16,7 @@ const versionRoutesHaystack: { [key: string]: string } = { const versionRoutesBenchmarks: { [key: string]: string } = { "latest": "/benchmarks/latest", + "v0.10.0": "/benchmarks/v0.10.0", "v0.9.0": "/benchmarks/v0.9.0", "v0.8.0": "/benchmarks/v0.8.0", "v0.7.0": "/benchmarks/v0.7.0", diff --git a/docs/latest/components/document_store.mdx b/docs/latest/components/document_store.mdx index 91e4e7fe9..592b7fd0e 100644 --- a/docs/latest/components/document_store.mdx +++ b/docs/latest/components/document_store.mdx @@ -13,170 +13,158 @@ Initialising a new DocumentStore within Haystack is straight forward.
-### Elasticsearch - -[Install](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) -Elasticsearch and then [start](https://www.elastic.co/guide/en/elasticsearch/reference/current/starting-elasticsearch.html) -an instance. - -If you have Docker set up, we recommend pulling the Docker image and running it. - -```bash -docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2 -docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 -``` - -Next you can initialize the Haystack object that will connect to this instance. - -```python -from haystack.document_store import ElasticSearchDocumentStore - -document_store = ElasticsearchDocumentStore() -``` - -### Open Distro for Elasticsearch - -Learn how to get started [here](https://opendistro.github.io/for-elasticsearch-docs/#get-started) - -If you have Docker set up, we recommend pulling the Docker image and running it. - -```bash -docker pull amazon/opendistro-for-elasticsearch:1.13.2 -docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" amazon/opendistro-for-elasticsearch:1.13.2 -``` - -Next you can initialize the Haystack object that will connect to this instance. - -```python -from haystack.document_store import OpenDistroElasticsearchDocumentStore - -document_store = OpenDistroElasticsearchDocumentStore() -``` - -### OpenSearch - -Learn how to get started [here](https://opensearch.org/docs/#docker-quickstart) - -If you have Docker set up, we recommend pulling the Docker image and running it. - -```bash -docker pull opensearchproject/opensearch:1.0.1 -docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.1 -``` - -Next you can initialize the Haystack object that will connect to this instance. - -```bash -from haystack.document_store import OpenSearchDocumentStore - -document_store = OpenSearchDocumentStore() -``` - -
- -### Milvus - -Follow the [official documentation](https://www.milvus.io/docs/v1.0.0/milvus_docker-cpu.md) to start a Milvus instance via Docker. -Note that we also have a utility function `haystack.utils.launch_milvus` that can start up a Milvus instance. - -You can initialize the Haystack object that will connect to this instance as follows: - -```python -from haystack.document_store import MilvusDocumentStore - -document_store = MilvusDocumentStore() -``` - -
- -### FAISS - -The `FAISSDocumentStore` requires no external setup. Start it by simply using this line. - -```python -from haystack.document_store import FAISSDocumentStore - -document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") -``` - -#### Save & Load - -FAISS document stores can be saved to disk and reloaded: - -```python -from haystack.document_store import FAISSDocumentStore - -document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") - -# Generates two files: my_faiss_index.faiss and my_faiss_index.json -document_store.save("my_faiss_index.faiss") - -# Looks for the two files generated above -new_document_store = FAISSDocumentStore.load("my_faiss_index.faiss") - -assert new_document_store.faiss_index_factory_str == "Flat" -``` - -While `my_faiss_index.faiss` contains the index, `my_faiss_index.json` -contains the parameters used to inizialize it (like `faiss_index_factory_store`). -This configuration file is necessary for `load()` to work. It simply contains -the initial parameters in a JSON format. -For example, a hand-written configuration file for the above FAISS index could look like: - -```json -{ - faiss_index_factory_store: 'Flat' -} -``` - -
- -### In Memory - -The `InMemoryDocumentStore()` requires no external setup. Start it by simply using this line. - -```python -from haystack.document_store import InMemoryDocumentStore - -document_store = InMemoryDocumentStore() -``` - -
- -### SQL - -The `SQLDocumentStore` requires SQLite, PostgresQL or MySQL to be installed and started. -Note that SQLite already comes packaged with most operating systems. - -```python -from haystack.document_store import SQLDocumentStore - -document_store = SQLDocumentStore() -``` - -
- -### Weaviate - -The `WeaviateDocumentStore` requires a running Weaviate Server. -You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details): - -``` - docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.4.0 -``` - -Afterwards, you can use it in Haystack: - -```python -from haystack.document_store import WeaviateDocumentStore - -document_store = WeaviateDocumentStore() -``` - -Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes. -See API documentation for more info. - -
+## Types + + + Install  + Elasticsearch and then start  + an instance.



+ If you have Docker set up, we recommend pulling the Docker image and running it.

+
+                    docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2
+                    docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
+                
+ Next you can initialize the Haystack object that will connect to this instance.

+
+                    document_store = ElasticsearchDocumentStore()
+                
+
+ ) + }, + { + title: "Open Distro for Elasticsearch", + content: ( +
+ Learn how to get started here. + If you have Docker set up, we recommend pulling the Docker image and running it. +
+                    docker pull amazon/opendistro-for-elasticsearch:1.13.2
+                    docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" amazon/opendistro-for-elasticsearch:1.13.2
+                
+ Next you can initialize the Haystack object that will connect to this instance. +
+                    from haystack.document_store import OpenDistroElasticsearchDocumentStore
+                    document_store = OpenDistroElasticsearchDocumentStore()
+                
+
+ ) + }, + { + title: "OpenSearch", + content: ( +
+ Learn how to get started here. + If you have Docker set up, we recommend pulling the Docker image and running it. +
+                    docker pull opensearchproject/opensearch:1.0.1
+                    docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.1
+                
+ Next you can initialize the Haystack object that will connect to this instance. +
+                    from haystack.document_store import OpenSearchDocumentStore
+                    document_store = OpenSearchDocumentStore()
+                
+
+ ) + }, + { + title: "Milvus", + content: ( +
+ Follow the official documentation to start a Milvus instance via Docker. + Note that we also have a utility function haystack.utils.launch_milvus that can start up a Milvus instance.



+ You can initialize the Haystack object that will connect to this instance as follows:

+
+                    from haystack.document_store import MilvusDocumentStore
+                    document_store = MilvusDocumentStore()
+                
+
+ ) + }, + { + title: "FAISS", + content: ( +
+ The FAISSDocumentStore requires no external setup. Start it by simply using this line.

+
+                    from haystack.document_store import FAISSDocumentStore
+                    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+                
+

Save & Load

+ FAISS document stores can be saved to disk and reloaded: +
+                    from haystack.document_store import FAISSDocumentStore
+                    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+                    # Generates two files: my_faiss_index.faiss and my_faiss_index.json
+                    document_store.save("my_faiss_index.faiss")
+                    # Looks for the two files generated above
+                    new_document_store = FAISSDocumentStore.load("my_faiss_index.faiss")
+                    assert new_document_store.faiss_index_factory_str == "Flat"
+                
+ While `my_faiss_index.faiss` contains the index, my_faiss_index.json + contains the parameters used to inizialize it (like faiss_index_factory_store). + This configuration file is necessary for load() to work. It simply contains + the initial parameters in a JSON format.

+ For example, a hand-written configuration file for the above FAISS index could look like:

+
+                    {
+                      faiss_index_factory_store: 'Flat'
+                    }
+                
+
+ ) + }, + { + title: "In Memory", + content: ( +
+ The InMemoryDocumentStore() requires no external setup. Start it by simply using this line. +
+                    from haystack.document_store import InMemoryDocumentStore
+                    document_store = InMemoryDocumentStore()
+                
+
+ ) + }, + { + title: "SQL", + content: ( +
+ The SQLDocumentStore requires SQLite, PostgresQL or MySQL to be installed and started. + Note that SQLite already comes packaged with most operating systems. +
+                from haystack.document_store import SQLDocumentStore
+                document_store = SQLDocumentStore()
+                
+
+ ) + }, + { + title: "Weaviate", + content: ( +
+ The WeaviateDocumentStore requires a running Weaviate Server. + You can start a basic instance like this (see the Weaviate docs for details): +
+                    docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.4.0
+                
+ Afterwards, you can use it in Haystack: +
+                    from haystack.document_store import WeaviateDocumentStore
+                    document_store = WeaviateDocumentStore()
+                
+ Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes. + See API documentation for more info. +
+ ) + } + ]} +/> ## Input Format @@ -239,116 +227,150 @@ Having GPU acceleration will significantly speed this up. The Document Stores have different characteristics. You should choose one depending on the maturity of your project, the use case and technical environment: -### Elasticsearch - -**Pros:** - -- Fast & accurate sparse retrieval with many tuning options -- Basic support for dense retrieval -- Production-ready - -**Cons:** - -- Slow for dense retrieval with more than ~ 1 Mio documents - -### Open Distro for Elasticsearch - -**Pros:** - -- Fully open source (Apache 2.0 license) -- Essentially the same features as Elasticsearch - -**Cons:** - -- Slow for dense retrieval with more than ~ 1 Mio documents - -### OpenSearch - -**Pros:** - -- Fully open source (Apache 2.0 license) -- Essentially the same features as Elasticsearch -- Has more support for vector similarity comparisons and approximate nearest neighbours algorithms - -**Cons:** - -- Not as optimized as dedicated vector similarity options like Milvus and FAISS - -
- -### Milvus - -**Pros:** - -- Scalable DocumentStore that excels at handling vectors (hence suited to dense retrieval methods like DPR) -- Encapsulates multiple ANN libraries (e.g. FAISS and ANNOY) and provides added reliability -- Runs as a separate service (e.g. a Docker container) -- Allows dynamic data management - -**Cons:** - -- No efficient sparse retrieval - -
- -### FAISS - -**Pros:** - -- Fast & accurate dense retrieval -- Highly scalable due to approximate nearest neighbour algorithms (ANN) -- Many options to tune dense retrieval via different index types (more info [here](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index)) - -**Cons:** - -- No efficient sparse retrieval - -
- -### In Memory - -**Pros:** - -- Simple -- Exists already in many environments - -**Cons:** - -- Only compatible with minimal TF-IDF Retriever -- Bad retrieval performance -- Not recommended for production - -### SQL - -
- -**Pros:** - -- Simple & fast to test -- No database requirements -- Supports MySQL, PostgreSQL and SQLite - -**Cons:** - -- Not scalable -- Not persisting your data on disk - -
- -### Weaviate - -**Pros:** - -- Simple vector search -- Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up -- Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset - -**Cons:** - -- Less options for ANN algorithms than FAISS or Milvus -- No BM25 / Tf-idf retrieval - -
+ + Pros: +
    +
  • Fast & accurate sparse retrieval with many tuning options
  • +
  • Basic support for dense retrieval
  • +
  • Production-ready
  • +
  • Support also for Open Distro
  • +
+ Cons: +
    +
  • Slow for dense retrieval with more than ~ 1 Mio documents
  • +
+
+ ) + }, + { + title: "Open Distro for Elasticsearch", + content: ( +
+ Pros: +
    +
  • Fully open source (Apache 2.0 license)
  • +
  • Essentially the same features as Elasticsearch
  • +
+ Cons: +
    +
  • Slow for dense retrieval with more than ~ 1 Mio documents
  • +
+
+ ) + }, + { + title: "OpenSearch", + content: ( +
+ Pros: +
    +
  • Fully open source (Apache 2.0 license)
  • +
  • Essentially the same features as Elasticsearch
  • +
  • Has more support for vector similarity comparisons and approximate nearest neighbours algorithms
  • +
+ Cons: +
    +
  • Not as optimized as dedicated vector similarity options like Milvus and FAISS
  • +
+
+ ) + }, + { + title: "Milvus", + content: ( +
+ Pros: +
    +
  • Scalable DocumentStore that excels at handling vectors (hence suited to dense retrieval methods like DPR)
  • +
  • Encapsulates multiple ANN libraries (e.g. FAISS and ANNOY) and provides added reliability
  • +
  • Runs as a separate service (e.g. a Docker container)
  • +
  • Allows dynamic data management
  • +
+ Cons: +
    +
  • No efficient sparse retrieval
  • +
+
+ ) + }, + { + title: "FAISS", + content: ( +
+ Pros: +
    +
  • Fast & accurate dense retrieval
  • +
  • Highly scalable due to approximate nearest neighbour algorithms (ANN)
  • +
  • Many options to tune dense retrieval via different index types (more info here)
  • +
+ Cons: +
    +
  • No efficient sparse retrieval
  • +
+
+ ) + }, + { + title: "In Memory", + content: ( +
+ Pros: +
    +
  • Simple
  • +
  • Exists already in many environments
  • +
+ Cons: +
    +
  • Only compatible with minimal TF-IDF Retriever
  • +
  • Bad retrieval performance
  • +
  • Not recommended for production
  • +
+
+ ) + }, + { + title: "SQL", + content: ( +
+ Pros: +
    +
  • Simple & fast to test
  • +
  • No database requirements
  • +
  • Supports MySQL, PostgreSQL and SQLite
  • +
+ Cons: +
    +
  • Not scalable
  • +
  • Not persisting your data on disk
  • +
+
+ ) + }, + { + title: "Weaviate", + content: ( +
+ Pros: +
    +
  • Simple vector search
  • +
  • Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
  • +
  • Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset
  • +
+ Cons: +
    +
  • Less options for ANN algorithms than FAISS or Milvus
  • +
  • No BM25 / Tf-idf retrieval
  • +
+
+ ) + } + ]} +/>
diff --git a/docs/v0.10.0/components/classifier.mdx b/docs/v0.10.0/components/classifier.mdx new file mode 100644 index 000000000..55c793f40 --- /dev/null +++ b/docs/v0.10.0/components/classifier.mdx @@ -0,0 +1,43 @@ +# Classifier + +The Classifier Node is a transformer based classification model used to create predictions that can be attached to retrieved documents as metadata. +For example, by using a sentiment model, you can label each document as being either positive or negative in sentiment. +Through a tight integration with the HuggingFace model hub, you can easily load any classification model by simply supplying the model name. + +![image](/img/classifier.png) + +
+ +Note that the Classifier is different from the Query Classifier. +While the Query Classifier categorizes incoming queries in order to route them to different parts of the pipeline, +the Classifier is used to create classification labels that can be attached to retrieved documents as metadata. + +
+ +## Usage + +Initialize it as follows: + +``` python +from haystack.classifier import FARMClassifier + +classifier_model = 'textattack/bert-base-uncased-imdb' +classifier = FARMClassifier(model_name_or_path=classifier_model) +``` + +It slotted into a pipeline as follows: + +``` python +pipeline = Pipeline() +pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) +pipeline.add_node(component=classifier, name='Classifier', inputs=['Retriever']) +``` + +It can also be run in isolation: + +``` python +documents = classifier.predict( + query="", + documents = [doc1, doc2, doc3, ...] +): +``` \ No newline at end of file diff --git a/docs/v0.10.0/components/document_store.mdx b/docs/v0.10.0/components/document_store.mdx new file mode 100644 index 000000000..533f66d81 --- /dev/null +++ b/docs/v0.10.0/components/document_store.mdx @@ -0,0 +1,385 @@ +# DocumentStores + +You can think of the DocumentStore as a "database" that: + +- stores your texts and meta data +- provides them to the retriever at query time + +There are different DocumentStores in Haystack to fit different use cases and tech stacks. + +## Initialisation + +Initialising a new DocumentStore within Haystack is straight forward. + +
+ +## Types + + + Install  + Elasticsearch and then start  + an instance.



+ If you have Docker set up, we recommend pulling the Docker image and running it.

+
+                    docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2
+                    docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
+                
+ Next you can initialize the Haystack object that will connect to this instance.

+
+                    document_store = ElasticsearchDocumentStore()
+                
+
+ ) + }, + { + title: "Open Distro for Elasticsearch", + content: ( +
+ Learn how to get started here. + If you have Docker set up, we recommend pulling the Docker image and running it. +
+                    docker pull amazon/opendistro-for-elasticsearch:1.13.2
+                    docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" amazon/opendistro-for-elasticsearch:1.13.2
+                
+ Next you can initialize the Haystack object that will connect to this instance. +
+                    from haystack.document_store import OpenDistroElasticsearchDocumentStore
+                    document_store = OpenDistroElasticsearchDocumentStore()
+                
+
+ ) + }, + { + title: "OpenSearch", + content: ( +
+ Learn how to get started here. + If you have Docker set up, we recommend pulling the Docker image and running it. +
+                    docker pull opensearchproject/opensearch:1.0.1
+                    docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.1
+                
+ Next you can initialize the Haystack object that will connect to this instance. +
+                    from haystack.document_store import OpenSearchDocumentStore
+                    document_store = OpenSearchDocumentStore()
+                
+
+ ) + }, + { + title: "Milvus", + content: ( +
+ Follow the official documentation to start a Milvus instance via Docker. + Note that we also have a utility function haystack.utils.launch_milvus that can start up a Milvus instance.



+ You can initialize the Haystack object that will connect to this instance as follows:

+
+                    from haystack.document_store import MilvusDocumentStore
+                    document_store = MilvusDocumentStore()
+                
+
+ ) + }, + { + title: "FAISS", + content: ( +
+ The FAISSDocumentStore requires no external setup. Start it by simply using this line.

+
+                    from haystack.document_store import FAISSDocumentStore
+                    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+                
+

Save & Load

+ FAISS document stores can be saved to disk and reloaded: +
+                    from haystack.document_store import FAISSDocumentStore
+                    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+                    # Generates two files: my_faiss_index.faiss and my_faiss_index.json
+                    document_store.save("my_faiss_index.faiss")
+                    # Looks for the two files generated above
+                    new_document_store = FAISSDocumentStore.load("my_faiss_index.faiss")
+                    assert new_document_store.faiss_index_factory_str == "Flat"
+                
+ While `my_faiss_index.faiss` contains the index, my_faiss_index.json + contains the parameters used to inizialize it (like faiss_index_factory_store). + This configuration file is necessary for load() to work. It simply contains + the initial parameters in a JSON format.

+ For example, a hand-written configuration file for the above FAISS index could look like:

+
+                    {
+                      faiss_index_factory_store: 'Flat'
+                    }
+                
+
+ ) + }, + { + title: "In Memory", + content: ( +
+ The InMemoryDocumentStore() requires no external setup. Start it by simply using this line. +
+                    from haystack.document_store import InMemoryDocumentStore
+                    document_store = InMemoryDocumentStore()
+                
+
+ ) + }, + { + title: "SQL", + content: ( +
+ The SQLDocumentStore requires SQLite, PostgresQL or MySQL to be installed and started. + Note that SQLite already comes packaged with most operating systems. +
+                from haystack.document_store import SQLDocumentStore
+                document_store = SQLDocumentStore()
+                
+
+ ) + }, + { + title: "Weaviate", + content: ( +
+ The WeaviateDocumentStore requires a running Weaviate Server. + You can start a basic instance like this (see the Weaviate docs for details): +
+                    docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.4.0
+                
+ Afterwards, you can use it in Haystack: +
+                    from haystack.document_store import WeaviateDocumentStore
+                    document_store = WeaviateDocumentStore()
+                
+ Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes. + See API documentation for more info. +
+ ) + } + ]} +/> + +## Input Format + +DocumentStores expect Documents in dictionary form, like that below. +They are loaded using the `DocumentStore.write_documents()` method. +See [Preprocessing](/components/preprocessing) for more information on the cleaning and splitting steps that will help you maximize Haystack's performance. + +[//]: # "Add link to preprocessing section" + +```python +from haystack.document_store import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore() +dicts = [ + { + 'text': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +document_store.write_documents(dicts) +``` + +
+ +## Writing Documents (Sparse Retrievers) + +Haystack allows for you to write store documents in an optimised fashion so that query times can be kept low. +For **sparse**, keyword based retrievers such as BM25 and TF-IDF, +you simply have to call `DocumentStore.write_documents()`. +The creation of the inverted index which optimises querying speed is handled automatically. + +```python +document_store.write_documents(dicts) +``` + +
+ +## Writing Documents (Dense Retrievers) + +For **dense** neural network based retrievers like Dense Passage Retrieval, or Embedding Retrieval, +indexing involves computing the Document embeddings which will be compared against the Query embedding. + +The storing of the text is handled by `DocumentStore.write_documents()` and the computation of the +embeddings is started by `DocumentStore.update_embeddings()`. + +```python +document_store.write_documents(dicts) +document_store.update_embeddings(retriever) +``` + +This step is computationally intensive since it will engage the transformer based encoders. +Having GPU acceleration will significantly speed this up. + + + + +
+ +## Choosing the Right Document Store + +The Document Stores have different characteristics. You should choose one depending on the maturity of your project, the use case and technical environment: + + + Pros: +
    +
  • Fast & accurate sparse retrieval with many tuning options
  • +
  • Basic support for dense retrieval
  • +
  • Production-ready
  • +
  • Support also for Open Distro
  • +
+ Cons: +
    +
  • Slow for dense retrieval with more than ~ 1 Mio documents
  • +
+
+ ) + }, + { + title: "Open Distro for Elasticsearch", + content: ( +
+ Pros: +
    +
  • Fully open source (Apache 2.0 license)
  • +
  • Essentially the same features as Elasticsearch
  • +
+ Cons: +
    +
  • Slow for dense retrieval with more than ~ 1 Mio documents
  • +
+
+ ) + }, + { + title: "OpenSearch", + content: ( +
+ Pros: +
    +
  • Fully open source (Apache 2.0 license)
  • +
  • Essentially the same features as Elasticsearch
  • +
  • Has more support for vector similarity comparisons and approximate nearest neighbours algorithms
  • +
+ Cons: +
    +
  • Not as optimized as dedicated vector similarity options like Milvus and FAISS
  • +
+
+ ) + }, + { + title: "Milvus", + content: ( +
+ Pros: +
    +
  • Scalable DocumentStore that excels at handling vectors (hence suited to dense retrieval methods like DPR)
  • +
  • Encapsulates multiple ANN libraries (e.g. FAISS and ANNOY) and provides added reliability
  • +
  • Runs as a separate service (e.g. a Docker container)
  • +
  • Allows dynamic data management
  • +
+ Cons: +
    +
  • No efficient sparse retrieval
  • +
+
+ ) + }, + { + title: "FAISS", + content: ( +
+ Pros: +
    +
  • Fast & accurate dense retrieval
  • +
  • Highly scalable due to approximate nearest neighbour algorithms (ANN)
  • +
  • Many options to tune dense retrieval via different index types (more info [here](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index))
  • +
+ Cons: +
    +
  • No efficient sparse retrieval
  • +
+
+ ) + }, + { + title: "In Memory", + content: ( +
+ Pros: +
    +
  • Simple
  • +
  • Exists already in many environments
  • +
+ Cons: +
    +
  • Only compatible with minimal TF-IDF Retriever
  • +
  • Bad retrieval performance
  • +
  • Not recommended for production
  • +
+
+ ) + }, + { + title: "SQL", + content: ( +
+ Pros: +
    +
  • Simple & fast to test
  • +
  • No database requirements
  • +
  • Supports MySQL, PostgreSQL and SQLite
  • +
+ Cons: +
    +
  • Not scalable
  • +
  • Not persisting your data on disk
  • +
+
+ ) + }, + { + title: "Weaviate", + content: ( +
+ Pros: +
    +
  • Simple vector search
  • +
  • Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up
  • +
  • Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset
  • +
+ Cons: +
    +
  • Less options for ANN algorithms than FAISS or Milvus
  • +
  • No BM25 / Tf-idf retrieval
  • +
+
+ ) + } + ]} +/> + +
+ +#### Our Recommendations + +**Restricted environment:** Use the `InMemoryDocumentStore`, if you are just giving Haystack a quick try on a small sample and are working in a restricted environment that complicates running Elasticsearch or other databases + +**Allrounder:** Use the `ElasticSearchDocumentStore`, if you want to evaluate the performance of different retrieval options (dense vs. sparse) and are aiming for a smooth transition from PoC to production + +**Vector Specialist:** Use the `MilvusDocumentStore`, if you want to focus on dense retrieval and possibly deal with larger datasets + +
diff --git a/docs/v0.10.0/components/generator.mdx b/docs/v0.10.0/components/generator.mdx new file mode 100644 index 000000000..73bab5c44 --- /dev/null +++ b/docs/v0.10.0/components/generator.mdx @@ -0,0 +1,62 @@ +# Generator + +While extractive QA highlights the span of text that answers a query, +generative QA can return a novel text answer that it has composed. + +The best current approaches, such as [Retriever-Augmented Generation](https://arxiv.org/abs/2005.11401) and [LFQA](https://yjernite.github.io/lfqa.html), +can draw upon both the knowledge it gained during language model pretraining (parametric memory) +as well as passages provided to it with a retriever (non-parametric memory). + +With the advent of Transformer based retrieval methods such as [Dense Passage Retrieval](https://arxiv.org/abs/2004.04906), +retriever and generator can be trained concurrently from the one loss signal. + +
+ +**Tutorial:** Checkout our tutorial notebooks for a guide on how to build your own generative QA system with RAG ([here](/tutorials/retrieval-augmented-generation)) +or with LFQA ([here](/tutorials/pipelines)). + +
+ +**Pros** + +- More appropriately phrased answers +- Able to synthesize information from different texts +- Can draw on latent knowledge stored in language model + +**Cons** + +- Not easy to track what piece of information the generator is basing its response off of + +## Usage + +Initialize a Generator as follows: + +``` python +from haystack.generator.transformers import RAGenerator + +generator = RAGenerator( + model_name_or_path="facebook/rag-sequence-nq", + retriever=dpr_retriever, + top_k=1, + min_length=2 +) +``` + +Running a Generator in a pipeline: + +``` python +from haystack.pipeline import GenerativeQAPipeline + +pipeline = GenerativeQAPipeline(generator=generator, retriever=dpr_retriever) +result = pipelines.run(query='What are the best party games for adults?', top_k_retriever=20) +``` + +Running a stand-alone Generator: + +``` python +result = generator.predict( + query='What are the best party games for adults?', + documents=[doc1, doc2, doc3...], + top_k=top_k +) +``` diff --git a/docs/v0.10.0/components/knowledge_graph.mdx b/docs/v0.10.0/components/knowledge_graph.mdx new file mode 100644 index 000000000..ec5c9d9f8 --- /dev/null +++ b/docs/v0.10.0/components/knowledge_graph.mdx @@ -0,0 +1,107 @@ +# Question Answering on a Knowledge Graph + +Haystack allows loading and querying knowledge graphs. In particular, Haystack can: + +- Load an existing knowledge graph given as a .ttl file +- Execute SPARQL queries on a knowledge graph +- Execute text queries on the knowledge graph by translating them to SPARQL queries with the help of a pre-trained seq2seq model + +Haystack's knowledge graph functionalities are still in a very early stage. Thus, don't expect our [exemplary tutorial](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.py) to work on your custom dataset out-of-the-box. +Two classes implement the functionalities: GraphDBKnowledgeGraph and Text2SparqlRetriever. + +
+ +## GraphDBKnowledgeGraph + +GraphDBKnowledgeGraph is a triple store similar to Haystack's document stores. Currently, it is the only implementation of the BaseKnowledgeGraph class. +GraphDBKnowledgeGraph runs on GraphDB. The licensing of GraphDB is rather complicated and it's more than unfortunate that GraphDB cannot be used right away in colab notebooks. +On your local machine, you can start a GraphDB instance by running: + +`docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11` + +By default, GraphDBKnowledgeGraph connects to a GraphDB instance running on localhost at port 7200. +Similar to Haystack's ElasticsearchDocumentStore, the only additional setting needed is an index name. +(Note that GraphDB internally calls these indices repositories.) + +`kg = GraphDBKnowledgeGraph(index="tutorial_10_index")` + +Indices can be deleted and created with `GraphDBKnowledgeGraph.delete_index()` and `GraphDBKnowledgeGraph.create_index(config_path)`. +`config_path` needs to point to a .ttl file that contains configuration settings (see [GraphDB documentation](https://graphdb.ontotext.com/documentation/free/configuring-a-repository.html#configure-a-repository-programmatically) for details or use the file from our [tutorial](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.py)). It starts with something like: + +``` +# +# Sesame configuration template for a GraphDB Free repository +# +@prefix rdfs: . +@prefix rep: . +@prefix sr: . +@prefix sail: . +@prefix owlim: . + +[] a rep:Repository ; + rep:repositoryID "tutorial_10_index" ; + rdfs:label "tutorial 10 index" ; +... +``` + +GraphDBKnowledgeGraph can load an existing knowledge graph represented in the form of a .ttl file with the method `GraphDBKnowledgeGraph.import_from_ttl_file(index, path)`, where path points to a ttl file starting with something like: + +``` +@prefix rdf: . +@prefix xsd: . +@prefix hp: . + +hp:Gryffindor hp:source_url "https://harrypotter.fandom.com/wiki/Gryffindor"^^xsd:string . +hp:Gryffindor rdf:type hp:House_ . +hp:Gryffindor hp:name hp:Gryffindor . +hp:Gryffindor hp:founder hp:Godric_gryffindor . +... +``` + +`GraphDBKnowledgeGraph.get_all_triples()` returns all loaded triples in the form of subject, predicate, and object. It is helpful to check whether the loading of a .ttl file was successful. + +`GraphDBKnowledgeGraph.query(sparql_query)` executes SPARQL queries on the knowledge graph. However, we usually do not want to use this method directly but use it through a retriever. + +
+ +## Text2SparqlRetriever + +Text2SparqlRetriever can execute SPARQL queries translated from text but also any other custom SPARQL queries. Currently, it is the only implementation of the BaseGraphRetriever class. +Internally, Text2SparqlRetriever uses a pre-trained BART model to translate text questions to queries in SPARQL format. + +`Text2SparqlRetriever.retrieve(query)` can be called with a text query, which is then automatically translated to a SPARQL query. + +`Text2SparqlRetriever._query_kg(sparql_query)` can be called with a SPARQL query. + +
+ +## Trying Question Answering on Knowledge Graphs with Custom Data + +If you want to use your custom data you would first need to have your custom knowledge graph in the format of a .ttl file. +You can load your custom graph and execute SPARQL queries with `Text2SparqlRetriever._query_kg(sparql_query)`. To allow the use of abbreviations of namespaces, GraphDBKnowledgeGraph needs to know about them: + +``` +prefixes = """PREFIX rdf: + PREFIX xsd: + PREFIX hp: + """ +kg.prefixes = prefixes +``` + +If you suspect you are having issues because of abbreviations of namespaces not mapped correctly, you can always try to execute a SPARQL query with the full namespace: + +`Text2SparqlRetriever._query_kg(sparql_query="select distinct ?obj where { ?obj . }")` + +instead of using the abbreviated form: + +`Text2SparqlRetriever._query_kg(sparql_query="select distinct ?obj where { hp:Hermione_granger hp:patronus ?obj . }")` + +If you would like to translate text queries to SPARQL queries for your custom data and use `Text2SparqlRetriever.retrieve(query)`, there is significantly more effort necessary. +We provide an exemplary pre-trained model in our [tutorial](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.py). +One limitation is that this pre-trained model can only generate questions about resources it has seen during training. +Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph. +For example, it can translate "Harry" to "hp:Harry_potter" only because we trained it to do so. + +Unfortunately, our pre-trained model for translating text queries does not work with your custom data. +Instead, you need to train your own model. It needs to be trained according to the [seq2seq example for summarization with BART in transformers](https://github.com/huggingface/transformers/tree/master/examples/legacy/seq2seq). +Haystack currently does not support the training of text2sparql models. We dont have concrete plans to extend the funtionality, but we are more than open to contributions. Don't hesitate to reach out! diff --git a/docs/v0.10.0/components/pipelines.mdx b/docs/v0.10.0/components/pipelines.mdx new file mode 100644 index 000000000..59e854289 --- /dev/null +++ b/docs/v0.10.0/components/pipelines.mdx @@ -0,0 +1,461 @@ +# Pipelines + +## Flexibility powered by DAGs + +To build modern search pipelines, you need two things: powerful building blocks and an easy way to stick them together. +The `Pipeline` class is precisely built for this purpose and enables many search scenarios beyond QA. +The core idea is to build a Directed Acyclic Graph (DAG) where each Node is one building block (Reader, Retriever, Generator ...). +Here's a simple example for a standard Open-Domain QA Pipeline: + +```python +from haystack import Pipeline + +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever1", inputs=["Query"]) +p.add_node(component=reader, name="QAReader", inputs=["ESRetriever1"]) +res = p.run(query="What did Einstein work on?") +``` + +
+ +## Initialize a Pipeline + +To start building your custom pipeline, you’ll need to initialize an object of the base Pipeline class: + +``` python +from haystack import Pipeline + +pipeline = Pipeline() +``` + +By default, a new pipeline receives a root node called `Query` or `File` depending on whether it's a Query or Indexing Pipeline, as the entry point to the pipeline graph. You need to manually define how the information flows from one node to the next from that point on. + +## Add Nodes to a Pipeline + +Use the `add_node()` method to add new components to the pipeline graph. You may either initialize the modules before or during the call to `add_node()`. When you add a node to the pipeline, give it a name and a list of inputs containing one or more items. Note how the default `Query` node acts as the input node to the first explicitly defined node. + +``` python +pipeline.add_node(component=retriever, name='Retriever', inputs=['Query']) +``` + +Here's an example of a node with several input sources: + +``` python +pipeline.add_node(component=JoinNode(), name='Joiner', + inputs=['Retriever1', 'Retriever2']) +``` + +If the predecessor node has more than one output, you’ll need to specify the output number in the `inputs` list. For example: + +``` python +pipeline.add_node(component=Branch1(), name='Branch1', + inputs=['TopicClassifier.output_1']) +pipeline.add_node(component=Branch2(), name='Branch2', + inputs=['TopicClassifier.output_2']) +``` + +Under the hood, the nodes are placed in a queue and executed one by one when the `run()` method is invoked. The output of the last node in the queue is the output of the entire pipeline. + +When you create a custom pipeline, you need to pay extra care that each node’s output is compatible with the input of the successive node in the chain. Otherwise, your system will throw an error at runtime. + +
+ +## Arguments + +Each node in a Pipeline defines the arguments the run() method accepts. The Pipeline class takes care of passing relevant +arguments to the node. In addition to mandatory inputs like `query`, the `run()` accepts optional node parameters like +`top_k` with the `params` argument. For instance, `params={"top_k": 5}` will set the `top_k` of all nodes as 5. To +target params to a specific node, the node name can be explicitly specified as `params={"Retriever": {"top_k": 5}}`. + + +```python +res = pipeline.run( + query="What did Einstein work on?", + params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}} +) +``` + +
+ +## Run a Pipeline + +The `run()` function is the single command that triggers the execution of the entire pipeline: + +``` python +query = "What's the history of Quidditch?" +pipeline.run(query=query) +``` + +Every node has its own `run()` method, and the pipeline `run()` call invokes each node, one after the other. When you `run()` a pipeline, all the function arguments are propagated to every node in the graph. To disambiguate, say, the `top_k` values of retriever and ranker, they have aliases that are automatically recognized by the respective modules. This lets you dynamically modify these parameters in each call to the pipeline: + +``` python +pipeline.run(query=query, params={"retriever": {"top_k": 28}, "ranker": {"top_k": 9}}) +``` + +
+ +## Inspect a Pipeline + +### Using `draw()` + +The `pipeline.draw()` method generates a sketch of your pipeline. By looking at a drawing of your pipeline, you may be able to confirm that the graph is indeed structured in the way that you intended. This is especially true for customized graphs that may branch out at some point. + +![image](https://user-images.githubusercontent.com/1563902/102451716-54813700-4039-11eb-881e-f3c01b47ca15.png) + +## Accessing Pipeline Nodes + +If your custom pipeline is not working as intended, try running your nodes in isolation. You may access any pipeline node by using the `get_node()` method and specifying the component's name: + +``` python +retriever_node = pipeline.get_node('Retriever') +``` + +
+ +## Add debug information + +Nodes in a Pipeline can add debug information that gets propagated to the final output of a Pipeline. For instance, a decision Node can append details on the decision made. + +To return debug data from a Node, add a `_debug` key in the output dict. The value can be a primitive or a dict. For instance, + +```python +def run(self, query: str): + if "?" in query: + return {"_debug": "The query contains a question mark"}, "output_1" + else: + return {"_debug": "The query does not contains a question mark"}, "output_2" + ``` +This `_debug` gets appended to a "global" `_debug` dict storing per Node debug data that gets returned with the final output. The final output may look like: `{"answers": ..., "_debug": {"node_a": "my debug info", "node_b": {"key": "value"}}}` + +A Node in a Pipeline can access the global `_debug` from preceding nodes by adding `_debug` in the `run()` method: + +```python +def run(self, query: str, _debug: dict): + debug_info = _debug["PrecedingNodeA"] + ... + ``` + +
+ + +## Running a Node in Isolation + +When you execute a pipeline with `run()`, it successively invokes the `run()` methods of all nodes in the queue. However, you can also use a given node's `run()` method in isolation. + +``` python +retriever_node.run(query=query, pipeline_type='Query') +``` + +What happens during an individual run depends entirely on the given node's definition. For example, the retriever's `run()` method calls `run_query()`, which in turn calls `retrieve()` and a few other methods. Once you have extracted your node from the pipeline with the `get_node()` method, you're free to run any one of that node's class methods: + +``` python +retriever_node.run_query(query=query) +retriever_node.retrieve(query=query) +``` + +If you want to find out which class methods are called by a component's `run()` function, we recommend that you take a look at the definitions (e.g., [this one](https://github.com/deepset-ai/haystack/blob/4c2a0b914a0ad81d8df3deaf02c637e7c2413b00/haystack/retriever/base.py#L177) in the source code. + + +
+ +## YAML File Definitions + +For your convenience, there is also the option of defining and loading pipelines in YAML files. +Having your pipeline available in a YAML is particularly useful when +you move between experimentation and production environments. +Just export the YAML from your notebook / IDE and import it into your production environment. +It also helps with version control of pipelines, allows you to share your pipeline easily with colleagues, +and simplifies the configuration of pipeline parameters in production. + +For example, you can define and save a simple Retriever Reader pipeline by saving the following to a file: + +```yaml +version: "0.9" + +components: # define all the building-blocks for Pipeline + - name: MyReader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 + - name: MyESRetriever + type: ElasticsearchRetriever + params: + document_store: MyDocumentStore # params can reference other components defined in the YAML + custom_query: null + - name: MyDocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test + +pipelines: # multiple Pipelines can be defined using the components from above + - name: my_query_pipeline # a simple extractive-qa Pipeline + nodes: + - name: MyESRetriever + inputs: [Query] + - name: MyReader + inputs: [MyESRetriever] +``` + +To load, simply call: + +```python +pipeline.load_from_yaml(Path("sample.yaml")) +``` + +For another example YAML config, check out [this file](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipeline/pipelines.yaml). + +
+ +## Custom Nodes + +Thanks to the modularity of pipelines, you can create your own nodes and comfortably integrate them into your system. You should define a `run()` function at the core of each node class that accepts a flexible number of mandatory or optional keyword arguments. That's where the entire functionality of your node will be defined. Let's look at a node class template: + +``` python +class NodeTemplate(BaseComponent): + outgoing_edges = 1 + + def run(self, query: str, my_arg: Optional[int] = 10): + # Insert code here to manipulate the input & produce an output + return output, "output_1" +``` + +Usually, your node will have one outgoing edge and thus one return value. A node's return value should come in the form of a Python dictionary. That value is returned within a tuple, which also contains the outgoing edge name e.g., `output_1`. + +It's also possible to have more than one outgoing edge, typically in a decision node. A decision node's `run()` method consists of a decision function that determines the path in the graph by which to send down its input. Such a function has more than one possible return value, and all of these will be named accordingly, i.e. `output_1`, `output_2`, and so forth. + +When defining your own custom nodes, you must inherit from `haystack.BaseComponent`. This registers the node as a Component that can later be added to a Pipeline. + +``` python +from haystack import BaseComponent + +class CustomNode(BaseComponent): + pass +``` + + +## Decision nodes + +You can add decision nodes where only one "branch" is executed afterwards. +This allows, for example, to classify an incoming query and depending on the result routing it to different modules. +To find a ready-made example of a decision node, have a look at [the page](/components/query-classifier) about the `QueryClassifier`. + +![image](https://user-images.githubusercontent.com/1563902/102452199-41229b80-403a-11eb-9365-7038697e7c3e.png) + +If you'd like to define our own, you'll need to create a class that looks something like this: + +```python + class QueryClassifier(BaseComponent): + outgoing_edges = 2 + + def run(self, query): + if "?" in query: + return {}, "output_1" + + else: + return {}, "output_2" + + pipe = Pipeline() + pipe.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) + pipe.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) + pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) + pipe.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", + inputs=["ESRetriever", "DPRRetriever"]) + pipe.add_node(component=reader, name="QAReader", inputs=["JoinResults"]) + res = p.run(query="What did Einstein work on?", params={"ESRetriever": {"top_k": 1}, "DPRRetriever": {"top_k": 3}}) +``` + +
+ +## Evaluation nodes + +There are nodes in Haystack that are used to evaluate the performance of readers, retrievers and combine systems. +To get hands on with this kind of node, have a look at the [evaluation tutorial](/tutorials/evaluation). + +
+ +## Ready-Made Pipelines + +Last but not least, we added some ready-made pipelines that allow you to run standard patterns with very few lines of code. See the [ready-made pipelines page](/components/ready-made-pipelines) and [pipelines API documentation](/reference/pipelines) to learn more about these. + +**Examples:** + +```python +from haystack.pipeline import DocumentSearchPipeline, ExtractiveQAPipeline, Pipeline, JoinDocuments + +# Extractive QA +qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever) +res = qa_pipe.run(query="When was Kant born?", params={"retriever": {"top_k": 3}, "reader": {"top_k": 5}}) + +# Document Search +doc_pipe = DocumentSearchPipeline(retriever=retriever) +res = doc_pipe.run(query="Physics Einstein", params={"retriever": {"top_k": 3}}) + +# Generative QA +doc_pipe = GenerativeQAPipeline(generator=rag_generator, retriever=retriever) +res = doc_pipe.run(query="Physics Einstein", params={"retriever": {"top_k": 3}}) + +# FAQ based QA +doc_pipe = FAQPipeline(retriever=retriever) +res = doc_pipe.run(query="How can I change my address?", params={"retriever": {"top_k": 3}}) + +``` + +
+ +## Example: Multiple retrievers + +You can now also use multiple Retrievers and join their results: + +```python +from haystack import Pipeline + +p = Pipeline() +p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) +p.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) +p.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]) +p.add_node(component=reader, name="QAReader", inputs=["JoinResults"]) +res = p.run(query="What did Einstein work on?", params={"ESRetriever": {"top_k": 1}, "DPRRetriever": {"top_k": 3}}) +``` + +![image](https://user-images.githubusercontent.com/1563902/102451782-7bd80400-4039-11eb-9046-01b002a783f8.png) + +
+ +## Example: Creating a Retriever-Ranker-Summarizer Pipeline + +In this example, we'll look at how to establish a custom Retriever-Ranker-Summarizer pipeline. It's useful to add a `Ranker` to a summarization pipeline because the output of the `Summarizer` depends on the order of the documents that it receives. + +``` python +from haystack import Pipeline + +pipeline = Pipeline() +``` + +To create new pipeline nodes, we initialize the modules first. For our use case, we need a retriever, a ranker, and a summarizer. We tell the summarizer to return a single summary per query (instead of one summary for each document), and that its length should be somewhere between ten and 300 words: + +``` python +from haystack.retriever import ElasticsearchRetriever +from haystack.ranker import FARMRanker +from haystack.summarizer import TransformersSummarizer + +retriever = ElasticsearchRetriever(document_store, top_k=10) + +ranker=FARMRanker(model_name_or_path="sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking", top_k=10) + +summarizer = TransformersSummarizer(model_name_or_path='t5-large', min_length=10, max_length=300, generate_single_summary=True) +``` + +We add the nodes to the pipeline: + +``` python +pipeline.add_node(component=retriever, name='Retriever', inputs=['Query']) +pipeline.add_node(component=ranker, name='Ranker', inputs=['Retriever']) +pipeline.add_node(component=summarizer, name='Summarizer', inputs=['Ranker']) +``` + +Let's now run our custom pipeline on the Harry Potter Wiki dataset. A typical application for this pipeline would be a situation where we want some high-level information about our corpus that is not necessarily contained within one document. We therefore retrieve multiple documents, rank them, and let the summarizer return a single summary of all the texts. + +``` python +query = "What's the history of Quidditch?" +result = pipeline.run(query=query) +``` + +The pipeline returns a dictionary that contains the query, the name of the last node, and a list of documents: + +``` python +result.keys() + +>>> dict_keys(['documents', 'query', 'node_id']) +``` + +Since we requested a single summary of all the texts we inputted to the summarizer, the list of documents contains only one item. We access the summary through the text attribute: + +``` python +result['documents'][0].text + +>>> "the first record of a primitive form of Quidditch (''Kwidditch'') dates to c. 1050. the first known reference to wizards using broomsticks as a means of conveyance dates to A.D. 963. a variant of the game, Quodpot, was invented in the eighteenth century. in the middle of the 14th century it was made a protected species by the wizards council." +``` + +
+ +## Example: Creating a Custom Translation Node + +Let's say that we wanted to add a special translation module to our pipeline. Instead of just translating into one predefined language, our node should be able to return a summary in any language we want (i.e., for which we have a trained model). To that end, we define a CustomTranslator class. Since there's no decision function involved, we set `outgoing_edges = 1`: + +``` python +class CustomTranslator(): + outgoing_edges = 1 +``` + +Within a pipeline node, the `run()` function is where all the action happens. Our run function receives a language argument that tells the translator which translation model to initialize: + +``` python +def run(self, language='fr', **kwargs): + translator = TransformersTranslator(model_name_or_path=f'Helsinki-NLP/opus-mt-en-{language}') +``` + +We run the translator with the specified model and return its output. +``` python +translation = translator.run(documents=kwargs['documents']) +return translation +``` + +We initialize this node directly when adding it to the pipeline. As usual, we specify a name and the inputs for this node: + +``` python +pipeline.add_node(component=CustomTranslator(), name='CustomTranslator', inputs=['Summarizer']) +``` + +We can now call the pipeline with any [Helsinki-NLP translation model](https://huggingface.co/models?search=helsinki-nlp/opus-mt-en-) from HuggingFace with English as a source language. Pipeline arguments are simply propagated through the pipeline. This means that if we want to pass a language value to our custom node, we simply specify it in our call to the pipeline. Let's look at the French summary of a popular wizard sport: + +``` python +query = "What's the history of Quidditch?' +result = pipeline.run(query=query, params={"retriever": {"top_k": 30}, "ranker": {"top_k": 20}, "language": "fr"}) +result['documents'][0].text + +>>> "''Quidditch'' a obtenu son nom du marais queerditch, l'emplacement du premier jeu enregistré. le jeu a été basé sur un jeu joué par une sorcière au 11ème siècle. un snitch d'or a été introduit à la suite d'un jeu 1269 joué en kent. on pense qu'une version balai du jeu peut avoir inspiré le mouvement du jeu moderne 'harlem shuffle'" +``` + +Now, how about Ukrainian? + +``` python +result = pipeline.run(query=query, params={"retriever": {"top_k": 30}, "ranker": {"top_k": 30}, "language": "uk"}) +result['documents'][0].text + +>>> '" Quuiditch " отримала свою назву від дивного болота, місця першої в історії записаної гри. Гру було засновано на грі, яку грала відьма у XI столітті. Золотий стукач було введено у гру 1269 гри в кенті. Вважається, що версія мітла у грі, можливо, надихнула сучасну гру на " заплутування " move " гри' +``` + +## Distributed Pipelines with Ray + +Ray (https://ray.io) is a framework for distributed computing. + +Ray allows distributing a Pipeline's components across a cluster of machines. The individual components of a +Pipeline can be independently scaled. For instance, an extractive QA Pipeline deployment can have three replicas +of the Reader and a single replica for the Retriever. It enables efficient resource utilization by horizontally +scaling Components. + +To set the number of replicas, add `replicas` in the YAML config for the node in a pipeline: + +```yaml +components: + ... + +pipelines: + - name: ray_query_pipeline + type: RayPipeline + nodes: + - name: ESRetriever + replicas: 2 # number of replicas to create on the Ray cluster + inputs: [ Query ] +``` + +A RayPipeline can only be created with a YAML Pipeline config: + +```python +from haystack.pipeline import RayPipeline +pipeline = RayPipeline.load_from_yaml(path="my_pipelines.yaml", pipeline_name="my_query_pipeline") +pipeline.run(query="What is the capital of Germany?") +``` +By default, RayPipelines creates an instance of RayServe locally. To connect to an existing Ray instance, +set the `address` parameter when creating the RayPipeline instance. \ No newline at end of file diff --git a/docs/v0.10.0/components/preprocessing.mdx b/docs/v0.10.0/components/preprocessing.mdx new file mode 100644 index 000000000..3c0c4acd1 --- /dev/null +++ b/docs/v0.10.0/components/preprocessing.mdx @@ -0,0 +1,172 @@ +# Preprocessing + +Haystack includes a suite of tools to: + +- extract text from different file types, +- normalize white space +- split text into smaller pieces to optimize retrieval + +These data preprocessing steps can have a big impact on the systems performance +and effective handling of data is key to getting the most out of Haystack. + +
+ +Check out our [preprocessing tutorial](/tutorials/train-dpr) if you'd like to start working with code examples already! + +
+ +## Document Format + +The DocumentStore expects its inputs to come in the following format. +The sections below will show you all the tools you'll need to ready your data for storing. + +```python +docs = [ + { + 'text': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +``` + +
+ +## File Conversion + +There are a range of different file converters in Haystack that +can extract text from files and cast them into the unified dictionary format shown above. +Haystack features support for txt, pdf and docx files and there is even a converter that leverages Apache Tika. +Please refer to [the API docs](/reference/file-converters) to see which converter best suits you. + + + from haystack.file_converter import PDFToTextConverter + + converter = PDFToTextConverter(remove_numeric_tables=True, + valid_languages=["de","en"]) + + doc = converter.convert(file_path=file, meta=None) + + # Alternatively, if you have a PDF containing images, Haystack uses tessaract under the hood to OCR image PDFs. + + + from haystack.file_converter import PDFToTextOCRConverter + + + converter = PDFToTextOCRConverter(remove_numeric_tables=False, + valid_languages=["deu","eng"]) + + doc = converter.convert(file_path=file, meta=None) + + ), + }, + { + title: "DOCX", + content: ( +
+          from haystack.file_converter import DocxToTextConverter
+          
+            converter = DocxToTextConverter(remove_numeric_tables=True,
+            valid_languages=["de","en"])
+          
+          doc = converter.convert(file_path=file, meta=None)
+        
+ ), + }, + { + title: "From a Directory", + content: ( +
+

+ Haystack also has a `convert_files_to_dicts()` utility function that + will convert all txt or pdf files in a given folder into this + dictionary format. +

+
+            
+              from haystack.preprocessor.utils import convert_files_to_dicts
+            
+            docs = convert_files_to_dicts(dir_path=doc_dir)
+          
+
+ ), + }, + { + title: "Image", + content: ( +
+

+ Haystack supports extraction of text from images using OCR. +

+
+            
+              from haystack.file_converter import ImageToTextConverter
+            
+            
+            converter = ImageToTextConverter(remove_numeric_tables=True,
+            valid_languages=["de","en"])
+          
+          doc = converter.convert(file_path=file, meta=None)
+          
+
+ ), + }, + ]} +/> + +
+ +## Web Crawler + +In Haystack, you will find a web crawler that will help you scrape text from websites and save it to file. +See the [API documentation](https://haystack.deepset.ai/reference/crawler) for more details. + +```python +from haystack.connector import Crawler + +crawler = Crawler(output_dir="crawled_files") +docs = crawler.crawl( + urls=["https://haystack.deepset.ai/overview/get-started"], + filter_urls=["haystack"], + crawler_depth=1 +) +``` + +
+ +## PreProcessor + +While each of the above conversion methods produce documents that are already in the format expected by the Document Store, +it is recommended that they are further processed in order to ensure optimal Retriever and Reader performance. +The `PreProcessor` takes one of the documents created by the converter as input, +performs various cleaning steps and splits them into multiple smaller documents. + +For suggestions on how best to split your documents, see [Optimization](/guides/optimization) + +```python +from haystack.preprocessor import PreProcessor + +doc = converter.convert(file_path=file, meta=None) +processor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=True, + split_by="word", + split_length=200, + split_respect_sentence_boundary=True, + split_overlap=0 +) +docs = processor.process(doc) +``` + +- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines +- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text +- `clean_header_footer` will remove any long header or footer texts that are repeated on each page +- `split_by` determines what unit the document is split by: `'word'`, `'sentence'` or `'passage'` +- `split_length` sets a maximum number of `'word'`, `'sentence'` or `'passage'` units per output document +- `split_respect_sentence_boundary` ensures that document boundaries do not fall in the middle of sentences +- `split_overlap` sets the amount of overlap between two adjacent documents after a split. Setting this to a positive number essentially enables the sliding window approach. diff --git a/docs/v0.10.0/components/query_classifier.mdx b/docs/v0.10.0/components/query_classifier.mdx new file mode 100644 index 000000000..9eab63625 --- /dev/null +++ b/docs/v0.10.0/components/query_classifier.mdx @@ -0,0 +1,178 @@ +# Query Classifier + +Queries come in all shapes and forms. A keyword-based search differs from a question posed in natural language. In Haystack, we can account for these differences by integrating a special node into our QA pipeline: the query classifier. + +A query classifier puts each incoming query into one of two predefined classes, and routes it to the appropriate section of the pipeline. +Haystack comes with classifiers to distinguish between the three most common query types (Keywords, Question, Statement) and allows two different types of models (SKlearn and Transformer). + +Using a query classifier can potentially yield the following benefits: + +* Getting better search results (e.g. by routing only proper questions to DPR / QA branches and not keyword queries) +* Less GPU costs (e.g. if 50% of your traffic is only keyword queries you could just use elastic here and save the GPU resources for the other 50% of traffic with semantic queries) + +
+ +## Common Query types + +#### 1. Keyword Queries: +Such queries don't have semantic meaning, merely consist of keywords and the order of words does not matter: +* arya stark father +* jon snow country +* arya stark younger brothers + +
+ +#### 2. Questions (Interrogative Queries): +In such queries users ask a question in a complete, "natural" sentence. Regardless of the presence of "?" in the query the goal here is to detect the intent of the user whether any question is asked or not in the query: + +* who is the father of arya stark? +* which country was jon snow filmed in +* who are the younger brothers of arya stark? + +#### 3. Statements (Declarative Queries): +Such queries consist also of a regular, natural sentence with semantic relations between the words. However, they are rather a statement than a question: + +* Arya stark was a daughter of a lord. +* Show countries that Jon snow was filmed in. +* List all brothers of Arya. + +
+ +## Stand-alone Usage +To test how a query classifier works before integrating it into a pipeline, you can run it just as an individual component: + +```python +from haystack.pipeline import TransformersQueryClassifier + +queries = ["Arya Stark father","Jon Snow UK", + "who is the father of arya stark?","Which country was jon snow filmed in?"] + +question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection") +# Or Sklearn based: + +for query in queries: + result = question_classifier.run(query=query) + if result[1] == "output_1": + category = "question" + else: + category = "keywords" + + print(f"Query: {query}, raw_output: {result}, class: {category}") + +# Returns: +# Query: Arya Stark father, raw_output: ({'query': 'Arya Stark father'}, 'output_2'), class: keywords +# Query: Jon Snow UK, raw_output: ({'query': 'Jon Snow UK'}, 'output_2'), class: keywords +# Query: who is the father of arya stark?, raw_output: ({'query': 'who is the father of arya stark?'}, 'output_1'), class: question +# Query: Which country was jon snow filmed in?, raw_output: ({'query': 'Which country was jon snow filmed in?'}, 'output_1'), class: question + +``` +Note how the node returns two objects: the query (e.g.'Arya Stark father') and the name of the output edge (e.g. "output_2"). This information can be leveraged in a pipeline for routing the query to the next node. + +
+ +## Route Queries to Different Retrievers + +You can use a Query Classifier within a pipeline as a "decision node". Depending on the output of the classifier other parts of the pipeline will be executed. For example, we can route keyword queries to an ElasticsearchRetriever and semantic queries (questions/statements) to DPR. + +![image](https://user-images.githubusercontent.com/6007894/127831511-f55bad86-4b4f-4b54-9889-7bba37e475c6.png) + +Below, we define a pipeline with a `TransformersQueryClassifier` that routes questions/statements to the node's `output_1` and keyword queries to `output_2`. We leverage this structure in the pipeline by connecting the DPRRetriever to `QueryClassifier.output_1` and the ESRetriever to `QueryClassifier.output_2`. + +```python +from haystack.pipeline import TransformersQueryClassifier, Pipeline +from haystack.utils import print_answers + +query_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection") + +pipe = Pipeline() +pipe.add_node(component=query_classifier, name="QueryClassifier", inputs=["Query"]) +pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +pipe.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) + +# Pass a question -> run DPR +res_1 = pipe.run(query="Who is the father of Arya Stark?") + +# Pass keywords -> run the ElasticsearchRetriever +res_2 = pipe.run(query="arya stark father") + +``` +## Run QA on Proper Questions Only + +If you add QA to an existing search system, it can make sense to only use it for real questions that come in and keep a basic document search with elasticsearch for the remaining keyword queries. You can use a Query Classifier to build such a hybrid pipeline: + +```python +haystack.pipeline import TransformersQueryClassifier, Pipeline +from haystack.utils import print_answers + +query_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") + +pipe = Pipeline() +pipe.add_node(component=query_classifier, name="QueryClassifier", inputs=["Query"]) +pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +pipe.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +pipe.add_node(component=reader, name="QAReader", inputs=["DPRRetriever"]) + +# Pass a question -> run DPR + QA -> return answers +res_1 = pipe.run(query="Who is the father of Arya Stark?") + +# Pass keywords -> run only ElasticsearchRetriever -> return docs +res_2 = pipe.run(query="arya stark father") + +``` + +
+ +## Which models are available? +The transformer classifier is more accurate than the SkLearn classifier as it can use the context and order of words. However, it requires more memory and most probably GPU for faster inference. You can mitigate those down sides by choosing a very small transformer model. The default models we trained are using a mini BERT architecture which is only about `50 MB` in size and allows relatively fast inference on CPU. + +
+ +#### Transformers +Pass your own `Transformer` binary classification model from file/huggingface or use one of the following pretrained ones hosted on Huggingface: +1) Keywords vs. Questions/Statements (Default) + + ```python + TransformersQueryClassifier(model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection") + # output_1 => question/statement + # output_2 => keyword query + ``` + + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + +2) Questions vs. Statements + ```python + TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") + # output_1 => question + # output_2 => statement + ``` + + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + +
+ +#### Sklearn +Pass your own `Sklearn` binary classification model or use one of the following pretrained Gradient boosting models: + +1) Keywords vs. Questions/Statements (Default) + + ```python + SklearnQueryClassifier(query_classifier = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", + query_vectorizer = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle") + + # output_1 => question/statement + # output_2 => keyword query + ``` + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + +2) Questions vs. Statements + + ```python + SklearnQueryClassifier(query_classifier = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle", + query_vectorizer = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle") + + output_1 => question + output_2 => statement + ``` + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) \ No newline at end of file diff --git a/docs/v0.10.0/components/question_generator.mdx b/docs/v0.10.0/components/question_generator.mdx new file mode 100644 index 000000000..f23b4f844 --- /dev/null +++ b/docs/v0.10.0/components/question_generator.mdx @@ -0,0 +1,69 @@ +# Question Generator + +Question Answering systems are trained to find an answer given a question and a document; +but with the recent advances in generative NLP, there are now models that can read a document +and suggest questions that can be answered by that document. +All this power is available to you now via the `QuestionGenerator` class. + +`QuestionGenerator` models can be trained using Question Answering datasets. +Instead of predicting answers, the `QuestionGenerator` takes the document as input and is trained to output the questions. + +
+ +**Note: ** The `QuestionGenerator` is different to the `Generator`. +The `QuestionGenerator` receives only documents as input and returns questions as output +while the `Generator` class is an alternative to the `Reader`. +It takes a question and documents as input and returns an answer. + +
+ +## Stand-Alone Usage + +``` python +from haystack.question_generator import QuestionGenerator + +text = """Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum +and first released in 1991, Python's design philosophy emphasizes code +readability with its notable use of significant whitespace.""" + +qg = QuestionGenerator() +result = qg.generate(text) +``` + +The output will look like this: + +``` python +[' Who created Python?', + ' When was Python first released?', + " What is Python's design philosophy?"] +``` + +## Ready-Made Pipelines + +In Haystack, there are 2 pipeline configurations that are already encapsulated in its own class. +- `QuestionGenerationPipeline` +- `QuestionAnswerGenerationPipeline` + +Have a look at our [ready-made pipelines page](/components/ready-made-pipelines) to learn more about them. +Check out the question generation [tutorial](/tutorials/question-generation) to start using them. + +## Use Case: Auto-Suggested Questions + +Generated questions can help users get closer to the information that they are looking for. +Search engines now present auto-suggested questions to your top search results and even present suggested answers. +It is possible to build this same functionality in Haystack using the `QuestionGenerator`. + +After your `Retriever` has returned some candidate documents, you can run the `QuestionGenerator` to suggest more answerable questions. +By presenting these generated questions to your users, you can give them a sense of other facts and topics that are present in the documents. +You can go even on step further by predicting answers to these questions with a `Reader` or `Generator`. + +## Use Case: Human in the Loop Annotation + +A `QuestionGenerator` can enable different annotation workflows. +For example, given a text corpus, you could use the `QuestionGenerator` to create questions, +but you can also use then use a `Reader` to predict answers. + +Correct QA pairs created in this manner might not be so effective in retraining your `Reader` model. +However, correcting wrong QA pairs creates training samples that your model found challenging. +These examples are likely to be impactful when it comes to retraining. +This is also a quicker workflow than having annotators generate both question and answer. diff --git a/docs/v0.10.0/components/ranker.mdx b/docs/v0.10.0/components/ranker.mdx new file mode 100644 index 000000000..cc72313d7 --- /dev/null +++ b/docs/v0.10.0/components/ranker.mdx @@ -0,0 +1,50 @@ +# Ranker + +There are pure "semantic document search" use cases that do not need question answering functionality but only document ranking. +While the [Retriever](/components/retriever) is a perfect fit for document retrieval, we can further improve its results with the Ranker. +For example, BM25 (sparse retriever) does not take into account semantics of the documents and the query but only their keywords. +The Ranker can re-rank the results of the retriever step by taking semantics into account. +Similar to the Reader, it is based on the latest language models. +Instead of returning answers, it returns documents in re-ranked order. + +Without a Ranker and its re-ranking step, the querying process is faster but the query results might be of lower quality. +If you want to do "semantic document search" instead of a question answering, try first with a Retriever only. +In case the semantic similarity of the query and the resulting documents is low, add a Ranker. + +Note that a Ranker needs to be initialised with a model trained on a text pair classification task. +You can train the model also with the train() method of the Ranker. +Alternatively, [this example](https://github.com/deepset-ai/FARM/blob/master/examples/text_pair_classification.py) shows how to train a text pair classification model in FARM. + +
+ +## FARMRanker + +### Description + +The FARMRanker consists of a Transformer-based model for document re-ranking using the TextPairClassifier of [FARM](https://github.com/deepset-ai/FARM). +Given a text pair of query and passage, the TextPairClassifier either predicts label "1" if the pair is similar or label "0" if they are dissimilar (accompanied with a probability). +While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same. +With a FARMRanker, you can: + +- Directly get predictions (re-ranked version of the supplied list of Document) via predict() if supplying a pre-trained model +- Take a plain language model (e.g. `bert-base-cased`) and train it for TextPairClassification via train() + +
+ +### Initialisation + +```python +from haystack.document_store import ElasticsearchDocumentStore +from haystack.retriever import ElasticsearchRetriever +from haystack.ranker import FARMRanker +from haystack import Pipeline + +document_store = ElasticsearchDocumentStore() +... +retriever = ElasticsearchRetriever(document_store) +ranker = FARMRanker(model_name_or_path="saved_models/roberta-base-asnq-binary") +... +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) +p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) +``` diff --git a/docs/v0.10.0/components/reader.mdx b/docs/v0.10.0/components/reader.mdx new file mode 100644 index 000000000..4cc5ef747 --- /dev/null +++ b/docs/v0.10.0/components/reader.mdx @@ -0,0 +1,357 @@ +# Reader + +The Reader, also known as Open-Domain QA systems in Machine Learning speak, +is the core component that enables Haystack to find the answers that you need. +Haystack’s Readers are: + +- built on the latest transformer based language models + +- strong in their grasp of semantics + +- sensitive to syntactic structure + +- state-of-the-art in QA tasks like SQuAD and Natural Questions + + +
+            from haystack.reader import FARMReader
+            model = "deepset/roberta-base-squad2"
+            reader = FARMReader(model, use_gpu=True)
+          
+
+ ), + }, + { + title: "Transformers", + content: ( +
+
+            from haystack.reader import TransformersReader
+            model = "deepset/roberta-base-squad2"
+            reader = TransformersReader(model, use_gpu=1)
+          
+
+ ), + }, + ]} +/> + +While these models can work on CPU, it is recommended that they are run using GPUs to keep query times low. + +
+ +## Choosing the Right Model + +In Haystack, you can start using pretrained QA models simply by providing its HuggingFace Model Hub name to the Reader. +The loading of model weights is handled by Haystack, +and you have the option of using the QA pipeline from deepset FARM or HuggingFace Transformers (see FARM vs Transformers for details). + +Currently, there are a lot of different models out there and it can be rather overwhelming trying to pick the one that fits your use case. +To get you started, we have a few recommendations for you to try out. + + +
+

RoBERTa (base)

+

An optimised variant of BERT and a great starting point.

+
+              from haystack.reader import FARMReader
+              reader = FARMReader("deepset/roberta-base-squad2")
+            
+
    +
  • Pro: Strong all round model
  • +
  • + Con: There are other models that are either faster or more + accurate +
  • +
+
+
+

MiniLM

+

+ A cleverly distilled model that sacrifices a little accuracy for + speed. +

+
+              from haystack.reader import FARMReader
+              reader = FARMReader("deepset/minilm-uncased-squad2")
+            
+
    +
  • Pro: Inference speed up to 50% faster than BERT base
  • +
  • + Con: Still doesn’t match the best base sized models in accuracy +
  • +
+
+
+

ALBERT (XXL)

+

Large, powerful, SotA model.

+
+              from haystack.reader import FARMReader
+              
+                reader = FARMReader("ahotrod/albert_xxlargev1_squad2_512")
+              
+            
+
    +
  • + Pro: Better accuracy than any other open source model in QA +
  • +
  • + Con: The computational power needed make it impractical for most + use cases +
  • +
+
+
+ ), + }, + { + title: "Transformers", + content: ( +
+
+

RoBERTa (base)

+

An optimised variant of BERT and a great starting point.

+
+              from haystack.reader import TransformersReader
+              
+                reader = TransformersReader("deepset/roberta-base-squad2")
+              
+            
+
    +
  • Pro: Strong all round model
  • +
  • + Con: There are other models that are either faster or more + accurate +
  • +
+
+
+

MiniLM

+

+ A cleverly distilled model that sacrifices a little accuracy for + speed. +

+
+              from haystack.reader import TransformersReader
+              
+                reader = TransformersReader("deepset/minilm-uncased-squad2")
+              
+            
+
    +
  • Pro: Inference speed up to 50% faster than BERT base
  • +
  • + Con: Still doesn’t match the best base sized models in accuracy +
  • +
+
+
+

ALBERT (XXL)

+

Large, powerful, SotA model.

+
+              from haystack.reader import TransformersReader
+              
+                reader =
+                TransformersReader("ahotrod/albert_xxlargev1_squad2_512")
+              
+            
+
    +
  • + Pro: Better accuracy than any other open source model in QA +
  • +
  • + Con: The computational power needed make it impractical for most + use cases +
  • +
+
+
+ ), + }, + ]} +/> + +
+ +
+ +**Recommendations:** + +**All-rounder**: In the class of base sized models trained on SQuAD, **RoBERTa** has shown better performance than BERT +and can be capably handled by any machine equipped with a single NVidia V100 GPU. +We recommend this as the starting point for anyone wanting to create a performant and computationally reasonable instance of Haystack. + +**Built for Speed**: If speed and GPU memory are more of a priority to you than accuracy, +you should try the MiniLM model. +It is a smaller model that is trained to mimic larger models through the distillation process, +and it outperforms the BERT base on SQuAD even though it is about 40% smaller. + + + +**State of the Art Accuracy**: For most, **ALBERT XXL** will be too large to feasibly work with. +But if performance is your sole concern, and you have the computational resources, +you might like to try ALBERT XXL which has set SoTA performance on SQuAD 2.0. + + + +
+ +
+ +## Confidence Scores + +When printing the full results of a Reader, +you will see that each prediction is accompanied +by a value in the range of 0 to 1 reflecting the model's confidence in that prediction. + +In the output of `print_answers()`, you will find the model's confidence score in dictionary key called `score`. + +```python +from haystack.utils import print_answers + +print_answers(prediction, details="all") +``` + +```python +{ + 'answers': [ + { 'answer': 'Eddard', + 'context': 's Nymeria after a legendary warrior queen. ' + 'She travels with her father, Eddard, to ' + "King's Landing when he is made Hand of the " + 'King. Before she leaves,', + 'score': 0.9899835586547852, + ... + }, + ] +} +``` + +The intuition behind this score is the following: if a model has on average a confidence score of 0.9 that means we can expect the model's predictions to be correct in about 9 out of 10 cases. +However, if the model's training data strongly differs from the data it needs to make predictions on, we cannot guarantee that the confidence score and the model's accuracy are well aligned. +In order to better align this confidence score with the model's accuracy, finetuning needs to be performed +on a specific dataset. +To this end, the reader has a method `calibrate_confidence_scores(document_store, device, label_index, doc_index, label_origin)`. +The parameters of this method are the same as for the `eval()` method because the calibration of confidence scores is performed on a dataset that comes with gold labels. +The calibration calls the `eval()` method internally and therefore needs a DocumentStore containing labeled questions and evaluation documents. + +Have a look at this [FARM tutorial](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering_confidence.py) +to see how to compare calibrated confidence scores with uncalibrated confidence scores within FARM. +Note that a finetuned confidence score is specific to the domain that it is finetuned on. +There is no guarantee that this performance can transfer to a new domain. + +Having a confidence score is particularly useful in cases where you need Haystack to work with a certain accuracy threshold. +Many of our users have built systems where predictions below a certain confidence value are routed +on to a fallback system. + + + +
+ +## Deeper Dive: FARM vs Transformers + +Apart from the **model weights**, Haystack Readers contain all the components found in end-to-end open domain QA systems. +This includes **tokenization**, **embedding computation**, **span prediction** and **candidate aggregation**. +While the handling of model weights is the same between the FARM and Transformers libraries, their QA pipelines differ in some ways. +The major points are: + +- The **TransformersReader** will sometimes predict the same span twice while duplicates are removed in the **FARMReader** + +- The **FARMReader** currently uses the tokenizers from the HuggingFace Transformers library while the **TransformersReader** uses the tokenizers from the HuggingFace Tokenizers library + +- Start and end logits are normalized per passage and multiplied in the **TransformersReader** while they are summed and not normalised in the **FARMReader** + +If you’re interested in the finer details of these points, have a look at [this](https://github.com/deepset-ai/haystack/issues/248#issuecomment-661977237) GitHub comment. + +We see value in maintaining both kinds of Readers since Transformers is a very familiar library to many of Haystack’s users +but we at deepset can more easily update and optimise the FARM pipeline for speed and performance. + + + +Haystack also has a close integration with FARM which means that you can further fine-tune your Readers on labelled data using a FARMReader. +See our tutorials for an end-to-end example or below for a shortened example. + +```python +from haystack.reader import FARMReader + +# Initialise Reader +model = "deepset/roberta-base-squad2" +reader = FARMReader(model) + +# Perform finetuning +train_data = "PATH/TO_YOUR/TRAIN_DATA" +train_filename = "train.json" +save_dir = "finetuned_model" +reader.train(train_data, train_filename, save_dir=save_dir) + +# Load +finetuned_reader = FARMReader(save_dir) +``` + +
+ +## Deeper Dive: From Language Model to Haystack Reader + +Language models form the core of most modern NLP systems and that includes the Readers in Haystack. +They build a general understanding of language when performing training tasks such as Masked Language Modeling or Replaced Token Detection +on large amounts of text. +Well trained language models capture the word distribution in one or more languages +but more importantly, convert input text into a set of word vectors that capture elements of syntax and semantics. + +In order to convert a language model into a Reader model, it needs first to be trained on a Question Answering dataset. +To do so requires the addition of a question answering prediction head on top of the language model. +The task can be thought of as a token classification task where every input token is assigned a probability of being +either the start or end token of the correct answer. +In cases where the answer is not contained within the passage, the prediction head is also expected to return a `no_answer` prediction. + + + +Since language models are limited in the number of tokens which they can process in a single forward pass, +a sliding window mechanism is implemented to handle variable length documents. +This functions by slicing the document into overlapping passages of (approximately) `max_seq_length` +that are each offset by `doc_stride` number of tokens. +These can be set when the Reader is initialized. + + +
+            from haystack.reader import FARMReader
+            
+              reader = FARMReader(... max_seq_len=384, doc_stride=128 ...)
+            
+          
+
+ ), + }, + { + title: "Transformers", + content: ( +
+
+            from haystack.reader import TransformersReader
+            
+              reader = TransformersReader(... max_seq_len=384, doc_stride=128
+              ...)
+            
+          
+
+ ), + }, + ]} +/> + +Predictions are made on each individual passage and the process of aggregation picks the best candidates across all passages. +If you’d like to learn more about what is happening behind the scenes, have a look at [this](https://medium.com/deepset-ai/modern-question-answering-systems-explained-4d0913744097) article. diff --git a/docs/v0.10.0/components/ready_made_pipelines.mdx b/docs/v0.10.0/components/ready_made_pipelines.mdx new file mode 100644 index 000000000..c2a650773 --- /dev/null +++ b/docs/v0.10.0/components/ready_made_pipelines.mdx @@ -0,0 +1,241 @@ +# Ready-Made Pipelines + +
+ +**Note:** These ready-made Pipelines replace the `Finder` class which is now deprecated. + +
+ +Haystack Pipelines chain together various Haystack components to build a search system. Haystack comes with a number of predefined pipelines that fit most standard search patterns, allowing you to build a QA system in no time. + +## ExtractiveQAPipeline + +Extractive QA is the task of searching through a large collection of documents for a span of text that answers a question. The `ExtractiveQAPipeline` combines the Retriever and the Reader such that: +- The [Retriever](/components/retriever) combs through a database and returns only the documents that it deems to be the most relevant to the query. +- The [Reader](/components/reader) accepts the documents returned by the Retriever and selects a text span as the answer to the query. + + +```python +pipeline = ExtractiveQAPipeline(reader, retriever) + +query = "What is Hagrid's dog's name?" +result = pipeline.run(query=query, params={"retriever": {"top_k": 10}, "reader": {"top_k": 1}}) +``` + +The output of the pipeline is a Python dictionary with answers stored under the `answers` key. The output provides additional information such as the context from which the answer was extracted and the model’s confidence in the accuracy of the extracted answer. + +```python +result["answers"] +>>> [{ 'answer': 'Fang', + 'score': 13.26807975769043, + 'probability': 0.9657130837440491, + 'context': """Криволапик (Kryvolapyk, kryvi lapy "crooked paws") + ===Fang (Hagrid's dog)=== + *Chinese (PRC): 牙牙 (ya2 ya) (from 牙 "tooth", 牙,""" +...}] +``` + +For more examples that showcase `ExtractiveQAPipeline`, check out one of our tutorials [here](/tutorials/first-qa-system) or [here](/tutorials/without-elasticsearch). + +## DocumentSearchPipeline + +We typically pass the output of the Retriever to another component such as the Reader or the Generator. However, we can use the Retriever by itself for semantic document search to find the documents most relevant to our query. + +`DocumentSearchPipeline` wraps the [Retriever](/components/retriever) into a pipeline. Note that this wrapper does not endow the Retrievers with additional functionality but instead allows them to be used consistently with other Haystack Pipeline objects and with the same familiar syntax. Creating this pipeline is as simple as passing the Retriever into the pipeline’s constructor: + +``` python +pipeline = DocumentSearchPipeline(retriever=retriever) + +query = "Tell me something about that time when they play chess." +result = pipeline.run(query, params={"retriever": {"top_k": 2}) +``` + +The pipeline returns a Python dictionary with the retrieved documents accessible via the `documents` key: + +```python +result["documents"] +>>> [{'text': "used Seamus Finnigan's pieces, which offered him conflicting advice because they knew that he was not a good or experienced player. Murphy McNully and Barnaby LeeDuring the Christmas holidays in the 1984-1985 school year, Rowan Khanna got a new Wizard's Chess set from their parents...", + 'score': 24.10395, + 'probability': 0.9531577010470198, +...}] +``` + +## GenerativeQAPipeline + +Unlike extractive QA, which produces an answer by extracting a text span from a collection of passages, generative QA works by producing free text answers that need not correspond to a span of any document. Because the answers are not constrained by text spans, the Generator is able to create answers that are more appropriately worded compared to those extracted by the Reader. Therefore, it makes sense to employ a generative QA system if you expect answers to extend over multiple text spans, or if you expect answers to not be contained verbatim in the documents. + +`GenerativeQAPipeline` combines the [Retriever](/components/retriever) with the [Generator](/components/generator). To create an answer, the Generator uses the internal factual knowledge stored in the language model’s parameters in addition to the external knowledge provided by the Retriever’s output. + +You can build a `GenerativeQAPipeline` by simply placing the individual components inside the pipeline’s constructor: + +```python +pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever) + +result = pipeline.run(query="Who opened the Chamber of Secrets?", params={"retriever": {"top_k": 10}, "generator": {"top_k": 1}}) +``` + +You can access the answer via the `answers` key: + +```python +result["answers"] +>>> [{'query': 'Who opened the Chamber of Secrets?', + 'answer': ' tom riddle', +...}] +``` + +For more examples on using `GenerativeQAPipeline`, check out our tutorials where we implement generative QA systems with [RAG](/tutorials/retrieval-augmented-generation ) and [LFQA](/tutorials/lfqa). + +## SearchSummarizationPipeline + +Summarizer helps make sense of the Retriever’s output by creating a summary of the retrieved documents. This is useful for performing a quick sanity check and confirming the quality of candidate documents suggested by the Retriever, without having to inspect each document individually. + +`SearchSummarizationPipeline` combines the [Retriever](/components/retriever) with the [Summarizer](/components/summarizer). Below is an example of an implementation. + +```python +pipeline = SearchSummarizationPipeline(summarizer=summarizer, retriever=retriever, generate_single_summary=True) + +result = pipeline.run(query="Describe Luna Lovegood.", params={"retriever": {"top_k": 5}}) +``` + + +You can access the output via the `documents` key. Depending on whether you set the `generate_single_summary` to `True` or `False`, the output will either be a single summary of all documents or one summary per document. + +```python +result['documents'] +>>> [{'text': "Luna Lovegood is the only known member of the Lovegood family whose first name is not of Greek origin, rather it is of Latin origin. Her nickname, 'Loony,' refers to the moon and its ties with insanity, as it is short for 'lunatic' she is the goddess of the moon, hunting, the wilderness and the gift of taming wild animals.", +...}] +``` + +## TranslationWrapperPipeline + +Translator components bring the power of machine translation into your QA systems. Say your knowledge base is in English but the majority of your user base speaks German. With a `TranslationWrapperPipeline`, you can chain together: + +- The [Translator](/components/translator), which translates a query source into a target language (e.g. German into English) +- A search pipeline such as ExtractiveQAPipeline or DocumentSearchPipeline, which executes the translated query against a knowledge base. +- Another Translator that translates the search pipeline's results from the target back into the source language (e.g. English into German) + +After wrapping your search pipeline between two translation nodes, you can query it like you normally would, that is, by calling the `run()` method with a query in the desired language. Here’s an example of an implementation: + +```python +pipeline = TranslationWrapperPipeline(input_translator=de_en_translator, + output_translator=en_de_translator, + pipeline=extractive_qa_pipeline) + +query = "Was lässt den dreiköpfigen Hund weiterschlafen?" # What keeps the three-headed dog asleep? + +result = pipeline.run(query=query, params={"retriever": {"top_k": 10}, "reader": {"top_k": 1}}) +``` + +You may access the answer and other information like the model’s confidence and original context via the `answers` key, in this manner: + +``` python +result["answers"] +>>> [{'answer': 'der Klang der Musik', + 'score': 9.269367218017578, + 'probability': 0.4444255232810974, + 'context': "test weakness was the inability to resist falling asleep to the sound of music," + ...}] +``` + +## FAQPipeline + +FAQPipeline wraps the [Retriever](/components/retriever) into a pipeline and allows it to be used for question answering with FAQ data. Compared to other types of question answering, FAQ-style QA is significantly faster. However, it’s only able to answer FAQ-type questions because this type of QA matches queries against questions that already exist in your FAQ documents. + +For this task, we recommend using the Embedding Retriever with a sentence similarity model such as `sentence-transformers/all-MiniLM-L6-v2` Here’s an example of an FAQPipeline in action: + +```python +pipeline = FAQPipeline(retriever=retriever) +query = "How to reduce stigma around Covid-19?" +result = pipeline.run(query=query, params={"retriever": {"top_k": 1}) +``` + +```python +result["answers"] +>>> [{'answer': 'People can fight stigma and help, not hurt, others by providing social support. Counter stigma by learning and sharing facts. Communicating the facts that viruses do not target specific racial or ethnic groups and how COVID-19 actually spreads can help stop stigma., +...}] +``` + +Check out our [tutorial](/tutorials/existing-faqs) for more information on FAQPipeline. + +## QuestionGenerationPipeline + +The most basic version of a question generator pipeline takes a document as input and outputs generated questions +which the the document can answer. + +``` python +text1 = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace." + +question_generation_pipeline = QuestionGenerationPipeline(question_generator) +result = question_generation_pipeline.run(documents=[document]) +print(result) +``` +Output: +``` python +[' Who created Python?', + ' When was Python first released?', + " What is Python's design philosophy?"] +``` + +## QuestionAnswerGenerationPipeline + +This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using a Reader model. + +``` python +qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader) +result = qag_pipeline.run(document=document) +print(result) +``` +Output: +``` python +{ + ... + 'query_doc_list': [{'docs': [{'text': "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first ...", ...}], + 'queries': ' Who created Python?'}, + ...], + 'results': [{'answers': [{'answer': 'Guido van Rossum', + 'context': 'eted, high-level, general-purpose ' + 'programming language. Created by Guido ' + 'van Rossum and first released in 1991, ' + "Python's design philosophy emphasizes ", + 'document_id': '2ce1de1b4d6dd8e4564795c955e0b356', + 'offset_end': 83, + 'offset_end_in_doc': 103, + 'offset_start': 67, + 'offset_start_in_doc': 87, + 'score': 0.9960587024688721}], + 'no_ans_gap': 15.335145950317383, + 'query': ' Who created Python?'}, + ... + ], + ... + } +``` + +## MostSimilarDocumentsPipeline + +This pipeline is used to find the most similar documents to a given document in your document store. + +You will need to first make sure that your indexed documents have attached embeddings. +You can generate and store their embeddings using the `DocumentStore.update_embeddings()` method. + +``` python +from haystack.pipeline import MostSimilarDocumentsPipeline + +msd_pipeline = MostSimilarDocumentsPipeline(document_store) +result = msd_pipeline.run(document_ids=[doc_id1, doc_id2, ...]) +print(result) +``` + +Output: + +``` python +[[ + {'text': "Southern California's economy is diver...", + 'score': 0.8605178832348279, + 'question': None, + 'meta': {'name': 'Southern_California'}, + 'embedding': ..., + 'id': '6e26b1b78c48efc6dd6c888e72d0970b'}, + ... +]] +``` \ No newline at end of file diff --git a/docs/v0.10.0/components/retriever.mdx b/docs/v0.10.0/components/retriever.mdx new file mode 100644 index 000000000..cd1d5f144 --- /dev/null +++ b/docs/v0.10.0/components/retriever.mdx @@ -0,0 +1,286 @@ +# Retriever + +The Retriever is a lightweight filter that can quickly go through the full document store and pass on a set of candidate documents that are relevant to the query. +When used in combination with a Reader, it is a tool for sifting out the obvious negative cases, saving the Reader from doing more work than it needs to and speeding up the querying process. + +
+ +**Recommendations** + +- BM25 (sparse) + +- Dense Passage Retrieval (dense) + +
+ + + + +Note that not all Retrievers can be paired with every DocumentStore. +Here are the combinations which are supported: + +| | Memory | Elasticsearch | SQL | FAISS | Milvus | +| --------- | ------ | ------------- | --- | ----- | ------ | +| BM25 | N | Y | N | N | N | +| TF-IDF | Y | Y | Y | N | N | +| Embedding | Y | Y | N | Y | Y | +| DPR | Y | Y | N | Y | Y | + +See [Optimization](/guides/optimization) for suggestions on how to choose top-k values. + +
+ +## TF-IDF + +### Description + +TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions: + +- documents that have more lexical overlap with the query are more likely to be relevant + +- words that occur in fewer documents are more significant than words that occur in many documents + +Given a query, a tf-idf score is computed for each document as follows: + +```python +score = tf * idf +``` + +Where: + +- `tf` is how many times words in the query occur in that document. + +- `idf` is the inverse of the fraction of documents containing the word. + +In practice, both terms are usually log normalised. + +
+ +### Initialisation + +```python +from haystack.document_store import InMemoryDocumentStore +from haystack.retriever.sparse import TfidfRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = InMemoryDocumentStore() +... +retriever = TfidfRetriever(document_store) +... +p = ExtractiveQAPipeline(reader, retriever) +``` + +
+ +## BM25 (Recommended) + +### Description + +BM25 is a variant of TF-IDF that we recommend you use if you are looking for a retrieval method that does not need a neural network for indexing. +It improves upon its predecessor in two main aspects: + +- It saturates `tf` after a set number of occurrences of the given term in the document + +- It normalises by document length so that short documents are favoured over long documents if they have the same amount of word overlap with the query + +
+ +### Initialisation + +```python +from haystack.document_store import ElasticsearchDocumentStore +from haystack.retriever import ElasticsearchRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = ElasticsearchDocumentStore() +... +retriever = ElasticsearchRetriever(document_store) +... +p = ExtractiveQAPipeline(reader, retriever) +``` + +See [this](https://www.elastic.co/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables) blog post for more details about the algorithm. + + + +
+ +## Dense Passage Retrieval (Recommended) + +### Description + +[Dense Passage Retrieval](https://arxiv.org/abs/2004.04906) is a highly performant retrieval method that calculates relevance using dense representations. +Key features: + +- One BERT base model to encode documents + +- One BERT base model to encode queries + +- Ranking of documents done by dot product similarity between query and document embeddings + + + +Indexing using DPR is comparatively expensive in terms of required computation since all documents in the database need to be processed through the transformer. +The embeddings that are created in this step can be stored in FAISS, a database optimized for vector similarity. +DPR can also work with the ElasticsearchDocumentStore or the InMemoryDocumentStore. + +There are two design decisions that have made DPR particularly performant. + +- Separate encoders for document and query helps since queries are much shorter than documents + +- Training with ‘In-batch negatives’ (gold labels are treated as negative examples for other samples in same batch) is highly efficient + +In Haystack, you can simply download the pretrained encoders needed to start using DPR. +If you’d like to learn how to set up a DPR based system, have a look at the [tutorial](/tutorials/dense-passage-retrieval)! + +
+ +### Initialisation + +
+ +**Tip** + +When using DPR, it is recommended that you use the dot product similarity function since that is how it is trained. +To do so, simply provide `similarity="dot_product"` when initializing the DocumentStore +as is done in the code example below. + +
+ +```python +from haystack.document_store import FAISSDocumentStore +from haystack.retriever import DensePassageRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = FAISSDocumentStore(similarity="dot_product") +... +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base" +) +... +finder = ExtractiveQAPipeline(reader, retriever) +``` + +
+ +**Training DPR:** Haystack supports training of your own DPR model! Check out the [tutorial](/tutorials/train-dpr) to see how this is done! + +
+ + + + +
+ +## Embedding Retrieval + +### Description + +In Haystack, you also have the option of using a single transformer model to encode document and query. +One style of model that is suited to this kind of retrieval is that of [Sentence Transformers](https://github.com/UKPLab/sentence-transformers). +These models are trained in Siamese Networks and use triplet loss such that they learn to embed similar sentences near to each other in a shared embedding space. + +They are particular suited to cases where your query input is similar in style to that of the documents in your database +i.e. when you are searching for most similar documents. +This is not inherently suited to query based search where the length, language and format of the query usually significantly differs from the searched for text. + +
+ +**Tip** + +When using Sentence Transformer models, we recommend that you use a cosine similarity function. +To do so, simply provide `similarity="cosine"` when initializing the DocumentStore +as is done in the code example below. + +
+ +
+ +### Initialisation + +```python +from haystack.document_store import ElasticsearchDocumentStore +from haystack.retriever import EmbeddingRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = ElasticsearchDocumentStore(similarity="cosine") +... +retriever = EmbeddingRetriever(document_store=document_store, + embedding_model="sentence-transformers/all-MiniLM-L6-v2") +... +p = ExtractiveQAPipeline(reader, retriever) +``` + +
+ +## Deeper Dive: Dense vs Sparse + +Broadly speaking, retrieval methods can be split into two categories: **dense** and **sparse**. + +**Sparse** methods, like TF-IDF and BM25, operate by looking for shared keywords between the document and query. +They are: + +- simple but effective + +- don’t need to be trained + +- work on any language + +More recently, **dense** approaches such as Dense Passage Retrieval (DPR) have shown even better performance than their sparse counter parts. +These methods embed both document and query into a shared embedding space using deep neural networks +and the top candidates are the nearest neighbour documents to the query. +They are: + +- powerful but computationally more expensive especially during indexing + +- trained using labelled datasets + +- language specific + +
+ +### Qualitative Differences + +Between these two types there are also some qualitative differences too. +For example, sparse methods treat text as a bag-of-words meaning that they **do not take word order and syntax into account**, +while the latest generation of dense methods use transformer based encoders +which are designed to be **sensitive** to these factors. + +Also dense methods are very capable of building strong semantic representations of text, +but they **struggle when encountering out-of-vocabulary** words such as new names. +By contrast, sparse methods don’t need to learn representations of words, +they only care about whether they are present or absent in the text. +As such, **they handle out-of-vocabulary words with no problem**. + + + +
+ +### Indexing + +Dense methods perform indexing by processing all the documents through a neural network and storing the resulting vectors. +This is a much more expensive operation than the creation of the inverted-index in sparse methods +and will require significant computational power and time. + + + +
+ +### Terminology + + + + +The terms **dense** and **sparse** refer to the representations that the algorithms build for each document and query. +**Sparse** methods characterise texts using vectors with one dimension corresponding to each word in the vocabulary. +Dimensions will be zero if the word is absent and non-zero if it is present. +Since most documents contain only a small subset of the full vocabulary, +these vectors are considered sparse since non-zero values are few and far between. + +**Dense** methods, by contrast, pass text as input into neural network encoders +and represent text in a vector of a manually defined size (usually 768). +Though individual dimensions are not mapped to any corresponding vocabulary or linguistic feature, +each dimension encodes some information about the text. +There are rarely 0s in these vectors hence their relative density. diff --git a/docs/v0.10.0/components/summarizer.mdx b/docs/v0.10.0/components/summarizer.mdx new file mode 100644 index 000000000..c3c0b82c8 --- /dev/null +++ b/docs/v0.10.0/components/summarizer.mdx @@ -0,0 +1,47 @@ +# Summarizer + +Retrievers are excellent at returning a set of candidate documents, +but you might not have the time to read through them all. +Haystack's Summmarizer is here to help you make sense of the documents at a glance. + +There is a full integration with Huggingface Transformers and using any of their summarization +models is as simple as providing the model name. +See the up-to-date list of available models [here](https://huggingface.co/models?filter=summarization). +By default, the Google [Pegasus](https://ai.googleblog.com/2020/06/pegasus-state-of-art-model-for.html) model is loaded. + +```python +from haystack.summarizer import TransformersSummarizer +from haystack.schema import Document + +docs = [Document("PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.\ + The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by\ + the shutoffs which were expected to last through at least midday tomorrow.")] + +summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum") +summary = summarizer.predict(documents=docs, generate_single_summary=True) +``` + +The contents of summary should contain both the summarization and also the original document text. + +```python +[ + { + "text": "California's largest electricity provider has turned off power to hundreds of thousands of customers.", + "meta": { + "context": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." + }, + ... + } +] +``` + +The summarizer can also functions as a node in a pipeline. + +```python +from haystack.pipeline import Pipeline + +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever1", inputs=["Query"]) +p.add_node(component=summarizer, name="Summarizer", inputs=["ESRetriever1"]) +res = p.run(query="What did Einstein work on?") +``` diff --git a/docs/v0.10.0/components/translator.mdx b/docs/v0.10.0/components/translator.mdx new file mode 100644 index 000000000..7620e6806 --- /dev/null +++ b/docs/v0.10.0/components/translator.mdx @@ -0,0 +1,52 @@ +# Translator + +Texts come in different languages. This is not different for search and there are plenty of options to deal with it. +One of them is actually to translate the incoming query, the documents or the search results. + +Let's imagine you have an English corpus of technical docs, but the mother tongue of many of your users is French. +You can use a `Translator` node in your pipeline to + +- Translate the incoming query from French to English +- Search in your English corpus for the right document / answer +- Translate the results back from English to French + +## Example: Stand-alone Translator + +You can use the Translator component directly to translate your query or documents: + +```python +from haystack.schema import Document +from haystack.translator import TransformersTranslator + +DOCS = [ + Document( + text="""Heinz von Foerster was an Austrian American scientist + combining physics and philosophy, and widely attributed + as the originator of Second-order cybernetics.""" + ) + ] +translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-fr") +res = translator.translate(documents=DOCS, query=None) +``` + +
+ +## Example: Wrapping another Pipeline + +You can also wrap one of your existing pipelines and "add" the translation nodes at the beginning and at the end of your pipeline. +For example, lets translate the incoming query to from French to English, then do our document retrieval and then translate the results back from English to French: + +```python +from haystack.pipeline import TranslationWrapperPipeline, DocumentSearchPipeline +from haystack.translator import TransformersTranslator + +pipeline = DocumentSearchPipeline(retriever=my_dpr_retriever) + +in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-fr-en") +out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-fr") + +pipeline_with_translation = TranslationWrapperPipeline(input_translator=in_translator, + output_translator=out_translator, + pipeline=pipeline) +``` + diff --git a/docs/v0.10.0/guides/annotation.mdx b/docs/v0.10.0/guides/annotation.mdx new file mode 100644 index 000000000..9ae3e9c7a --- /dev/null +++ b/docs/v0.10.0/guides/annotation.mdx @@ -0,0 +1,82 @@ +# Annotation Tool + +- Create labels with different techniques: Come up with questions (+ answers) while reading passages (SQuAD style) or have a set of predefined questions and look for answers in the document (~ Natural Questions). +- Structure your work via organizations, projects, users +- Upload your documents or import labels from an existing SQuAD-style dataset +- Export your labels in SQuAD Format + +![image](/img/annotation_tool.png) + +
+ +## Hosted version + +Signup here: [Haystack Annotation Tool](https://annotate.deepset.ai/login) + +
+ +## Local version (Docker) + +1. Configure credentials & database in the [`docker-compose.yml`](https://github.com/deepset-ai/haystack/blob/master/annotation_tool/docker-compose.yml): + +The credentials should match in database image and application configuration. + + DEFAULT_ADMIN_EMAIL: "example@example.com" + DEFAULT_ADMIN_PASSWORD: "DEMO-PASSWORD" + + PROD_DB_NAME: "databasename" + PROD_DB_USERNAME: "somesafeuser" + PROD_DB_PASSWORD: "somesafepassword" + + + POSTGRES_USER: "somesafeuser" + POSTGRES_PASSWORD: "somesafepassword" + POSTGRES_DB: "databasename" + +2. Run docker-compose by executing `docker-compose up`. + +3. The UI should be available at `localhost:7001`. + +
+ +## Manual + +The manual (of a slightly earlier version) can be found [here](https://drive.google.com/file/d/1Wv3OIC0Z7ibHIzOm9Xw_r0gjTFmpl-33/view). While it doesn't include all latest features, the basic workflow and tips for label quality are still the same. + +## Annotation FAQ + +**What is a good question?** +- A good question is a fact-seeking question that can be answered with an entity (person, organisation, location, etc.) or explanation. A bad question is ambiguous, incomprehensible, dependent on clear false presuppositions, opinion seeking, or not clearly a request for factual information. +- The question should ask about information present in the text passage given. It should not be answerable only with additional knowledge or your interpretation. +- Do not copy paste answer text into the question. Good questions do not contain the exact same words as the answer or the context around the answer. The question should be a reformulation with synonyms and in different order as the context of the answer. +- Questions should be very precise natural questions you would ask when you want information from another person. + +**How many questions should you ask per text passage?** +- Maximally ask 20 questions per passage +- Some text passages are not suited for 20 questions. Do not make up very constructed and complicated questions just to fill up the 20 - move on to the next text. +- Try to ask questions covering the whole passage and focus on questions covering important information. Do not only ask questions about a single sentence in that passage. + +**What is a good answer span?** +- Always mark whole words. Do not start or end the answer within a word. +- For short answers: The answer should be as short and as close to a spoken human answer as possible. Do not include punctuation. +- For long answers: Please mark whole sentences with punctuation. The sentences can also pick up parts of the question, or mark even whole text passages. Mark passages only if they are not too large (e.g. not more than 8-10 sentences). + +**How do I differentiate long vs short answers?** +- If there is a short answer possible you should always select short answer over long answer. +- Short precise answers like numbers or a few words are short answers. +- Long answers include lists of possibilities or multiple sentences are needed to answer the question correctly. + +**How to handle multiple possible answers to a single question?** +- As of now there is no functionality to mark multiple answers per single question. +- Workaround: You can add a question with the same text but different answer selection by using the button below the question list (Button reads “custom question”) + +**What to do with grammatically wrong or incorrectly spelled questions?** +- Include them. When users use the tool and ask questions they will likely contain grammar and spelling errors, too. +- Exception: The question needs to be understandable without reading and interpretation of the corresponding text passage. If you do not understand the question, please mark the question as “I don’t understand the question”. + +**What to do with text passages that are not properly converted or contain (in part) information that cannot be labelled (e.g. just lists or garbage text)?** +- Please do not annotate this text +- You can write down what is missing, or the cause why you cannot label the text + the text number and title. + +**Which browser to use?** +- Please use the Chrome browser. The tool is not tested for other browsers. \ No newline at end of file diff --git a/docs/v0.10.0/guides/chatbots.mdx b/docs/v0.10.0/guides/chatbots.mdx new file mode 100644 index 000000000..84b6e0704 --- /dev/null +++ b/docs/v0.10.0/guides/chatbots.mdx @@ -0,0 +1,151 @@ +# Chatbot Integration + +It is easy and simple to imbue your chatbot with the NLP power that Haystack offers. +If your chatbot is handling a question that is in the log tail of undefined intents, why not handle it using Haystack's question answering capabilities? +Thanks to Haystack's REST API, chatbot frameworks can seamlessly communicate with your custom Haystack service. +Below, we will trace through what it takes to set this up using the popular [Rasa](https://rasa.com/) framework. + +## Overview + +When a user speaks to a conversational AI program, each user message is classified and labelled with an intent. +One approach to integrating Haystack would be to introduce a new intent, such as `knolwedge question`, +that triggers a call to a running Haystack API. + +Alternatively, in cases where they are uncertain what intent fits, chatbots can route requests to a fallback action. +Rasa uses the [FallbackClassifier](https://rasa.com/docs/rasa/fallback-handoff/#nlu-fallback) to make this happen. + +In both cases, Rasa uses an [action server](https://rasa.com/docs/action-server/) to send a REST API request to Haystack. +When it receives the result, it will base its next message to the user on the contents of this response. + +![image](/img/rasa_haystack.png) + +## Setting up Haystack with REST API + +To set up a Haystack instance with REST API, have a look at [this documentation page](/guides/rest-api). +By default, the API server runs on `http://127.0.0.1:8000`. + +
+ +**Quick-start demo:** Bundled with the Github repository is an example Haystack service with REST API that queries the Game of Thrones Wiki. +Simply clone it and call the following in the root directory: + +``` bash + cd haystack + docker-compose pull + docker-compose up + + # Or on a GPU machine: docker-compose -f docker-compose-gpu.yml up +``` + +
+ +## Setting Up a Rasa Chatbot + +
+ +**Demo:** For a bare bones example of a Rasa chatbot communicating with Haystack, have a look at [this repo](https://github.com/deepset-ai/rasa-haystack). + +
+ +After [installing](https://rasa.com/docs/rasa/installation) Rasa and [initializing](https://rasa.com/docs/rasa/command-line-interface/#rasa-init) a new project, +there are a few steps that a developer needs to take in order to get Rasa to communicate with Haystack. + +In `data/nlu.yaml`, you will want to define a new intent but providing example utterances of that intent: +``` +nlu: +- intent: knowledge_question + examples: | + - Can you look this up: what is ? + - Look this up: what is? + - Can you check: what is? + - please check: What is ? + - Can you look this up: where is ? + - Look this up: where is? + - Can you check: where is? + - please check: where is ? +```` + +You will also want to define what action is taken when that intent is identified in the `data/rules.yml` file: +``` +- rule: Query Haystack anytime the user has a direct question for your document base + steps: + - intent: knowledge_question + - action: call_haystack +``` + +If you want the Haystack API call to be triggered by a fallback instead of an intent, you will need to add the `FallbackClassifier` to the pipeline in `config.yml`: +``` +pipeline: + - name: FallbackClassifier + threshold: 0.8 + ambiguity_threshold: 0.1 +``` +You will also need to add the following to `data/rules.yml`: +``` +- rule: Query Haystack whenever they send a message with low NLU confidence + steps: + - intent: nlu_fallback + - action: call_haystack +``` +By default the Rasa custom action server API is blocked so you will need to uncomment this line from `endpoints.yml`: +``` +action_endpoint: + url: "http://localhost:5055/webhook" +``` +Finally, we can define an action function that calls the Haystack API and handles the response in `acitons/actions.py`. +Here is an example of what that might look like: + +``` python +class ActionHaystack(Action): + + def name(self) -> Text: + return "call_haystack" + + def run(self, dispatcher: CollectingDispatcher, + tracker: Tracker, + domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: + + url = "http://localhost:8000/query" + payload = {"query": str(tracker.latest_message["text"])} + headers = { + 'Content-Type': 'application/json' + } + response = requests.request("POST", url, headers=headers, json=payload).json() + + if response["answers"]: + answer = response["answers"][0]["answer"] + else: + answer = "No Answer Found!" + + dispatcher.utter_message(text=answer) + + return [] +``` +You will also want to declare this new action in the `domain.yml` file: +``` +actions: +- call_haystack +``` + +## Running the Chatbot + +Whenever changes are made to your Rasa project, you will want to retrain your chatbot via: +``` +rasa train +``` +You will need to start the Rasa action server using: +``` +rasa run actions +``` +To start interacting with the bot you have created, you can call +``` +rasa shell +``` +Now you can start talking to the Haystack enabled chatbot! + +![image](/img/rasa_conversation.png) + +Alternatively, you can use [Rasa X](https://rasa.com/docs/rasa-x/) which allows the chatbot to be run in a GUI. +While interacting with you chatbot in this interface, you will also see the intent being assigned to each message as well as the action being taken. + +![image](/img/rasa_x_conversation.png) \ No newline at end of file diff --git a/docs/v0.10.0/guides/domain_adaptation.mdx b/docs/v0.10.0/guides/domain_adaptation.mdx new file mode 100644 index 000000000..614be50d7 --- /dev/null +++ b/docs/v0.10.0/guides/domain_adaptation.mdx @@ -0,0 +1,148 @@ +# Domain Adaptation + +## Generalisation + +In our experience, language models trained on SQuAD show very strong general question answering capabilities. +Though SQuAD is composed entirely of Wikipedia articles, these models are flexible enough to deal with many different styles of text. + +Before trying to adapt these models to your domain, we’d recommend trying one of the off the shelf models. +We’ve found that these models are often flexible enough for a wide range of use cases. + +
+ +**Intuition** + +Most people probably don’t know what an HP Valve is. +But you don’t always need to know what a HP Valve is to answer “What is connected to a HP Valve?” +The answer might be there in plain language. +In the same way, many QA models have a good enough grasp of language to answer questions about concepts in an unseen domain. + +
+ +
+ +## Finetuning + +Any model that can be loaded into Haystack can also be finetuned within Haystack. +Simply provide the domain specific dataset and call `Reader.train()` on an initialised model. + +``` +reader.train(data_dir=train_data, + train_filename="dev-v2.0.json", + n_epochs=1, + save_dir="my_model") +``` + +At the end of training, the finetuned model will be saved in the specified `save_dir` and can be loaded as a `Reader`. + +
+ +**Recommendation** + +See Tutorial 2 for a runnable example of this process. +If you’re interested in measuring how much your model has improved, +please also check out Tutorial 5 which walks through the steps needed to perform evaluation. + +
+ +
+ +## Generating Labels + +Using our [Haystack Annotate tool](https://annotate.deepset.ai/login) (Beta), +you can easily create a labelled dataset using your own documents featuring your own question/ answer pairs. + +![image](/img/annotation_tool.png) + +Features include: + +- Structured workspaces via organisations, projects and users + +- Easy upload of your own documents and labels in a variety of formats (txt, pdf, SQuAD style) + +- Export of labels to be used directly in Haystack + +Annotate also supports two different workflows: + +- Think up questions and answers while reading passages (SQuAD style) + +- Have a set of predefined questions and look for answers in the document (~ Natural Questions style) + +
+ +## User Feedback + +A simpler and faster process to finetune models to your domain is to utilise user feedback. +Dedicated annotation work can be costly and time consuming +but user feedback is an efficient alternative since it allows for labels to be generated by users, for users, +all while the system is already in production. +We, for example, have used a simple thumbs up / down system in our demos to allow +users to give feedback. + +![image](/img/demo.png) + +Through the Rest API, users can annotate each Haystack result as being either: + +- Correct +- Correct document but wrong text +- Wrong document and wrong text + +To get started, follow these steps: + +- Start up the REST API + - The simplest way to do this is to call `docker-compose up` from the root directory of the Haystack repository + - Alternatively, run `gunicorn rest_api.application:app -b 0.0.0.0:8000 -k uvicorn.workers.UvicornWorker -t 300` +- Make a POST request to the `doc-qa-feedback` endpoint with the following payload: + +``` +{ + "question": "string", + "is_correct_answer": true, + "document_id": "string", + "model_id": 0, + "is_correct_document": true, + "answer": "string", + "offset_start_in_doc": 0 +} +``` + +To fill out all these values, you can use the response from an executed search request on the `doc-qa` endpoint. + +- Export your labels in SQuAD format by making a GET request to the `export-doc-qa-feedback` endpoint + +```python +# SQUAD format +{ + "data": [ + { + "title": "Normans", + "paragraphs": [ + { + "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the...", + "qas": [ + { + "question": "In what country is Normandy located?", + "id": "56ddde6b9a695914005b9628", + "answers": [ + { + "text": "France", + "answer_start": 159 + } + ] + } + ] + } + ] + } + ] +} +``` + +User feedback labelling also works with FAQ style QA. +Refer to the Swagger API documentation (http://127.0.0.1:8000/docs) for more details. + +This labelled data can then be used to +augment the training data and enable `Reader` finetuning. +Alternatively, they can also be used to form an evaluation set to +measure the performance of trained models, +or monitor how well the model is doing in a live environment. diff --git a/docs/v0.10.0/guides/evaluation.mdx b/docs/v0.10.0/guides/evaluation.mdx new file mode 100644 index 000000000..ea93dbf8e --- /dev/null +++ b/docs/v0.10.0/guides/evaluation.mdx @@ -0,0 +1,92 @@ +# Evaluation + +Haystack has all the tools needed to evaluate Retrievers, Readers and Generators in both +open domain and closed domain modes. +Evaluation and the metrics that it generates are vital for: +- judging how well your system is performing on a given domain. +- comparing the performance of different models +- identifying underperforming components in your pipeline + +
+ +**Tutorial:** This documentation page is meant to give an in depth understanding of the concepts involved in evaluation. +To get started using Haystack for evaluation, we recommend having a look at our [evaluation tutorial](/tutorials/evaluation) + +
+ +## Open vs Closed Domain + +There are two evaluation modes known as **open domain** and **closed domain.** + +**Closed domain** means single document QA. +In this setting, you want to make sure the correct instance of a string is highlighted as the answer. +So you compare the indexes of predicted against labeled answers. +Even if the two strings have identical content, if they occur in different documents, +or in different positions in the same document, they count as wrong. + +**Open domain** means multiple-document QA (typically over the entire database). +Here, you only look for a match or overlap between the two answer strings. +Even if the predicted answer is extracted from a different position than the correct answer, +that's fine as long as the strings match. + +## Metrics: Retrieval + +### Recall + +Recall measures how many times the correct document was among the retrieved documents over a set of queries. +For a single query, the output is binary: either a document is contained in the selection, or it is not. +Over the entire dataset, the recall score amounts to a number between zero (no query retrieved the right document) and one (all queries retrieved the right documents). + +Note that recall is affected by the number of documents that the retriever returns. +If the retriever returns only one or a few documents, it is a tougher task to retrieve correct documents. +Make sure to set the Retriever's `top_k` to an appropriate value and to also define the `top_k` in `Retriever.eval()` or `EvalDocuments` + +### Mean Reciprocal Rank (MRR) + +In contrast to the recall metric, mean reciprocal rank takes the position of the top correctly retrieved document (the “rank”) into account. +It does this to account for the fact that a query elicits multiple responses of varying relevance. +Like recall, MRR can be a value between zero (no matches) and one (the system retrieved a correct document for all queries as the top result). +For more details, check out [this page](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) + +### Mean Average Precision (mAP) + +Mean average precision is similar to mean reciprocal rank but takes into account the position of every correctly retrieved document. +Like MRR, mAP can be a value between zero (no matches) and one (the system retrieved correct documents for all top results). +mAP is particularly useful in cases where there are more than one correct document to be retrieved. +For more details, check out [this page](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) + + +## Metrics: Question Answering + +### Exact Match (EM) + +Exact match measures the proportion of cases where the predicted answer is identical to the correct answer. +For example, for the annotated question answer pair “What is Haystack?" + "A question answering library in Python”, +even a predicted answer like “A Python question answering library” would yield a zero score because it does not match the expected answer 100 percent. + +### F1 + +The F1 score is more forgiving and measures the word overlap between the labeled and the predicted answer. +Whenever the EM is 1, F1 will also be 1. +To learn more about the F1 score, check out this guide + +### Semantic Answer Similarity (SAS) + +Semantic Answer Similarity uses a transformer-based cross-encoder architecture to evaluate the semantic similarity of two answers rather than their lexical overlap. +While F1 and EM would both score “one hundred percent” as sharing zero similarity with “100 %", SAS is trained to assign this a high score. + +SAS is particularly useful to seek out cases where F1 doesn't give a good indication of the validity of a predicted answer. + +You can read more about SAS in [this paper](https://arxiv.org/abs/2108.06130). + +## Datasets + +Annotated datasets are crucial for evaluating the retrieval as well as the question answering capabilities of your system. +Haystack is designed to work with question answering datasets that follow SQuAD format. +Please check out our [annotation tool](/guides/annotation) if you're interested in creating your own dataset. + +
+ +**Data Tool:** have a look at our `SquadData` object in `haystack/squad_data.py` if you'd like to manipulate SQuAD style data using Pandas dataframes. + +
\ No newline at end of file diff --git a/docs/v0.10.0/guides/languages.mdx b/docs/v0.10.0/guides/languages.mdx new file mode 100644 index 000000000..bb415c821 --- /dev/null +++ b/docs/v0.10.0/guides/languages.mdx @@ -0,0 +1,196 @@ +# Languages Other Than English + +Haystack is well suited to open-domain QA on languages other than English. +While our defaults are tuned for English, +you will find some tips and tricks here for using Haystack in your language. + +
+ +## Preprocessor + +The PreProcessor's sentence tokenization is language specific. +If you are using the PreProcessor on a language other than English, +make sure to set the `language` argument when initializing it. + +``` python +preprocessor = PreProcessor(language="sv", ...) +``` +[Here](https://github.com/deepset-ai/haystack/blob/3e6def7e03097021c8efd1b5c277bec6e541c162/haystack/preprocessor/preprocessor.py#L17) you will find the list of supported languages. + +
+ +## Retrievers + +The sparse retriever methods themselves(BM25, TF-IDF) are language agnostic. +Their only requirement is that the text be split into words. +The ElasticsearchDocumentStore relies on an analyzer to impose word boundaries, +but also to handle punctuation, casing and stop words. + +The default analyzer is an English analyzer. +While it can still work decently for a large range of languages, +you will want to set it to your language's analyzer for optimal performance. +In some cases, such as with Thai, the default analyzer is completely incompatible. +See [this page](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html) +for the full list of language specific analyzers. + +```python +from haystack.document_store import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore(analyzer="thai") +``` + +The models used in dense retrievers are language specific. +Be sure to check language of the model used in your EmbeddingRetriever. +The default model that is loaded in the DensePassageRetriever is for English. + +We have created a [German DensePassageRetriever model](https://deepset.ai/germanquad) and know other teams who work on further languages. +If you have a language model and a question answering dataset in your own language, you can also train a DPR model using Haystack! +Below is a simplified example. +See [our tutorial](/tutorials/train-dpr) and also the [API reference](/reference/retriever#train) for `DensePassageRetriever.train()` for more details. + +```python +from haystack.retriever import DensePassageRetriever + +dense_passage_retriever = DensePassageRetriever(document_store) +dense_passage_retriever.train(self, + data_dir: str, + train_filename: str, + dev_filename: str = None, + test_filename: str = None, + batch_size: int = 16, + embed_title: bool = True, + num_hard_negatives: int = 1, + n_epochs: int = 3) +``` + +
+ +## Readers + +While models are comparatively more performant on English, +thanks to a wealth of available English training data, +there are a couple QA models that are directly usable in Haystack. + + + from haystack.reader import FARMReader + + reader = FARMReader("deepset/gelectra-large-germanquad") + + + ), + }, + { + title: "Italian", + content: ( +
+                  from haystack.reader import FARMReader
+                  
+                    reader =
+                    FARMReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa")
+                  
+                
+ ), + }, + { + title: "Chinese", + content: ( +
+                  from haystack.reader import FARMReader
+                  
+                    reader =
+                    FARMReader("uer/roberta-base-chinese-extractive-qa")
+                  
+                  # or
+                  
+                    reader = FARMReader("wptoux/albert-chinese-large-qa")
+                  
+                
+ ), + }, + { + title: "Zero-shot", + content: ( +
+                  from haystack.reader import FARMReader
+                  
+                    reader = FARMReader("deepset/xlm-roberta-large-squad2")
+                  
+                
+ ), + }, + ]} + /> + ), + }, + { + title: "Transformers", + content: ( + + from haystack.reader import TransformersReader + + reader = TransformersReader("illuin/camembert-base-fquad") + + + ), + }, + { + title: "Italian", + content: ( +
+                  from haystack.reader import TransformersReader
+                  
+                    reader =
+                    TransformersReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa")
+                  
+                
+ ), + }, + { + title: "Zero-shot", + content: ( +
+                  from haystack.reader import TransformersReader
+                  
+                    reader =
+                    TransformersReader("deepset/xlm-roberta-large-squad2")
+                  
+                
+ ), + }, + ]} + /> + ), + }, + ]} +/> + +
+ +We are the creators of the **German** model and you can find out more about it [here](https://deepset.ai/germanquad) + +The **French**, **Italian**, **Spanish**, **Portuguese** and **Chinese** models are monolingual language models trained on versions of the SQuAD dataset in their respective languages +and their authors report decent results in their model cards +(e.g. [here](https://huggingface.co/illuin/camembert-base-fquad) and [here](https://huggingface.co/mrm8488/bert-italian-finedtuned-squadv1-it-alfa)). +There also exist Korean QA models on the model hub but their performance is not reported. + +The **zero-shot model** that is shown above is a **multilingual XLM-RoBERTa Large** that is trained on English SQuAD. +It is clear, from our [evaluations](https://huggingface.co/deepset/xlm-roberta-large-squad2#model_card), +that the model has been able to transfer some of its English QA capabilities to other languages, +but still its performance lags behind that of the monolingual models. +Nonetheless, if there is not yet a monolingual model for your language and it is one of the 100 supported by XLM-RoBERTa, +this zero-shot model may serve as a decent first baseline. diff --git a/docs/v0.10.0/guides/optimization.mdx b/docs/v0.10.0/guides/optimization.mdx new file mode 100644 index 000000000..8e721f5d5 --- /dev/null +++ b/docs/v0.10.0/guides/optimization.mdx @@ -0,0 +1,109 @@ +# Optimization + +## Speeding up Reader + +In most pipelines, the Reader will be the most computationally expensive component. +If this is a step that you would like to speed up, you can opt for a smaller Reader model +that can process more passages in the same amount of time. + +On our [benchmarks page](https://haystack.deepset.ai/bm/benchmarks), you will find a comparison of +many of the common model architectures. While our default recommendation is RoBERTa, +MiniLM offers much faster processing for only a minimal drop in accuracy. +You can find the models that we've trained on [the HuggingFace Model Hub](https://huggingface.co/deepset) + +
+ +## GPU acceleration + +The transformer based models used in Haystack are designed to be run on a GPU enabled machine. +The design of these models means that they greatly benefit from the parallel processing capabilities of graphics cards. +If Haystack has successfully detected a graphics card, you should see these lines in your console output. + +``` +INFO - farm.utils - Using device: CUDA +INFO - farm.utils - Number of GPUs: 1 +``` + +You can track the work load on your CUDA enabled Nvidia GPU by tracking the output of `nvidia-smi -l` on the command line +while your Haystack program is running. + +
+ +## Document Length + +Document length has a very direct impact on the speed of the Reader +which is why we recommend using the `PreProcessor` class to clean and split your documents. +**If you halve the length of your documents, you will halve the workload placed onto your Reader.** + +For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text +can get washed out by the rest of the document. +To get a good balance between Reader speed and Retriever performance, we splitting documents to a maximum of 500 words. +If there is no Reader in the pipeline following the Retriever, we recommend that **documents be no longer than 10,000 words**. + +**Dense retrievers** are limited in the length of text that they can read in one pass. +As such, it is important that documents are not longer than the dense retriever's maximum input length. +By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens. +As such, we recommend that documents contain significantly less words. +We have found decent performance with **documents around 100 words long**. + +## Respecting Sentence Boundaries + +When splitting documents, it is generally not a good idea to let document boundaries fall in the middle of sentences. +Doing so means that each document will contain incomplete sentence fragments +which maybe be hard for both retriever and reader to interpret. +It is therefore recommended to set `split_respect_sentence_boundary=True` when initializing your `PreProcessor`. + +
+ +## Choosing the Right top-k Values + +The `top-k` parameter in both the `Retriever` and `Reader` determine how many results they return. +More specifically, `Retriever` `top-k` dictates how many retrieved documents are passed on to the next stage, +while `Reader` `top-k` determines how many answer candidates to show. + +In our experiments, we have found that **`Retriever` `top_k=10` +gives decent overall performance** and so we have set this as the default in Haystack. + +The choice of `Retriever` `top-k` is a trade-off between speed and accuracy, +especially when there is a `Reader` in the pipeline. +Setting it higher means passing more documents to the `Reader`, +thus reducing the chance that the answer-containing passage is missed. +However, passing more documents to the `Reader` will create a larger workload for the component. + +These parameters can easily be tweaked as follows if using a `Finder`: + +```python +answers = pipeline.run(query="What did Einstein work on?", params={"retriever": {"top_k": 10}, "reader": {"top_k": 5}}) +``` + +or like this if directly calling the `Retriever`: + +```python +retrieved_docs = retriever.retrieve(top_k=10) +``` + +
+ +## Metadata Filtering + +Metadata can be attached to the documents which you index into your DocumentStore (see the input data format [here](/components/retriever)). +At query time, you can apply filters based on this metadata to limit the scope of your search and ensure your answers +come from a specific slice of your data. + +For example, if you have a set of annual reports from various companies, +you may want to perform a search on just a specific year, or on a small selection of companies. +This can reduce the work load of the retriever and also ensure that you get more relevant results. + +Filters are applied via the `filters` argument of the `Retriever` class. In practice, this argument will probably +be passed into the `Pipeline.run()` call, which will then route it on to the `Retriever` class +(see our the Arguments on the [Pipelines page](/components/pipelines) for an explanation). + +```python +pipeline.run( + query="Why did the revenue increase?", + filters={ + "years": ["2019"], + "companies": ["BMW", "Mercedes"] + } +) +``` diff --git a/docs/v0.10.0/guides/rest_api.mdx b/docs/v0.10.0/guides/rest_api.mdx new file mode 100644 index 000000000..98c152ec0 --- /dev/null +++ b/docs/v0.10.0/guides/rest_api.mdx @@ -0,0 +1,183 @@ +# REST API + +Haystack can be deployed as a REST API. Using Haystack through an API can benefit developers looking to deploy Question Answering (QA) functionality in their projects, which could be web or mobile apps. + +The API uses a web server to receive HTTP requests and pass them on to a running instance of Haystack for processing, before returning Haystack results as an HTTP response. + +The diagram below illustrates how the Haystack REST API is structured: + +![image](/img/rest_api.png) + +## Background: Haystack Pipelines + +The Haystack [Pipeline](/components/pipelines) is at the core of Haystack’s QA functionality, whether Haystack is used directly through the Python bindings or through a REST API. + +A pipeline is defined as a sequence of components where each component performs a dedicated function, e.g., retrieving documents from a document store or extracting an answer to a query from a text document. A pipeline’s components are interconnected through inputs and outputs. + +The Haystack REST API exposes an HTTP interface for interacting with a pipeline. For instance, you can use the REST API to send a query submitted in the body of an HTTP request to the Haystack Pipeline. Haystack will then process the request and return the answer to the query in a HTTP response. + +When running Haystack as a REST API, you'll need to define the pipeline you'll be using in the API as a YAML file. Check out the [Pipelines YAML doc](/components/pipelines) for more details. + +The example Haystack pipeline that we’ll be using below is defined in the [rest_api/pipeline/pipelines.yaml](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipeline/pipelines.yaml) file. + +## Setting up a REST API with Haystack + +A simple Haystack API is already defined in the project’s default `docker-compose.yml` file. The easiest way to start this Haystack API is to clone the Haystack repository and then run docker-compose: + +```bash +git clone https://github.com/deepset-ai/haystack.git +cd haystack +docker-compose pull +docker-compose up +``` + +docker-compose will start three Docker containers: + +**haystack-api:** this container runs both Haystack and the HTTP API server. + +**elasticsearch:** the datastore that backs the Haystack QA system in this example. + +**ui:** a simple streamlit user interface that uses the API. The interface is intended only to be used for extractive QA (i.e., answering questions with snippets from the document store). It’s not designed to handle other Haystack use cases like document search or generative QA. + +
+ +**Note:** we recommend running Docker with at least 8GB of RAM to ensure that all containers run properly. + +
+ +By default, the Haystack API container will start with a pipeline defined in the [rest_api/pipeline/pipelines.yaml](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipeline/pipelines.yaml) file. If you want to direct the API container to use a YAML file at a different location, you can set the [`PIPELINE_YAML_PATH` environment variable](https://github.com/deepset-ai/haystack/blob/master/rest_api/config.py#L3). + +### Interacting with the Haystack API through a simple user interface + +Once you’ve started the Haystack containers, you’ll see the UI by navigating to http://127.0.0.1:8501. + +![image](/img/streamlit_ui_screenshot.png) + +When you enter a query and press Run, the interface will send a REST API request to the haystack-api container that will, in turn, supply the input to the query pipeline that we defined in the YAML file. + +### Swagger documentation for the Haystack REST API + +The API server by default runs on http://127.0.0.1:8000. The API server includes a Swagger documentation site, so to view all endpoints available through the REST API, navigate to the following page: http://127.0.0.1:8000/docs. + +![image](/img/swagger.png) + +You can use the Swagger page to view the available endpoints and expected input and output formats for each endpoint. The Swagger site also includes the option to send sample API requests and inspect responses. + +### API endpoints included in the example application + +The example Haystack API includes the following endpoints: + +#### POST /query + +This endpoint receives the question as a string and allows the requester to set additional parameters that will be passed on to the Haystack pipeline. + +#### POST ​/feedback + +This endpoint allows the API user to submit feedback on an answer for a particular query. For example, the user can send feedback on whether the answer was correct and whether the right snippet was identified as the answer. Information submitted through this endpoint is used to train the underlying QA model. + +#### POST ​/eval-feedback + +This endpoint returns basic accuracy metrics based on user feedback, e.g., the ratio of correct answers or correctly identified documents. You can filter the output by document or label. + +#### GET ​/export-feedback + +This endpoint returns JSON output in the SQuAD format for question/answer pairs that were marked as "relevant" by user feedback through the /feedback endpoint. + +#### POST ​/file-upload + +You can use this endpoint to upload a file for indexing (see the section below for more details on indexing). + +### Running HTTP API without Docker + +If you prefer to not use Docker with your Haystack API, you can start the REST API server and supporting Haystack pipeline by running the gunicorn server manually with the following command: + +``` bash +gunicorn rest_api.application:app -b 0.0.0.0:8000 -k uvicorn.workers.UvicornWorker -t 300 +``` + +This is the same command that’s used in the haystack-api container definition. + +## Indexing documents in the Haystack REST API document store + +In the below example, we’ll use an ElasticSearch container pre-loaded with articles about the Game of Thrones series. In a production environment, you’d instead start with either an existing or empty datastore. + +There are two options for indexing answers when working with an empty datastore. You can either use an indexing script, or load files through another pipeline exposed through an API. + +#### Option 1: Using an indexing script + +You can load documents to your datastore by using a Python script that runs before the Haystack API is initialized. You can find an example script that pre-processes a few files and saves them into the document store in the [Build Your First QA System tutorial](https://haystack.deepset.ai/docs/latest/tutorial1md#Preprocessing-of-documents). + +If you go with an indexing script, your REST API startup flow would be: + +1. Initialize the document store +2. Run indexing script +3. Start the REST API +4. Submit queries + +#### Option 2: Use the REST API to index documents + +When using the indexing pipeline, the REST API startup flow would be: + +1. Initialize the document store +2. Start the REST API +3. Add documents through the index endpoint +4. Submit queries + +Our example `pipelines.yaml` file defines an [indexing pipeline](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipeline/pipelines.yaml#L38-L50)—it processes text and PDF files that are submitted to the `file-upload` API endpoint. So if you submit an API request to the indexing endpoint, the submitted content will get processed by the Indexing pipeline, loaded into Elasticsearch, and made available to the `query` pipeline. + +Here’s how you can add a document through the indexing API by using [cURL](https://curl.se/) to make the HTTP request: + +```bash +curl -X POST -H 'Accept: application/json' -F files='@/Users/alexey/Downloads/Sansa Stark - Wikipedia.pdf' http://127.0.0.1:8000/file-upload +``` +
+ +**Note:** you can use any other HTTP request tool in lieu of cURL. We recommend referring to the Swagger documentation page for complete examples of file-upload requests, as putting them together by hand with cURL can prove difficult. + +
+ +## Querying the Haystack REST API + +After adding documents to our document store, we can call the API endpoints directly to retrieve answers. + +When working with the API’s JSON output, we suggest that you use [jq](https://stedolan.github.io/jq/) to make the output easier to read. + +Let’s try querying the Haystack API directly without using the UI. Here’s an example of a query with cURL and jq: + +```bash +curl -H 'Content-Type: application/json' -H 'Accept: application/json' -d '{"query": "Who is the father of Arya Stark?"}' http://127.0.0.1:8000/query | jq +``` + +We get the same information in the response as what we previously saw in the UI: + +```json +{ + "query": "Who is the father of Arya Stark?", + "answers": [ + { + "answer": "Lord Eddard Stark", + "question": null, + "score": 14.684528350830078, + "probability": 0.9044522047042847, + "context": "ark daughters.\nDuring the Tourney of the Hand to honour her father Lord Eddard Stark, Sansa Stark is enchanted by the knights performing in the event.", + "offset_start": 67, + "offset_end": 84, + "offset_start_in_doc": 659, + "offset_end_in_doc": 676, + "document_id": "a413e1f3-aa2c-4e17-8a47-8b067a5440d1", + "meta": { + "name": "332_Sansa_Stark.txt" + } + }, + ... + ] +} +``` + +## Building a custom API endpoint + +Existing API endpoints are defined using FastAPI route methods, e.g., in the [rest_api/controller/search.py file](https://github.com/deepset-ai/haystack/blob/master/rest_api/controller/search.py#L52). + +You can add custom endpoints to the Haystack API by defining new API endpoints using the FastAPI methods. Creating new endpoints can be handy for making multiple pipelines available under different API endpoints, or if you need to add custom metrics or modify how parameters are being passed to the pipelines. + +Check out the [FastAPI intro tutorial](https://fastapi.tiangolo.com/tutorial/first-steps/) for details of how to use FastAPI methods. diff --git a/docs/v0.10.0/menu.json b/docs/v0.10.0/menu.json new file mode 100644 index 000000000..4b8768109 --- /dev/null +++ b/docs/v0.10.0/menu.json @@ -0,0 +1,130 @@ +[ + { + "subMenuTitle": "Overview", + "pathPrefix": "/overview/", + "items": [ + { + "slug": "get-started", + "title": "Get Started" + }, + { "slug": "intro", "title": "What is Haystack?" }, + { "slug": "use-cases", "title": "Use Cases" }, + { "slug": "roadmap", "title": "Roadmap" }, + { "slug": "faq", "title": "FAQ" }, + { "slug": "glossary", "title": "Glossary" } + ] + }, + { + "subMenuTitle": "Components", + "pathPrefix": "/components/", + "items": [ + {"slug": "preprocessing", "title": "Preprocessing"}, + {"slug": "pipelines", "title": "Pipelines"}, + {"slug": "ready-made-pipelines", "title": "Ready-Made Pipelines"}, + {"slug": "document-store", "title": "DocumentStore"}, + {"slug": "retriever", "title": "Retriever"}, + {"slug": "reader", "title": "Reader"}, + {"slug": "generator", "title": "Generator" }, + {"slug": "summarizer", "title": "Summarizer"}, + {"slug": "classifier", "title": "Classifier"}, + {"slug": "translator", "title": "Translator"}, + {"slug": "knowledge-graph", "title": "Knowledge Graph"}, + {"slug": "ranker", "title": "Ranker"}, + {"slug": "query-classifier", "title": "Query Classifier"}, + {"slug": "question-generator", "title": "Question Generator"} ] + }, + { + "subMenuTitle": "Guides", + "pathPrefix": "/guides/", + "items": [ + {"slug": "languages", "title": "Languages Other Than English"}, + {"slug": "domain-adaptation","title": "Domain Adaptation"}, + {"slug": "optimization", "title": "Optimization"}, + {"slug": "evaluation", "title": "Evaluation"}, + {"slug": "annotation", "title": "Annotation Tool"}, + {"slug": "rest-api", "title": "REST API"}, + {"slug": "chatbots", "title": "Chatbot Integration"} + ] + }, + { + "subMenuTitle": "Tutorials", + "pathPrefix": "/tutorials/", + "items": [ + { + "slug": "first-qa-system", + "title": "Build Your First QA System" + }, + { + "slug": "fine-tuning-a-model", + "title": "Fine-tuning a Model on Your Own Data" + }, + { + "slug": "without-elasticsearch", + "title": "Build a QA System Without Elasticsearch" + }, + { + "slug": "existing-faqs", + "title": "Utilizing existing FAQs for Question Answering" + }, + { + "slug": "evaluation", + "title": "Evaluation of a QA System" + }, + { + "slug": "dense-passage-retrieval", + "title": "Better Retrieval via \"Dense Passage Retrieval\"" + }, + { + "slug": "retrieval-augmented-generation", + "title": "Generative QA with \"Retrieval-Augmented Generation\"" + }, + { + "slug": "preprocessing", + "title": "Preprocessing your Documents" + }, + { "slug": "train-dpr", "title": "How to Train DPR" }, + { + "slug": "knowledge-graph", + "title": "Question Answering on a Knowledge Graph" + }, + { + "slug": "pipelines", + "title": "How to use Pipelines" + }, + { + "slug": "lfqa", + "title": "Generative QA with \"LFQA\"" + }, + { + "slug": "question-generation", + "title": "Question Generation" + }, + { + "slug": "query-classifier", + "title": "Query Classifier" + } + ] + }, + { + "subMenuTitle": "API Reference", + "pathPrefix": "/reference/", + "items": [ + {"slug": "document-store", "title": "Document Store"}, + { "slug": "retriever", "title": "Retriever" }, + { "slug": "reader", "title": "Reader" }, + { "slug": "generator", "title": "Generator" }, + {"slug": "summarizer", "title": "Summarizer"}, + {"slug": "translator", "title": "Translator"}, + {"slug": "preprocessor", "title": "Preprocessor"}, + {"slug": "file-converters", "title": "File Converters"}, + {"slug": "crawler", "title": "Crawler" }, + {"slug": "evaluation", "title": "Evaluation"}, + { "slug": "pipelines", "title": "Pipelines" }, + {"slug": "knowledge-graph", "title": "Knowledge Graph"}, + {"slug": "graph-retriever", "title": "Graph Retriever"}, + {"slug": "classifier", "title": "Classifier"}, + {"slug": "question-generator", "title": "Question Generator"}, + {"slug": "ranker", "title": "Ranker"} + ] + } +] diff --git a/docs/v0.10.0/overview/faq.mdx b/docs/v0.10.0/overview/faq.mdx new file mode 100644 index 000000000..b6eaa6833 --- /dev/null +++ b/docs/v0.10.0/overview/faq.mdx @@ -0,0 +1,105 @@ +# Frequently Asked Questions + +## Why am I seeing duplicate answers being returned? + +The ElasticsearchDocumentStore and MilvusDocumentStore rely on Elasticsearch and Milvus backend services which +persist after your Python script has finished running. +If you rerun your script without deleting documents, you could end up with duplicate +copies of your documents in your database. +The easiest way to avoid this is to call `DocumentStore.delete_documents()` after initialization +to ensure that you are working with an empty DocumentStore. + +DocumentStores also have a `duplicate_documents` argument in their `__init__()` and `write_documents` methods +where you can define whether you'd like skip writing duplicates, overwrite existing duplicates or raise an error when there are duplicates. + +
+ +## How can I make sure that my GPU is being engaged when I use Haystack? + +You will want to ensure that a CUDA enabled GPU is being engaged when Haystack is running (you can check by running `nvidia-smi -l` on your command line). +Components which can be sped up by GPU have a `use_gpu` argument in their constructor which you will want to set to `True`. + +
+ +## How do I speed up my predictions? + +There are many different ways to speed up the performance of your Haystack system. + +The Reader is usually the most computationally expensive component in a pipeline +and you can often speed up your system by using a smaller model, like `deepset/minilm-uncased-squad2` (see [benchmarks](https://huggingface.co/deepset/minilm-uncased-squad2)). This usually comes with a small trade-off in accuracy. + +You can reduce the work load on the Reader by instructing the Retriever to pass on less documents. +This is done by setting the `top_k_retriever` parameter to a lower value. + +Making sure that your documents are shorter can also increase the speed of your system. You can split +your documents into smaller chunks by using the `PreProcessor` (see [tutorial](https://haystack.deepset.ai/tutorials/pipelines)). + +For more optimization suggestions, have a look at our [optimization page](https://haystack.deepset.ai/usage/optimization) +and also our [blogs](https://medium.com/deepset-ai) + +
+ +## How do I use Haystack for my language? + +The components in Haystack, such as the `Retriever` or the `Reader`, are designed in a language agnostic way. However you may +have to set certain parameters or load models pretrained for your language in order to get good performance out of Haystack. +See our [languages page](https://haystack.deepset.ai/usage/languages) for more details. + +
+ +## How can I add metadata to my documents so that I can apply filters? + +When providing your documents in the input format (see [here](https://haystack.deepset.aihttps://haystack.deepset.ai/usage/document-store#input-format)) +you can provide metadata information as a dictionary under the `meta` key. At query time, you can provide a `filters` argument +(most likely through `Pipelines.run()`) that specifies the accepted values for a certain metadata field +(for an example of what a `filters` dictionary might look like, please refer to [this example](https://haystack.deepset.ai/reference/retriever#__init__)) + +
+ +## How can I see each component's output if it's running in a pipeline? + +To see a pipeline component's out, you want to place an evaluation node after it in the pipeline. +You'll want an `EvalDocuments` node after a `Retriever` and a `EvalAnswers` node after the `Reader`. +Have a look at our [evaluation tutorial](/tutorials/5) to see how this is done. + +When initializing them, you want to set `debug=True`. +This causes their `EvalDocuments.log` or `EvalAnswers.log` to be populated with a record of each prediction made. + +For more suggestions on how to debug a pipeline, have a look at our [blog article](https://medium.com/deepset-ai/debugging-pipelines-9a3d43f2d59c) on the topic. + +
+ +## How can I serve my Haystack model? + +Haystack models can be wrapped in a REST API. For basic details on how to set this up, please refer to this section +on our [Github page](https://github.com/deepset-ai/haystack/blob/master/README.md#7-rest-api). +More comprehensive documentation coming soon! + +
+ +## How can I interpret the confidence scores being returned by the Reader? + +The confidence scores are in the range of 0 and 1 and reflect how confident the model is in each prediction that it makes. +Having a confidence score is particularly useful in cases where you need Haystack to work with a certain accuracy threshold. +Many of our users have built systems where predictions below a certain confidence value are routed on to a fallback system. + +For more information on model confidence and how to tune it, please refer to [this section](https://haystack.deepset.ai/usage/reader#confidence-scores). + +
+ +## My documents aren't showing up in my DocumentStore even though I've called `DocumentStore.write_documents()` + +When indexing, retrieving or querying for documents from a DocumentStore, you can specify an `index` on which to perform this action. +This can be specified in almost all methods of `DocumentStore` as well as `Retriever.retrieve()`. +Ensure that you are performing these operations on the one index! +Note that this also applies at evaluation where labels are written into their own separate DocumentStore index. + +
+ +## What is the difference between the FARMReader and the TransformersReader? + +In short, the FARMReader uses a QA pipeline implementation that comes from our own +[FARM framework](https://github.com/deepset-ai/FARM) that we can more easily update and also optimize for performance. +By contrast, the TransformersReader uses a QA pipeline implementation that comes from HuggingFace's [Transformers](https://github.com/huggingface/transformers). +See [this section](https://haystack.deepset.ai/usage/reader#deeper-dive-farm-vs-transformers) +for a more details about their differences! diff --git a/docs/v0.10.0/overview/get_started.mdx b/docs/v0.10.0/overview/get_started.mdx new file mode 100644 index 000000000..a75bfc273 --- /dev/null +++ b/docs/v0.10.0/overview/get_started.mdx @@ -0,0 +1,209 @@ +# Get Started + +## Installation + + +

+ The most straightforward way to install Haystack is through pip. +

+
+            pip install farm-haystack
+          
+
+ ), + }, + { + title: "Editable", + content: ( +
+

+ If you’d like to run a specific, unreleased version of Haystack, or + make edits to the way Haystack runs, you’ll want to install it using + `git` and `pip --editable`. This clones a copy of the repo to a + local directory and runs Haystack from there. +

+
+            git clone https://github.com/deepset-ai/haystack.git
+            cd haystack
+            pip install --editable .
+          
+

+ By default, this will give you the latest version of the master + branch. Use regular git commands to switch between different + branches and commits. +

+
+ ), + }, + { + title: "Docker", + content: ( +
+

+ Run a demo via Docker. +

+
+            git clone https://github.com/deepset-ai/haystack.git
+            cd haystack
+            docker-compose pull
+            docker-compose up
+            
+
+ ), + }, + { + title: "Docker (GPU)", + content: ( +
+

+ Run a demo via Docker using GPUs. +

+
+            git clone https://github.com/deepset-ai/haystack.git
+            cd haystack
+            docker-compose -f docker-compose-gpu.yml pull
+            docker-compose -f docker-compose-gpu.yml up
+            
+
+ ), + }, + ]} +/> + +
+ +**Windows:** On Windows add the arg `-f https://download.pytorch.org/whl/torch_stable.html` to install PyTorch correctly + +
+ +
+ +**Apple M1 (a.k.a. Apple Silicon):** Please check out [this thread](https://github.com/deepset-ai/haystack/issues/1310#issuecomment-900301287) for a guide on installation + +
+ +
+ +## The Building Blocks of Haystack + +Here’s a sample of some Haystack code showing a question answering system using a retriever and a reader. +For a working code example, check out our [starter tutorial](/tutorials/first-qa-system). + +```python +# DocumentStore: holds all your data +document_store = ElasticsearchDocumentStore() + +# Clean & load your documents into the DocumentStore +dicts = convert_files_to_dicts(doc_dir, clean_func=clean_wiki_text) +document_store.write_documents(dicts) + +# Retriever: A Fast and simple algo to indentify the most promising candidate documents +retriever = ElasticsearchRetriever(document_store) + +# Reader: Powerful but slower neural network trained for QA +model_name = "deepset/roberta-base-squad2" +reader = FARMReader(model_name) + +# Pipeline: Combines all the components +pipe = ExtractiveQAPipeline(reader, retriever) + +# Voilà! Ask a question! +question = "Who is the father of Sansa Stark?" +prediction = pipe.run(query=question) +print_answers(prediction) +``` + +
+ +## Loading Documents into the DocumentStore + +In Haystack, DocumentStores expect Documents in a dictionary format. They are loaded as follows: + +```python +document_store = ElasticsearchDocumentStore() +dicts = [ + { + 'text': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +document_store.write_documents(dicts) +``` + +When we talk about Documents in Haystack, we are referring specifically to the individual blocks of text that are being held in the DocumentStore. +You might want to use all the text in one file as a Document, or split it into multiple Documents. +This splitting can have a big impact on speed and performance. + +
+ Tip: If Haystack is running very slowly, + you might want to try splitting your text into smaller Documents. If you want + an improvement to performance, you might want to try concatenating text to + make larger Documents. See Optimization for + more details. +
+ +
+ +## Running Search Queries + +There are many different flavours of search that can be created using Haystack. +But to give just one example of what can be achieved, let's look more closely at +an Open Domain Question Answering (ODQA) Pipeline. + +**Querying** in an ODQA system involves searching for an answer to a given question within the full document store. +This process will: + +- make the Retriever filter for a small set of relevant candidate documents + +- get the Reader to process this set of candidate documents + +- return potential answers to the given question + +Usually, there are tight time constraints on querying and so it needs to be a lightweight operation. +When documents are loaded, Haystack will precompute any of the results that might be useful at query time. + +In Haystack, querying is performed with a `Pipeline` object which connects the reader to the retriever. + +```python +# Pipeline: Combines all the components +pipe = ExtractiveQAPipeline(reader, retriever) + +# Voilà! Ask a question! +question = "Who is the father of Sansa Stark?" +prediction = pipe.run(query=question) +print_answers(prediction) +``` + +When the query is complete, you can expect to see results that look something like this: + +```python +[ + { 'answer': 'Eddard', + 'context': 's Nymeria after a legendary warrior queen. She travels ' + "with her father, Eddard, to King's Landing when he is made " + 'Hand of the King. Before she leaves,' + }, ... +] +``` + +
+ +## Custom Search Pipelines + +Haystack providers many different building blocks for you to mix and match. +They include: + +- Readers +- Retrievers (sparse and dense) +- DocumentStores +- Summarizers +- Generators +- Translators + +These can all be combined in the configuration that you want. +Have a look at our [Pipelines page](/components/pipelines) to see what's possible! diff --git a/docs/v0.10.0/overview/glossary.mdx b/docs/v0.10.0/overview/glossary.mdx new file mode 100644 index 000000000..566c89d90 --- /dev/null +++ b/docs/v0.10.0/overview/glossary.mdx @@ -0,0 +1,56 @@ +# Glossary + +**BERT** - A popular, transformer based language model which has been improved upon but is still considered a common benchmark. + +**Dense** - Vectors that contain many non-zero values are considered dense. +Retrieval methods can also be called dense if they create dense vector representations of documents. + +**Document** - A Document in Haystack refers to the individual pieces of text that are stored in the DocumentStore. +Multiple Documents might originally come from the one file. +It is ultimately up to you how to divide up your corpus into Documents. + +**Document Store** - The component in Haystack that stores the text documents and their metadata. +Can have a variety of backends such as Elasticsearch, SQL or FAISS. + +**FARM** - An open-source transfer learning [framework](https://github.com/deepset-ai/FARM) by deepset. +FARM’s question answering models are used in Haystack’s Readers. + +**Indexing** - To store data in a database in a way that optimises retrieval time. +The exact steps involved in indexing depend on what kind of retrieval method is chosen. + +**Language Model** - The component in an NLP model that stores general language understanding, but no task specific knowledge. + +**Model Hub** - The [repository](https://huggingface.co/models) set up by HuggingFace where trained models can be saved to and loaded from. +With Haystack, you can directly load and use any question answering model found on the model hub. + +**Neural Network** - A machine learning architecture composed of artificial neurons that learn a task when exposed to labelled training data. + +**Prediction Head** - The modelling component that adapts the general knowledge of the language model for a specific task. +In question answering models (and hence in Haystack Readers), this is usually a single layer neural network. + +**Querying** - The task of returning relevant documents from a database. + +**Question Answering (QA)** - A popular task in the world of NLP where systems have to find answers to questions. +The term is generally used to refer to extractive question answering, +where a system has to find the minimal text span in a given document that contains the answer to the question. +Note however, that it may also refer to abstractive question answering or FAQ matching. + +**Reader** - The component in Haystack that does the closest reading of a document to extract +the exact text which answers a question. +It is, at its core, a trained Question Answering model. + +**Retriever** - A lightweight filter that selects only the most relevant documents for the Reader to further process. + +**Semantic Search** - A style of search that relies not on the matching of exact string forms +but on the similarity of meaning between a query and a piece of text. + +**Sparse** - Vectors that are composed primarily of zeros are called sparse. +Retrieval methods are also considered sparse if they build sparse vector representations of documents. + +**SQuAD** - The [Stanford Question Answering Dataset](https://rajpurkar.github.io/SQuAD-explorer/) is the defacto standard QA dataset. +The documents are paragraphs from Wikipedia and the question / answer pairs are created by human annotators. + +**Transformers** - Originally refers to the deep learning architecture that is composed of stacked self-attention layers +(first conceptualised [here](https://arxiv.org/pdf/1706.03762.pdf)). +Can also refer to HuggingFace’s [repository](https://github.com/huggingface/transformers) +which contains implementations of popular model architectures. diff --git a/docs/v0.10.0/overview/intro.mdx b/docs/v0.10.0/overview/intro.mdx new file mode 100644 index 000000000..d5907a188 --- /dev/null +++ b/docs/v0.10.0/overview/intro.mdx @@ -0,0 +1,55 @@ +# What is Haystack? + +Haystack is an **open-source framework** for building **search systems** that work intelligently over large document collections. +Recent advances in NLP have enabled the application of question answering, retrieval and summarization to real world settings +and Haystack is designed to be the bridge between research and industry. + +- **Latest NLP models**: Utilize all transformer based models (BERT, RoBERTa, MiniLM, DPR ...) and smoothly switch when new ones get published + +- **Flexible databases**: Load data into and query from a range of databases such as Elasticsearch, Milvus, FAISS, SQL and more + +- **Scalability**: Production-ready deployments that scale to millions of documents + +- **End-to-End**: All tooling you need to implement, evaluate, improve and run a search system + +- **Domain adaptation**: Fine-tune models to your own domain & improve them continuously via user feedback + +
+ +## Retriever-Reader + +The most common system built with Haystack is the Retriever-Reader Pipeline which is designed to optimize for both +speed and performance on the task of Open Domain Question Answering. +In practice, this is a great back bone for creating a search system that can handle detailed full sentence queries. + +![image](/img/retriever_reader.png) + +**Readers**, also known as Open-Domain QA systems in Machine Learning speak, +are powerful models that do close analysis of documents and perform the core task of question answering. +The Readers in Haystack are trained from the latest transformer based language models and can be significantly sped up using GPU acceleration. +However, it is not currently feasible to use the Reader directly on large collection of documents. + + + + +The **Retriever** assists the Reader by acting as a lightweight filter that reduces the number of documents that the Reader has to process. +It does this by: + +- Scanning through all documents in the database + +- Quickly identifying the relevant and dismissing the irrelevant + +- Passing on only a small candidate set of documents to the Reader + +Current methods fall into one of the two categories: + +- sparse + + - keyword based + - fast indexing and querying + - e.g. BM25 + +- dense + - neural network based + - computationally heavy indexing but fast querying + - e.g. Dense Passage Retrieval diff --git a/docs/v0.10.0/overview/roadmap.mdx b/docs/v0.10.0/overview/roadmap.mdx new file mode 100644 index 000000000..331f49029 --- /dev/null +++ b/docs/v0.10.0/overview/roadmap.mdx @@ -0,0 +1,47 @@ +# Open roadmap + +We believe open-source is more than open source code. It's a lot about people, collaboration, transparency and trust. +Therefore, we decided to be as open as possible with our roadmap and sprint planning. +In fact, you can see all of it in real-time on GitHub. +We hope this helps to clarify the direction of the open-source project and inspires discussions in the community. + +
+ +## How to access it + +We decided for Zenhub, as it allows a close integration with GitHub and real-time sharing of roadmaps and sprints. +Once you have installed the browser plugin below, you will see additional tabs and infos on the Haystack GitHub page. + +Zenhub Plugin: https://www.zenhub.com/extension + +
+ +## Roadmap + +_Zenhub Tab -> Roadmap (left menu)_ + +Here you can find our most recent roadmap with the **high-level projects** that are planned for the upcoming quarters. +We update it regularly and refine the projects as they come closer in time. + +![image](/img/zenhub_roadmap.png) + +
+ +## Board + +_Zenhub Tab -> Board (left menu)_ + +If you are interested in the **operational tasks** and their status, you can find our agile board here. + +![image](/img/zenhub_board.png) + +
+ +## Additional issue details & Releases + +_Right panel in regular Github issues_ + +With Zenhub you can also see some additional tags in every GitHub issue. +For those of you who wonder about the next release date: We aim for releases every ~ 4 weeks and will tag the issues that will need to be finished before a bit in advance. + +![image](/img/zenhub_issue.png) diff --git a/docs/v0.10.0/overview/use_cases.mdx b/docs/v0.10.0/overview/use_cases.mdx new file mode 100644 index 000000000..add2223c7 --- /dev/null +++ b/docs/v0.10.0/overview/use_cases.mdx @@ -0,0 +1,52 @@ +# Use cases + +## Semantic Search System + +Take the leap from using keyword search on your own documents to semantic search with Haystack. + +- Store your documents in the database of your choice (Elasticsearch, SQL, in memory, FAISS) + +- Perform question driven queries. + +Expect to see results that highlight the very sentence that contains the answer to your question. +Thanks to the power of Transformer based language models, results are chosen based on compatibility in meaning +rather than lexical overlap. + +![image](/img/search.png) + +
+ +## Information Extractor + +Automate the extraction of relevant information from a set of documents that pertain to the same topics but for different entities. + +Haystack can: + +- Apply a set of standard questions to each document in a store + +- Return a NO_ANSWER if a given document does not contain the answer to a question + +Say you have the financial reports for different companies over different years. +You can gather a set of standard questions which are applicable to each financial report, +like _what is the revenue forecast for 2020?_ or _what are the main sources of income?_. +Haystack will try to find an answer for each question within each document! + +We’ve seen this style of application be particularly effective in the sphere of finance and patent law +but we see a lot of potential in using this to gain a better overview of academic papers and internal business documents. + +
+ +## FAQ Style Question Answering + +Leverage existing FAQ documents and semantic similarity search to answer new incoming questions. +The workflow is as follows: + +- Store a set of FAQ documents in Haystack + +- The user presents a new question + +- Haystack will find the closest match to the new question in the FAQ documents + +- The user will be presented with the most similar Question Answer pair + +Haystack’s flexibility allows you to give new users more dynamic access to your existing documentation. diff --git a/lib/github.ts b/lib/github.ts index d704e8363..6acd393d4 100644 --- a/lib/github.ts +++ b/lib/github.ts @@ -17,7 +17,7 @@ export const getDownloadUrl = async ({ const res = await octokit.rest.repos.getContent({ owner: "deepset-ai", repo: "haystack", - path: `docs${version && version !== "latest" ? `/${version}` : ""}${repoPath}${filename}`, + path: `docs${version && version !== "latest" ? `/${version}` : ""}${repoPath}${filename}` }); if (Array.isArray(res.data)) return; if (!res.data.download_url) return; diff --git a/lib/utils.ts b/lib/utils.ts index 9352472d7..1b4240a42 100644 --- a/lib/utils.ts +++ b/lib/utils.ts @@ -96,6 +96,7 @@ export const getMenu = async (version?: string) => { export async function getDocsVersions() { const tagNames = await getHaystackReleaseTagNames(); + tagNames.push('v0.10.0'); return tagNames.filter((tagName) => tagName.startsWith("v")); } diff --git a/package-lock.json b/package-lock.json index 4bccdec3e..7b5fdf05c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,15 +17,15 @@ "github-slugger": "^1.4.0", "gray-matter": "^4.0.3", "markdown-to-jsx": "^7.1.3", - "next": "^11.1.0", + "next": "^11.1.2", "next-mdx-remote": "^3.0.4", "next-remote-watch": "^1.0.0", "next-sitemap": "^1.6.164", "octokit": "^1.1.0", "prism-react-renderer": "^1.2.1", "prismjs": "^1.24.1", - "react": "17.0.2", - "react-dom": "17.0.2", + "react": "^17.0.2", + "react-dom": "^17.0.2", "react-google-charts": "^3.0.15", "react-icons": "^4.2.0", "remark": "^13.0.0", @@ -533,9 +533,9 @@ } }, "node_modules/@babel/runtime": { - "version": "7.14.8", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.14.8.tgz", - "integrity": "sha512-twj3L8Og5SaCRCErB4x4ajbvBIVV77CGeFglHpeg5WC5FF8TZzBWXtTJ4MqaD9QszLYTtr+IsaAL2rEUevb+eg==", + "version": "7.15.3", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.15.3.tgz", + "integrity": "sha512-OvwMLqNXkCXSz1kSm58sEsNuhqOx/fKpnUnKnFB5v8uDda5bLNEHNgKPvhDN6IU0LDcnHQ90LlJ0Q6jnyBSIBA==", "dependencies": { "regenerator-runtime": "^0.13.4" }, @@ -1469,11 +1469,6 @@ } } }, - "node_modules/@material-ui/styles/node_modules/csstype": { - "version": "2.6.17", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-2.6.17.tgz", - "integrity": "sha512-u1wmTI1jJGzCJzWndZo8mk4wnPTZd1eOIYTYvuEyOQGfmDl3TrabCCfKnOC86FZwW/9djqTl933UF/cS425i9A==" - }, "node_modules/@material-ui/system": { "version": "4.12.1", "resolved": "https://registry.npmjs.org/@material-ui/system/-/system-4.12.1.tgz", @@ -1502,11 +1497,6 @@ } } }, - "node_modules/@material-ui/system/node_modules/csstype": { - "version": "2.6.17", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-2.6.17.tgz", - "integrity": "sha512-u1wmTI1jJGzCJzWndZo8mk4wnPTZd1eOIYTYvuEyOQGfmDl3TrabCCfKnOC86FZwW/9djqTl933UF/cS425i9A==" - }, "node_modules/@material-ui/types": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/@material-ui/types/-/types-5.1.0.tgz", @@ -1659,9 +1649,9 @@ "integrity": "sha512-jDJTpta+P4p1NZTFVLHJ/TLFVYVcOqv6l8xwOeBKNPMgY/zDYH/YH7SJbvrr/h1RcS9GzbPcLKGzpuK9cV56UA==" }, "node_modules/@next/env": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/env/-/env-11.1.0.tgz", - "integrity": "sha512-zPJkMFRenSf7BLlVee8987G0qQXAhxy7k+Lb/5hLAGkPVHAHm+oFFeL+2ipbI2KTEFlazdmGY0M+AlLQn7pWaw==" + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/env/-/env-11.1.2.tgz", + "integrity": "sha512-+fteyVdQ7C/OoulfcF6vd1Yk0FEli4453gr8kSFbU8sKseNSizYq6df5MKz/AjwLptsxrUeIkgBdAzbziyJ3mA==" }, "node_modules/@next/eslint-plugin-next": { "version": "11.0.1", @@ -1670,14 +1660,14 @@ "dev": true }, "node_modules/@next/polyfill-module": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/polyfill-module/-/polyfill-module-11.1.0.tgz", - "integrity": "sha512-64EgW8SzJRQls2yJ5DkuljRxgE24o2kYtX/ghTkPUJYsfidHMWzQGwg26IgRbb/uHqTd1G0W5UkKag+Nt8TWaQ==" + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/polyfill-module/-/polyfill-module-11.1.2.tgz", + "integrity": "sha512-xZmixqADM3xxtqBV0TpAwSFzWJP0MOQzRfzItHXf1LdQHWb0yofHHC+7eOrPFic8+ZGz5y7BdPkkgR1S25OymA==" }, "node_modules/@next/react-dev-overlay": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/react-dev-overlay/-/react-dev-overlay-11.1.0.tgz", - "integrity": "sha512-h+ry0sTk1W3mJw+TwEf91aqLbBJ5oqAsxfx+QryqEItNtfW6zLSSjxkyTYTqX8DkgSssQQutQfATkzBVgOR+qQ==", + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/react-dev-overlay/-/react-dev-overlay-11.1.2.tgz", + "integrity": "sha512-rDF/mGY2NC69mMg2vDqzVpCOlWqnwPUXB2zkARhvknUHyS6QJphPYv9ozoPJuoT/QBs49JJd9KWaAzVBvq920A==", "dependencies": { "@babel/code-frame": "7.12.11", "anser": "1.4.9", @@ -1816,9 +1806,9 @@ } }, "node_modules/@next/react-refresh-utils": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/react-refresh-utils/-/react-refresh-utils-11.1.0.tgz", - "integrity": "sha512-g5DtFTpLTGa36iy9DuZawtJeitI11gysFGKPQQqy+mNbSFazguArcJ10gAYFlbqpIi4boUamWNI5mAoSPx3kog==", + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/react-refresh-utils/-/react-refresh-utils-11.1.2.tgz", + "integrity": "sha512-hsoJmPfhVqjZ8w4IFzoo8SyECVnN+8WMnImTbTKrRUHOVJcYMmKLL7xf7T0ft00tWwAl/3f3Q3poWIN2Ueql/Q==", "peerDependencies": { "react-refresh": "0.8.3", "webpack": "^4 || ^5" @@ -1829,6 +1819,66 @@ } } }, + "node_modules/@next/swc-darwin-arm64": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-11.1.2.tgz", + "integrity": "sha512-hZuwOlGOwBZADA8EyDYyjx3+4JGIGjSHDHWrmpI7g5rFmQNltjlbaefAbiU5Kk7j3BUSDwt30quJRFv3nyJQ0w==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-darwin-x64": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-11.1.2.tgz", + "integrity": "sha512-PGOp0E1GisU+EJJlsmJVGE+aPYD0Uh7zqgsrpD3F/Y3766Ptfbe1lEPPWnRDl+OzSSrSrX1lkyM/Jlmh5OwNvA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-x64-gnu": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-11.1.2.tgz", + "integrity": "sha512-YcDHTJjn/8RqvyJVB6pvEKXihDcdrOwga3GfMv/QtVeLphTouY4BIcEUfrG5+26Nf37MP1ywN3RRl1TxpurAsQ==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-win32-x64-msvc": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-11.1.2.tgz", + "integrity": "sha512-e/pIKVdB+tGQYa1cW3sAeHm8gzEri/HYLZHT4WZojrUxgWXqx8pk7S7Xs47uBcFTqBDRvK3EcQpPLf3XdVsDdg==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, "node_modules/@node-rs/helper": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/@node-rs/helper/-/helper-1.2.1.tgz", @@ -2349,6 +2399,11 @@ "@types/react": "*" } }, + "node_modules/@types/react/node_modules/csstype": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", + "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + }, "node_modules/@types/scheduler": { "version": "0.16.2", "resolved": "https://registry.npmjs.org/@types/scheduler/-/scheduler-0.16.2.tgz", @@ -3928,9 +3983,9 @@ "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==" }, "node_modules/csstype": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", - "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + "version": "2.6.17", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-2.6.17.tgz", + "integrity": "sha512-u1wmTI1jJGzCJzWndZo8mk4wnPTZd1eOIYTYvuEyOQGfmDl3TrabCCfKnOC86FZwW/9djqTl933UF/cS425i9A==" }, "node_modules/damerau-levenshtein": { "version": "1.0.7", @@ -4159,6 +4214,11 @@ "csstype": "^3.0.2" } }, + "node_modules/dom-helpers/node_modules/csstype": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", + "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + }, "node_modules/domain-browser": { "version": "4.19.0", "resolved": "https://registry.npmjs.org/domain-browser/-/domain-browser-4.19.0.tgz", @@ -5469,6 +5529,19 @@ "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", "dev": true }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/function-bind": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", @@ -8922,6 +8995,11 @@ "jss": "10.7.1" } }, + "node_modules/jss/node_modules/csstype": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", + "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + }, "node_modules/jsx-ast-utils": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.2.0.tgz", @@ -9346,6 +9424,15 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/mdast-util-from-markdown/node_modules/mdast-util-to-string": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-2.0.0.tgz", + "integrity": "sha512-AW4DRS3QbBayY/jJmD8437V1Gombjf8RSOUCMFBuo5iHi58AGEgVCKQ+ezHkZZDpAQS75hcBMpLqjpJTjtUL7w==", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/mdast-util-to-hast": { "version": "10.0.1", "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-10.0.1.tgz", @@ -9409,7 +9496,7 @@ "url": "https://opencollective.com/unified" } }, - "node_modules/mdast-util-to-string": { + "node_modules/mdast-util-to-markdown/node_modules/mdast-util-to-string": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-2.0.0.tgz", "integrity": "sha512-AW4DRS3QbBayY/jJmD8437V1Gombjf8RSOUCMFBuo5iHi58AGEgVCKQ+ezHkZZDpAQS75hcBMpLqjpJTjtUL7w==", @@ -9418,6 +9505,15 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/mdast-util-to-string": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-3.1.0.tgz", + "integrity": "sha512-n4Vypz/DZgwo0iMHLQL49dJzlp7YtAJP+N07MZHpjPf/5XJuHUWstviF4Mn2jEiR/GNmtnRRqnwsXExk3igfFA==", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/mdurl": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/mdurl/-/mdurl-1.0.1.tgz", @@ -9624,16 +9720,16 @@ } }, "node_modules/next": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/next/-/next-11.1.0.tgz", - "integrity": "sha512-GHBk/c7Wyr6YbFRFZF37I0X7HKzkHHI8pur/loyXo5AIE8wdkbGPGO0ds3vNAO6f8AxZAKGCRYtAzoGlVLoifA==", + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/next/-/next-11.1.2.tgz", + "integrity": "sha512-azEYL0L+wFjv8lstLru3bgvrzPvK0P7/bz6B/4EJ9sYkXeW8r5Bjh78D/Ol7VOg0EIPz0CXoe72hzAlSAXo9hw==", "dependencies": { - "@babel/runtime": "7.12.5", + "@babel/runtime": "7.15.3", "@hapi/accept": "5.0.2", - "@next/env": "11.1.0", - "@next/polyfill-module": "11.1.0", - "@next/react-dev-overlay": "11.1.0", - "@next/react-refresh-utils": "11.1.0", + "@next/env": "11.1.2", + "@next/polyfill-module": "11.1.2", + "@next/react-dev-overlay": "11.1.2", + "@next/react-refresh-utils": "11.1.2", "@node-rs/helper": "1.2.1", "assert": "2.0.0", "ast-types": "0.13.2", @@ -9671,11 +9767,11 @@ "stream-browserify": "3.0.0", "stream-http": "3.1.1", "string_decoder": "1.3.0", - "styled-jsx": "4.0.0", + "styled-jsx": "4.0.1", "timers-browserify": "2.0.12", "tty-browserify": "0.0.1", "use-subscription": "1.5.1", - "util": "0.12.3", + "util": "0.12.4", "vm-browserify": "1.1.2", "watchpack": "2.1.1" }, @@ -9685,6 +9781,12 @@ "engines": { "node": ">=12.0.0" }, + "optionalDependencies": { + "@next/swc-darwin-arm64": "11.1.2", + "@next/swc-darwin-x64": "11.1.2", + "@next/swc-linux-x64-gnu": "11.1.2", + "@next/swc-win32-x64-msvc": "11.1.2" + }, "peerDependencies": { "fibers": ">= 3.1.0", "node-sass": "^4.0.0 || ^5.0.0", @@ -9868,14 +9970,6 @@ "next": "*" } }, - "node_modules/next/node_modules/@babel/runtime": { - "version": "7.12.5", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.12.5.tgz", - "integrity": "sha512-plcc+hbExy3McchJCEQG3knOsuh3HH+Prx1P6cLIkET/0dLuQDEnrT+s27Axgc9bqfsmNUNHfscgMUdBpC9xfg==", - "dependencies": { - "regenerator-runtime": "^0.13.4" - } - }, "node_modules/next/node_modules/browserslist": { "version": "4.16.6", "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.16.6.tgz", @@ -11858,15 +11952,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/remark-slug/node_modules/mdast-util-to-string": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-3.1.0.tgz", - "integrity": "sha512-n4Vypz/DZgwo0iMHLQL49dJzlp7YtAJP+N07MZHpjPf/5XJuHUWstviF4Mn2jEiR/GNmtnRRqnwsXExk3igfFA==", - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, "node_modules/remark-slug/node_modules/trough": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/trough/-/trough-2.0.2.tgz", @@ -12744,9 +12829,9 @@ } }, "node_modules/styled-jsx": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-4.0.0.tgz", - "integrity": "sha512-2USeoWMoJ/Lx5s2y1PxuvLy/cz2Yrr8cTySV3ILHU1Vmaw1bnV7suKdblLPjnyhMD+qzN7B1SWyh4UZTARn/WA==", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-4.0.1.tgz", + "integrity": "sha512-Gcb49/dRB1k8B4hdK8vhW27Rlb2zujCk1fISrizCcToIs+55B4vmUM0N9Gi4nnVfFZWe55jRdWpAqH1ldAKWvQ==", "dependencies": { "@babel/plugin-syntax-jsx": "7.14.5", "@babel/types": "7.15.0", @@ -12757,8 +12842,16 @@ "stylis": "3.5.4", "stylis-rule-sheet": "0.0.10" }, + "engines": { + "node": ">= 12.0.0" + }, "peerDependencies": { "react": ">= 16.8.0 || 17.x.x || 18.x.x" + }, + "peerDependenciesMeta": { + "@babel/core": { + "optional": true + } } }, "node_modules/styled-jsx/node_modules/@babel/plugin-syntax-jsx": { @@ -13636,9 +13729,9 @@ } }, "node_modules/util": { - "version": "0.12.3", - "resolved": "https://registry.npmjs.org/util/-/util-0.12.3.tgz", - "integrity": "sha512-I8XkoQwE+fPQEhy9v012V+TSdH2kp9ts29i20TaaDUXsg7x/onePbhFJUExBfv/2ay1ZOp/Vsm3nDlmnFGSAog==", + "version": "0.12.4", + "resolved": "https://registry.npmjs.org/util/-/util-0.12.4.tgz", + "integrity": "sha512-bxZ9qtSlGUWSOy9Qa9Xgk11kSslpuZwaxCg4sNIDj6FLucDab2JxnHwyNTCpHMtK1MjoQiWQ6DiUMZYbSrO+Sw==", "dependencies": { "inherits": "^2.0.3", "is-arguments": "^1.0.4", @@ -14424,9 +14517,9 @@ } }, "@babel/runtime": { - "version": "7.14.8", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.14.8.tgz", - "integrity": "sha512-twj3L8Og5SaCRCErB4x4ajbvBIVV77CGeFglHpeg5WC5FF8TZzBWXtTJ4MqaD9QszLYTtr+IsaAL2rEUevb+eg==", + "version": "7.15.3", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.15.3.tgz", + "integrity": "sha512-OvwMLqNXkCXSz1kSm58sEsNuhqOx/fKpnUnKnFB5v8uDda5bLNEHNgKPvhDN6IU0LDcnHQ90LlJ0Q6jnyBSIBA==", "requires": { "regenerator-runtime": "^0.13.4" } @@ -15114,13 +15207,6 @@ "jss-plugin-rule-value-function": "^10.5.1", "jss-plugin-vendor-prefixer": "^10.5.1", "prop-types": "^15.7.2" - }, - "dependencies": { - "csstype": { - "version": "2.6.17", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-2.6.17.tgz", - "integrity": "sha512-u1wmTI1jJGzCJzWndZo8mk4wnPTZd1eOIYTYvuEyOQGfmDl3TrabCCfKnOC86FZwW/9djqTl933UF/cS425i9A==" - } } }, "@material-ui/system": { @@ -15132,13 +15218,6 @@ "@material-ui/utils": "^4.11.2", "csstype": "^2.5.2", "prop-types": "^15.7.2" - }, - "dependencies": { - "csstype": { - "version": "2.6.17", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-2.6.17.tgz", - "integrity": "sha512-u1wmTI1jJGzCJzWndZo8mk4wnPTZd1eOIYTYvuEyOQGfmDl3TrabCCfKnOC86FZwW/9djqTl933UF/cS425i9A==" - } } }, "@material-ui/types": { @@ -15249,9 +15328,9 @@ "integrity": "sha512-jDJTpta+P4p1NZTFVLHJ/TLFVYVcOqv6l8xwOeBKNPMgY/zDYH/YH7SJbvrr/h1RcS9GzbPcLKGzpuK9cV56UA==" }, "@next/env": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/env/-/env-11.1.0.tgz", - "integrity": "sha512-zPJkMFRenSf7BLlVee8987G0qQXAhxy7k+Lb/5hLAGkPVHAHm+oFFeL+2ipbI2KTEFlazdmGY0M+AlLQn7pWaw==" + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/env/-/env-11.1.2.tgz", + "integrity": "sha512-+fteyVdQ7C/OoulfcF6vd1Yk0FEli4453gr8kSFbU8sKseNSizYq6df5MKz/AjwLptsxrUeIkgBdAzbziyJ3mA==" }, "@next/eslint-plugin-next": { "version": "11.0.1", @@ -15260,14 +15339,14 @@ "dev": true }, "@next/polyfill-module": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/polyfill-module/-/polyfill-module-11.1.0.tgz", - "integrity": "sha512-64EgW8SzJRQls2yJ5DkuljRxgE24o2kYtX/ghTkPUJYsfidHMWzQGwg26IgRbb/uHqTd1G0W5UkKag+Nt8TWaQ==" + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/polyfill-module/-/polyfill-module-11.1.2.tgz", + "integrity": "sha512-xZmixqADM3xxtqBV0TpAwSFzWJP0MOQzRfzItHXf1LdQHWb0yofHHC+7eOrPFic8+ZGz5y7BdPkkgR1S25OymA==" }, "@next/react-dev-overlay": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/react-dev-overlay/-/react-dev-overlay-11.1.0.tgz", - "integrity": "sha512-h+ry0sTk1W3mJw+TwEf91aqLbBJ5oqAsxfx+QryqEItNtfW6zLSSjxkyTYTqX8DkgSssQQutQfATkzBVgOR+qQ==", + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/react-dev-overlay/-/react-dev-overlay-11.1.2.tgz", + "integrity": "sha512-rDF/mGY2NC69mMg2vDqzVpCOlWqnwPUXB2zkARhvknUHyS6QJphPYv9ozoPJuoT/QBs49JJd9KWaAzVBvq920A==", "requires": { "@babel/code-frame": "7.12.11", "anser": "1.4.9", @@ -15377,11 +15456,35 @@ } }, "@next/react-refresh-utils": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/@next/react-refresh-utils/-/react-refresh-utils-11.1.0.tgz", - "integrity": "sha512-g5DtFTpLTGa36iy9DuZawtJeitI11gysFGKPQQqy+mNbSFazguArcJ10gAYFlbqpIi4boUamWNI5mAoSPx3kog==", + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/react-refresh-utils/-/react-refresh-utils-11.1.2.tgz", + "integrity": "sha512-hsoJmPfhVqjZ8w4IFzoo8SyECVnN+8WMnImTbTKrRUHOVJcYMmKLL7xf7T0ft00tWwAl/3f3Q3poWIN2Ueql/Q==", "requires": {} }, + "@next/swc-darwin-arm64": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-11.1.2.tgz", + "integrity": "sha512-hZuwOlGOwBZADA8EyDYyjx3+4JGIGjSHDHWrmpI7g5rFmQNltjlbaefAbiU5Kk7j3BUSDwt30quJRFv3nyJQ0w==", + "optional": true + }, + "@next/swc-darwin-x64": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-11.1.2.tgz", + "integrity": "sha512-PGOp0E1GisU+EJJlsmJVGE+aPYD0Uh7zqgsrpD3F/Y3766Ptfbe1lEPPWnRDl+OzSSrSrX1lkyM/Jlmh5OwNvA==", + "optional": true + }, + "@next/swc-linux-x64-gnu": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-11.1.2.tgz", + "integrity": "sha512-YcDHTJjn/8RqvyJVB6pvEKXihDcdrOwga3GfMv/QtVeLphTouY4BIcEUfrG5+26Nf37MP1ywN3RRl1TxpurAsQ==", + "optional": true + }, + "@next/swc-win32-x64-msvc": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-11.1.2.tgz", + "integrity": "sha512-e/pIKVdB+tGQYa1cW3sAeHm8gzEri/HYLZHT4WZojrUxgWXqx8pk7S7Xs47uBcFTqBDRvK3EcQpPLf3XdVsDdg==", + "optional": true + }, "@node-rs/helper": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/@node-rs/helper/-/helper-1.2.1.tgz", @@ -15862,6 +15965,13 @@ "@types/prop-types": "*", "@types/scheduler": "*", "csstype": "^3.0.2" + }, + "dependencies": { + "csstype": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", + "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + } } }, "@types/react-dom": { @@ -17100,9 +17210,9 @@ } }, "csstype": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", - "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + "version": "2.6.17", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-2.6.17.tgz", + "integrity": "sha512-u1wmTI1jJGzCJzWndZo8mk4wnPTZd1eOIYTYvuEyOQGfmDl3TrabCCfKnOC86FZwW/9djqTl933UF/cS425i9A==" }, "damerau-levenshtein": { "version": "1.0.7", @@ -17283,6 +17393,13 @@ "requires": { "@babel/runtime": "^7.8.7", "csstype": "^3.0.2" + }, + "dependencies": { + "csstype": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", + "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + } } }, "domain-browser": { @@ -18300,6 +18417,12 @@ "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", "dev": true }, + "fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "optional": true + }, "function-bind": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", @@ -20773,6 +20896,13 @@ "csstype": "^3.0.2", "is-in-browser": "^1.1.3", "tiny-warning": "^1.0.2" + }, + "dependencies": { + "csstype": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.0.8.tgz", + "integrity": "sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw==" + } } }, "jss-plugin-camel-case": { @@ -21181,6 +21311,13 @@ "micromark": "~2.11.0", "parse-entities": "^2.0.0", "unist-util-stringify-position": "^2.0.0" + }, + "dependencies": { + "mdast-util-to-string": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-2.0.0.tgz", + "integrity": "sha512-AW4DRS3QbBayY/jJmD8437V1Gombjf8RSOUCMFBuo5iHi58AGEgVCKQ+ezHkZZDpAQS75hcBMpLqjpJTjtUL7w==" + } } }, "mdast-util-to-hast": { @@ -21230,12 +21367,19 @@ "parse-entities": "^2.0.0", "repeat-string": "^1.0.0", "zwitch": "^1.0.0" + }, + "dependencies": { + "mdast-util-to-string": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-2.0.0.tgz", + "integrity": "sha512-AW4DRS3QbBayY/jJmD8437V1Gombjf8RSOUCMFBuo5iHi58AGEgVCKQ+ezHkZZDpAQS75hcBMpLqjpJTjtUL7w==" + } } }, "mdast-util-to-string": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-2.0.0.tgz", - "integrity": "sha512-AW4DRS3QbBayY/jJmD8437V1Gombjf8RSOUCMFBuo5iHi58AGEgVCKQ+ezHkZZDpAQS75hcBMpLqjpJTjtUL7w==" + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-3.1.0.tgz", + "integrity": "sha512-n4Vypz/DZgwo0iMHLQL49dJzlp7YtAJP+N07MZHpjPf/5XJuHUWstviF4Mn2jEiR/GNmtnRRqnwsXExk3igfFA==" }, "mdurl": { "version": "1.0.1", @@ -21387,16 +21531,20 @@ "integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==" }, "next": { - "version": "11.1.0", - "resolved": "https://registry.npmjs.org/next/-/next-11.1.0.tgz", - "integrity": "sha512-GHBk/c7Wyr6YbFRFZF37I0X7HKzkHHI8pur/loyXo5AIE8wdkbGPGO0ds3vNAO6f8AxZAKGCRYtAzoGlVLoifA==", + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/next/-/next-11.1.2.tgz", + "integrity": "sha512-azEYL0L+wFjv8lstLru3bgvrzPvK0P7/bz6B/4EJ9sYkXeW8r5Bjh78D/Ol7VOg0EIPz0CXoe72hzAlSAXo9hw==", "requires": { - "@babel/runtime": "7.12.5", + "@babel/runtime": "7.15.3", "@hapi/accept": "5.0.2", - "@next/env": "11.1.0", - "@next/polyfill-module": "11.1.0", - "@next/react-dev-overlay": "11.1.0", - "@next/react-refresh-utils": "11.1.0", + "@next/env": "11.1.2", + "@next/polyfill-module": "11.1.2", + "@next/react-dev-overlay": "11.1.2", + "@next/react-refresh-utils": "11.1.2", + "@next/swc-darwin-arm64": "11.1.2", + "@next/swc-darwin-x64": "11.1.2", + "@next/swc-linux-x64-gnu": "11.1.2", + "@next/swc-win32-x64-msvc": "11.1.2", "@node-rs/helper": "1.2.1", "assert": "2.0.0", "ast-types": "0.13.2", @@ -21434,23 +21582,15 @@ "stream-browserify": "3.0.0", "stream-http": "3.1.1", "string_decoder": "1.3.0", - "styled-jsx": "4.0.0", + "styled-jsx": "4.0.1", "timers-browserify": "2.0.12", "tty-browserify": "0.0.1", "use-subscription": "1.5.1", - "util": "0.12.3", + "util": "0.12.4", "vm-browserify": "1.1.2", "watchpack": "2.1.1" }, "dependencies": { - "@babel/runtime": { - "version": "7.12.5", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.12.5.tgz", - "integrity": "sha512-plcc+hbExy3McchJCEQG3knOsuh3HH+Prx1P6cLIkET/0dLuQDEnrT+s27Axgc9bqfsmNUNHfscgMUdBpC9xfg==", - "requires": { - "regenerator-runtime": "^0.13.4" - } - }, "browserslist": { "version": "4.16.6", "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.16.6.tgz", @@ -23082,11 +23222,6 @@ "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.0.0.tgz", "integrity": "sha512-NXRbBtUdBioI73y/HmOhogw/U5msYPC9DAtGkJXeFcFWSFZw0mCUsPxk/snTuJHzNKA8kLBK4rH97RMB1BfCXw==" }, - "mdast-util-to-string": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-3.1.0.tgz", - "integrity": "sha512-n4Vypz/DZgwo0iMHLQL49dJzlp7YtAJP+N07MZHpjPf/5XJuHUWstviF4Mn2jEiR/GNmtnRRqnwsXExk3igfFA==" - }, "trough": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/trough/-/trough-2.0.2.tgz", @@ -23761,9 +23896,9 @@ } }, "styled-jsx": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-4.0.0.tgz", - "integrity": "sha512-2USeoWMoJ/Lx5s2y1PxuvLy/cz2Yrr8cTySV3ILHU1Vmaw1bnV7suKdblLPjnyhMD+qzN7B1SWyh4UZTARn/WA==", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-4.0.1.tgz", + "integrity": "sha512-Gcb49/dRB1k8B4hdK8vhW27Rlb2zujCk1fISrizCcToIs+55B4vmUM0N9Gi4nnVfFZWe55jRdWpAqH1ldAKWvQ==", "requires": { "@babel/plugin-syntax-jsx": "7.14.5", "@babel/types": "7.15.0", @@ -24445,9 +24580,9 @@ } }, "util": { - "version": "0.12.3", - "resolved": "https://registry.npmjs.org/util/-/util-0.12.3.tgz", - "integrity": "sha512-I8XkoQwE+fPQEhy9v012V+TSdH2kp9ts29i20TaaDUXsg7x/onePbhFJUExBfv/2ay1ZOp/Vsm3nDlmnFGSAog==", + "version": "0.12.4", + "resolved": "https://registry.npmjs.org/util/-/util-0.12.4.tgz", + "integrity": "sha512-bxZ9qtSlGUWSOy9Qa9Xgk11kSslpuZwaxCg4sNIDj6FLucDab2JxnHwyNTCpHMtK1MjoQiWQ6DiUMZYbSrO+Sw==", "requires": { "inherits": "^2.0.3", "is-arguments": "^1.0.4", diff --git a/package.json b/package.json index 588f85b54..ca1ce2a65 100644 --- a/package.json +++ b/package.json @@ -23,15 +23,15 @@ "github-slugger": "^1.4.0", "gray-matter": "^4.0.3", "markdown-to-jsx": "^7.1.3", - "next": "^11.1.0", + "next": "^11.1.2", "next-mdx-remote": "^3.0.4", "next-remote-watch": "^1.0.0", "next-sitemap": "^1.6.164", "octokit": "^1.1.0", "prism-react-renderer": "^1.2.1", "prismjs": "^1.24.1", - "react": "17.0.2", - "react-dom": "17.0.2", + "react": "^17.0.2", + "react-dom": "^17.0.2", "react-google-charts": "^3.0.15", "react-icons": "^4.2.0", "remark": "^13.0.0", diff --git a/pages/overview/[...slug].tsx b/pages/overview/[...slug].tsx index 45a8217ab..52af713da 100644 --- a/pages/overview/[...slug].tsx +++ b/pages/overview/[...slug].tsx @@ -81,6 +81,7 @@ export const getStaticPaths: GetStaticPaths = async () => { export const getStaticProps: GetStaticProps = async ({ params, }: GetStaticPropsContext) => { + console.log() if (!params?.slug || !Array.isArray(params.slug)) { return { notFound: true, diff --git a/yarn.lock b/yarn.lock index b9d516fc7..3cc810fb5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -337,17 +337,10 @@ "core-js-pure" "^3.16.0" "regenerator-runtime" "^0.13.4" -"@babel/runtime@^7.10.2", "@babel/runtime@^7.11.2", "@babel/runtime@^7.3.1", "@babel/runtime@^7.4.4", "@babel/runtime@^7.5.5", "@babel/runtime@^7.8.3", "@babel/runtime@^7.8.7": - "integrity" "sha512-twj3L8Og5SaCRCErB4x4ajbvBIVV77CGeFglHpeg5WC5FF8TZzBWXtTJ4MqaD9QszLYTtr+IsaAL2rEUevb+eg==" - "resolved" "https://registry.npmjs.org/@babel/runtime/-/runtime-7.14.8.tgz" - "version" "7.14.8" - dependencies: - "regenerator-runtime" "^0.13.4" - -"@babel/runtime@7.12.5": - "integrity" "sha512-plcc+hbExy3McchJCEQG3knOsuh3HH+Prx1P6cLIkET/0dLuQDEnrT+s27Axgc9bqfsmNUNHfscgMUdBpC9xfg==" - "resolved" "https://registry.npmjs.org/@babel/runtime/-/runtime-7.12.5.tgz" - "version" "7.12.5" +"@babel/runtime@^7.10.2", "@babel/runtime@^7.11.2", "@babel/runtime@^7.3.1", "@babel/runtime@^7.4.4", "@babel/runtime@^7.5.5", "@babel/runtime@^7.8.3", "@babel/runtime@^7.8.7", "@babel/runtime@7.15.3": + "integrity" "sha512-OvwMLqNXkCXSz1kSm58sEsNuhqOx/fKpnUnKnFB5v8uDda5bLNEHNgKPvhDN6IU0LDcnHQ90LlJ0Q6jnyBSIBA==" + "resolved" "https://registry.npmjs.org/@babel/runtime/-/runtime-7.15.3.tgz" + "version" "7.15.3" dependencies: "regenerator-runtime" "^0.13.4" @@ -768,25 +761,25 @@ "resolved" "https://registry.npmjs.org/@napi-rs/triples/-/triples-1.0.3.tgz" "version" "1.0.3" -"@next/env@11.1.0": - "integrity" "sha512-zPJkMFRenSf7BLlVee8987G0qQXAhxy7k+Lb/5hLAGkPVHAHm+oFFeL+2ipbI2KTEFlazdmGY0M+AlLQn7pWaw==" - "resolved" "https://registry.npmjs.org/@next/env/-/env-11.1.0.tgz" - "version" "11.1.0" +"@next/env@11.1.2": + "integrity" "sha512-+fteyVdQ7C/OoulfcF6vd1Yk0FEli4453gr8kSFbU8sKseNSizYq6df5MKz/AjwLptsxrUeIkgBdAzbziyJ3mA==" + "resolved" "https://registry.npmjs.org/@next/env/-/env-11.1.2.tgz" + "version" "11.1.2" "@next/eslint-plugin-next@11.0.1": "integrity" "sha512-UzdX3y6XSrj9YuASUb/p4sRvfjP2klj2YgIOfMwrWoLTTPJQMh00hREB9Ftr7m7RIxjVSAaaLXIRLdxvq948GA==" "resolved" "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-11.0.1.tgz" "version" "11.0.1" -"@next/polyfill-module@11.1.0": - "integrity" "sha512-64EgW8SzJRQls2yJ5DkuljRxgE24o2kYtX/ghTkPUJYsfidHMWzQGwg26IgRbb/uHqTd1G0W5UkKag+Nt8TWaQ==" - "resolved" "https://registry.npmjs.org/@next/polyfill-module/-/polyfill-module-11.1.0.tgz" - "version" "11.1.0" +"@next/polyfill-module@11.1.2": + "integrity" "sha512-xZmixqADM3xxtqBV0TpAwSFzWJP0MOQzRfzItHXf1LdQHWb0yofHHC+7eOrPFic8+ZGz5y7BdPkkgR1S25OymA==" + "resolved" "https://registry.npmjs.org/@next/polyfill-module/-/polyfill-module-11.1.2.tgz" + "version" "11.1.2" -"@next/react-dev-overlay@11.1.0": - "integrity" "sha512-h+ry0sTk1W3mJw+TwEf91aqLbBJ5oqAsxfx+QryqEItNtfW6zLSSjxkyTYTqX8DkgSssQQutQfATkzBVgOR+qQ==" - "resolved" "https://registry.npmjs.org/@next/react-dev-overlay/-/react-dev-overlay-11.1.0.tgz" - "version" "11.1.0" +"@next/react-dev-overlay@11.1.2": + "integrity" "sha512-rDF/mGY2NC69mMg2vDqzVpCOlWqnwPUXB2zkARhvknUHyS6QJphPYv9ozoPJuoT/QBs49JJd9KWaAzVBvq920A==" + "resolved" "https://registry.npmjs.org/@next/react-dev-overlay/-/react-dev-overlay-11.1.2.tgz" + "version" "11.1.2" dependencies: "@babel/code-frame" "7.12.11" "anser" "1.4.9" @@ -800,10 +793,15 @@ "stacktrace-parser" "0.1.10" "strip-ansi" "6.0.0" -"@next/react-refresh-utils@11.1.0": - "integrity" "sha512-g5DtFTpLTGa36iy9DuZawtJeitI11gysFGKPQQqy+mNbSFazguArcJ10gAYFlbqpIi4boUamWNI5mAoSPx3kog==" - "resolved" "https://registry.npmjs.org/@next/react-refresh-utils/-/react-refresh-utils-11.1.0.tgz" - "version" "11.1.0" +"@next/react-refresh-utils@11.1.2": + "integrity" "sha512-hsoJmPfhVqjZ8w4IFzoo8SyECVnN+8WMnImTbTKrRUHOVJcYMmKLL7xf7T0ft00tWwAl/3f3Q3poWIN2Ueql/Q==" + "resolved" "https://registry.npmjs.org/@next/react-refresh-utils/-/react-refresh-utils-11.1.2.tgz" + "version" "11.1.2" + +"@next/swc-linux-x64-gnu@11.1.2": + "integrity" "sha512-YcDHTJjn/8RqvyJVB6pvEKXihDcdrOwga3GfMv/QtVeLphTouY4BIcEUfrG5+26Nf37MP1ywN3RRl1TxpurAsQ==" + "resolved" "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-11.1.2.tgz" + "version" "11.1.2" "@node-rs/helper@1.2.1": "integrity" "sha512-R5wEmm8nbuQU0YGGmYVjEc0OHtYsuXdpRG+Ut/3wZ9XAvQWyThN08bTh2cBJgoZxHQUPtvRfeQuxcAgLuiBISg==" @@ -5119,17 +5117,17 @@ "matcher" "^4.0.0" "minimist" "^1.2.5" -"next@*", "next@^11.1.0", "next@>=10.2.0": - "integrity" "sha512-GHBk/c7Wyr6YbFRFZF37I0X7HKzkHHI8pur/loyXo5AIE8wdkbGPGO0ds3vNAO6f8AxZAKGCRYtAzoGlVLoifA==" - "resolved" "https://registry.npmjs.org/next/-/next-11.1.0.tgz" - "version" "11.1.0" +"next@*", "next@^11.1.2", "next@>=10.2.0": + "integrity" "sha512-azEYL0L+wFjv8lstLru3bgvrzPvK0P7/bz6B/4EJ9sYkXeW8r5Bjh78D/Ol7VOg0EIPz0CXoe72hzAlSAXo9hw==" + "resolved" "https://registry.npmjs.org/next/-/next-11.1.2.tgz" + "version" "11.1.2" dependencies: - "@babel/runtime" "7.12.5" + "@babel/runtime" "7.15.3" "@hapi/accept" "5.0.2" - "@next/env" "11.1.0" - "@next/polyfill-module" "11.1.0" - "@next/react-dev-overlay" "11.1.0" - "@next/react-refresh-utils" "11.1.0" + "@next/env" "11.1.2" + "@next/polyfill-module" "11.1.2" + "@next/react-dev-overlay" "11.1.2" + "@next/react-refresh-utils" "11.1.2" "@node-rs/helper" "1.2.1" "assert" "2.0.0" "ast-types" "0.13.2" @@ -5167,13 +5165,18 @@ "stream-browserify" "3.0.0" "stream-http" "3.1.1" "string_decoder" "1.3.0" - "styled-jsx" "4.0.0" + "styled-jsx" "4.0.1" "timers-browserify" "2.0.12" "tty-browserify" "0.0.1" "use-subscription" "1.5.1" - "util" "0.12.3" + "util" "0.12.4" "vm-browserify" "1.1.2" "watchpack" "2.1.1" + optionalDependencies: + "@next/swc-darwin-arm64" "11.1.2" + "@next/swc-darwin-x64" "11.1.2" + "@next/swc-linux-x64-gnu" "11.1.2" + "@next/swc-win32-x64-msvc" "11.1.2" "node-emoji@^1.8.1": "integrity" "sha512-Yt3384If5H6BYGVHiHwTL+99OzJKHhgp82S8/dktEK73T26BazdgZ4JZh92xSVtGNJvz9UbXdNAc5hcrXV42vw==" @@ -5940,7 +5943,7 @@ "iconv-lite" "0.4.24" "unpipe" "1.0.0" -"react-dom@^16 || ^17 || ^18", "react-dom@^16.8.0 || ^17.0.0", "react-dom@^17.0.2", "react-dom@>=16.3.0", "react-dom@>=16.6.0", "react-dom@>=16.x <=17.x", "react-dom@17.0.2": +"react-dom@^16 || ^17 || ^18", "react-dom@^16.8.0 || ^17.0.0", "react-dom@^17.0.2", "react-dom@>=16.3.0", "react-dom@>=16.6.0", "react-dom@>=16.x <=17.x": "integrity" "sha512-s4h96KtLDUQlsENhMn1ar8t2bEa+q/YAtj8pPPdIjPDGBDIVNsrD9aXNWqspUe6AzKCIG0C1HZZLqLV7qpOBGA==" "resolved" "https://registry.npmjs.org/react-dom/-/react-dom-17.0.2.tgz" "version" "17.0.2" @@ -6728,10 +6731,10 @@ dependencies: "inline-style-parser" "0.1.1" -"styled-jsx@4.0.0": - "integrity" "sha512-2USeoWMoJ/Lx5s2y1PxuvLy/cz2Yrr8cTySV3ILHU1Vmaw1bnV7suKdblLPjnyhMD+qzN7B1SWyh4UZTARn/WA==" - "resolved" "https://registry.npmjs.org/styled-jsx/-/styled-jsx-4.0.0.tgz" - "version" "4.0.0" +"styled-jsx@4.0.1": + "integrity" "sha512-Gcb49/dRB1k8B4hdK8vhW27Rlb2zujCk1fISrizCcToIs+55B4vmUM0N9Gi4nnVfFZWe55jRdWpAqH1ldAKWvQ==" + "resolved" "https://registry.npmjs.org/styled-jsx/-/styled-jsx-4.0.1.tgz" + "version" "4.0.1" dependencies: "@babel/plugin-syntax-jsx" "7.14.5" "@babel/types" "7.15.0" @@ -7276,10 +7279,10 @@ dependencies: "inherits" "2.0.3" -"util@^0.12.0", "util@0.12.3": - "integrity" "sha512-I8XkoQwE+fPQEhy9v012V+TSdH2kp9ts29i20TaaDUXsg7x/onePbhFJUExBfv/2ay1ZOp/Vsm3nDlmnFGSAog==" - "resolved" "https://registry.npmjs.org/util/-/util-0.12.3.tgz" - "version" "0.12.3" +"util@^0.12.0", "util@0.12.4": + "integrity" "sha512-bxZ9qtSlGUWSOy9Qa9Xgk11kSslpuZwaxCg4sNIDj6FLucDab2JxnHwyNTCpHMtK1MjoQiWQ6DiUMZYbSrO+Sw==" + "resolved" "https://registry.npmjs.org/util/-/util-0.12.4.tgz" + "version" "0.12.4" dependencies: "inherits" "^2.0.3" "is-arguments" "^1.0.4"