added query classifier page (deepset-ai#124)

* added query classifier page * updated tutorials
ju-gu · Aug 24, 2021 · 978a55d · 978a55d
1 parent fd7e9f7
commit 978a55d
Show file tree

Hide file tree

Showing 27 changed files with 603 additions and 40 deletions.
diff --git a/src/pages/docs/versions/master/latest/site/en/menuStructure/menu.json b/src/pages/docs/versions/master/latest/site/en/menuStructure/menu.json
@@ -178,6 +178,14 @@
  "label3": "",
  "order": 13
  },
+ {
+ "id": "query_classifiermd",
+ "title": "Query Classifier",
+ "label1": "usage_haystack",
+ "label2": "",
+ "label3": "",
+ "order": 14
+ },
  {
  "id": "tutorials_haystack",
  "title": "Tutorials",

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/1.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/1.md
@@ -133,7 +133,7 @@ dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, spl
 # 'meta': {'name': "<DOCUMENT_NAME_HERE>", ...}
 #}
 # (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
-# can be accessed later for filtering or shown in the responses of the Finder)
+# can be accessed later for filtering or shown in the responses of the Pipeline)
 
 # Let's have a look at the first 3 entries:
 print(dicts[:3])
@@ -142,7 +142,7 @@ print(dicts[:3])
 document_store.write_documents(dicts)
 ```
 
-## Initalize Retriever, Reader, & Finder
+## Initalize Retriever, Reader, & Pipeline
 
 ### Retriever
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/11.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/11.md
@@ -34,7 +34,7 @@ These lines are to install Haystack through pip
 
 ```python
 # Install the latest release of Haystack in your own environment
-#! pip install farm-haystack
+!pip install farm-haystack
 
 # Install the latest master of Haystack
 !pip install grpcio-tools==1.34.1

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/2.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/2.md
@@ -72,9 +72,9 @@ Then change the `use_gpu` arguments below to `True`
 
 ```python
 reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
-train_data = "data/squad20"
-# train_data = "PATH/TO_YOUR/TRAIN_DATA" 
-reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")
+data_dir = "data/squad20"
+# data_dir = "PATH/TO_YOUR/TRAIN_DATA" 
+reader.train(data_dir=data_dir, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")
 ```
 
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/3.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/3.md
@@ -44,7 +44,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 
 
 ```python
-from haystack import Finder
 from haystack.preprocessor.cleaning import clean_wiki_text
 from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
 from haystack.reader.farm import FARMReader
@@ -102,7 +101,7 @@ print(dicts[:3])
 document_store.write_documents(dicts)
 ```
 
-## Initalize Retriever, Reader, & Finder
+## Initalize Retriever, Reader & Pipeline
 
 ### Retriever
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/4.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/4.md
@@ -52,7 +52,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 
 
 ```python
-from haystack import Finder
 from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
 
 from haystack.retriever.dense import EmbeddingRetriever

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/5.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/5.md
@@ -148,7 +148,7 @@ retriever = ElasticsearchRetriever(document_store=document_store)
 # Initialize Reader
 from haystack.reader.farm import FARMReader
 
-reader = FARMReader("deepset/roberta-base-squad2", top_k_per_candidate=4, return_no_answer=True)
+reader = FARMReader("deepset/roberta-base-squad2", top_k=4, return_no_answer=True)
 
 ```
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/6.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/6.md
@@ -85,7 +85,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 
 
 ```python
-from haystack import Finder
 from haystack.preprocessor.cleaning import clean_wiki_text
 from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
 from haystack.reader.farm import FARMReader
@@ -146,7 +145,7 @@ dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, spl
 document_store.write_documents(dicts)
 ```
 
-### Initalize Retriever, Reader, & Finder
+### Initalize Retriever, Reader & Pipeline
 
 #### Retriever
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/8.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/8.md
@@ -190,8 +190,7 @@ preprocessor = PreProcessor(
  split_length=100,
  split_respect_sentence_boundary=True
 )
-nested_docs = [preprocessor.process(d) for d in all_docs]
-docs = [d for x in nested_docs for d in x]
+docs = preprocessor.process(all_docs)
 
 print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
 ```

diff --git a/src/pages/docs/versions/master/latest/site/en/usage/usage/annotation.md b/src/pages/docs/versions/master/latest/site/en/usage/usage/annotation.md
@@ -45,3 +45,34 @@ The credentials should match in database image and application configuration.
 
 # Usage
 The manual (of a slightly earlier version) can be found [here](https://drive.google.com/file/d/1Wv3OIC0Z7ibHIzOm9Xw_r0gjTFmpl-33/view). While it doesn't include all latest features, the basic workflow and tips for label quality are still the same.
+
+# Annotation FAQ
+
+1. What is a good question?
+ - A good question is a fact-seeking question that can be answered with an entity (person, organisation, location, etc.) or explanation. A bad question is ambiguous, incomprehensible, dependent on clear false presuppositions, opinion seeking, or not clearly a request for factual information.
+ - The question should ask about information present in the text passage given. It should not be answerable only with additional knowledge or your interpretation.
+ - Do not copy paste answer text into the question. Good questions do not contain the exact same words as the answer or the context around the answer. The question should be a reformulation with synonyms and in different order as the context of the answer.
+ - Questions should be very precise natural questions you would ask when you want information from another person.
+2. How many questions should you ask per text passage?
+ - Maximally ask 20 questions per passage
+ - Some text passages are not suited for 20 questions. Do not make up very constructed and complicated questions just to fill up the 20 - move on to the next text.
+ - Try to ask questions covering the whole passage and focus on questions covering important information. Do not only ask questions about a single sentence in that passage.
+3. What is a good answer span?
+ - Always mark whole words. Do not start or end the answer within a word.
+ - For short answers: The answer should be as short and as close to a spoken human answer as possible. Do not include punctuation.
+ - For long answers: Please mark whole sentences with punctuation. The sentences can also pick up parts of the question, or mark even whole text passages. Mark passages only if they are not too large (e.g. not more than 8-10 sentences).
+4. How do I differentiate long vs short answers?
+ - If there is a short answer possible you should always select short answer over long answer.
+ - Short precise answers like numbers or a few words are short answers.
+ - Long answers include lists of possibilities or multiple sentences are needed to answer the question correctly.
+5. How to handle multiple possible answers to a single question?
+ - As of now there is no functionality to mark multiple answers per single question.
+ - Workaround: You can add a question with the same text but different answer selection by using the button below the question list (Button reads “custom question”)
+6. What to do with grammatically wrong or incorrectly spelled questions?
+ - Include them. When users use the tool and ask questions they will likely contain grammar and spelling errors, too.
+ - Exception: The question needs to be understandable without reading and interpretation of the corresponding text passage. If you do not understand the question, please mark the question as “I don’t understand the question”.
+7. What to do with text passages that are not properly converted or contain (in part) information that cannot be labelled (e.g. just lists or garbage text)?
+ - Please do not annotate this text
+ - You can write down what is missing, or the cause why you cannot label the text + the text number and title.
+8. Which browser to use?
+ - Please use the Chrome browser. The tool is not tested for other browsers.
diff --git a/src/pages/docs/versions/master/latest/site/en/usage/usage/document_store.md b/src/pages/docs/versions/master/latest/site/en/usage/usage/document_store.md
@@ -47,9 +47,9 @@ from haystack.document_store import ElasticsearchDocumentStore
 document_store = ElasticsearchDocumentStore()
 ```
 
-Note that we also support [Open Distro for Elasticsearch](https://opendistro.github.io/for-elasticsearch-docs/).
-Follow [their documentation](https://opendistro.github.io/for-elasticsearch-docs/docs/install/)
-to run it and connect to it using Haystack's `OpenDistroElasticsearchDocumentStore` class.
+Note that we also support [OpenSearch](https://opensearch.org/).
+Follow [their documentation](https://opensearch.org/docs/)
+to run it and connect to it using Haystack's `OpenSearchDocumentStore` class.
 
 We further support [AWS Elastic Search Service](https://aws.amazon.com/elasticsearch-service/) with [signed Requests](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html):
 Use e.g. [aws-requests-auth](https://github.com/davidmuller/aws-requests-auth) to create an auth object and pass it as `aws4auth` to the `ElasticsearchDocumentStore` constructor.
@@ -130,7 +130,7 @@ document_store = SQLDocumentStore()
 The `WeaviateDocumentStore` requires a running Weaviate Server. 
 You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details):
 ```
- docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.4.0
+ docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.6.0
 ```
 
 Afterwards, you can use it in Haystack:
@@ -143,6 +143,32 @@ document_store = WeaviateDocumentStore()
 </div>
 </div>
 
+<div class="tab">
+<input type="radio" id="tab-1-1" name="tab-group-1" checked>
+<label class="labelouter" for="tab-1-1">OpenSearch</label>
+<div class="tabcontent">
+
+See the official [OpenSearch documentation](https://opensearch.org/docs/opensearch/install/docker/) on how to install and start an instance.
+
+If you have Docker set up, we recommend pulling the Docker image and running it.
+```bash
+docker pull opensearchproject/opensearch:1.0.0
+docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.0
+```
+
+Note that we also have a utility function `haystack.utils.launch_opensearch` that can start up an OpenSearch instance.
+
+Next you can initialize the Haystack object that will connect to this instance.
+
+```python
+from haystack.document_store import OpenSearchDocumentStore
+
+document_store = OpenSearchDocumentStore()
+```
+
+</div>
+</div>
+
 </div>
 
 Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes.
@@ -303,6 +329,21 @@ The Document Stores have different characteristics. You should choose one depend
 - Less options for ANN algorithms than FAISS or Milvus
 - No BM25 / Tf-idf retrieval
 
+</div>
+</div>
+
+<div class="tab">
+<input type="radio" id="tab-2-6" name="tab-group-2">
+<label class="labelouter" for="tab-2-6">OpenSearch</label>
+<div class="tabcontent">
+
+**Pros:**
+- Fully open source fork of Elasticsearch
+- Has support for Approximate Nearest Neighbours vector search
+
+**Cons:**
+- It's ANN algorithms seem a little less performant that FAISS or Milvus in our benchmarks
+
 </div>
 </div>
 

diff --git a/src/pages/docs/versions/master/latest/site/en/usage/usage/pipelines.md b/src/pages/docs/versions/master/latest/site/en/usage/usage/pipelines.md
@@ -90,7 +90,7 @@ To load, simply call:
 pipeline.load_from_yaml(Path("sample.yaml"))
 ```
 
-For another example YAML config, check out [this file](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipelines.yaml).
+For another example YAML config, check out [this file](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipeline/pipelines.yaml).
 
 ### Multiple retrievers
 You can now also use multiple Retrievers and join their results: 
@@ -145,7 +145,7 @@ To get hands on with this kind of node, have a look at the [evaluation tutorial]
 
 ### Default Pipelines (replacing the "Finder")
 Last but not least, we added some "Default Pipelines" that allow you to run standard patterns with very few lines of code.
-This is replacing the `Finder` class which is now deprecated.
+This is replacing the `Finder` class which was deprecated with Haystack 0.6.0 .
 
 ```
 from haystack.pipeline import DocumentSearchPipeline, ExtractiveQAPipeline, Pipeline, JoinDocuments
@@ -167,6 +167,20 @@ doc_pipe = FAQPipeline(retriever=retriever)
 res = doc_pipe.run(query="How can I change my address?", top_k_retriever=3)
 
 ``` 
+So to migrate your QA system from the deprecated `Finder` to `ExtractiveQAPipeline` you'd need to: 
+```
+# 1. Change import
+from haystack.pipeline import ExtractiveQAPipeline
+
+# 2. Replace the Finder 
+qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever)
+
+# 3. Replace get_answers() with run()
+res = qa_pipe.run(query="When was Kant born?", top_k_retriever=3, top_k_reader=5)
+
+# 4. Access your results from ["documents"] rather than ["answers"]
+print(res["documents"])
+```
 See also the [Pipelines API documentation](/docs/latest/apipipelinesmd) for more details. 
 
 We plan many more features around the new pipelines incl. parallelized execution, distributed execution, dry runs - so stay tuned ...