make tutorial 11 testable (deepset-ai#17)

* make tutorial 11 testable * add 30s sleep * install graph deps * revert and comment draw calls * update md
blancadesal · Sep 16, 2022 · f6c123f · f6c123f
1 parent c5b0e34
commit f6c123f
Show file tree

Hide file tree

Showing 4 changed files with 179 additions and 108 deletions.
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -29,6 +29,7 @@ jobs:
  - 06_Better_Retrieval_via_Embedding_Retrieval
  - 07_RAG_Generator
  - 10_Knowledge_Graph
+ - 11_Pipelines
  - 12_LFQA
 
  env:

diff --git a/.github/workflows/run_tutorials.yml b/.github/workflows/run_tutorials.yml
@@ -26,7 +26,7 @@ jobs:
  - name: Checkout
  uses: actions/checkout@v3
 
- - name: Install jupyter
+ - name: Install dependencies
  # remove pip install pyzmq when this is resolved https://github.com/zeromq/pyzmq/issues/1764
  run: |
  pip install pyzmq==23.2.1

diff --git a/markdowns/11.md b/markdowns/11.md
@@ -25,26 +25,26 @@ Let's start by ensuring we have a GPU running to ensure decent speed in this tut
 In Google colab, you can change to a GPU runtime in the menu:
 - **Runtime -> Change Runtime type -> Hardware accelerator -> GPU**
 
+You can double check whether the GPU runtime is enabled with the following command:
 
-```python
-# Make sure you have a GPU running
-!nvidia-smi
+
+```bash
+%%bash
+
+nvidia-smi
 ```
 
-These lines are to install Haystack through pip
+To start, install the latest release of Haystack with `pip` along with `pygraphviz`:
 
 
-```python
-# Install the latest release of Haystack in your own environment
-#! pip install farm-haystack
+```bash
+%%bash
 
-# Install the latest main of Haystack
-!pip install --upgrade pip
-!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
+pip install --upgrade pip
+pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
 
-# Install pygraphviz
-!apt install libgraphviz-dev
-!pip install pygraphviz
+apt install libgraphviz-dev
+pip install pygraphviz
 ```
 
 ## Logging
@@ -62,23 +62,36 @@ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logg
 logging.getLogger("haystack").setLevel(logging.INFO)
 ```
 
-If running from Colab or a no Docker environment, you will want to start Elasticsearch from source
+### Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
 
 
 ```python
-# In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.9.2
+# Recommended: Start Elasticsearch using Docker via the Haystack utility function
+from haystack.utils import launch_es
 
-import os
-from subprocess import Popen, PIPE, STDOUT
+launch_es()
+```
 
-es_server = Popen(
- ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
-)
-# wait until ES has started
-! sleep 30
+### Start an Elasticsearch server in Colab
+
+If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
+
+```bash
+%%bash
+
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+chown -R daemon:daemon elasticsearch-7.9.2
+sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch -d
+```
+
+
+```bash
+%%bash --bg
+
+sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch
 ```
 
 ## Initialization
@@ -88,13 +101,7 @@ be used indexed into our `DocumentStore`
 
 
 ```python
-from haystack.utils import (
- print_answers,
- print_documents,
- fetch_archive_from_http,
- convert_files_to_docs,
- clean_wiki_text,
-)
+from haystack.utils import fetch_archive_from_http, convert_files_to_docs, clean_wiki_text
 
 # Download and prepare data - 517 Wikipedia articles for Game of Thrones
 doc_dir = "data/tutorial11"
@@ -112,15 +119,20 @@ to perform Open Domain Question Answering.
 
 
 ```python
-from haystack import Pipeline
-from haystack.utils import launch_es
+import os
+import time
+
 from haystack.document_stores import ElasticsearchDocumentStore
 from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader
 
+# Wait 30 seconds only to be sure Elasticsearch is ready before continuing
+time.sleep(30)
+
+# Get the host where Elasticsearch is running, default to localhost
+host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
 
 # Initialize DocumentStore and index documents
-launch_es()
-document_store = ElasticsearchDocumentStore()
+document_store = ElasticsearchDocumentStore(host=host)
 document_store.delete_documents()
 document_store.write_documents(got_docs)
 
@@ -145,6 +157,7 @@ Here we have an `ExtractiveQAPipeline` (the successor to the now deprecated `Fin
 
 ```python
 from haystack.pipelines import ExtractiveQAPipeline
+from haystack.utils import print_answers
 
 # Prebuilt pipeline
 p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=bm25_retriever)
@@ -159,6 +172,7 @@ If you want to just do the retrieval step, you can use a `DocumentSearchPipeline
 
 ```python
 from haystack.pipelines import DocumentSearchPipeline
+from haystack.utils import print_documents
 
 p_retrieval = DocumentSearchPipeline(bm25_retriever)
 res = p_retrieval.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}})
@@ -170,7 +184,7 @@ you can initialize a `GenerativeQAPipeline` like this:
 
 
 ```python
-from haystack.pipelines import GenerativeQAPipeline, FAQPipeline
+from haystack.pipelines import GenerativeQAPipeline
 from haystack.nodes import RAGenerator
 
 # We set this to True so that the document store returns document embeddings with each document
@@ -206,9 +220,11 @@ you can save a diagram showing how all the components are connected.
 
 
 ```python
-p_extractive_premade.draw("pipeline_extractive_premade.png")
-p_retrieval.draw("pipeline_retrieval.png")
-p_generator.draw("pipeline_generator.png")
+# Uncomment the following to generate the images
+
+# p_extractive_premade.draw("pipeline_extractive_premade.png")
+# p_retrieval.draw("pipeline_retrieval.png")
+# p_generator.draw("pipeline_generator.png")
 ```
 
 ## Custom Pipelines
@@ -218,6 +234,9 @@ We do this by adding the building blocks that we initialized as nodes in the gra
 
 
 ```python
+from haystack.pipelines import Pipeline
+
+
 # Custom built extractive QA pipeline
 p_extractive = Pipeline()
 p_extractive.add_node(component=bm25_retriever, name="Retriever", inputs=["Query"])
@@ -228,7 +247,9 @@ res = p_extractive.run(
  query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
 )
 print_answers(res, details="minimum")
-p_extractive.draw("pipeline_extractive.png")
+
+# Uncomment the following to generate the pipeline image
+# p_extractive.draw("pipeline_extractive.png")
 ```
 
 Pipelines offer a very simple way to ensemble together different components.
@@ -253,7 +274,9 @@ p_ensemble.add_node(
  component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"]
 )
 p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])
-p_ensemble.draw("pipeline_ensemble.png")
+
+# Uncomment the following to generate the pipeline image
+# p_ensemble.draw("pipeline_ensemble.png")
 
 # Run pipeline
 res = p_ensemble.run(
@@ -339,7 +362,8 @@ p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier",
 p_classifier.add_node(component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
 p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
 p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
-p_classifier.draw("pipeline_classifier.png")
+# Uncomment the following to generate the pipeline image
+# p_classifier.draw("pipeline_classifier.png")
 
 # Run only the dense retriever on the full sentence query
 res_1 = p_classifier.run(query="Who is the father of Arya Stark?")