Make tutorial 16 testable (deepset-ai#23)

* make 16 testable * fix
blancadesal · Sep 16, 2022 · 529e86a · 529e86a
1 parent 9d46ceb
commit 529e86a
Show file tree

Hide file tree

Showing 3 changed files with 192 additions and 220 deletions.
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -32,6 +32,7 @@ jobs:
  - 10_Knowledge_Graph
  - 11_Pipelines
  - 12_LFQA
+ - 16_Document_Classifier_at_Index_Time
 
  env:
  HAYSTACK_TELEMETRY_ENABLED: "False"

diff --git a/markdowns/16.md b/markdowns/16.md
@@ -20,20 +20,18 @@ The result can be accessed at query time: for example by applying a filter for "
 This tutorial will show you how to integrate a classification model into your preprocessing steps and how you can filter for this additional metadata at query time. In the last section we show how to put it all together and create an indexing pipeline.
 
 
-```python
-# Install the latest release of Haystack in your own environment
-#! pip install farm-haystack
+```bash
+%%bash
 
 # Install the latest main of Haystack
-!pip install --upgrade pip
-!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
+pip install --upgrade pip
+pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
 
 !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
 !tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
 
-# Install pygraphviz
-!apt install libgraphviz-dev
-!pip install pygraphviz
+apt install libgraphviz-dev
+pip install pygraphviz
 ```
 
 ## Logging
@@ -51,31 +49,26 @@ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logg
 logging.getLogger("haystack").setLevel(logging.INFO)
 ```
 
+## Read and preprocess documents
 
-```python
-# Here are the imports we need
-from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
-from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever
-from haystack.schema import Document
-from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers
-```
 
 
 ```python
-# This fetches some sample files to work with
+from haystack.utils import fetch_archive_from_http
+
 
+# This fetches some sample files to work with
 doc_dir = "data/tutorial16"
 s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 ```
 
-## Read and preprocess documents
-
-
 
 ```python
-# note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents
+from haystack.nodes import PreProcessor
+from haystack.utils import convert_files_to_docs
 
+# note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents
 all_docs = convert_files_to_docs(dir_path=doc_dir)
 preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False)
 docs_sliding_window = preprocessor_sliding_window.process(all_docs)
@@ -89,6 +82,9 @@ These classes can later on be accessed at query time.
 
 
 ```python
+from haystack.nodes import TransformersDocumentClassifier
+
+
 doc_classifier = TransformersDocumentClassifier(
  model_name_or_path="cross-encoder/nli-distilroberta-base",
  task="zero-shot-classification",
@@ -130,27 +126,53 @@ print(classified_docs[0].to_dict())
 
 ## Indexing
 
+### Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
 
 ```python
-# In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.9.2
+# Recommended: Start Elasticsearch using Docker via the Haystack utility function
+from haystack.utils import launch_es
 
-import os
-from subprocess import Popen, PIPE, STDOUT
+launch_es()
+```
 
-es_server = Popen(
- ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
-)
-# wait until ES has started
-! sleep 30
+### Start an Elasticsearch server in Colab
+
+If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source.
+
+
+```bash
+%%bash
+
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+chown -R daemon:daemon elasticsearch-7.9.2
+sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch -d
+```
+
+
+```bash
+%%bash --bg
+
+sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch
 ```
 
 
 ```python
 # Connect to Elasticsearch
-document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+import os
+import time
+
+from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
+
+# Wait 30 seconds only to be sure Elasticsearch is ready before continuing
+time.sleep(30)
+
+# Get the host where Elasticsearch is running, default to localhost
+host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
+
+document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")
 ```
 
 
@@ -177,6 +199,8 @@ All we have to do to filter for one of our classes is to set a filter on "classi
 ```python
 # Initialize QA-Pipeline
 from haystack.pipelines import ExtractiveQAPipeline
+from haystack.nodes import FARMReader, BM25Retriever
+
 
 retriever = BM25Retriever(document_store=document_store)
 reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
@@ -194,6 +218,9 @@ prediction = pipe.run(
 
 
 ```python
+from haystack.utils import print_answers
+
+
 print_answers(prediction, details="high")
 ```
 
@@ -204,10 +231,8 @@ print_answers(prediction, details="high")
 from pathlib import Path
 from haystack.pipelines import Pipeline
 from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter
-```
 
 
-```python
 file_type_classifier = FileTypeClassifier()
 text_converter = TextConverter()
 pdf_converter = PDFToTextConverter()
@@ -237,7 +262,8 @@ indexing_pipeline_with_classification.add_node(
 indexing_pipeline_with_classification.add_node(
  component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]
 )
-indexing_pipeline_with_classification.draw("index_time_document_classifier.png")
+# Uncomment the following to generate the pipeline image
+# indexing_pipeline_with_classification.draw("index_time_document_classifier.png")
 
 document_store.delete_documents()
 txt_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".txt"]