make tutorial 17 testable (deepset-ai#24)

* make tutorial 17 testable * try * try * try * try
blancadesal · Sep 16, 2022 · 4c666fe · 4c666fe
1 parent 529e86a
commit 4c666fe
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 73 deletions.
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -33,6 +33,7 @@ jobs:
  - 11_Pipelines
  - 12_LFQA
  - 16_Document_Classifier_at_Index_Time
+ - 17_Audio
 
  env:
  HAYSTACK_TELEMETRY_ENABLED: "False"
@@ -47,6 +48,7 @@ jobs:
  run: |
  pip install pyzmq==23.2.1
  pip install nbconvert
+ pip install ipython
 
  - name: Convert notebook to Python
  run: |

diff --git a/.github/workflows/run_tutorials.yml b/.github/workflows/run_tutorials.yml
@@ -29,8 +29,10 @@ jobs:
  - name: Install dependencies
  # remove pip install pyzmq when this is resolved https://github.com/zeromq/pyzmq/issues/1764
  run: |
+ apt-get update && apt-get install -y build-essential gcc libsndfile1 ffmpeg && rm -rf /var/lib/apt/lists/*
  pip install pyzmq==23.2.1
- pip install nbconvert
+ pip install nbconvert ipython
+ pip install "pyworld<=0.2.12" espnet espnet-model-zoo pydub
 
  - name: Files changed
  uses: jitterbit/get-changed-files@v1

diff --git a/markdowns/17.md b/markdowns/17.md
@@ -25,20 +25,23 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 
 <img src="https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/colab_gpu_runtime.jpg">
 
+You can double check whether the GPU runtime is enabled with the following command:
 
-```python
-# Make sure you have a GPU running
-!nvidia-smi
+
+```bash
+%%bash
+
+nvidia-smi
 ```
 
+To start, install the latest release of Haystack with `pip`:
 
-```python
-# Install the latest release of Haystack in your own environment
-#! pip install farm-haystack
 
-# Install the latest main of Haystack
-!pip install --upgrade pip
-!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,audio]
+```bash
+%%bash
+
+pip install --upgrade pip
+pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,audio]
 ```
 
 ## Logging
@@ -56,8 +59,8 @@ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logg
 logging.getLogger("haystack").setLevel(logging.INFO)
 ```
 
-### Setup Elasticsearch
-
+### Start an Elasticsearch server
+You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
 
 
 ```python
@@ -67,21 +70,25 @@ from haystack.utils import launch_es
 launch_es()
 ```
 
+### Start an Elasticsearch server in Colab
 
-```python
-# In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.9.2
+If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source.
 
-import os
-from subprocess import Popen, PIPE, STDOUT
 
-es_server = Popen(
- ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
-)
-# wait until ES has started
-! sleep 30
+```bash
+%%bash
+
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+chown -R daemon:daemon elasticsearch-7.9.2
+sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch -d
+```
+
+
+```bash
+%%bash --bg
+
+sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch
 ```
 
 ### Populate the document store with `SpeechDocuments`
@@ -94,13 +101,23 @@ To the basic version, we can add here a DocumentToSpeech node that also generate
 
 
 ```python
+import os
+import time
+
 from haystack.document_stores import ElasticsearchDocumentStore
 from haystack.utils import fetch_archive_from_http, launch_es
 from pathlib import Path
 from haystack import Pipeline
 from haystack.nodes import FileTypeClassifier, TextConverter, PreProcessor, DocumentToSpeech
 
-document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+
+# Wait 30 seconds only to be sure Elasticsearch is ready before continuing
+time.sleep(30)
+
+# Get the host where Elasticsearch is running, default to localhost
+host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
+
+document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")
 
 # Get the documents
 documents_path = "data/tutorial17"

diff --git a/tutorials/17_Audio.ipynb b/tutorials/17_Audio.ipynb
@@ -30,7 +30,9 @@
  "Make sure you enable the GPU runtime to experience decent speed in this tutorial.\n",
  "**Runtime -> Change Runtime type -> Hardware accelerator -> GPU**\n",
  "\n",
- "<img src=\"https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/colab_gpu_runtime.jpg\">"
+ "<img src=\"https://raw.githubusercontent.com/deepset-ai/haystack/main/docs/img/colab_gpu_runtime.jpg\">\n",
+ "\n",
+ "You can double check whether the GPU runtime is enabled with the following command:"
  ]
  },
  {
@@ -40,79 +42,88 @@
  "id": "uDHmaD2gB3SX",
  "pycharm": {
  "name": "#%%\n"
+ },
+ "vscode": {
+ "languageId": "shellscript"
  }
  },
  "outputs": [],
  "source": [
- "# Make sure you have a GPU running\n",
- "!nvidia-smi"
+ "%%bash\n",
+ "\n",
+ "nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To start, install the latest release of Haystack with `pip`:"
  ]
  },
  {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {
- "id": "QsY0HC8JB3Sc"
+ "id": "QsY0HC8JB3Sc",
+ "vscode": {
+ "languageId": "shellscript"
+ }
  },
  "outputs": [],
  "source": [
- "# Install the latest release of Haystack in your own environment\n",
- "#! pip install farm-haystack\n",
+ "%%bash\n",
  "\n",
- "# Install the latest main of Haystack\n",
- "!pip install --upgrade pip\n",
- "!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,audio]"
+ "pip install --upgrade pip\n",
+ "pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,audio]"
  ]
  },
  {
  "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
  "source": [
  "## Logging\n",
  "\n",
  "We configure how logging messages should be displayed and which log level should be used before importing Haystack.\n",
  "Example log message:\n",
  "INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt\n",
  "Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily:"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
+ ]
  },
  {
  "cell_type": "code",
  "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
  "outputs": [],
  "source": [
  "import logging\n",
  "\n",
  "logging.basicConfig(format=\"%(levelname)s - %(name)s - %(message)s\", level=logging.WARNING)\n",
  "logging.getLogger(\"haystack\").setLevel(logging.INFO)"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ ]
  },
  {
  "cell_type": "markdown",
- "metadata": {
- "id": "ZCemC4_XB3Se"
- },
+ "metadata": {},
  "source": [
- "### Setup Elasticsearch\n"
+ "### Start an Elasticsearch server\n",
+ "You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source."
  ]
  },
  {
  "cell_type": "code",
  "execution_count": null,
- "metadata": {
- "id": "BtEN_VgSB3Sg"
- },
+ "metadata": {},
  "outputs": [],
  "source": [
  "# Recommended: Start Elasticsearch using Docker via the Haystack utility function\n",
@@ -121,27 +132,46 @@
  "launch_es()"
  ]
  },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Start an Elasticsearch server in Colab\n",
+ "\n",
+ "If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source."
+ ]
+ },
  {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {
- "id": "r-oqc2g1B3Si"
+ "vscode": {
+ "languageId": "shellscript"
+ }
  },
  "outputs": [],
  "source": [
- "# In Colab / No Docker environments: Start Elasticsearch from source\n",
- "! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
- "! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
- "! chown -R daemon:daemon elasticsearch-7.9.2\n",
+ "%%bash\n",
  "\n",
- "import os\n",
- "from subprocess import Popen, PIPE, STDOUT\n",
+ "wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
+ "tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
+ "chown -R daemon:daemon elasticsearch-7.9.2\n",
+ "sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch -d"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "vscode": {
+ "languageId": "shellscript"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "%%bash --bg\n",
  "\n",
- "es_server = Popen(\n",
- " [\"elasticsearch-7.9.2/bin/elasticsearch\"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon\n",
- ")\n",
- "# wait until ES has started\n",
- "! sleep 30"
+ "sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch"
  ]
  },
  {
@@ -170,13 +200,23 @@
  },
  "outputs": [],
  "source": [
+ "import os\n",
+ "import time\n",
+ "\n",
  "from haystack.document_stores import ElasticsearchDocumentStore\n",
  "from haystack.utils import fetch_archive_from_http, launch_es\n",
  "from pathlib import Path\n",
  "from haystack import Pipeline\n",
  "from haystack.nodes import FileTypeClassifier, TextConverter, PreProcessor, DocumentToSpeech\n",
  "\n",
- "document_store = ElasticsearchDocumentStore(host=\"localhost\", username=\"\", password=\"\", index=\"document\")\n",
+ "\n",
+ "# Wait 30 seconds only to be sure Elasticsearch is ready before continuing\n",
+ "time.sleep(30)\n",
+ "\n",
+ "# Get the host where Elasticsearch is running, default to localhost\n",
+ "host = os.environ.get(\"ELASTICSEARCH_HOST\", \"localhost\")\n",
+ "\n",
+ "document_store = ElasticsearchDocumentStore(host=host, username=\"\", password=\"\", index=\"document\")\n",
  "\n",
  "# Get the documents\n",
  "documents_path = \"data/tutorial17\"\n",
@@ -554,11 +594,8 @@
  "name": "Tutorial17_Audio.ipynb",
  "provenance": []
  },
- "interpreter": {
- "hash": "608574092bbd30ec12f87341bba285fb17e1c9fb49d850a21d7829c65ef2f8c3"
- },
  "kernelspec": {
- "display_name": "Python 3.9.7 ('venv': venv)",
+ "display_name": "Python 3.10.6 64-bit",
  "language": "python",
  "name": "python3"
  },
@@ -572,7 +609,12 @@
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
- "version": "3.9.7"
+ "version": "3.10.6"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "bda33b16be7e844498c7c2d368d72665b4f1d165582b9547ed22a0249a29ca2e"
+ }
  }
  },
  "nbformat": 4,