Add basic telemetry features (deepset-ai#2314)

* add basic telemetry features * change pipeline_config to _component_config * Update Documentation & Code Style * add super().__init__() calls to error classes * make posthog mock work with python 3.7 * Update Documentation & Code Style * update link to docs web page * log exceptions, send event for raised HaystackErrors, refactor Path(CONFIG_PATH) * add comment on send_event in BaseComponent.init() and fix mypy * mock NonPrivateParameters and fix pylint undefined-variable * Update Documentation & Code Style * check model path contains multiple / * add test for writing to file * add test for en-/disable telemetry * Update Documentation & Code Style * merge file deletion methods and ignore pylint global statement * Update Documentation & Code Style * set env variable in demo to activate telemetry * fix mock of HAYSTACK_TELEMETRY_ENABLED * fix mypy and linter * add CI as env variable to execution contexts * remove threading, add test for custom error event * Update Documentation & Code Style * simplify config/log file deletion * add test for final event being sent * force writing config file in test * make test compatible with python 3.7 * switch to posthog production server * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
jamescalam · Mar 21, 2022 · ac5617e · ac5617e
1 parent e13df4b
commit ac5617e
Show file tree

Hide file tree

Showing 59 changed files with 638 additions and 117 deletions.
diff --git a/.github/workflows/demo/docker-compose.demo.yml b/.github/workflows/demo/docker-compose.demo.yml
@@ -4,6 +4,7 @@ services:
  restart: always
  environment:
  CONCURRENT_REQUEST_PER_WORKER: 16
+ HAYSTACK_EXECUTION_CONTEXT: "public_demo"
  command: "/bin/bash -c 'sleep 10 && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 3 --timeout 180'"
 
  elasticsearch:

diff --git a/Dockerfile b/Dockerfile
@@ -43,6 +43,7 @@ RUN chmod 777 /home/user/rest_api/file-upload
 #COPY data /home/user/data
 
 EXPOSE 8000
+ENV HAYSTACK_DOCKER_CONTAINER="HAYSTACK_CPU_CONTAINER"
 
 # cmd for running the API
 CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"]
diff --git a/Dockerfile-GPU b/Dockerfile-GPU
@@ -60,6 +60,7 @@ RUN chmod 777 /home/user/rest_api/file-upload
 #COPY data /home/user/data
 
 EXPOSE 8000
+ENV HAYSTACK_DOCKER_CONTAINER="HAYSTACK_GPU_CONTAINER"
 
 # cmd for running the API (note: "--preload" is not working with cuda)
 CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"]
diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
@@ -439,6 +439,7 @@ then be found in the dict returned by this method under the key "_debug"
 #### eval
 
 ```python
+@send_event
 def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, add_isolated_node_eval: bool = False) -> EvaluationResult
 ```
 

diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md
@@ -113,8 +113,8 @@ In this tutorial, we download Wikipedia articles about Game of Thrones, apply a
 ```python
 # Let's first fetch some documents that we want to query
 # Here: 517 Wikipedia articles for Game of Thrones
-doc_dir = "data/article_txt_got"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
+doc_dir = "data/tutorial1"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # Convert files to dicts

diff --git a/docs/_src/tutorials/tutorials/10.md b/docs/_src/tutorials/tutorials/10.md
@@ -44,7 +44,7 @@ from haystack.utils import fetch_archive_from_http
 ```python
 # Let's first fetch some triples that we want to store in our knowledge graph
 # Here: exemplary triples from the wizarding world
-graph_dir = "../data/tutorial10_knowledge_graph/"
+graph_dir = "data/tutorial10"
 s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
 fetch_archive_from_http(url=s3_url, output_dir=graph_dir)
 

diff --git a/docs/_src/tutorials/tutorials/11.md b/docs/_src/tutorials/tutorials/11.md
@@ -82,8 +82,8 @@ from haystack.utils import (
 )
 
 # Download and prepare data - 517 Wikipedia articles for Game of Thrones
-doc_dir = "data/article_txt_got"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
+doc_dir = "data/tutorial11"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt11.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # convert files to dicts containing documents that can be indexed to our datastore

diff --git a/docs/_src/tutorials/tutorials/12.md b/docs/_src/tutorials/tutorials/12.md
@@ -65,8 +65,8 @@ Similarly to the previous tutorials, we download, convert and index some Game of
 
 ```python
 # Let's first get some files that we want to use
-doc_dir = "data/article_txt_got"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
+doc_dir = "data/tutorial12"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # Convert files to dicts

diff --git a/docs/_src/tutorials/tutorials/14.md b/docs/_src/tutorials/tutorials/14.md
@@ -103,8 +103,8 @@ from haystack.nodes import (
 )
 
 # Download and prepare data - 517 Wikipedia articles for Game of Thrones
-doc_dir = "data/article_txt_got"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
+doc_dir = "data/tutorial14"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # convert files to dicts containing documents that can be indexed to our datastore

diff --git a/docs/_src/tutorials/tutorials/15.md b/docs/_src/tutorials/tutorials/15.md
@@ -94,7 +94,7 @@ Just as text passages, tables are represented as `Document` objects in Haystack.
 # Here: 1000 tables from OTT-QA
 from haystack.utils import fetch_archive_from_http
 
-doc_dir = "data"
+doc_dir = "data/tutorial15"
 s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 ```

diff --git a/docs/_src/tutorials/tutorials/16.md b/docs/_src/tutorials/tutorials/16.md
@@ -49,8 +49,8 @@ from haystack.utils import convert_files_to_dicts, fetch_archive_from_http, prin
 ```python
 # This fetches some sample files to work with
 
-doc_dir = "data/preprocessing_tutorial"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
+doc_dir = "data/tutorial16"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 ```
 

diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md
@@ -44,6 +44,7 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 
 ```python
 from haystack.nodes import FARMReader
+from haystack.utils import fetch_archive_from_http
 ```
 
 
@@ -102,15 +103,18 @@ To get the most out of model distillation, we recommend increasing the size of y
 # Downloading script
 !wget https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/utils/augment_squad.py
 
+doc_dir = "data/tutorial2"
+
 # Downloading smaller glove vector file (only for demonstration purposes)
-!wget https://nlp.stanford.edu/data/glove.6B.zip
-!unzip glove.6B.zip
+glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
+fetch_archive_from_http(url=glove_url, output_dir=doc_dir)
 
 # Downloading very small dataset to make tutorial faster (please use a bigger dataset for real use cases)
-!wget https://raw.githubusercontent.com/deepset-ai/haystack/master/test/samples/squad/small.json
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # Just replace the path with your dataset and adjust the output (also please remove glove path to use bigger glove vector file)
-!python augment_squad.py --squad_path small.json --output_path augmented_dataset.json --multiplication_factor 2 --glove_path glove.6B.300d.txt
+!python augment_squad.py --squad_path squad_small.json --output_path augmented_dataset.json --multiplication_factor 2 --glove_path glove.6B.300d.txt
 ```
 
 In this case, we use a multiplication factor of 2 to keep this example lightweight. Usually you would use a factor like 20 depending on the size of your training data. Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU.

diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md
@@ -79,8 +79,8 @@ In this tutorial, we download Wikipedia articles on Game of Thrones, apply a bas
 ```python
 # Let's first get some documents that we want to query
 # Here: 517 Wikipedia articles for Game of Thrones
-doc_dir = "data/article_txt_got"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
+doc_dir = "data/tutorial3"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # convert files to dicts containing documents that can be indexed to our datastore

diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md
@@ -64,7 +64,7 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock
 
 ```python
 # Recommended: Start Elasticsearch using Docker via the Haystack utility function
-from haystack.utils import launch_es
+from haystack.utils import launch_es, fetch_archive_from_http
 
 launch_es()
 ```
@@ -126,8 +126,9 @@ Here: We download some question-answer pairs related to COVID-19
 
 ```python
 # Download
-temp = requests.get("https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv")
-open("small_faq_covid.csv", "wb").write(temp.content)
+doc_dir = "data/tutorial4"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # Get dataframe with columns "question", "answer" and some custom metadata
 df = pd.read_csv("small_faq_covid.csv")

diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md
@@ -69,7 +69,7 @@ es_server = Popen(
 from haystack.utils import fetch_archive_from_http
 
 # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
-doc_dir = "../data/nq"
+doc_dir = "data/tutorial5"
 s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 ```

diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md
@@ -136,8 +136,8 @@ Similarly to the previous tutorials, we download, convert and index some Game of
 
 ```python
 # Let's first get some files that we want to use
-doc_dir = "data/article_txt_got"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
+doc_dir = "data/tutorial6"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # Convert files to dicts

diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md
@@ -51,6 +51,7 @@ import pandas as pd
 from haystack import Document
 from haystack.document_stores import FAISSDocumentStore
 from haystack.nodes import RAGenerator, DensePassageRetriever
+from haystack.utils import fetch_archive_from_http
 ```
 
 Let's download a csv containing some sample text and preprocess the data.
@@ -59,10 +60,9 @@ Let's download a csv containing some sample text and preprocess the data.
 
 ```python
 # Download sample
-temp = requests.get(
- "https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv"
-)
-open("small_generator_dataset.csv", "wb").write(temp.content)
+doc_dir = "data/tutorial7/"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip"
+fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 
 # Create dataframe with columns "title" and "text"
 df = pd.read_csv("small_generator_dataset.csv", sep=",")

diff --git a/docs/_src/tutorials/tutorials/8.md b/docs/_src/tutorials/tutorials/8.md
@@ -51,8 +51,8 @@ from haystack.utils import convert_files_to_dicts, fetch_archive_from_http
 ```python
 # This fetches some sample files to work with
 
-doc_dir = "data/preprocessing_tutorial"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
+doc_dir = "data/tutorial8"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial8.zip"
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 ```
 

diff --git a/docs/_src/tutorials/tutorials/9.md b/docs/_src/tutorials/tutorials/9.md
@@ -95,7 +95,7 @@ Note that this data is probably only useful if you are trying to train from scra
 # Download original DPR data
 # WARNING: the train set is 7.4GB and the dev set is 800MB
 
-doc_dir = "data/dpr_training/"
+doc_dir = "data/tutorial9"
 
 s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz"
 s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz"

diff --git a/haystack/errors.py b/haystack/errors.py
@@ -1,6 +1,7 @@
 # coding: utf8
 """Custom Errors for Haystack"""
 
+from haystack.telemetry import send_custom_event
 from typing import Optional
 
 
@@ -14,6 +15,7 @@ class HaystackError(Exception):
  """
 
  def __init__(self, message: Optional[str] = None, docs_link: Optional[str] = None):
+ send_custom_event(event=f"{type(self).__name__} raised")
  super().__init__()
  if message:
  self.message = message
@@ -45,7 +47,8 @@ def __init__(
 class PipelineSchemaError(PipelineError):
  """Exception for issues arising when reading/building the JSON schema of pipelines"""
 
- pass
+ def __init__(self, message: Optional[str] = None):
+ super().__init__(message=message)
 
 
 class PipelineConfigError(PipelineError):
@@ -62,10 +65,12 @@ def __init__(
 class DocumentStoreError(HaystackError):
  """Exception for issues that occur in a document store"""
 
- pass
+ def __init__(self, message: Optional[str] = None):
+ super().__init__(message=message)
 
 
 class DuplicateDocumentError(DocumentStoreError, ValueError):
  """Exception for Duplicate document"""
 
- pass
+ def __init__(self, message: Optional[str] = None):
+ super().__init__(message=message)
diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py
@@ -29,7 +29,7 @@
 )
 
 from haystack import __version__ as haystack_version
-from haystack.errors import HaystackError, PipelineSchemaError
+from haystack.errors import PipelineSchemaError
 from haystack.nodes.base import BaseComponent
 
 

diff --git a/haystack/nodes/base.py b/haystack/nodes/base.py
@@ -9,6 +9,7 @@
 import logging
 
 from haystack.schema import Document, MultiLabel
+from haystack.telemetry import send_custom_event
 from haystack.errors import HaystackError
 
 
@@ -55,6 +56,10 @@ class BaseComponent(ABC):
  _subclasses: dict = {}
  _component_config: dict = {}
 
+ def __init__(self):
+ # a small subset of the component's parameters is sent in an event after applying filters defined in haystack.telemetry.NonPrivateParameters
+ send_custom_event(event=f"{type(self).__name__} initialized", payload=self._component_config.get("params", {}))
+
  # __init_subclass__ is invoked when a subclass of BaseComponent is _imported_
  # (not instantiated). It works approximately as a metaclass.
  def __init_subclass__(cls, **kwargs):

diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py
@@ -49,6 +49,7 @@
 from haystack.nodes.base import BaseComponent
 from haystack.nodes.retriever.base import BaseRetriever
 from haystack.document_stores.base import BaseDocumentStore
+from haystack.telemetry import send_event
 
 
 logger = logging.getLogger(__name__)
@@ -690,6 +691,7 @@ def run( # type: ignore
  i += 1 # attempt executing next node in the queue as current `node_id` has unprocessed predecessors
  return node_output
 
+ @send_event
  def eval(
  self,
  labels: List[MultiLabel],