Adding tutorial 10

deepset-ai · julian-risch · Apr 13, 2021 · Apr 9, 2021 · Apr 12, 2021 · Apr 12, 2021
commit fbe02b7b3fc3d0d958baa9482f34855b9b3ca59f
diff --git a/src/pages/docs/versions/master/latest/site/en/menuStructure/menu.json b/src/pages/docs/versions/master/latest/site/en/menuStructure/menu.json
@@ -219,6 +219,14 @@
  "label3": "",
  "order": 8
  },
+ {
+ "id": "tutorial10md",
+ "title": "Question Answering on a Knowledge Graph",
+ "label1": "tutorials_haystack",
+ "label2": "",
+ "label3": "",
+ "order": 8
+ },
  {
  "id": "api_haystack",
  "title": "API reference",

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/1.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/1.md
@@ -44,13 +44,11 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 # Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
 !pip install urllib3==1.25.4
-!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
 
 ```
 
 
 ```python
-from haystack import Finder
 from haystack.preprocessor.cleaning import clean_wiki_text
 from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
 from haystack.reader.farm import FARMReader
@@ -74,19 +72,19 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock
 
 ```python
 # Recommended: Start Elasticsearch using Docker
-#! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2
+#! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
 ```
 
 
 ```python
 # In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.6.2
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
 
 import os
 from subprocess import Popen, PIPE, STDOUT
-es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
+es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
  stdout=PIPE, stderr=STDOUT,
  preexec_fn=lambda: os.setuid(1) # as daemon
  )
@@ -206,13 +204,17 @@ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=Tr
 # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
 ```
 
-### Finder
+### Pipeline
 
-The Finder sticks together reader and retriever in a pipeline to answer our actual questions. 
+With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
+Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
+To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
+You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
 
 
 ```python
-finder = Finder(reader, retriever)
+from haystack.pipeline import ExtractiveQAPipeline
+pipe = ExtractiveQAPipeline(reader, retriever)
 ```
 
 ## Voilà! Ask a question!
@@ -221,13 +223,13 @@ finder = Finder(reader, retriever)
 ```python
 # You can configure how many candidates the reader and retriever shall return
 # The higher top_k_retriever, the better (but also the slower) your answers. 
-prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
+prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
 ```
 
 
 ```python
-# prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
-# prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5)
+# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
+# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)
 ```
 
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/10.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/10.md
@@ -0,0 +1,131 @@
+---
+title: "Tutorial 10"
+metaTitle: "Knowledge Graph QA"
+metaDescription: ""
+slug: "/docs/tutorial10"
+date: "2021-04-06"
+id: "tutorial10md"
+---
+
+# Question Answering on a Knowledge Graph
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.ipynb)
+
+Haystack allows storing and querying knowledge graphs with the help of pre-trained models that translate text queries to SPARQL queries.
+This tutorial demonstrates how to load an existing knowledge graph into haystack, load a pre-trained retriever, and execute text queries on the knowledge graph.
+The training of models that translate text queries into SPARQL queries is currently not supported.
+
+
+```python
+# Install the latest release of Haystack in your own environment
+#! pip install farm-haystack
+
+# Install the latest master of Haystack
+!pip install git+https://github.com/deepset-ai/haystack.git
+```
+
+
+```python
+# Here are some imports that we'll need
+
+import subprocess
+import time
+from pathlib import Path
+
+from haystack.graph_retriever.text_to_sparql import Text2SparqlRetriever
+from haystack.knowledge_graph.graphdb import GraphDBKnowledgeGraph
+from haystack.preprocessor.utils import fetch_archive_from_http
+```
+
+## Downloading Knowledge Graph and Model
+
+
+```python
+# Let's first fetch some triples that we want to store in our knowledge graph
+# Here: exemplary triples from the wizarding world
+graph_dir = "../data/tutorial10_knowledge_graph/"
+s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"
+fetch_archive_from_http(url=s3_url, output_dir=graph_dir)
+
+# Fetch a pre-trained BART model that translates text queries to SPARQL queries
+model_dir = "../saved_models/tutorial10_knowledge_graph/"
+s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip"
+fetch_archive_from_http(url=s3_url, output_dir=model_dir)
+```
+
+## Launching a GraphDB instance
+
+
+```python
+# Unfortunately, there seems to be no good way to run GraphDB in colab environments
+# In your local environment, you could start a GraphDB server with docker
+# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/
+print("Starting GraphDB ...")
+status = subprocess.run(
+ ['docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11'], shell=True
+)
+if status.returncode:
+ raise Exception("Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?")
+time.sleep(5)
+```
+
+## Creating a new GraphDB repository (also known as index in haystack's document stores)
+
+
+```python
+# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index
+kg = GraphDBKnowledgeGraph(index="tutorial_10_index")
+
+# Delete the index as it might have been already created in previous runs
+kg.delete_index()
+
+# Create the index based on a configuration file
+kg.create_index(config_path=Path(graph_dir+"repo-config.ttl"))
+
+# Import triples of subject, predicate, and object statements from a ttl file
+kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir+"triples.ttl"))
+print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}")
+print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.")
+```
+
+
+```python
+# Define prefixes for names of resources so that we can use shorter resource names in queries
+prefixes = """PREFIX rdf: <http:https://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX xsd: <http:https://www.w3.org/2001/XMLSchema#>
+PREFIX hp: <https://deepset.ai/harry_potter/>
+"""
+kg.prefixes = prefixes
+
+# Load a pre-trained model that translates text queries to SPARQL queries
+kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir+"hp_v3.4")
+```
+
+## Query Execution
+
+We can now ask questions that will be answered by our knowledge graph!
+One limitation though: our pre-trained model can only generate questions about resources it has seen during training.
+Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph.
+E.g. "Harry" -> "hp:Harry_potter"
+
+
+```python
+query = "In which house is Harry Potter?"
+print(f"Translating the text query \"{query}\" to a SPARQL query and executing it on the knowledge graph...")
+result = kgqa_retriever.retrieve(query=query)
+print(result)
+# Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . }
+# Correct answer: Gryffindor
+
+print("Executing a SPARQL query with prefixed names of resources...")
+result = kgqa_retriever._query_kg(sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }")
+print(result)
+# Paraphrased question: Who is the keeper of keys and grounds?
+# Correct answer: Rubeus Hagrid
+
+print("Executing a SPARQL query with full names of resources...")
+result = kgqa_retriever._query_kg(sparql_query="select distinct ?obj where { <https://deepset.ai/harry_potter/Hermione_granger> <https://deepset.ai/harry_potter/patronus> ?obj . }")
+print(result)
+# Paraphrased question: What is the patronus of Hermione?
+# Correct answer: Otter
+```
diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/2.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/2.md
@@ -39,8 +39,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 # Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
 !pip install urllib3==1.25.4
-!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
-
 ```
 
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/3.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/3.md
@@ -39,8 +39,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 # Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
 !pip install urllib3==1.25.4
-!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
-
 ```
 
 
@@ -153,28 +151,32 @@ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=Tr
 # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
 ```
 
-### Finder
+### Pipeline
 
-The Finder sticks together reader and retriever in a pipeline to answer our actual questions. 
+With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
+Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
+To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
+You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
 
 
 ```python
-finder = Finder(reader, retriever)
+from haystack.pipeline import ExtractiveQAPipeline
+pipe = ExtractiveQAPipeline(reader, retriever)
 ```
 
 ## Voilà! Ask a question!
 
 
 ```python
 # You can configure how many candidates the reader and retriever shall return
-# The higher top_k_retriever, the better (but also the slower) your answers. 
-prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
+# The higher top_k_retriever, the better (but also the slower) your answers.
+prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
 ```
 
 
 ```python
-# prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
-# prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5)
+# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
+# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)
 ```
 
 

diff --git a/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/4.md b/src/pages/docs/versions/master/latest/site/en/tutorials/tutorials/4.md
@@ -47,8 +47,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 # Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
 !pip install urllib3==1.25.4
-!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
-
 ```
 
 
@@ -136,19 +134,26 @@ print(df.head())
 # Get embeddings for our questions from the FAQs
 questions = list(df["question"].values)
 df["question_emb"] = retriever.embed_queries(texts=questions)
-df = df.rename(columns={"answer": "text"})
+df = df.rename(columns={"question": "text"})
 
 # Convert Dataframe to list of dicts and index them in our DocumentStore
 docs_to_index = df.to_dict(orient="records")
 document_store.write_documents(docs_to_index)
 ```
 
 ### Ask questions
-Initialize a Finder (this time without a reader) and ask questions
+Initialize a Pipeline (this time without a reader) and ask questions
 
 
 ```python
-finder = Finder(reader=None, retriever=retriever)
-prediction = finder.get_answers_via_similar_questions(question="How is the virus spreading?", top_k_retriever=10)
+from haystack.pipeline import FAQPipeline
+pipe = FAQPipeline(retriever=retriever)
+```
+
+
+```python
+prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10)
 print_answers(prediction, details="all")
+
+
 ```