Improve tutorials' output (#1694)

* Modify __str__ and __repr__ for Document and Answer * Rename QueryClassifier in Tutorial11 * Improve the output of tutorial1 * Make the output of Tutorial8 a bit less dense * Add a print_questions util to print the output of question generating pipelines * Replace custom printing with the new utility in Tutorial13 * Ensure all output is printed with minimal details in Tutorial14 and add some titles * Minor change to print_answers * Make tutorial3's output the same as tutorial1 * Add __repr__ to Answer and fix to_dict() * Fix a bug in the Document and Answer's __str__ method * Improve print_answers, print_documents and print_questions * Using print_answers in Tutorial7 and fixing typo in the utils * Remove duplicate line in Tutorial12 * Use print_answers in Tutorial4 * Add explanation of what the documents in the output of the basic QA pipeline are * Move the fields constant into print_answers * Normalize all 'minimal' to 'minimum' (they were mixed up) * Improve the sample output to include all fields from Document and Answer Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
deepset-ai · Nov 9, 2021 · 91cafb4 · 91cafb4
1 parent 861522b
commit 91cafb4
Show file tree

Hide file tree

Showing 27 changed files with 487 additions and 187 deletions.
diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md
@@ -237,7 +237,35 @@ prediction = pipe.run(
 
 
 ```python
-print_answers(prediction, details="minimal")
+# Now you can either print the object directly...
+from pprint import pprint
+
+pprint(prediction)
+
+# Sample output: 
+# {
+# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
+# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
+# ...
+# ]
+# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
+# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
+# ...
+# ],
+# 'no_ans_gap': 11.688868522644043,
+# 'node_id': 'Reader',
+# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
+# 'query': 'Who is the father of Arya Stark?',
+# 'root_node': 'Query'
+# }
+
+```
+
+
+```python
+# ...or use a util to simplify the output
+# Change `minimum` to `medium` or `all` to raise the level of detail
+print_answers(prediction, details="minimum")
 ```
 
 ## About us

diff --git a/docs/_src/tutorials/tutorials/11.md b/docs/_src/tutorials/tutorials/11.md
@@ -296,7 +296,7 @@ Below, we define a very naive `QueryClassifier` and show how to use it:
 
 
 ```python
-class QueryClassifier(BaseComponent):
+class CustomQueryClassifier(BaseComponent):
  outgoing_edges = 2
 
  def run(self, query: str):
@@ -307,7 +307,7 @@ class QueryClassifier(BaseComponent):
 
 # Here we build the pipeline
 p_classifier = Pipeline()
-p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"])
+p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
 p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"])
 p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"])
 p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])

diff --git a/docs/_src/tutorials/tutorials/13.md b/docs/_src/tutorials/tutorials/13.md
@@ -42,7 +42,8 @@ from tqdm import tqdm
 from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader
 from haystack.document_stores import ElasticsearchDocumentStore
 from haystack.pipelines import QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline
-from haystack.utils import launch_es
+from haystack.utils import launch_es, print_questions
+
 ```
 
 Let's start an Elasticsearch instance with one of the options below:
@@ -98,9 +99,11 @@ which the the document can answer.
 
 ```python
 question_generation_pipeline = QuestionGenerationPipeline(question_generator)
-for document in document_store:
+for idx, document in enumerate(document_store):
+
+ print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n")
  result = question_generation_pipeline.run(documents=[document])
- pprint(result)
+ print_questions(result)
 ```
 
 ## Retriever Question Generation Pipeline
@@ -111,8 +114,10 @@ This pipeline takes a query as input. It retrieves relevant documents and then g
 ```python
 retriever = ElasticsearchRetriever(document_store=document_store)
 rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator)
+
+print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n")
 result = rqg_pipeline.run(query="Arya Stark")
-pprint(result)
+print_questions(result)
 ```
 
 ## Question Answer Generation Pipeline
@@ -124,9 +129,11 @@ a Reader model
 ```python
 reader = FARMReader("deepset/roberta-base-squad2")
 qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
-for document in tqdm(document_store):
+for idx, document in enumerate(tqdm(document_store)):
+
+ print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
  result = qag_pipeline.run(documents=[document])
- pprint(result)
+ print_questions(result)
 ```
 
 ## About us

diff --git a/docs/_src/tutorials/tutorials/14.md b/docs/_src/tutorials/tutorials/14.md
@@ -161,14 +161,14 @@ res_1 = sklearn_keyword_classifier.run(
  query="Who is the father of Arya Stark?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_1)
+print_answers(res_1, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
 res_2 = sklearn_keyword_classifier.run(
  query="arya stark father"
 )
 print("ES Results" + "\n" + "="*15)
-print_answers(res_2)
+print_answers(res_2, details="minimum")
 
 ```
 
@@ -180,14 +180,14 @@ res_3 = sklearn_keyword_classifier.run(
  query="which country was jon snow filmed ?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_3)
+print_answers(res_3, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
 res_4 = sklearn_keyword_classifier.run(
  query="jon snow country"
 )
 print("ES Results" + "\n" + "="*15)
-print_answers(res_4)
+print_answers(res_4, details="minimum")
 ```
 
 
@@ -197,14 +197,14 @@ res_5 = sklearn_keyword_classifier.run(
  query="who are the younger brothers of arya stark ?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_5)
+print_answers(res_5, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
 res_6 = sklearn_keyword_classifier.run(
  query="arya stark younger brothers"
 )
 print("ES Results" + "\n" + "="*15)
-print_answers(res_6)
+print_answers(res_6, details="minimum")
 ```
 
 ## Transformer Keyword vs Question/Statement Classifier
@@ -234,14 +234,14 @@ res_1 = transformer_keyword_classifier.run(
  query="Who is the father of Arya Stark?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_1)
+print_answers(res_1, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
 res_2 = transformer_keyword_classifier.run(
  query="arya stark father"
 )
 print("ES Results" + "\n" + "="*15)
-print_answers(res_2)
+print_answers(res_2, details="minimum")
 
 ```
 
@@ -253,14 +253,14 @@ res_3 = transformer_keyword_classifier.run(
  query="which country was jon snow filmed ?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_3)
+print_answers(res_3, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
 res_4 = transformer_keyword_classifier.run(
  query="jon snow country"
 )
 print("ES Results" + "\n" + "="*15)
-print_answers(res_4)
+print_answers(res_4, details="minimum")
 ```
 
 
@@ -270,14 +270,14 @@ res_5 = transformer_keyword_classifier.run(
  query="who are the younger brothers of arya stark ?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_5)
+print_answers(res_5, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
 res_6 = transformer_keyword_classifier.run(
  query="arya stark younger brothers"
 )
 print("ES Results" + "\n" + "="*15)
-print_answers(res_6)
+print_answers(res_6, details="minimum")
 ```
 
 ## Question vs Statement Classifier
@@ -305,14 +305,14 @@ res_1 = transformer_question_classifier.run(
  query="Who is the father of Arya Stark?"
 )
 print("DPR Results" + "\n" + "="*15)
-print_answers(res_1)
+print_answers(res_1, details="minimum")
 
 # Show only DPR results
 res_2 = transformer_question_classifier.run(
  query="Arya Stark was the daughter of a Lord."
 )
 print("ES Results" + "\n" + "="*15)
-res_2
+print_answers(res_2, details="minimum")
 ```
 
 ## Standalone Query Classifier

diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md
@@ -182,7 +182,34 @@ prediction = pipe.run(
 
 
 ```python
-print_answers(prediction, details="minimal")
+# Now you can either print the object directly...
+from pprint import pprint
+
+pprint(prediction)
+
+# Sample output: 
+# {
+# 'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
+# <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
+# ...
+# ]
+# 'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
+# <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
+# ...
+# ],
+# 'no_ans_gap': 11.688868522644043,
+# 'node_id': 'Reader',
+# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
+# 'query': 'Who is the father of Arya Stark?',
+# 'root_node': 'Query'
+# }
+```
+
+
+```python
+# ...or use a util to simplify the output
+# Change `minimum` to `medium` or `all` to raise the level of detail
+print_answers(prediction, details="minimum")
 ```
 
 ## About us

diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md
@@ -155,12 +155,10 @@ pipe = FAQPipeline(retriever=retriever)
 
 
 ```python
+from haystack.utils import print_answers
+
 prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
-for a in prediction["answers"]:
- print(f"Answer: {a.answer}")
- print(f"Question: {a.meta['query']}")
- print(f"Score: {a.score}")
- print("---------------------")
+print_answers(prediction, details="medium")
 ```
 
 ## About us

diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md
@@ -193,11 +193,12 @@ for question in QUESTIONS:
 ```python
 # Or alternatively use the Pipeline class
 from haystack.pipelines import GenerativeQAPipeline
+from haystack.utils import print_answers
 
 pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
 for question in QUESTIONS:
  res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
- print(res)
+ print_answers(res, details="minimum")
 ```
 
 ## About us

diff --git a/haystack/schema.py b/haystack/schema.py
@@ -186,10 +186,13 @@ def __eq__(self, other):
  getattr(other, 'id_hash_keys', None) == self.id_hash_keys)
 
  def __repr__(self):
- return str(self.to_dict())
+ return f"<Document: {str(self.to_dict())}>"
 
  def __str__(self):
- return f"content: {self.content[:100]} {'[...]' if len(self.content) > 100 else ''}"
+ # In some cases, self.content is None (therefore not subscriptable)
+ if not self.content:
+ return f"<Document: id={self.id}, content=None>"
+ return f"<Document: id={self.id}, content='{self.content[:100]} {'...' if len(self.content) > 100 else ''}'>"
 
  def __lt__(self, other):
  """ Enable sorting of Documents by score """
@@ -262,7 +265,13 @@ def __lt__(self, other):
  return self.score < other.score
 
  def __str__(self):
- return f"answer: {self.answer} \nscore: {self.score} \ncontext: {self.context}"
+ # self.context might be None (therefore not subscriptable)
+ if not self.context:
+ return f"<Answer: answer='{self.answer}', score={self.score}, context=None>"
+ return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
+
+ def __repr__(self):
+ return f"<Answer {asdict(self)}>"
 
  def to_dict(self):
  return asdict(self)

diff --git a/haystack/utils/__init__.py b/haystack/utils/__init__.py
@@ -16,6 +16,7 @@
 from haystack.utils.export_utils import (
  print_answers,
  print_documents,
+ print_questions,
  export_answers_to_csv,
  convert_labels_to_squad,
 )