Refactor run() for all components

deepset-ai · oryx1729 · Sep 10, 2021 · Aug 5, 2021 · Aug 16, 2021 · Aug 16, 2021
commit 8d0af9f203b364bac76312a9bb48fd3bff7d5db5
diff --git a/haystack/classifier/base.py b/haystack/classifier/base.py
@@ -26,7 +26,7 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
  def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None):
  pass
 
- def run(self, query: str, documents: List[Document], top_k: Optional[int] = None, **kwargs): # type: ignore
+ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None): # type: ignore
  self.query_count += 1
  if documents:
  predict = self.timing(self.predict, "query_time")
@@ -36,11 +36,7 @@ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None
 
  document_ids = [doc.id for doc in results]
  logger.debug(f"Retrieved documents with IDs: {document_ids}")
- output = {
- "query": query,
- "documents": results,
- **kwargs
- }
+ output = {"documents": results}
 
  return output, "output_1"
 

diff --git a/haystack/classifier/farm.py b/haystack/classifier/farm.py
@@ -31,13 +31,12 @@ class FARMClassifier(BaseClassifier):
  retriever = ElasticsearchRetriever(document_store=document_store)
  classifier = FARMClassifier(model_name_or_path="deepset/bert-base-german-cased-sentiment-Germeval17")
  p = Pipeline()
- p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
- p.add_node(component=classifier, name="Classifier", inputs=["ESRetriever"])
+ p.add_node(component=retriever, name="Retriever", inputs=["Query"])
+ p.add_node(component=classifier, name="Classifier", inputs=["Retriever"])
 
  res = p_extractive.run(
  query="Who is the father of Arya Stark?",
- top_k_retriever=10,
- top_k_reader=5
+ params={"Retriever": {"top_k": 10}, "Classifier": {"top_k": 5}}
  )
 
  print(res["documents"][0].to_dict()["meta"]["classification"]["label"])

diff --git a/haystack/document_store/base.py b/haystack/document_store/base.py
@@ -286,9 +286,9 @@ def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Di
  def delete_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
  pass
 
- def run(self, documents: List[dict], index: Optional[str] = None, **kwargs): # type: ignore
+ def run(self, documents: List[dict], index: Optional[str] = None): # type: ignore
  self.write_documents(documents=documents, index=index)
- return kwargs, "output_1"
+ return {}, "output_1"
 
  @abstractmethod
  def get_documents_by_id(self, ids: List[str], index: Optional[str] = None,

diff --git a/haystack/eval.py b/haystack/eval.py
@@ -22,7 +22,7 @@ class EvalDocuments:
  a look at our evaluation tutorial for more info about open vs closed domain eval (
  https://haystack.deepset.ai/tutorials/evaluation).
  """
- def __init__(self, debug: bool=False, open_domain: bool=True, top_k_eval_documents: int=10, name="EvalDocuments"):
+ def __init__(self, debug: bool=False, open_domain: bool=True, top_k: int=10, name="EvalDocuments"):
  """
  :param open_domain: When True, a document is considered correctly retrieved so long as the answer string can be found within it.
  When False, correct retrieval is evaluated based on document_id.
@@ -35,7 +35,7 @@ def __init__(self, debug: bool=False, open_domain: bool=True, top_k_eval_documen
  self.debug = debug
  self.log: List = []
  self.open_domain = open_domain
- self.top_k_eval_documents = top_k_eval_documents
+ self.top_k = top_k
  self.name = name
  self.too_few_docs_warning = False
  self.top_k_used = 0
@@ -53,25 +53,25 @@ def init_counts(self):
  self.reciprocal_rank_sum = 0.0
  self.has_answer_reciprocal_rank_sum = 0.0
 
- def run(self, documents, labels: dict, top_k_eval_documents: Optional[int]=None, **kwargs):
+ def run(self, documents, labels: dict, top_k: Optional[int]=None, **kwargs):
  """Run this node on one sample and its labels"""
  self.query_count += 1
  retriever_labels = get_label(labels, kwargs["node_id"])
- if not top_k_eval_documents:
- top_k_eval_documents = self.top_k_eval_documents
+ if not top_k:
+ top_k = self.top_k
 
  if not self.top_k_used:
- self.top_k_used = top_k_eval_documents
- elif self.top_k_used != top_k_eval_documents:
+ self.top_k_used = top_k
+ elif self.top_k_used != top_k:
  logger.warning(f"EvalDocuments was last run with top_k_eval_documents={self.top_k_used} but is "
- f"being run again with top_k_eval_documents={self.top_k_eval_documents}. "
+ f"being run again with top_k={self.top_k}. "
  f"The evaluation counter is being reset from this point so that the evaluation "
  f"metrics are interpretable.")
  self.init_counts()
 
- if len(documents) < top_k_eval_documents and not self.too_few_docs_warning:
- logger.warning(f"EvalDocuments is being provided less candidate documents than top_k_eval_documents "
- f"(currently set to {top_k_eval_documents}).")
+ if len(documents) < top_k and not self.too_few_docs_warning:
+ logger.warning(f"EvalDocuments is being provided less candidate documents than top_k "
+ f"(currently set to {top_k}).")
  self.too_few_docs_warning = True
 
  # TODO retriever_labels is currently a Multilabel object but should eventually be a RetrieverLabel object
@@ -89,7 +89,7 @@ def run(self, documents, labels: dict, top_k_eval_documents: Optional[int]=None,
  # If there are answer span annotations in the labels
  else:
  self.has_answer_count += 1
- retrieved_reciprocal_rank = self.reciprocal_rank_retrieved(retriever_labels, documents, top_k_eval_documents)
+ retrieved_reciprocal_rank = self.reciprocal_rank_retrieved(retriever_labels, documents, top_k)
  self.reciprocal_rank_sum += retrieved_reciprocal_rank
  correct_retrieval = True if retrieved_reciprocal_rank > 0 else False
  self.has_answer_correct += int(correct_retrieval)
@@ -101,7 +101,7 @@ def run(self, documents, labels: dict, top_k_eval_documents: Optional[int]=None,
  self.recall = self.correct_retrieval_count / self.query_count
  self.mean_reciprocal_rank = self.reciprocal_rank_sum / self.query_count
 
- self.top_k_used = top_k_eval_documents
+ self.top_k_used = top_k
 
  if self.debug:
  self.log.append({"documents": documents, "labels": labels, "correct_retrieval": correct_retrieval, "retrieved_reciprocal_rank": retrieved_reciprocal_rank, **kwargs})

diff --git a/haystack/file_converter/base.py b/haystack/file_converter/base.py
@@ -91,7 +91,7 @@ def validate_language(self, text: str) -> bool:
  def run(self, file_paths: Union[Path, List[Path]], # type: ignore
  meta: Optional[Union[Dict[str, str], List[Dict[str, str]]]] = None, # type: ignore
  remove_numeric_tables: Optional[bool] = None, # type: ignore
- valid_languages: Optional[List[str]] = None, **kwargs): # type: ignore
+ valid_languages: Optional[List[str]] = None): # type: ignore
 
  if isinstance(file_paths, Path):
  file_paths = [file_paths]
@@ -110,7 +110,7 @@ def run(self, file_paths: Union[Path, List[Path]], # type: ignore
  )
  )
 
- result = {"documents": documents, **kwargs}
+ result = {"documents": documents}
  return result, "output_1"
 
 

diff --git a/haystack/generator/base.py b/haystack/generator/base.py
@@ -23,12 +23,11 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int]) -
  """
  pass
 
- def run(self, query: str, documents: List[Document], top_k_generator: Optional[int] = None, **kwargs): # type: ignore
+ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None): # type: ignore
 
  if documents:
- results = self.predict(query=query, documents=documents, top_k=top_k_generator)
+ results = self.predict(query=query, documents=documents, top_k=top_k)
  else:
  results = {"answers": []}
 
- results.update(**kwargs)
  return results, "output_1"
diff --git a/haystack/graph_retriever/base.py b/haystack/graph_retriever/base.py
@@ -15,9 +15,7 @@ def retrieve(self, query: str, top_k: int):
  def eval(self):
  raise NotImplementedError
 
- def run(self, query: str, top_k: int, **kwargs): # type: ignore
+ def run(self, query: str, top_k: int): # type: ignore
  answers = self.retrieve(query=query, top_k=top_k)
- results = {"query": query,
- "answers": answers,
- **kwargs}
+ results = {"answers": answers}
  return results, "output_1"
diff --git a/haystack/knowledge_graph/base.py b/haystack/knowledge_graph/base.py
@@ -8,7 +8,7 @@ class BaseKnowledgeGraph(BaseComponent):
 
  def run(self, sparql_query: str, index: Optional[str] = None, **kwargs): # type: ignore
  result = self.query(sparql_query=sparql_query, index=index)
- output = {"sparql_result": result, **kwargs}
+ output = {"sparql_result": result}
  return output, "output_1"
 
  def query(self, sparql_query: str, index: Optional[str] = None):

diff --git a/haystack/pipeline.py b/haystack/pipeline.py
@@ -254,15 +254,15 @@ def set_node(self, name: str, component):
  """
  self.graph.nodes[name]["component"] = component
 
- def run(self, query: Optional[str] = None, file: Optional[str] = None, params: Optional[dict] = None): # type: ignore
+ def run(self, query: Optional[str] = None, file_paths: Optional[List[str]] = None, params: Optional[dict] = None): # type: ignore
  node_output = None
  queue = {
  self.root_node: {"root_node": self.root_node, "params": params}
  } # ordered dict with "node_id" -> "input" mapping that acts as a FIFO queue
  if query:
  queue[self.root_node]["query"] = query
- if file:
- queue[self.root_node]["file"] = file
+ if file_paths:
+ queue[self.root_node]["file_paths"] = file_paths
  i = 0 # the first item is popped off the queue unless it is a "join" node with unprocessed predecessors
  while queue:
  node_id = list(queue.keys())[i]
@@ -505,6 +505,10 @@ def draw(self, path: Path = Path("pipeline.png")):
  """
  self.pipeline.draw(path)
 
+ def run(self, query: str, params: Optional[dict] = None):
+ output = self.pipeline.run(query=query, params=params)
+ return output
+
 
 class ExtractiveQAPipeline(BaseStandardPipeline):
  def __init__(self, reader: BaseReader, retriever: BaseRetriever):
@@ -518,15 +522,6 @@ def __init__(self, reader: BaseReader, retriever: BaseRetriever):
  self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
  self.pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
 
- def run(self, query: str, filters: Optional[Dict] = None, top_k_retriever: int = 10, top_k_reader: int = 10):
- params = {
- "filters": filters,
- "Retriever": {"top_k": top_k_retriever},
- "Reader": {"top_k": top_k_reader},
- }
- output = self.pipeline.run(query=query, params=params)
- return output
-
 
 class DocumentSearchPipeline(BaseStandardPipeline):
  def __init__(self, retriever: BaseRetriever):
@@ -538,8 +533,7 @@ def __init__(self, retriever: BaseRetriever):
  self.pipeline = Pipeline()
  self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
 
- def run(self, query: str, filters: Optional[Dict] = None, top_k_retriever: Optional[int] = None):
- params = {"filters": filters, "Retriever": {"top_k": top_k_retriever}}
+ def run(self, query: str, params: Optional[dict] = None):
  output = self.pipeline.run(query=query, params=params)
  document_dicts = [doc.to_dict() for doc in output["documents"]]
  output["documents"] = document_dicts
@@ -558,60 +552,28 @@ def __init__(self, generator: BaseGenerator, retriever: BaseRetriever):
  self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
  self.pipeline.add_node(component=generator, name="Generator", inputs=["Retriever"])
 
- def run(
- self,
- query: str,
- filters: Optional[Dict] = None,
- top_k_retriever: Optional[int] = None,
- top_k_generator: Optional[int] = None
- ):
- params = {
- "filters": filters,
- "Retriever": {"top_k": top_k_retriever},
- "Generator": {"top_k": top_k_generator},
- }
- output = self.pipeline.run(query=query, params=params)
- return output
-
 
 class SearchSummarizationPipeline(BaseStandardPipeline):
- def __init__(self, summarizer: BaseSummarizer, retriever: BaseRetriever):
+ def __init__(self, summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False):
  """
  Initialize a Pipeline that retrieves documents for a query and then summarizes those documents.
 
  :param summarizer: Summarizer instance
  :param retriever: Retriever instance
+ :param return_in_answer_format: Whether the results should be returned as documents (False) or in the answer
+ format used in other QA pipelines (True). With the latter, you can use this
+ pipeline as a "drop-in replacement" for other QA pipelines.
  """
  self.pipeline = Pipeline()
  self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
  self.pipeline.add_node(component=summarizer, name="Summarizer", inputs=["Retriever"])
+ self.return_in_answer_format = return_in_answer_format
 
- def run(
- self,
- query: str,
- filters: Optional[Dict] = None,
- top_k_retriever: Optional[int] = None,
- generate_single_summary: Optional[bool] = None,
- return_in_answer_format: bool = False,
- ):
- """
- :param query: Your search query
- :param filters:
- :param top_k_retriever: Number of top docs the retriever should pass to the summarizer.
- The higher this value, the slower your pipeline.
- :param generate_single_summary: Whether to generate single summary from all retrieved docs (True) or one per doc (False).
- :param return_in_answer_format: Whether the results should be returned as documents (False) or in the answer format used in other QA pipelines (True).
- With the latter, you can use this pipeline as a "drop-in replacement" for other QA pipelines.
- """
- params = {
- "filters": filters,
- "Retriever": {"top_k": top_k_retriever},
- "Summarizer": {"generate_single_summary": generate_single_summary},
- }
+ def run(self, query: str, params: Optional[dict] = None):
  output = self.pipeline.run(query=query, params=params)
 
  # Convert to answer format to allow "drop-in replacement" for other QA pipelines
- if return_in_answer_format:
+ if self.return_in_answer_format:
  results: Dict = {"query": query, "answers": []}
  docs = deepcopy(output["documents"])
  for doc in docs:
@@ -642,8 +604,7 @@ def __init__(self, retriever: BaseRetriever):
  self.pipeline = Pipeline()
  self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
 
- def run(self, query: str, filters: Optional[Dict] = None, top_k_retriever: Optional[int] = None):
- params = {"filters": filters, "Retriever": {"top_k": top_k_retriever}}
+ def run(self, query: str, params: Optional[dict] = None):
  output = self.pipeline.run(query=query, params=params)
  documents = output["documents"]
 

diff --git a/haystack/preprocessor/base.py b/haystack/preprocessor/base.py
@@ -47,7 +47,6 @@ def run( # type: ignore
  split_length: Optional[int] = None,
  split_overlap: Optional[int] = None,
  split_respect_sentence_boundary: Optional[bool] = None,
- **kwargs,
  ):
  documents = self.process(
  documents=documents,
@@ -59,5 +58,5 @@ def run( # type: ignore
  split_overlap=split_overlap,
  split_respect_sentence_boundary=split_respect_sentence_boundary,
  )
- result = {"documents": documents, **kwargs}
+ result = {"documents": documents}
  return result, "output_1"
diff --git a/haystack/question_generator/question_generator.py b/haystack/question_generator/question_generator.py
@@ -1,7 +1,8 @@
 from transformers import AutoModelForSeq2SeqLM
 from transformers import AutoTokenizer
-from haystack import BaseComponent
+from haystack import BaseComponent, Document
 from haystack.preprocessor import PreProcessor
+from typing import List
 
 
 class QuestionGenerator(BaseComponent):
@@ -50,16 +51,15 @@ def __init__(self,
  self.preprocessor = PreProcessor()
  self.prompt = prompt
 
- def run(self, **kwargs):
- documents = kwargs["documents"]
+ def run(self, documents: List[Document]):
  generated_questions = []
  for d in documents:
  questions = self.generate(d.text)
  curr_dict = {"document_id": d.id,
  "document_sample": d.text[:200],
  "questions": questions}
  generated_questions.append(curr_dict)
- output = {"generated_questions": generated_questions, **kwargs}
+ output = {"generated_questions": generated_questions}
  return output, "output_1"
 
  def generate(self, text):

diff --git a/haystack/ranker/base.py b/haystack/ranker/base.py
@@ -26,21 +26,17 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
  def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None):
  pass
 
- def run(self, query: str, documents: List[Document], top_k_ranker: Optional[int] = None, **kwargs): # type: ignore
+ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):  # type: ignore
  self.query_count += 1
  if documents:
  predict = self.timing(self.predict, "query_time")
- results = predict(query=query, documents=documents, top_k=top_k_ranker)
+ results = predict(query=query, documents=documents, top_k=top_k)
  else:
  results = []
 
  document_ids = [doc.id for doc in results]
  logger.debug(f"Retrieved documents with IDs: {document_ids}")
- output = {
- "query": query,
- "documents": results,
- **kwargs
- }
+ output = {"documents": results}
 
  return output, "output_1"
 

diff --git a/haystack/reader/base.py b/haystack/reader/base.py
@@ -67,7 +67,7 @@ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None
 
  return results, "output_1"
 
- def run_batch(self, query_doc_list: List[Dict], top_k_reader: Optional[int] = None):
+ def run_batch(self, query_doc_list: List[Dict], top_k: Optional[int] = None):
  """ A unoptimized implementation of running Reader queries in batch """
  self.query_count += len(query_doc_list)
  results = []
@@ -76,7 +76,7 @@ def run_batch(self, query_doc_list: List[Dict], top_k_reader: Optional[int] = No
  q = qd["queries"]
  docs = qd["docs"]
  predict = self.timing(self.predict, "query_time")
- result = predict(query=q, documents=docs, top_k=top_k_reader)
+ result = predict(query=q, documents=docs, top_k=top_k)
  results.append(result)
  else:
  results = [{"answers": [], "query": ""}]