feat: Create the TextIndexingPipeline (#3473)

* Add TextIndexingPipeline * Run Black formatting * Incorporate reviewer feedback Co-authored-by: ZanSara <[email protected]>
deepset-ai · Nov 1, 2022 · 0b2e71d · 0b2e71d
1 parent 6022441
commit 0b2e71d
Showing 1 changed file with 31 additions and 0 deletions.
diff --git a/haystack/pipelines/standard_pipelines.py b/haystack/pipelines/standard_pipelines.py
@@ -20,6 +20,7 @@
 from haystack.nodes.question_generator.question_generator import QuestionGenerator
 from haystack.document_stores.base import BaseDocumentStore
 from haystack.pipelines.base import Pipeline
+from haystack.nodes import PreProcessor, TextConverter
 
 
 logger = logging.getLogger(__name__)
@@ -757,3 +758,33 @@ def run_batch( # type: ignore
  :param index: Optionally specify the name of index to query the document from. If None, the DocumentStore's default index (self.index) will be used.
  """
  return self.run(document_ids=document_ids, filters=filters, top_k=top_k, index=index)
+
+
+class TextIndexingPipeline(BaseStandardPipeline):
+ def __init__(
+ self,
+ document_store: BaseDocumentStore,
+ text_converter: Optional[TextConverter] = None,
+ preprocessor: Optional[PreProcessor] = None,
+ ):
+ """
+ Initialize a basic Pipeline that converts text files into Documents and indexes them into a DocumentStore.
+
+ :param document_store: The DocumentStore to index the Documents into.
+ :param text_converter: A TextConverter object to be used in this pipeline for converting the text files into Documents.
+ :param preprocessor: A PreProcessor object to be used in this pipeline for preprocessing Documents.
+ """
+
+ self.pipeline = Pipeline()
+ self.document_store = document_store
+ self.text_converter = text_converter or TextConverter()
+ self.preprocessor = preprocessor or PreProcessor()
+ self.pipeline.add_node(component=self.text_converter, name="TextConverter", inputs=["File"])
+ self.pipeline.add_node(component=self.preprocessor, name="PreProcessor", inputs=["TextConverter"])
+ self.pipeline.add_node(component=self.document_store, name="DocumentStore", inputs=["PreProcessor"])
+
+ def run(self, file_path):
+ return self.pipeline.run(file_paths=[file_path])
+
+ def run_batch(self, file_paths):
+ return self.pipeline.run_batch(file_paths=file_paths)