Skip to content

Commit

Permalink
feat: Create the TextIndexingPipeline (#3473)
Browse files Browse the repository at this point in the history
* Add TextIndexingPipeline

* Run Black formatting

* Incorporate reviewer feedback

Co-authored-by: ZanSara <[email protected]>
  • Loading branch information
brandenchan and ZanSara authored Nov 1, 2022
1 parent 6022441 commit 0b2e71d
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions haystack/pipelines/standard_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from haystack.nodes.question_generator.question_generator import QuestionGenerator
from haystack.document_stores.base import BaseDocumentStore
from haystack.pipelines.base import Pipeline
from haystack.nodes import PreProcessor, TextConverter


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -757,3 +758,33 @@ def run_batch( # type: ignore
:param index: Optionally specify the name of index to query the document from. If None, the DocumentStore's default index (self.index) will be used.
"""
return self.run(document_ids=document_ids, filters=filters, top_k=top_k, index=index)


class TextIndexingPipeline(BaseStandardPipeline):
def __init__(
self,
document_store: BaseDocumentStore,
text_converter: Optional[TextConverter] = None,
preprocessor: Optional[PreProcessor] = None,
):
"""
Initialize a basic Pipeline that converts text files into Documents and indexes them into a DocumentStore.
:param document_store: The DocumentStore to index the Documents into.
:param text_converter: A TextConverter object to be used in this pipeline for converting the text files into Documents.
:param preprocessor: A PreProcessor object to be used in this pipeline for preprocessing Documents.
"""

self.pipeline = Pipeline()
self.document_store = document_store
self.text_converter = text_converter or TextConverter()
self.preprocessor = preprocessor or PreProcessor()
self.pipeline.add_node(component=self.text_converter, name="TextConverter", inputs=["File"])
self.pipeline.add_node(component=self.preprocessor, name="PreProcessor", inputs=["TextConverter"])
self.pipeline.add_node(component=self.document_store, name="DocumentStore", inputs=["PreProcessor"])

def run(self, file_path):
return self.pipeline.run(file_paths=[file_path])

def run_batch(self, file_paths):
return self.pipeline.run_batch(file_paths=file_paths)

0 comments on commit 0b2e71d

Please sign in to comment.