Skip to content

Commit

Permalink
feat: added split by page to DocumentSplitter (#6753)
Browse files Browse the repository at this point in the history
* feat-added-split-by-page-to-DocumentSplitter

* added test case and the suggested changes

* Update document_splitter.py

* Update haystack/components/preprocessors/document_splitter.py

* Update test_document_splitter.py

---------

Co-authored-by: Sebastian Husch Lee <[email protected]>
  • Loading branch information
sahusiddharth and sjrl committed Jan 17, 2024
1 parent 6a15145 commit a7ac4ed
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
19 changes: 12 additions & 7 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,21 @@ class DocumentSplitter:
"""

def __init__(
self, split_by: Literal["word", "sentence", "passage"] = "word", split_length: int = 200, split_overlap: int = 0
self,
split_by: Literal["word", "sentence", "page", "passage"] = "word",
split_length: int = 200,
split_overlap: int = 0,
):
"""
:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
"sentence" for splitting by ".", or "passage" for splitting by "\\n\\n".
"sentence" for splitting by ".", "page" for splitting by "\f" or "passage" for splitting by "\\n\\n".
:param split_length: The maximum number of units in each split.
:param split_overlap: The number of units that each split should overlap.
"""

self.split_by = split_by
if split_by not in ["word", "sentence", "passage"]:
raise ValueError("split_by must be one of 'word', 'sentence' or 'passage'.")
if split_by not in ["word", "sentence", "page", "passage"]:
raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
if split_length <= 0:
raise ValueError("split_length must be greater than 0.")
self.split_length = split_length
Expand Down Expand Up @@ -60,16 +63,18 @@ def run(self, documents: List[Document]):
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
return {"documents": split_docs}

def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]:
if split_by == "passage":
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
if split_by == "page":
split_at = "\f"
elif split_by == "passage":
split_at = "\n\n"
elif split_by == "sentence":
split_at = "."
elif split_by == "word":
split_at = " "
else:
raise NotImplementedError(
"DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
)
units = text.split(split_at)
# Add the delimiter back to all units except the last one
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Added split by page to DocumentSplitter, which will split the document at \f
16 changes: 15 additions & 1 deletion test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_empty_list(self):
assert res == {"documents": []}

def test_unsupported_split_by(self):
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence', 'page' or 'passage'."):
DocumentSplitter(split_by="unsupported")

def test_unsupported_split_length(self):
Expand Down Expand Up @@ -94,6 +94,20 @@ def test_split_by_passage(self):
assert result["documents"][1].content == "And there is a third sentence.\n\n"
assert result["documents"][2].content == " And another passage."

def test_split_by_page(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
result = splitter.run(
documents=[
Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
]
)
assert len(result["documents"]) == 3
assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c"
assert result["documents"][1].content == " And there is a third sentence.\x0c"
assert result["documents"][2].content == " And another passage."

def test_split_by_word_with_overlap(self):
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
result = splitter.run(
Expand Down

0 comments on commit a7ac4ed

Please sign in to comment.