Skip to content

Commit

Permalink
feat: add page_number to metadata in DocumentSplitter (#7599)
Browse files Browse the repository at this point in the history
* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705.

* Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x
Solve some minor bugs spotted by tests.

* Update docstrings.

* Add reno.

* Update haystack/components/preprocessors/document_splitter.py

Update docstring from suggestion

Co-authored-by: David S. Batista <[email protected]>

* solve suggestion to improve readability

* fragment tests

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: David S. Batista <[email protected]>

* Update .gitignore

* Update .gitignore

* Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml

* blackening

---------

Co-authored-by: David S. Batista <[email protected]>
  • Loading branch information
CarlosFerLo and davidsbatista committed Apr 29, 2024
1 parent 8d04e53 commit d2c87b2
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 7 deletions.
41 changes: 34 additions & 7 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from copy import deepcopy
from typing import List, Literal
from typing import Dict, List, Literal, Tuple

from more_itertools import windowed

Expand Down Expand Up @@ -53,7 +53,7 @@ def run(self, documents: List[Document]):
:returns: A dictionary with the following key:
- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
document to keep track of the original document that was split. Other metadata are copied from the original
document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
document.
:raises TypeError: if the input is not a list of Documents.
Expand All @@ -70,10 +70,12 @@ def run(self, documents: List[Document]):
f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
metadata = deepcopy(doc.meta)
metadata["source_id"] = doc.id
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
split_docs += self._create_docs_from_splits(
text_splits=text_splits, splits_pages=splits_pages, meta=metadata
)
return {"documents": split_docs}

def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
Expand All @@ -95,15 +97,40 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa
units[i] += split_at
return units

def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
def _concatenate_units(
self, elements: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int]]:
"""
Concatenates the elements into parts of split_length units.
Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
"""
text_splits = []
splits_pages = []
cur_page = 1
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = "".join(current_units)
if len(txt) > 0:
text_splits.append(txt)
return text_splits
splits_pages.append(cur_page)
processed_units = current_units[: split_length - split_overlap]
if self.split_by == "page":
num_page_breaks = len(processed_units)
else:
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
cur_page += num_page_breaks
return text_splits, splits_pages

@staticmethod
def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
"""
Creates Document objects from text splits enriching them with page number and the metadata of the original document.
"""
documents: List[Document] = []

for i, txt in enumerate(text_splits):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta)
doc.meta["page_number"] = splits_pages[i]
documents.append(doc)
return documents
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
highlights: >
Add the "page_number" field to the metadata of all output documents.
enhancements:
- |
Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to.
95 changes: 95 additions & 0 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,98 @@ def test_copy_metadata(self):
for doc, split_doc in zip(documents, result["documents"]):
assert doc.meta.items() <= split_doc.meta.items()
assert split_doc.content == "Text."

def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2, 2]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

splitter = DocumentSplitter(split_by="page", split_length=2)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 3]

for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]

for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

0 comments on commit d2c87b2

Please sign in to comment.