Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add page_number to metadata in DocumentSplitter #7599

Merged
merged 17 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 34 additions & 7 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from copy import deepcopy
from typing import List, Literal
from typing import Dict, List, Literal, Tuple

from more_itertools import windowed

Expand Down Expand Up @@ -53,7 +53,7 @@ def run(self, documents: List[Document]):

:returns: A dictionary with the following key:
- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
document to keep track of the original document that was split. Other metadata are copied from the original
document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
document.

:raises TypeError: if the input is not a list of Documents.
Expand All @@ -70,10 +70,12 @@ def run(self, documents: List[Document]):
f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
metadata = deepcopy(doc.meta)
metadata["source_id"] = doc.id
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
split_docs += self._create_docs_from_splits(
text_splits=text_splits, splits_pages=splits_pages, meta=metadata
)
return {"documents": split_docs}

def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
Expand All @@ -95,15 +97,40 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa
units[i] += split_at
return units

def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
def _concatenate_units(
self, elements: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int]]:
"""
Concatenates the elements into parts of split_length units.
Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
"""
text_splits = []
splits_pages = []
cur_page = 1
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = "".join(current_units)
if len(txt) > 0:
text_splits.append(txt)
return text_splits
splits_pages.append(cur_page)
processed_units = current_units[: split_length - split_overlap]
if self.split_by == "page":
num_page_breaks = len(processed_units)
else:
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
cur_page += num_page_breaks
return text_splits, splits_pages

@staticmethod
def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
"""
Creates Document objects from text splits enriching them with page number and the metadata of the original document.
"""
documents: List[Document] = []

for i, txt in enumerate(text_splits):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta)
doc.meta["page_number"] = splits_pages[i]
documents.append(doc)
return documents
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
highlights: >
Add the "page_number" field to the metadata of all output documents.

enhancements:
- |
Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to.
95 changes: 95 additions & 0 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,98 @@ def test_copy_metadata(self):
for doc, split_doc in zip(documents, result["documents"]):
assert doc.meta.items() <= split_doc.meta.items()
assert split_doc.content == "Text."

def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2, 2]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

splitter = DocumentSplitter(split_by="page", split_length=2)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 3]

for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
result = splitter.run(documents=[doc1, doc2])

expected_pages = [1, 1, 1, 2, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
result = splitter.run(documents=[doc1])

expected_pages = [1, 2, 2]
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]

for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p