feat: add page_number to metadata in DocumentSplitter (#7599)

* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705. * Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x Solve some minor bugs spotted by tests. * Update docstrings. * Add reno. * Update haystack/components/preprocessors/document_splitter.py Update docstring from suggestion Co-authored-by: David S. Batista <[email protected]> * solve suggestion to improve readability * fragment tests * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: David S. Batista <[email protected]> * Update .gitignore * Update .gitignore * Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml * blackening --------- Co-authored-by: David S. Batista <[email protected]>
deepset-ai · Apr 29, 2024 · d2c87b2 · d2c87b2
1 parent 8d04e53
commit d2c87b2
Show file tree

Hide file tree

Showing 3 changed files with 136 additions and 7 deletions.
diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import List, Literal
+from typing import Dict, List, Literal, Tuple
 
 from more_itertools import windowed
 
@@ -53,7 +53,7 @@ def run(self, documents: List[Document]):
 
  :returns: A dictionary with the following key:
  - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
- document to keep track of the original document that was split. Other metadata are copied from the original
+ document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
  document.
 
  :raises TypeError: if the input is not a list of Documents.
@@ -70,10 +70,12 @@ def run(self, documents: List[Document]):
  f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
  )
  units = self._split_into_units(doc.content, self.split_by)
- text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
+ text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
  metadata = deepcopy(doc.meta)
  metadata["source_id"] = doc.id
- split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
+ split_docs += self._create_docs_from_splits(
+ text_splits=text_splits, splits_pages=splits_pages, meta=metadata
+ )
  return {"documents": split_docs}
 
  def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
@@ -95,15 +97,40 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa
  units[i] += split_at
  return units
 
- def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
+ def _concatenate_units(
+ self, elements: List[str], split_length: int, split_overlap: int
+ ) -> Tuple[List[str], List[int]]:
  """
- Concatenates the elements into parts of split_length units.
+ Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
  """
  text_splits = []
+ splits_pages = []
+ cur_page = 1
  segments = windowed(elements, n=split_length, step=split_length - split_overlap)
  for seg in segments:
  current_units = [unit for unit in seg if unit is not None]
  txt = "".join(current_units)
  if len(txt) > 0:
  text_splits.append(txt)
- return text_splits
+ splits_pages.append(cur_page)
+ processed_units = current_units[: split_length - split_overlap]
+ if self.split_by == "page":
+ num_page_breaks = len(processed_units)
+ else:
+ num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+ cur_page += num_page_breaks
+ return text_splits, splits_pages
+
+ @staticmethod
+ def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
+ """
+ Creates Document objects from text splits enriching them with page number and the metadata of the original document.
+ """
+ documents: List[Document] = []
+
+ for i, txt in enumerate(text_splits):
+ meta = deepcopy(meta)
+ doc = Document(content=txt, meta=meta)
+ doc.meta["page_number"] = splits_pages[i]
+ documents.append(doc)
+ return documents
diff --git a/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml
@@ -0,0 +1,7 @@
+---
+highlights: >
+ Add the "page_number" field to the metadata of all output documents.
+
+enhancements:
+ - |
+ Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to.
diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py
@@ -141,3 +141,98 @@ def test_copy_metadata(self):
  for doc, split_doc in zip(documents, result["documents"]):
  assert doc.meta.items() <= split_doc.meta.items()
  assert split_doc.content == "Text."
+
+ def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
+ splitter = DocumentSplitter(split_by="word", split_length=2)
+ doc1 = Document(content="This is some text.\f This text is on another page.")
+ doc2 = Document(content="This content has two.\f\f page brakes.")
+ result = splitter.run(documents=[doc1, doc2])
+
+ expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
+ splitter = DocumentSplitter(split_by="sentence", split_length=1)
+ doc1 = Document(content="This is some text.\f This text is on another page.")
+ doc2 = Document(content="This content has two.\f\f page brakes.")
+ result = splitter.run(documents=[doc1, doc2])
+
+ expected_pages = [1, 1, 1, 1]
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
+ splitter = DocumentSplitter(split_by="passage", split_length=1)
+ doc1 = Document(
+ content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
+ )
+ result = splitter.run(documents=[doc1])
+
+ expected_pages = [1, 2, 2, 2]
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
+ splitter = DocumentSplitter(split_by="page", split_length=1)
+ doc1 = Document(
+ content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+ )
+ result = splitter.run(documents=[doc1])
+ expected_pages = [1, 2, 3]
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p
+
+ splitter = DocumentSplitter(split_by="page", split_length=2)
+ doc1 = Document(
+ content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+ )
+ result = splitter.run(documents=[doc1])
+ expected_pages = [1, 3]
+
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_overlap_word_split(self):
+ splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
+ doc1 = Document(content="This is some text. And\f this text is on another page.")
+ doc2 = Document(content="This content has two.\f\f page brakes.")
+ result = splitter.run(documents=[doc1, doc2])
+
+ expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
+ for doc, p in zip(result["documents"], expected_pages):
+ print(doc.content, doc.meta, p)
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
+ splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
+ doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
+ doc2 = Document(content="This content has two.\f\f page brakes. More text.")
+ result = splitter.run(documents=[doc1, doc2])
+
+ expected_pages = [1, 1, 1, 2, 1, 1]
+ for doc, p in zip(result["documents"], expected_pages):
+ print(doc.content, doc.meta, p)
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_overlap_passage_split(self):
+ splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
+ doc1 = Document(
+ content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
+ )
+ result = splitter.run(documents=[doc1])
+
+ expected_pages = [1, 2, 2]
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p
+
+ def test_add_page_number_to_metadata_with_overlap_page_split(self):
+ splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
+ doc1 = Document(
+ content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+ )
+ result = splitter.run(documents=[doc1])
+ expected_pages = [1, 2, 3]
+
+ for doc, p in zip(result["documents"], expected_pages):
+ assert doc.meta["page_number"] == p