Add the implementation for page counting used in the v1.25.x branch. …

…It should work as expected in issue #6705.
deepset-ai · davidsbatista · Apr 29, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
commit 9c1c153aa627fc296fea8c6cb069c549056072c2
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import List, Literal
+from typing import Dict, List, Literal, Tuple
 
 from more_itertools import windowed
 
@@ -70,10 +70,10 @@ def run(self, documents: List[Document]):
  f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
  )
  units = self._split_into_units(doc.content, self.split_by)
- text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
+ text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
  metadata = deepcopy(doc.meta)
  metadata["source_id"] = doc.id
- split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
+ split_docs += self._create_docs_from_splits(text_splits=text_splits, splits_pages=splits_pages)
  return {"documents": split_docs}
 
  def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
@@ -95,15 +95,39 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa
  units[i] += split_at
  return units
 
- def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
+ def _concatenate_units(
+ self, elements: List[str], split_length: int, split_overlap: int
+ ) -> Tuple[List[str], List[int]]:
  """
  Concatenates the elements into parts of split_length units.
  """
  text_splits = []
+ splits_pages = []
+ cur_page = 1
  segments = windowed(elements, n=split_length, step=split_length - split_overlap)
  for seg in segments:
  current_units = [unit for unit in seg if unit is not None]
  txt = "".join(current_units)
  if len(txt) > 0:
  text_splits.append(txt)
- return text_splits
+ splits_pages.append(cur_page)
+ processed_units = current_units[: split_length - split_overlap]
+ if self.split_by != "page":
+ num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+ else:
+ num_page_breaks = len(processed_units)
+ cur_page += num_page_breaks
+ return text_splits, splits_pages
+
+ def _create_docs_from_splits(self, text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
+ """
+ Creates Document objects from text splits enriching them with page number and the metadata of the original document.
+ """
+ documents: List[Document] = []
+
+ for i, txt in enumerate(text_splits):
+ meta = deepcopy(meta)
+ doc = Document(content=txt, meta=meta)
+ doc.meta["page"] = splits_pages[i]
+ documents.append(doc)
+ return documents