deepset-ai · sjrl · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024 · Jan 17, 2024
@@ -14,17 +14,20 @@ class DocumentSplitter:
  """
 
  def __init__(
- self, split_by: Literal["word", "sentence", "passage"] = "word", split_length: int = 200, split_overlap: int = 0
+ self,
+ split_by: Literal["word", "sentence", "page", "passage"] = "word",
+ split_length: int = 200,
+ split_overlap: int = 0,
  ):
  """
  :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
- "sentence" for splitting by ".", or "passage" for splitting by "\\n\\n".
+ "sentence" for splitting by ".", "page" for splittling by "\f" or "passage" for splitting by "\\n\\n".
  :param split_length: The maximum number of units in each split.
  :param split_overlap: The number of units that each split should overlap.
  """
 
  self.split_by = split_by
- if split_by not in ["word", "sentence", "passage"]:
+ if split_by not in ["word", "sentence", "page", "passage"]:
  raise ValueError("split_by must be one of 'word', 'sentence' or 'passage'.")
  if split_length <= 0:
  raise ValueError("split_length must be greater than 0.")
@@ -60,16 +63,18 @@ def run(self, documents: List[Document]):
  split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
  return {"documents": split_docs}
 
- def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]:
- if split_by == "passage":
+ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
+ if split_by == "page":
+ split_at = "\f"
+ elif split_by == "passage":
  split_at = "\n\n"
  elif split_by == "sentence":
  split_at = "."
  elif split_by == "word":
  split_at = " "
  else:
  raise NotImplementedError(
- "DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
+ "DocumentSplitter only supports 'page', 'passage', 'sentence' or 'word' split_by options."
  )
  units = text.split(split_at)
  # Add the delimiter back to all units except the last one

@@ -0,0 +1,4 @@
+---
+enhancements:
+ - |
+ Added split by page to DocumentSplitter, which will split the document at \f