preprocessors: review docstrings (#7219)

deepset-ai · Feb 27, 2024 · 9b1d792 · 9b1d792
1 parent 87727e4
commit 9b1d792
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 45 deletions.
diff --git a/docs/pydoc/config/preprocessors_api.yml b/docs/pydoc/config/preprocessors_api.yml
@@ -1,7 +1,7 @@
 loaders:
  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
  search_path: [../../../haystack/components/preprocessors]
- modules: ["document_cleaner", "document_splitter"]
+ modules: ["document_cleaner", "document_splitter", "text_cleaner"]
  ignore_when_discovered: ["__init__"]
 processors:
  - type: filter
@@ -13,7 +13,7 @@ processors:
  - type: crossref
 renderer:
  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
- excerpt: Normalizes white spaces, gets rid of headers and footers, cleans empty lines in your Documents, or splits them into smaller pieces.
+ excerpt: "Preprocess your Documents and texts: clean, split, and more."
  category_slug: haystack-api
  title: PreProcessors
  slug: preprocessors-api

diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py
@@ -13,21 +13,20 @@
 @component
 class DocumentCleaner:
  """
- Makes text documents more readable by removing extra whitespaces, empty lines, specified substrings, regexes, page headers and footers (in this order).
- This is useful for preparing the documents for further processing by LLMs.
-
- Example usage in an indexing pipeline:
+ Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes,
+ page headers and footers (in this order).
 
+ Usage example:
  ```python
- document_store = InMemoryDocumentStore()
- p = Pipeline()
- p.add_component(instance=TextFileToDocument(), name="text_file_converter")
- p.add_component(instance=DocumentCleaner(), name="cleaner")
- p.add_component(instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter")
- p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
- p.connect("text_file_converter.documents", "cleaner.documents")
- p.connect("cleaner.documents", "splitter.documents")
- p.connect("splitter.documents", "writer.documents")
+ from haystack import Document
+ from haystack.components.preprocessors import DocumentCleaner
+
+ doc = Document(content="This is a document to clean\\n\\n\\nsubstring to remove")
+
+ cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
+ result = cleaner.run(documents=[doc])
+
+ assert result["documents"][0].content == "This is a document to clean "
  ```
  """
 
@@ -43,8 +42,8 @@ def __init__(
  :param remove_empty_lines: Whether to remove empty lines.
  :param remove_extra_whitespaces: Whether to remove extra whitespaces.
  :param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages.
- Pages in the text need to be separated by form feed character "\f",
- which is supported by TextFileToDocument and AzureOCRDocumentConverter.
+ Pages in the text need to be separated by form feed character "\\f",
+ which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`.
  :param remove_substrings: List of substrings to remove from the text.
  :param remove_regex: Regex to match and replace substrings by "".
  """
@@ -58,8 +57,14 @@ def __init__(
  @component.output_types(documents=List[Document])
  def run(self, documents: List[Document]):
  """
- Run the DocumentCleaner on the given list of documents.
- The documents' metadata remain unchanged.
+ Cleans up the documents.
+
+ :param documents: List of Documents to clean.
+
+ :returns: A dictionary with the following key:
+ - `documents`: List of cleaned Documents.
+
+ :raises TypeError: if documents is not a list of Documents.
  """
  if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
  raise TypeError("DocumentCleaner expects a List of Documents as input.")
@@ -94,7 +99,7 @@ def _remove_empty_lines(self, text: str) -> str:
  """
  Remove empty lines and lines that contain nothing but whitespaces from text.
  :param text: Text to clean.
- :param return: The text without empty lines.
+ :returns: The text without empty lines.
  """
  lines = text.split("\n")
  non_empty_lines = filter(lambda line: line.strip() != "", lines)
@@ -104,7 +109,7 @@ def _remove_extra_whitespaces(self, text: str) -> str:
  """
  Remove extra whitespaces from text.
  :param text: Text to clean.
- :param return: The text without extra whitespaces.
+ :returns: The text without extra whitespaces.
  """
  return re.sub(r"\s\s+", " ", text).strip()
 
@@ -113,7 +118,7 @@ def _remove_regex(self, text: str, regex: str) -> str:
  Remove substrings that match the specified regex from the text.
  :param text: Text to clean.
  :param regex: Regex to match and replace substrings by "".
- :param return: The text without any substrings that match the regex.
+ :returns: The text without the substrings that match the regex.
  """
  return re.sub(regex, "", text).strip()
 
@@ -122,7 +127,7 @@ def _remove_substrings(self, text: str, substrings: List[str]) -> str:
  Remove all specified substrings from the text.
  :param text: Text to clean.
  :param substrings: Substrings to remove.
- :return: The text without the specified substrings.
+ :returns: The text without the specified substrings.
  """
  for substring in substrings:
  text = text.replace(substring, "")
@@ -133,7 +138,7 @@ def _remove_repeated_substrings(self, text: str) -> str:
  Remove any substrings from the text that occur repeatedly on every page. For example headers or footers.
  Pages in the text need to be separated by form feed character "\f".
  :param text: Text to clean.
- :return: The text without the repeated substrings.
+ :returns: The text without the repeated substrings.
  """
  return self._find_and_remove_header_footer(
  text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
@@ -152,7 +157,7 @@ def _find_and_remove_header_footer(
  :param n_chars: The number of first/last characters where the header/footer shall be searched in.
  :param n_first_pages_to_ignore: The number of first pages to ignore (e.g. TOCs often don't contain footer/header).
  :param n_last_pages_to_ignore: The number of last pages to ignore.
- :return: The text without the found headers and footers.
+ :returns: The text without the found headers and footers.
  """
 
  pages = text.split("\f")
@@ -178,7 +183,7 @@ def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
  Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace.
  :param seq: The sequence to generate ngrams from.
  :param n: The length of the ngrams to generate.
- :return: A Generator generating all ngrams of length n from the given sequence.
+ :returns: A Generator generating all ngrams of length n from the given sequence.
  """
 
  # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
@@ -201,7 +206,7 @@ def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
  :param seq: The sequence to generate ngrams from.
  :param min_ngram: The minimum length of ngram to consider.
  :param max_ngram: The maximum length of ngram to consider.
- :return: A set of all ngrams from the given sequence.
+ :returns: A set of all ngrams from the given sequence.
  """
  lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
  ngrams = map(partial(self._ngram, seq), lengths)
@@ -217,7 +222,7 @@ def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, m
  :param sequences: The list of strings that shall be searched for common n_grams.
  :param max_ngram: The maximum length of ngram to consider.
  :param min_ngram: The minimum length of ngram to consider.
- :return: The longest ngram that all sequences have in common.
+ :returns: The longest ngram that all sequences have in common.
  """
  sequences = [s for s in sequences if s] # filter empty sequences
  if not sequences:

diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py
@@ -10,7 +10,10 @@
 class DocumentSplitter:
  """
  Splits a list of text documents into a list of text documents with shorter texts.
- This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models.
+
+ Splitting documents with long texts is a common preprocessing step during indexing.
+ This allows Embedders to create significant semantic representations
+ and avoids exceeding the maximum context length of language models.
  """
 
  def __init__(
@@ -21,7 +24,7 @@ def __init__(
  ):
  """
  :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
- "sentence" for splitting by ".", "page" for splitting by "\f" or "passage" for splitting by "\\n\\n".
+  "sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
  :param split_length: The maximum number of units in each split.
  :param split_overlap: The number of units that each split should overlap.
  """
@@ -39,12 +42,18 @@ def __init__(
  @component.output_types(documents=List[Document])
  def run(self, documents: List[Document]):
  """
- Splits the documents by split_by after split_length units with an overlap of split_overlap units.
- Returns a list of documents with the split texts.
- A metadata field "source_id" is added to each document to keep track of the original document that was split.
- Other metadata are copied from the original document.
+ Splits documents by the unit expressed in `split_by`, with a length of `split_length`
+ and an overlap of `split_overlap`.
+
  :param documents: The documents to split.
- :return: A list of documents with the split texts.
+
+ :returns: A dictionary with the following key:
+ - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
+ document to keep track of the original document that was split. Other metadata are copied from the original
+ document.
+
+ :raises TypeError: if the input is not a list of Documents.
+ :raises ValueError: if the content of a document is None.
  """
 
  if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):

diff --git a/haystack/components/preprocessors/text_cleaner.py b/haystack/components/preprocessors/text_cleaner.py
@@ -10,6 +10,7 @@ class TextCleaner:
  """
  A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions,
  convert text to lowercase, remove punctuation, and remove numbers.
+
  This is useful to cleanup text data before evaluation.
  """
 
@@ -21,13 +22,11 @@ def __init__(
  remove_numbers: bool = False,
  ):
  """
- Creates a new instance of TextCleaner.
-
  :param remove_regexps: A list of regular expressions. If provided, it removes substrings
- matching these regular expressions from the text. Defaults to None.
- :param convert_to_lowercase: If True, converts all characters to lowercase. Defaults to False.
- :param remove_punctuation: If True, removes punctuation from the text. Defaults to False.
- :param remove_numbers: If True, removes numerical digits from the text. Defaults to False.
+ matching these regular expressions from the text.
+ :param convert_to_lowercase: If True, converts all characters to lowercase.
+ :param remove_punctuation: If True, removes punctuation from the text.
+ :param remove_numbers: If True, removes numerical digits from the text.
  """
  self._remove_regexps = remove_regexps
  self._convert_to_lowercase = convert_to_lowercase
@@ -47,12 +46,12 @@ def __init__(
 
  @component.output_types(texts=List[str])
  def run(self, texts: List[str]) -> Dict[str, Any]:
- r"""
- Run the TextCleaner on the given list of strings.
+ """
+ Cleans up the given list of strings.
 
  :param texts: List of strings to clean.
- :returns: A dictionary with the following outputs:
-  * `texts` - The cleaned list of strings.
+ :returns: A dictionary with the following key:
+ - `texts`: the cleaned list of strings.
  """
 
  if self._regex: