Skip to content

Commit

Permalink
preprocessors: review docstrings (#7219)
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Feb 27, 2024
1 parent 87727e4 commit 9b1d792
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 45 deletions.
4 changes: 2 additions & 2 deletions docs/pydoc/config/preprocessors_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/preprocessors]
modules: ["document_cleaner", "document_splitter"]
modules: ["document_cleaner", "document_splitter", "text_cleaner"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand All @@ -13,7 +13,7 @@ processors:
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
excerpt: Normalizes white spaces, gets rid of headers and footers, cleans empty lines in your Documents, or splits them into smaller pieces.
excerpt: "Preprocess your Documents and texts: clean, split, and more."
category_slug: haystack-api
title: PreProcessors
slug: preprocessors-api
Expand Down
57 changes: 31 additions & 26 deletions haystack/components/preprocessors/document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,20 @@
@component
class DocumentCleaner:
"""
Makes text documents more readable by removing extra whitespaces, empty lines, specified substrings, regexes, page headers and footers (in this order).
This is useful for preparing the documents for further processing by LLMs.
Example usage in an indexing pipeline:
Cleans up text documents by removing extra whitespaces, empty lines, specified substrings, regexes,
page headers and footers (in this order).
Usage example:
```python
document_store = InMemoryDocumentStore()
p = Pipeline()
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=DocumentCleaner(), name="cleaner")
p.add_component(instance=TextDocumentSplitter(split_by="sentence", split_length=1), name="splitter")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
p.connect("text_file_converter.documents", "cleaner.documents")
p.connect("cleaner.documents", "splitter.documents")
p.connect("splitter.documents", "writer.documents")
from haystack import Document
from haystack.components.preprocessors import DocumentCleaner
doc = Document(content="This is a document to clean\\n\\n\\nsubstring to remove")
cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
result = cleaner.run(documents=[doc])
assert result["documents"][0].content == "This is a document to clean "
```
"""

Expand All @@ -43,8 +42,8 @@ def __init__(
:param remove_empty_lines: Whether to remove empty lines.
:param remove_extra_whitespaces: Whether to remove extra whitespaces.
:param remove_repeated_substrings: Whether to remove repeated substrings (headers/footers) from pages.
Pages in the text need to be separated by form feed character "\f",
which is supported by TextFileToDocument and AzureOCRDocumentConverter.
Pages in the text need to be separated by form feed character "\\f",
which is supported by `TextFileToDocument` and `AzureOCRDocumentConverter`.
:param remove_substrings: List of substrings to remove from the text.
:param remove_regex: Regex to match and replace substrings by "".
"""
Expand All @@ -58,8 +57,14 @@ def __init__(
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Run the DocumentCleaner on the given list of documents.
The documents' metadata remain unchanged.
Cleans up the documents.
:param documents: List of Documents to clean.
:returns: A dictionary with the following key:
- `documents`: List of cleaned Documents.
:raises TypeError: if documents is not a list of Documents.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError("DocumentCleaner expects a List of Documents as input.")
Expand Down Expand Up @@ -94,7 +99,7 @@ def _remove_empty_lines(self, text: str) -> str:
"""
Remove empty lines and lines that contain nothing but whitespaces from text.
:param text: Text to clean.
:param return: The text without empty lines.
:returns: The text without empty lines.
"""
lines = text.split("\n")
non_empty_lines = filter(lambda line: line.strip() != "", lines)
Expand All @@ -104,7 +109,7 @@ def _remove_extra_whitespaces(self, text: str) -> str:
"""
Remove extra whitespaces from text.
:param text: Text to clean.
:param return: The text without extra whitespaces.
:returns: The text without extra whitespaces.
"""
return re.sub(r"\s\s+", " ", text).strip()

Expand All @@ -113,7 +118,7 @@ def _remove_regex(self, text: str, regex: str) -> str:
Remove substrings that match the specified regex from the text.
:param text: Text to clean.
:param regex: Regex to match and replace substrings by "".
:param return: The text without any substrings that match the regex.
:returns: The text without the substrings that match the regex.
"""
return re.sub(regex, "", text).strip()

Expand All @@ -122,7 +127,7 @@ def _remove_substrings(self, text: str, substrings: List[str]) -> str:
Remove all specified substrings from the text.
:param text: Text to clean.
:param substrings: Substrings to remove.
:return: The text without the specified substrings.
:returns: The text without the specified substrings.
"""
for substring in substrings:
text = text.replace(substring, "")
Expand All @@ -133,7 +138,7 @@ def _remove_repeated_substrings(self, text: str) -> str:
Remove any substrings from the text that occur repeatedly on every page. For example headers or footers.
Pages in the text need to be separated by form feed character "\f".
:param text: Text to clean.
:return: The text without the repeated substrings.
:returns: The text without the repeated substrings.
"""
return self._find_and_remove_header_footer(
text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
Expand All @@ -152,7 +157,7 @@ def _find_and_remove_header_footer(
:param n_chars: The number of first/last characters where the header/footer shall be searched in.
:param n_first_pages_to_ignore: The number of first pages to ignore (e.g. TOCs often don't contain footer/header).
:param n_last_pages_to_ignore: The number of last pages to ignore.
:return: The text without the found headers and footers.
:returns: The text without the found headers and footers.
"""

pages = text.split("\f")
Expand All @@ -178,7 +183,7 @@ def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
Return all ngrams of length n from a text sequence. Each ngram consists of n words split by whitespace.
:param seq: The sequence to generate ngrams from.
:param n: The length of the ngrams to generate.
:return: A Generator generating all ngrams of length n from the given sequence.
:returns: A Generator generating all ngrams of length n from the given sequence.
"""

# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
Expand All @@ -201,7 +206,7 @@ def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
:param seq: The sequence to generate ngrams from.
:param min_ngram: The minimum length of ngram to consider.
:param max_ngram: The maximum length of ngram to consider.
:return: A set of all ngrams from the given sequence.
:returns: A set of all ngrams from the given sequence.
"""
lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
ngrams = map(partial(self._ngram, seq), lengths)
Expand All @@ -217,7 +222,7 @@ def _find_longest_common_ngram(self, sequences: List[str], min_ngram: int = 3, m
:param sequences: The list of strings that shall be searched for common n_grams.
:param max_ngram: The maximum length of ngram to consider.
:param min_ngram: The minimum length of ngram to consider.
:return: The longest ngram that all sequences have in common.
:returns: The longest ngram that all sequences have in common.
"""
sequences = [s for s in sequences if s] # filter empty sequences
if not sequences:
Expand Down
23 changes: 16 additions & 7 deletions haystack/components/preprocessors/document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
class DocumentSplitter:
"""
Splits a list of text documents into a list of text documents with shorter texts.
This is useful for splitting documents with long texts that otherwise would not fit into the maximum text length of language models.
Splitting documents with long texts is a common preprocessing step during indexing.
This allows Embedders to create significant semantic representations
and avoids exceeding the maximum context length of language models.
"""

def __init__(
Expand All @@ -21,7 +24,7 @@ def __init__(
):
"""
:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
"sentence" for splitting by ".", "page" for splitting by "\f" or "passage" for splitting by "\\n\\n".
"sentence" for splitting by ".", "page" for splitting by "\\f" or "passage" for splitting by "\\n\\n".
:param split_length: The maximum number of units in each split.
:param split_overlap: The number of units that each split should overlap.
"""
Expand All @@ -39,12 +42,18 @@ def __init__(
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Splits the documents by split_by after split_length units with an overlap of split_overlap units.
Returns a list of documents with the split texts.
A metadata field "source_id" is added to each document to keep track of the original document that was split.
Other metadata are copied from the original document.
Splits documents by the unit expressed in `split_by`, with a length of `split_length`
and an overlap of `split_overlap`.
:param documents: The documents to split.
:return: A list of documents with the split texts.
:returns: A dictionary with the following key:
- `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
document to keep track of the original document that was split. Other metadata are copied from the original
document.
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""

if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
Expand Down
19 changes: 9 additions & 10 deletions haystack/components/preprocessors/text_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class TextCleaner:
"""
A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions,
convert text to lowercase, remove punctuation, and remove numbers.
This is useful to cleanup text data before evaluation.
"""

Expand All @@ -21,13 +22,11 @@ def __init__(
remove_numbers: bool = False,
):
"""
Creates a new instance of TextCleaner.
:param remove_regexps: A list of regular expressions. If provided, it removes substrings
matching these regular expressions from the text. Defaults to None.
:param convert_to_lowercase: If True, converts all characters to lowercase. Defaults to False.
:param remove_punctuation: If True, removes punctuation from the text. Defaults to False.
:param remove_numbers: If True, removes numerical digits from the text. Defaults to False.
matching these regular expressions from the text.
:param convert_to_lowercase: If True, converts all characters to lowercase.
:param remove_punctuation: If True, removes punctuation from the text.
:param remove_numbers: If True, removes numerical digits from the text.
"""
self._remove_regexps = remove_regexps
self._convert_to_lowercase = convert_to_lowercase
Expand All @@ -47,12 +46,12 @@ def __init__(

@component.output_types(texts=List[str])
def run(self, texts: List[str]) -> Dict[str, Any]:
r"""
Run the TextCleaner on the given list of strings.
"""
Cleans up the given list of strings.
:param texts: List of strings to clean.
:returns: A dictionary with the following outputs:
* `texts` - The cleaned list of strings.
:returns: A dictionary with the following key:
- `texts`: the cleaned list of strings.
"""

if self._regex:
Expand Down

0 comments on commit 9b1d792

Please sign in to comment.