Skip to content

Commit

Permalink
docs: review DocumentLanguageClassifier docstrings (#7210)
Browse files Browse the repository at this point in the history
* review DocumentLanguageClassifier docstrings

* fix

* improve pydoc config
  • Loading branch information
anakin87 committed Feb 27, 2024
1 parent 2580e05 commit e194c08
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 17 deletions.
2 changes: 1 addition & 1 deletion docs/pydoc/config/classifiers_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ processors:
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
excerpt: Detects the language of the Documents and routes them appropriately.
excerpt: Detects the language of the Documents and adds it to the metadata.
category_slug: haystack-api
title: Classifiers
slug: classifiers-api
Expand Down
49 changes: 34 additions & 15 deletions haystack/components/classifiers/document_language_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,46 @@
class DocumentLanguageClassifier:
"""
Classify the language of documents and add the detected language to their metadata.
A MetadataRouter can then route them onto different output connections depending on their language.
This is useful to route documents to different models in a pipeline depending on their language.
A `MetadataRouter` can then route them onto different output connections depending on their language.
The set of supported languages can be specified.
For routing plain text using the same logic, use the related TextLanguageRouter component instead.
For routing plain text using the same logic, use the related `TextLanguageRouter` component instead.
Example usage within an indexing pipeline, storing in a Document Store
Usage example within an indexing pipeline, storing in a Document Store
only documents written in English:
```python
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.routers import MetadataRouter
from haystack.components.writers import DocumentWriter
docs = [Document(id="1", content="This is an English document"),
Document(id="2", content="Este es un documento en español")]
document_store = InMemoryDocumentStore()
p = Pipeline()
p.add_component(instance=TextFileToDocument(), name="text_file_converter")
p.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
p.add_component(instance=DocumentLanguageClassifier(languages=["en"]), name="language_classifier")
p.add_component(instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router")
p.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
p.connect("text_file_converter.documents", "language_classifier.documents")
p.connect("language_classifier.documents", "router.documents")
p.connect("router.en", "writer.documents")
p.run({"language_classifier": {"documents": docs}})
written_docs = document_store.filter_documents()
assert len(written_docs) == 1
assert written_docs[0] == Document(id="1", content="This is an English document", meta={"language": "en"})
```
"""

def __init__(self, languages: Optional[List[str]] = None):
"""
:param languages: A list of languages in ISO code, each corresponding to a different output connection
(see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
By default, only ["en"] is supported and Documents of any other language are routed to "unmatched".
:param languages: A list of languages in ISO code, each corresponding to a different output connection.
For supported languages, see the [`langdetect` documentation](https://github.com/Mimino666/langdetect#languages).
If not specified, the default is ["en"].
"""
langdetect_import.check()
if not languages:
Expand All @@ -49,11 +63,16 @@ def __init__(self, languages: Optional[List[str]] = None):
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Run the DocumentLanguageClassifier. This method classifies the documents' language and adds it to their metadata.
If a Document's text does not match any of the languages specified at initialization, the metadata value "unmatched" will be stored.
This method classifies the documents' language and adds it to their metadata.
If a Document's text does not match any of the languages specified at initialization,
the metadata value "unmatched" will be stored.
:param documents: A list of documents to classify their language.
:return: List of Documents with an added metadata field called language.
:returns: A dictionary with the following key:
- `documents`: List of Documents with an added metadata field called `language`.
:raises TypeError: if the input is not a list of Documents.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError(
Expand All @@ -65,15 +84,15 @@ def run(self, documents: List[Document]):
output["unmatched"] = []

for document in documents:
detected_language = self.detect_language(document)
detected_language = self._detect_language(document)
if detected_language in self.languages:
document.meta["language"] = detected_language
else:
document.meta["language"] = "unmatched"

return {"documents": documents}

def detect_language(self, document: Document) -> Optional[str]:
def _detect_language(self, document: Document) -> Optional[str]:
try:
language = langdetect.detect(document.content)
except langdetect.LangDetectException:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_empty_list(self):

def test_detect_language(self):
classifier = DocumentLanguageClassifier()
detected_language = classifier.detect_language(Document(content="This is an english sentence."))
detected_language = classifier._detect_language(Document(content="This is an english sentence."))
assert detected_language == "en"

def test_classify_as_en_and_unmatched(self):
Expand Down

0 comments on commit e194c08

Please sign in to comment.