Skip to content

Commit

Permalink
docs: review Routers docstrings (#7234)
Browse files Browse the repository at this point in the history
* wip

* review routers

* small fixes

* Update haystack/components/routers/conditional_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/conditional_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/file_type_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/file_type_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/file_type_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/file_type_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/metadata_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/metadata_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/text_language_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/text_language_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update haystack/components/routers/text_language_router.py

Co-authored-by: Madeesh Kannan <[email protected]>

---------

Co-authored-by: Madeesh Kannan <[email protected]>
  • Loading branch information
anakin87 and shadeMe committed Feb 28, 2024
1 parent ac4f458 commit 7b9704a
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 103 deletions.
2 changes: 1 addition & 1 deletion docs/pydoc/config/routers_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ processors:
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
excerpt: Routes data to the right component based on its file type or metadata.
excerpt: Routers is a group of components that route queries or Documents to other components that can handle them best.
category_slug: haystack-api
title: Routers
slug: routers-api
Expand Down
79 changes: 45 additions & 34 deletions haystack/components/routers/conditional_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@ class RouteConditionException(Exception):
@component
class ConditionalRouter:
"""
ConditionalRouter in Haystack 2.x pipelines is designed to manage data routing based on specific conditions.
This is achieved by defining a list named 'routes'. Each element in this list is a dictionary representing a
single route.
`ConditionalRouter` allows data routing based on specific conditions.
This is achieved by defining a list named `routes`. Each element in this list is a dictionary representing a
single route.
A route dictionary comprises four key elements:
- 'condition': A Jinja2 string expression that determines if the route is selected.
- 'output': A Jinja2 expression defining the route's output value.
- 'output_type': The type of the output data (e.g., str, List[int]).
- 'output_name': The name under which the `output` value of the route is published. This name is used to connect
- `condition`: A Jinja2 string expression that determines if the route is selected.
- `output`: A Jinja2 expression defining the route's output value.
- `output_type`: The type of the output data (e.g., `str`, `List[int]`).
- `output_name`: The name under which the `output` value of the route is published. This name is used to connect
the router to other components in the pipeline.
Here's an example:
Usage example:
```python
from typing import List
from haystack.components.routers import ConditionalRouter
routes = [
Expand Down Expand Up @@ -66,10 +66,10 @@ class ConditionalRouter:
'enough_streams' output might be connected to another component that processes the streams, while the
'insufficient_streams' output might be connected to a component that fetches more streams, and so on.
Here is a pseudocode example of a pipeline that uses the ConditionalRouter and routes fetched ByteStreams to
different components depending on the number of streams fetched:
```
Here is a pseudocode example of a pipeline that uses the `ConditionalRouter` and routes fetched `ByteStreams` to
different components depending on the number of streams fetched:
```python
from typing import List
from haystack import Pipeline
from haystack.dataclasses import ByteStream
Expand Down Expand Up @@ -101,11 +101,15 @@ class ConditionalRouter:

def __init__(self, routes: List[Dict]):
"""
Initializes the ConditionalRouter with a list of routes detailing the conditions for routing.
:param routes: A list of dictionaries, each defining a route with a boolean condition expression
('condition'), an output value ('output'), the output type ('output_type') and
('output_name') that defines the output name for the variable defined in 'output'.
Initializes the `ConditionalRouter` with a list of routes detailing the conditions for routing.
:param routes: A list of dictionaries, each defining a route.
A route dictionary comprises four key elements:
- `condition`: A Jinja2 string expression that determines if the route is selected.
- `output`: A Jinja2 expression defining the route's output value.
- `output_type`: The type of the output data (e.g., str, List[int]).
- `output_name`: The name under which the `output` value of the route is published. This name is used to connect
the router to other components in the pipeline.
"""
self._validate_routes(routes)
self.routes: List[dict] = routes
Expand All @@ -129,6 +133,12 @@ def __init__(self, routes: List[Dict]):
component.set_output_types(self, **output_types)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
for route in self.routes:
# output_type needs to be serialized to a string
route["output_type"] = serialize_type(route["output_type"])
Expand All @@ -137,6 +147,14 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ConditionalRouter":
"""
Deserializes the component from a dictionary.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized component.
"""
init_params = data.get("init_parameters", {})
routes = init_params.get("routes")
for route in routes:
Expand All @@ -146,19 +164,17 @@ def from_dict(cls, data: Dict[str, Any]) -> "ConditionalRouter":

def run(self, **kwargs):
"""
Executes the routing logic by evaluating the specified boolean condition expressions
for each route in the order they are listed. The method directs the flow
of data to the output specified in the first route, whose expression
evaluates to True. If no route's expression evaluates to True, an exception
is raised.
Executes the routing logic by evaluating the specified boolean condition expressions for each route in the order they are listed.
The method directs the flow of data to the output specified in the first route whose `condition` is True.
:param kwargs: A dictionary containing the pipeline variables, which should
include all variables used in the "condition" templates.
:param kwargs: All variables used in the `condition` expressed in the routes. When the component is used in a
pipeline, these variables are passed from the previous component's output.
:return: A dictionary containing the output and the corresponding result,
based on the first route whose expression evaluates to True.
:returns: A dictionary where the key is the `output_name` of the selected route and the value is the `output`
of the selected route.
:raises NoRouteSelectedException: If no route's expression evaluates to True.
:raises NoRouteSelectedException: If no `condition' in the routes is `True`.
:raises RouteConditionException: If there is an error parsing or evaluating the `condition` expression in the routes.
"""
# Create a Jinja native environment to evaluate the condition templates as Python expressions
env = NativeEnvironment()
Expand All @@ -182,7 +198,6 @@ def _validate_routes(self, routes: List[Dict]):
Validates a list of routes.
:param routes: A list of routes.
:type routes: List[Dict]
"""
env = NativeEnvironment()
for route in routes:
Expand All @@ -206,10 +221,8 @@ def _extract_variables(self, env: NativeEnvironment, templates: List[str]) -> Se
Extracts all variables from a list of Jinja template strings.
:param env: A Jinja environment.
:type env: Environment
:param templates: A list of Jinja template strings.
:type templates: List[str]
:return: A set of variable names.
:returns: A set of variable names.
"""
variables = set()
for template in templates:
Expand All @@ -222,10 +235,8 @@ def _validate_template(self, env: Environment, template_text: str):
Validates a template string by parsing it with Jinja.
:param env: A Jinja environment.
:type env: Environment
:param template_text: A Jinja template string.
:type template_text: str
:return: True if the template is valid, False otherwise.
:returns: `True` if the template is valid, `False` otherwise.
"""
try:
env.parse(template_text)
Expand Down
50 changes: 33 additions & 17 deletions haystack/components/routers/file_type_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,39 @@
class FileTypeRouter:
"""
FileTypeRouter takes a list of data sources (file paths or byte streams) and groups them by their corresponding
MIME types. For file paths, MIME types are inferred from their extensions, while for byte streams, MIME types
MIME types.
For file paths, MIME types are inferred from their extensions, while for byte streams, MIME types
are determined from the provided metadata.
The set of MIME types to consider is specified during the initialization of the component.
This component is invaluable when categorizing a large collection of files or data streams by their MIME
types and routing them to different components for further processing.
This component is useful when you need to classify a large collection of files or data streams according to their
MIME types and route them to different components for further processing.
Usage example:
```python
from haystack.components.routers import FileTypeRouter
router = FileTypeRouter(mime_types=["text/plain"])
print(router.run(sources=["text_file.txt", "pdf_file.pdf"]))
# defaultdict(<class 'list'>, {'text/plain': [PosixPath('text_file.txt')],
# 'unclassified': [PosixPath('pdf_file.pdf')]})
```
"""

def __init__(self, mime_types: List[str]):
"""
Initialize the FileTypeRouter.
:param mime_types: A list of file mime types to consider when routing
files (e.g. ["text/plain", "audio/x-wav", "image/jpeg"]).
:param mime_types: A list of file mime types to consider when routing files
(e.g. `["text/plain", "audio/x-wav", "image/jpeg"]`).
"""
if not mime_types:
raise ValueError("The list of mime types cannot be empty.")

for mime_type in mime_types:
if not self.is_valid_mime_type_format(mime_type):
if not self._is_valid_mime_type_format(mime_type):
raise ValueError(
f"Unknown mime type: '{mime_type}'. Ensure you passed a list of strings in the 'mime_types' parameter"
)
Expand All @@ -47,7 +59,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
Categorizes the provided data sources by their MIME types.
:param sources: A list of file paths or byte streams to categorize.
:return: A dictionary where keys are MIME types and values are lists of data sources.
:returns: A dictionary where the keys are MIME types (or `"unclassified"`) and the values are lists of data sources.
"""

mime_types = defaultdict(list)
Expand All @@ -56,7 +69,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
source = Path(source)

if isinstance(source, Path):
mime_type = self.get_mime_type(source)
mime_type = self._get_mime_type(source)
elif isinstance(source, ByteStream):
mime_type = source.meta.get("content_type")
else:
Expand All @@ -69,28 +82,31 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni

return mime_types

def get_mime_type(self, path: Path) -> Optional[str]:
def _get_mime_type(self, path: Path) -> Optional[str]:
"""
Get the MIME type of the provided file path.
:param path: The file path to get the MIME type for.
:return: The MIME type of the provided file path, or None if the MIME type cannot be determined.
:returns: The MIME type of the provided file path, or `None` if the MIME type cannot be determined.
"""
extension = path.suffix.lower()
mime_type = mimetypes.guess_type(path.as_posix())[0]
# lookup custom mappings if the mime type is not found
return self.get_custom_mime_mappings().get(extension, mime_type)
return self._get_custom_mime_mappings().get(extension, mime_type)

def is_valid_mime_type_format(self, mime_type: str) -> bool:
def _is_valid_mime_type_format(self, mime_type: str) -> bool:
"""
Check if the provided MIME type is in valid format
:param mime_type: The MIME type to check.
:return: True if the provided MIME type is a valid MIME type format, False otherwise.
:returns: `True` if the provided MIME type is a valid MIME type format, `False` otherwise.
"""
return mime_type in mimetypes.types_map.values() or mime_type in self.get_custom_mime_mappings().values()
return mime_type in mimetypes.types_map.values() or mime_type in self._get_custom_mime_mappings().values()

@staticmethod
def get_custom_mime_mappings() -> Dict[str, str]:
def _get_custom_mime_mappings() -> Dict[str, str]:
"""
Returns a dictionary of custom file extension to MIME type mappings.
"""
Expand Down
97 changes: 58 additions & 39 deletions haystack/components/routers/metadata_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,78 @@
@component
class MetadataRouter:
"""
A component that routes documents to different connections based on the content of their fields.
A component that routes documents to different connections based on the content of their metadata fields.
Usage example:
```python
from haystack import Document
from haystack.components.routers import MetadataRouter
docs = [Document(content="Paris is the capital of France.", meta={"language": "en"}),
Document(content="Berlin ist die Haupststadt von Deutschland.", meta={"language": "de"})]
router = MetadataRouter(rules={"en": {"field": "meta.language", "operator": "==", "value": "en"}})
print(router.run(documents=docs))
# {'en': [Document(id=..., content: 'Paris is the capital of France.', meta: {'language': 'en'})],
# 'unmatched': [Document(id=..., content: 'Berlin ist die Haupststadt von Deutschland.', meta: {'language': 'de'})]}
```
"""

def __init__(self, rules: Dict[str, Dict]):
"""
Initialize the MetadataRouter.
:param rules: A dictionary of rules that specify which edge to route a document to based on its metadata.
The keys of the dictionary are the names of the output connections, and the values are dictionaries that
follow the format of filtering expressions in Haystack. For example:
```python
{
"edge_1": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
],
},
"edge_2": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
],
},
"edge_3": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
],
},
"edge_4": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
{"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
],
},
}
```
:param rules: A dictionary of rules that specify which output connection to route a document to based on its metadata.
The keys of the dictionary are the names of the output connections, and the values are dictionaries that
follow the [format of filtering expressions in Haystack](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering).
For example:
```python
{
"edge_1": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
],
},
"edge_2": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
],
},
"edge_3": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
],
},
"edge_4": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
{"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
],
},
}
```
"""
self.rules = rules
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})

def run(self, documents: List[Document]):
"""
Run the MetadataRouter. This method routes the documents to different edges based on their fields content and
the rules specified during initialization. If a document does not match any of the rules, it is routed to
a connection named "unmatched".
Route the documents to different edges based on their fields content and the rules specified during initialization.
If a document does not match any of the rules, it is routed to a connection named "unmatched".
:param documents: A list of documents to route to different edges.
:returns: A dictionary where the keys are the names of the output connections (including `"unmatched"`)
and the values are lists of routed documents.
"""
unmatched_documents = []
output: Dict[str, List[Document]] = {edge: [] for edge in self.rules}
Expand Down
Loading

0 comments on commit 7b9704a

Please sign in to comment.