docs: review Routers docstrings (#7234)

* wip * review routers * small fixes * Update haystack/components/routers/conditional_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/conditional_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/file_type_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/file_type_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/file_type_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/file_type_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/metadata_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/metadata_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/text_language_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/text_language_router.py Co-authored-by: Madeesh Kannan <[email protected]> * Update haystack/components/routers/text_language_router.py Co-authored-by: Madeesh Kannan <[email protected]> --------- Co-authored-by: Madeesh Kannan <[email protected]>
deepset-ai · Feb 28, 2024 · 7b9704a · 7b9704a
1 parent ac4f458
commit 7b9704a
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 103 deletions.
diff --git a/docs/pydoc/config/routers_api.yml b/docs/pydoc/config/routers_api.yml
@@ -13,7 +13,7 @@ processors:
  - type: crossref
 renderer:
  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
- excerpt: Routes data to the right component based on its file type or metadata.
+ excerpt: Routers is a group of components that route queries or Documents to other components that can handle them best.
  category_slug: haystack-api
  title: Routers
  slug: routers-api

diff --git a/haystack/components/routers/conditional_router.py b/haystack/components/routers/conditional_router.py
@@ -21,20 +21,20 @@ class RouteConditionException(Exception):
 @component
 class ConditionalRouter:
  """
- ConditionalRouter in Haystack 2.x pipelines is designed to manage data routing based on specific conditions.
- This is achieved by defining a list named 'routes'. Each element in this list is a dictionary representing a
- single route.
+ `ConditionalRouter` allows data routing based on specific conditions.
 
+ This is achieved by defining a list named `routes`. Each element in this list is a dictionary representing a
+ single route.
  A route dictionary comprises four key elements:
- - 'condition': A Jinja2 string expression that determines if the route is selected.
- - 'output': A Jinja2 expression defining the route's output value.
- - 'output_type': The type of the output data (e.g., str, List[int]).
- - 'output_name': The name under which the `output` value of the route is published. This name is used to connect
+ - `condition`: A Jinja2 string expression that determines if the route is selected.
+ - `output`: A Jinja2 expression defining the route's output value.
+ - `output_type`: The type of the output data (e.g., `str`, `List[int]`).
+ - `output_name`: The name under which the `output` value of the route is published. This name is used to connect
  the router to other components in the pipeline.
 
- Here's an example:
-
+ Usage example:
  ```python
+ from typing import List
  from haystack.components.routers import ConditionalRouter
 
  routes = [
@@ -66,10 +66,10 @@ class ConditionalRouter:
  'enough_streams' output might be connected to another component that processes the streams, while the
  'insufficient_streams' output might be connected to a component that fetches more streams, and so on.
 
- Here is a pseudocode example of a pipeline that uses the ConditionalRouter and routes fetched ByteStreams to
- different components depending on the number of streams fetched:
 
- ```
+ Here is a pseudocode example of a pipeline that uses the `ConditionalRouter` and routes fetched `ByteStreams` to
+ different components depending on the number of streams fetched:
+ ```python
  from typing import List
  from haystack import Pipeline
  from haystack.dataclasses import ByteStream
@@ -101,11 +101,15 @@ class ConditionalRouter:
 
  def __init__(self, routes: List[Dict]):
  """
- Initializes the ConditionalRouter with a list of routes detailing the conditions for routing.
-
- :param routes: A list of dictionaries, each defining a route with a boolean condition expression
- ('condition'), an output value ('output'), the output type ('output_type') and
- ('output_name') that defines the output name for the variable defined in 'output'.
+ Initializes the `ConditionalRouter` with a list of routes detailing the conditions for routing.
+
+ :param routes: A list of dictionaries, each defining a route.
+ A route dictionary comprises four key elements:
+ - `condition`: A Jinja2 string expression that determines if the route is selected.
+ - `output`: A Jinja2 expression defining the route's output value.
+ - `output_type`: The type of the output data (e.g., str, List[int]).
+ - `output_name`: The name under which the `output` value of the route is published. This name is used to connect
+ the router to other components in the pipeline.
  """
  self._validate_routes(routes)
  self.routes: List[dict] = routes
@@ -129,6 +133,12 @@ def __init__(self, routes: List[Dict]):
  component.set_output_types(self, **output_types)
 
  def to_dict(self) -> Dict[str, Any]:
+ """
+ Serializes the component to a dictionary.
+
+ :returns:
+ Dictionary with serialized data.
+ """
  for route in self.routes:
  # output_type needs to be serialized to a string
  route["output_type"] = serialize_type(route["output_type"])
@@ -137,6 +147,14 @@ def to_dict(self) -> Dict[str, Any]:
 
  @classmethod
  def from_dict(cls, data: Dict[str, Any]) -> "ConditionalRouter":
+ """
+ Deserializes the component from a dictionary.
+
+ :param data:
+ The dictionary to deserialize from.
+ :returns:
+ The deserialized component.
+ """
  init_params = data.get("init_parameters", {})
  routes = init_params.get("routes")
  for route in routes:
@@ -146,19 +164,17 @@ def from_dict(cls, data: Dict[str, Any]) -> "ConditionalRouter":
 
  def run(self, **kwargs):
  """
- Executes the routing logic by evaluating the specified boolean condition expressions
- for each route in the order they are listed. The method directs the flow
- of data to the output specified in the first route, whose expression
- evaluates to True. If no route's expression evaluates to True, an exception
- is raised.
+ Executes the routing logic by evaluating the specified boolean condition expressions for each route in the order they are listed.
+ The method directs the flow of data to the output specified in the first route whose `condition` is True.
 
- :param kwargs: A dictionary containing the pipeline variables, which should
-  include all variables used in the "condition" templates.
+ :param kwargs: All variables used in the `condition` expressed in the routes. When the component is used in a
+ pipeline, these variables are passed from the previous component's output.
 
- :return: A dictionary containing the output and the corresponding result,
-  based on the first route whose expression evaluates to True.
+ :returns: A dictionary where the key is the `output_name` of the selected route and the value is the `output`
+ of the selected route.
 
- :raises NoRouteSelectedException: If no route's expression evaluates to True.
+ :raises NoRouteSelectedException: If no `condition' in the routes is `True`.
+ :raises RouteConditionException: If there is an error parsing or evaluating the `condition` expression in the routes.
  """
  # Create a Jinja native environment to evaluate the condition templates as Python expressions
  env = NativeEnvironment()
@@ -182,7 +198,6 @@ def _validate_routes(self, routes: List[Dict]):
  Validates a list of routes.
 
  :param routes: A list of routes.
- :type routes: List[Dict]
  """
  env = NativeEnvironment()
  for route in routes:
@@ -206,10 +221,8 @@ def _extract_variables(self, env: NativeEnvironment, templates: List[str]) -> Se
  Extracts all variables from a list of Jinja template strings.
 
  :param env: A Jinja environment.
- :type env: Environment
  :param templates: A list of Jinja template strings.
- :type templates: List[str]
- :return: A set of variable names.
+ :returns: A set of variable names.
  """
  variables = set()
  for template in templates:
@@ -222,10 +235,8 @@ def _validate_template(self, env: Environment, template_text: str):
  Validates a template string by parsing it with Jinja.
 
  :param env: A Jinja environment.
- :type env: Environment
  :param template_text: A Jinja template string.
- :type template_text: str
- :return: True if the template is valid, False otherwise.
+ :returns: `True` if the template is valid, `False` otherwise.
  """
  try:
  env.parse(template_text)

diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py
@@ -14,27 +14,39 @@
 class FileTypeRouter:
  """
  FileTypeRouter takes a list of data sources (file paths or byte streams) and groups them by their corresponding
- MIME types. For file paths, MIME types are inferred from their extensions, while for byte streams, MIME types
+ MIME types.
+
+ For file paths, MIME types are inferred from their extensions, while for byte streams, MIME types
  are determined from the provided metadata.
 
  The set of MIME types to consider is specified during the initialization of the component.
 
- This component is invaluable when categorizing a large collection of files or data streams by their MIME
- types and routing them to different components for further processing.
+ This component is useful when you need to classify a large collection of files or data streams according to their
+ MIME types and route them to different components for further processing.
+
+ Usage example:
+ ```python
+ from haystack.components.routers import FileTypeRouter
+
+ router = FileTypeRouter(mime_types=["text/plain"])
+
+ print(router.run(sources=["text_file.txt", "pdf_file.pdf"]))
+
+ # defaultdict(<class 'list'>, {'text/plain': [PosixPath('text_file.txt')],
+ # 'unclassified': [PosixPath('pdf_file.pdf')]})
+ ```
  """
 
  def __init__(self, mime_types: List[str]):
  """
- Initialize the FileTypeRouter.
-
- :param mime_types: A list of file mime types to consider when routing
- files (e.g. ["text/plain", "audio/x-wav", "image/jpeg"]).
+ :param mime_types: A list of file mime types to consider when routing files
+ (e.g. `["text/plain", "audio/x-wav", "image/jpeg"]`).
  """
  if not mime_types:
  raise ValueError("The list of mime types cannot be empty.")
 
  for mime_type in mime_types:
- if not self.is_valid_mime_type_format(mime_type):
+ if not self._is_valid_mime_type_format(mime_type):
  raise ValueError(
  f"Unknown mime type: '{mime_type}'. Ensure you passed a list of strings in the 'mime_types' parameter"
  )
@@ -47,7 +59,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
  Categorizes the provided data sources by their MIME types.
 
  :param sources: A list of file paths or byte streams to categorize.
- :return: A dictionary where keys are MIME types and values are lists of data sources.
+
+ :returns: A dictionary where the keys are MIME types (or `"unclassified"`) and the values are lists of data sources.
  """
 
  mime_types = defaultdict(list)
@@ -56,7 +69,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
  source = Path(source)
 
  if isinstance(source, Path):
- mime_type = self.get_mime_type(source)
+ mime_type = self._get_mime_type(source)
  elif isinstance(source, ByteStream):
  mime_type = source.meta.get("content_type")
  else:
@@ -69,28 +82,31 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
 
  return mime_types
 
- def get_mime_type(self, path: Path) -> Optional[str]:
+ def _get_mime_type(self, path: Path) -> Optional[str]:
  """
  Get the MIME type of the provided file path.
 
  :param path: The file path to get the MIME type for.
- :return: The MIME type of the provided file path, or None if the MIME type cannot be determined.
+
+ :returns: The MIME type of the provided file path, or `None` if the MIME type cannot be determined.
  """
  extension = path.suffix.lower()
  mime_type = mimetypes.guess_type(path.as_posix())[0]
  # lookup custom mappings if the mime type is not found
- return self.get_custom_mime_mappings().get(extension, mime_type)
+ return self._get_custom_mime_mappings().get(extension, mime_type)
 
- def is_valid_mime_type_format(self, mime_type: str) -> bool:
+ def _is_valid_mime_type_format(self, mime_type: str) -> bool:
  """
  Check if the provided MIME type is in valid format
+
  :param mime_type: The MIME type to check.
- :return: True if the provided MIME type is a valid MIME type format, False otherwise.
+
+ :returns: `True` if the provided MIME type is a valid MIME type format, `False` otherwise.
  """
- return mime_type in mimetypes.types_map.values() or mime_type in self.get_custom_mime_mappings().values()
+ return mime_type in mimetypes.types_map.values() or mime_type in self._get_custom_mime_mappings().values()
 
  @staticmethod
- def get_custom_mime_mappings() -> Dict[str, str]:
+ def _get_custom_mime_mappings() -> Dict[str, str]:
  """
  Returns a dictionary of custom file extension to MIME type mappings.
  """

diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py
@@ -7,59 +7,78 @@
 @component
 class MetadataRouter:
  """
- A component that routes documents to different connections based on the content of their fields.
+ A component that routes documents to different connections based on the content of their metadata fields.
+
+ Usage example:
+ ```python
+ from haystack import Document
+ from haystack.components.routers import MetadataRouter
+
+ docs = [Document(content="Paris is the capital of France.", meta={"language": "en"}),
+ Document(content="Berlin ist die Haupststadt von Deutschland.", meta={"language": "de"})]
+
+ router = MetadataRouter(rules={"en": {"field": "meta.language", "operator": "==", "value": "en"}})
+
+ print(router.run(documents=docs))
+
+ # {'en': [Document(id=..., content: 'Paris is the capital of France.', meta: {'language': 'en'})],
+ # 'unmatched': [Document(id=..., content: 'Berlin ist die Haupststadt von Deutschland.', meta: {'language': 'de'})]}
+ ```
  """
 
  def __init__(self, rules: Dict[str, Dict]):
  """
  Initialize the MetadataRouter.
 
- :param rules: A dictionary of rules that specify which edge to route a document to based on its metadata.
- The keys of the dictionary are the names of the output connections, and the values are dictionaries that
- follow the format of filtering expressions in Haystack. For example:
- ```python
- {
- "edge_1": {
- "operator": "AND",
- "conditions": [
- {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
- {"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
- ],
- },
- "edge_2": {
- "operator": "AND",
- "conditions": [
- {"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
- {"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
- ],
- },
- "edge_3": {
- "operator": "AND",
- "conditions": [
- {"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
- {"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
- ],
- },
- "edge_4": {
- "operator": "AND",
- "conditions": [
- {"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
- {"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
- ],
- },
- }
- ```
+ :param rules: A dictionary of rules that specify which output connection to route a document to based on its metadata.
+ The keys of the dictionary are the names of the output connections, and the values are dictionaries that
+ follow the [format of filtering expressions in Haystack](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering).
+ For example:
+ ```python
+ {
+ "edge_1": {
+ "operator": "AND",
+ "conditions": [
+ {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
+ {"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
+ ],
+ },
+ "edge_2": {
+ "operator": "AND",
+ "conditions": [
+ {"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
+ {"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
+ ],
+ },
+ "edge_3": {
+ "operator": "AND",
+ "conditions": [
+ {"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
+ {"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
+ ],
+ },
+ "edge_4": {
+ "operator": "AND",
+ "conditions": [
+ {"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
+ {"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
+ ],
+ },
+ }
+ ```
  """
  self.rules = rules
  component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
 
  def run(self, documents: List[Document]):
  """
- Run the MetadataRouter. This method routes the documents to different edges based on their fields content and
- the rules specified during initialization. If a document does not match any of the rules, it is routed to
- a connection named "unmatched".
+ Route the documents to different edges based on their fields content and the rules specified during initialization.
+ If a document does not match any of the rules, it is routed to a connection named "unmatched".
 
  :param documents: A list of documents to route to different edges.
+
+ :returns: A dictionary where the keys are the names of the output connections (including `"unmatched"`)
+ and the values are lists of routed documents.
  """
  unmatched_documents = []
  output: Dict[str, List[Document]] = {edge: [] for edge in self.rules}