progressing on other components

deepset-ai · anakin87 · Dec 15, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
commit 989e48e55bab4678152f72bee8d08049b89384c1
@@ -56,8 +56,8 @@ def __init__(self, endpoint: str, api_key: Optional[str] = None, model_id: str =
  self.endpoint = endpoint
  self.model_id = model_id
 
- @component.output_types(documents=List[Document], azure=List[Dict])
- def run(self, sources: List[Union[str, Path, ByteStream]]):
+ @component.output_types(documents=List[Document], raw_azure_response=List[Dict])
+ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
  """
  Convert files to Documents using Azure's Document Intelligence service.
 
@@ -66,10 +66,20 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
  the raw responses from Azure's Document Intelligence service.
 
  :param sources: List of file paths or ByteStream objects.
+ :param meta: Optional list of metadata to attach to the Documents.
+ The length of the list must match the number of sources. Defaults to `None`.
+ :return: A dictionary containing a list of Document objects under the 'documents' key.
+ It also contains the raw Azure response under the 'raw_azure_response' key.
  """
  documents = []
  azure_output = []
- for source in sources:
+
+ if meta is None:
+ meta = [{}] * len(sources)
+ elif len(sources) != len(meta):
+ raise ValueError("The length of the metadata list must match the number of sources.")
+
+ for source, metadata in zip(sources, meta):
  try:
  bytestream = get_bytestream_from_source(source=source)
  except Exception as e:
@@ -87,6 +97,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
  file_suffix = Path(bytestream.metadata["file_path"]).suffix
 
  document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix)
+ merged_metadata = {**bytestream.metadata, **metadata}
+ document.meta = merged_metadata
  documents.append(document)
 
  return {"documents": documents, "raw_azure_response": azure_output}

diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -35,8 +35,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
 
  :param sources: List of HTML file paths or ByteStream objects.
  :param meta: Optional list of metadata to attach to the Documents.
- The length of the list must match the number of sources. Defaults to `None`.
- :return: List of converted Documents.
+  The length of the list must match the number of sources. Defaults to `None`.
+ :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
 
  documents = []

diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py
@@ -51,15 +51,19 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
 
  :param sources: A list of markdown data sources (file paths or binary objects)
  :param meta: Optional list of metadata to attach to the Documents.
- The length of the list must match the number of paths. Defaults to `None`.
+ The length of the list must match the number of paths. Defaults to `None`.
+ :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
  parser = MarkdownIt(renderer_cls=RendererPlain)
  if self.table_to_single_line:
  parser.enable("table")
 
  documents = []
+
  if meta is None:
  meta = [{}] * len(sources)
+ elif len(sources) != len(meta):
+ raise ValueError("The length of the metadata list must match the number of sources.")
 
  for source, metadata in tqdm(
  zip(sources, meta),
@@ -79,7 +83,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
  logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
  continue
 
- document = Document(content=text, meta=metadata)
+ merged_metadata = {**bytestream.metadata, **metadata}
+ document = Document(content=text, meta=merged_metadata)
  documents.append(document)
 
  return {"documents": documents}
@@ -1,6 +1,6 @@
 import io
 import logging
-from typing import List, Union, Protocol, Dict
+from typing import List, Union, Protocol, Dict, Any, Optional
 from pathlib import Path
 
 from haystack.dataclasses import ByteStream
@@ -71,15 +71,23 @@ def to_dict(self):
  return default_to_dict(self, converter_name=self.converter_name)
 
  @component.output_types(documents=List[Document])
- def run(self, sources: List[Union[str, Path, ByteStream]]):
+ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
  """
  Converts a list of PDF sources into Document objects using the configured converter.
 
  :param sources: A list of PDF data sources, which can be file paths or ByteStream objects.
+ :param meta: Optional list of metadata to attach to the Documents.
+ The length of the list must match the number of sources. Defaults to `None`.
  :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
  documents = []
- for source in sources:
+
+ if meta is None:
+ meta = [{}] * len(sources)
+ elif len(sources) != len(meta):
+ raise ValueError("The length of the metadata list must match the number of sources.")
+
+ for source, metadata in zip(sources, meta):
  try:
  bytestream = get_bytestream_from_source(source)
  except Exception as e:
@@ -91,6 +99,9 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
  except Exception as e:
  logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
  continue
+
+ merged_metadata = {**bytestream.metadata, **metadata}
+ document.meta = merged_metadata
  documents.append(document)
 
  return {"documents": documents}
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Dict, Any, Optional
 import io
 
 from haystack.lazy_imports import LazyImport
@@ -37,15 +37,24 @@ def __init__(self, tika_url: str = "https://localhost:9998/tika"):
  self.tika_url = tika_url
 
  @component.output_types(documents=List[Document])
- def run(self, sources: List[Union[str, Path, ByteStream]]):
+ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
  """
  Convert files to Documents.
 
  :param sources: List of file paths or ByteStream objects.
+ :param meta: Optional list of metadata to attach to the Documents.
+ The length of the list must match the number of sources. Defaults to `None`.
+ :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
 
  documents = []
- for source in sources:
+
+ if meta is None:
+ meta = [{}] * len(sources)
+ elif len(sources) != len(meta):
+ raise ValueError("The length of the metadata list must match the number of sources.")
+
+ for source, metadata in zip(sources, meta):
  try:
  bytestream = get_bytestream_from_source(source)
  except Exception as e:
@@ -56,6 +65,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
  except Exception as conversion_e:
  logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
  continue
- document = Document(content=text)
+
+ merged_metadata = {**bytestream.metadata, **metadata}
+ document = Document(content=text, meta=merged_metadata)
  documents.append(document)
  return {"documents": documents}
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Dict, Any, Optional
 
 from haystack import Document, component
 from haystack.dataclasses import ByteStream
@@ -27,28 +27,39 @@ def __init__(self, encoding: str = "utf-8"):
  self.encoding = encoding
 
  @component.output_types(documents=List[Document])
- def run(self, sources: List[Union[str, Path, ByteStream]]):
+ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
  """
  Convert text files to Documents.
 
- :param streams: A list of paths to text files or ByteStream objects.
- Note that if an encoding is specified in the metadata of a ByteStream,
- it will override the component's default.
- :return: A dictionary containing the converted documents.
+ :param sources: A list of paths to text files or ByteStream objects.
+ Note that if an encoding is specified in the metadata of a ByteStream,
+ it will override the component's default.
+ :param meta: Optional list of metadata to attach to the Documents.
+ The length of the list must match the number of sources. Defaults to `None`.
+ :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
  documents = []
- for source in sources:
+
+ if meta is None:
+ meta = [{}] * len(sources)
+ elif len(sources) != len(meta):
+ raise ValueError("The length of the metadata list must match the number of sources.")
+
+ for source, metadata in zip(sources, meta):
  try:
  bytestream = get_bytestream_from_source(source)
  except Exception as e:
  logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
  continue
  try:
  encoding = bytestream.metadata.get("encoding", self.encoding)
- document = Document(content=bytestream.data.decode(encoding))
- document.meta = bytestream.metadata
- documents.append(document)
+ text = Document(content=bytestream.data.decode(encoding))
  except Exception as e:
  logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
+ continue
+
+ merged_metadata = {**bytestream.metadata, **metadata}
+ document = Document(content=text, meta=merged_metadata)
+ documents.append(document)
 
  return {"documents": documents}