feat: support single metadata dictionary in MarkdownToDocument (#6629)

* support single metadata dict in markdown2document * reno * unwrap list * direct key access * typing * add explicit test
deepset-ai · Jan 9, 2024 · abd16ab · abd16ab
1 parent 9ace6bf
commit abd16ab
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 17 deletions.
diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -50,7 +50,11 @@ def __init__(
  self.extractor_type = extractor_type
 
  @component.output_types(documents=List[Document])
- def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+ def run(
+ self,
+ sources: List[Union[str, Path, ByteStream]],
+ meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+ ):
  """
  Converts a list of HTML files to Documents.
 

diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py
@@ -7,7 +7,7 @@
 from haystack import Document, component
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
-from haystack.components.converters.utils import get_bytestream_from_source
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 
 with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
  from markdown_it import MarkdownIt
@@ -27,7 +27,7 @@ class MarkdownToDocument:
  from haystack.components.converters.markdown import MarkdownToDocument
 
  converter = MarkdownToDocument()
- results = converter.run(sources=["sample.md"])
+ results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()})
  documents = results["documents"]
  print(documents[0].content)
  # 'This is a text from the markdown file.'
@@ -45,28 +45,31 @@ def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True
  self.progress_bar = progress_bar
 
  @component.output_types(documents=List[Document])
- def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+ def run(
+ self,
+ sources: List[Union[str, Path, ByteStream]],
+ meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+ ):
  """
  Reads text from a markdown file and executes optional preprocessing steps.
 
  :param sources: A list of markdown data sources (file paths or binary objects)
- :param meta: Optional list of metadata to attach to the Documents.
- The length of the list must match the number of paths. Defaults to `None`.
+ :param meta: Optional metadata to attach to the Documents.
+ This value can be either a list of dictionaries or a single dictionary.
+ If it's a single dictionary, its content is added to the metadata of all produced Documents.
+ If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+ Defaults to `None`.
  :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
  parser = MarkdownIt(renderer_cls=RendererPlain)
  if self.table_to_single_line:
  parser.enable("table")
 
  documents = []
-
- if meta is None:
- meta = [{}] * len(sources)
- elif len(sources) != len(meta):
- raise ValueError("The length of the metadata list must match the number of sources.")
+ meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
 
  for source, metadata in tqdm(
- zip(sources, meta),
+ zip(sources, meta_list),
  total=len(sources),
  desc="Converting markdown files to Documents",
  disable=not self.progress_bar,

diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -82,7 +82,11 @@ def to_dict(self):
  return default_to_dict(self, converter_name=self.converter_name)
 
  @component.output_types(documents=List[Document])
- def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
+ def run(
+ self,
+ sources: List[Union[str, Path, ByteStream]],
+ meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+ ):
  """
  Converts a list of PDF sources into Document objects using the configured converter.
 

diff --git a/releasenotes/notes/single-meta-in-markdown2document-082bae7b20bd605d.yaml b/releasenotes/notes/single-meta-in-markdown2document-082bae7b20bd605d.yaml
@@ -0,0 +1,4 @@
+---
+features:
+ - |
+ Adds support for single metadata dictionary input in `MarkdownToDocument``.
diff --git a/test/components/converters/test_markdown_to_document.py b/test/components/converters/test_markdown_to_document.py
@@ -31,17 +31,33 @@ def test_run(self, test_files_path):
  assert "What to build with Haystack" in doc.content
  assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content
 
- def test_run_with_meta(self):
+ def test_run_calls_normalize_metadata(self, test_files_path):
+ bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
+
+ converter = MarkdownToDocument()
+
+ with patch("haystack.components.converters.markdown.normalize_metadata") as normalize_metadata, patch(
+ "haystack.components.converters.markdown.MarkdownIt"
+ ):
+ converter.run(sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"})
+
+ # check that the metadata normalizer is called properly
+ normalize_metadata.assert_called_with(meta={"language": "it"}, sources_count=2)
+
+ def test_run_with_meta(self, test_files_path):
  bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
 
  converter = MarkdownToDocument()
 
  with patch("haystack.components.converters.markdown.MarkdownIt"):
- output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
- document = output["documents"][0]
+ output = converter.run(
+ sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"}
+ )
 
  # check that the metadata from the bytestream is merged with that from the meta parameter
- assert document.meta == {"author": "test_author", "language": "it"}
+ assert output["documents"][0].meta["author"] == "test_author"
+ assert output["documents"][0].meta["language"] == "it"
+ assert output["documents"][1].meta["language"] == "it"
 
  @pytest.mark.integration
  def test_run_wrong_file_type(self, test_files_path, caplog):