deepset-ai · ZanSara · Dec 22, 2023 · Dec 21, 2023 · Dec 22, 2023 · Dec 22, 2023
@@ -6,7 +6,7 @@
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
 from haystack import Document, component, default_to_dict
-from haystack.components.converters.utils import get_bytestream_from_source
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
 
 with LazyImport("Run 'pip install pypdf'") as pypdf_import:
  from pypdf import PdfReader
@@ -52,7 +52,7 @@ class PyPDFToDocument:
  from haystack.components.converters.pypdf import PyPDFToDocument
 
  converter = PyPDFToDocument()
- results = converter.run(sources=["sample.pdf"])
+ results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
  documents = results["documents"]
  print(documents[0].content)
  # 'This is a text from the PDF file.'
@@ -87,18 +87,17 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
  Converts a list of PDF sources into Document objects using the configured converter.
 
  :param sources: A list of PDF data sources, which can be file paths or ByteStream objects.
- :param meta: Optional list of metadata to attach to the Documents.
- The length of the list must match the number of sources. Defaults to `None`.
+ :param meta: Optional metadata to attach to the Documents.
+ This value can be either a list of dictionaries or a single dictionary.
+ If it's a single dictionary, its content is added to the metadata of all produced Documents.
+ If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+ Defaults to `None`.
  :return: A dictionary containing a list of Document objects under the 'documents' key.
  """
  documents = []
+ meta_list = normalize_metadata(meta, sources_count=len(sources))
 
- if meta is None:
- meta = [{}] * len(sources)
- elif len(sources) != len(meta):
- raise ValueError("The length of the metadata list must match the number of sources.")
-
- for source, metadata in zip(sources, meta):
+ for source, metadata in zip(sources, meta_list):
  try:
  bytestream = get_bytestream_from_source(source)
  except Exception as e:

@@ -0,0 +1,5 @@
+
+---
+enhancements:
+ - |
+ Adds support for single metadata dictionary input in `PyPDFToDocument`.
@@ -29,17 +29,19 @@ def test_run(self, test_files_path):
  assert len(docs) == 1
  assert "ReAct" in docs[0].content
 
- def test_run_with_meta(self):
+ def test_run_with_meta(self, test_files_path):
  bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
 
  converter = PyPDFToDocument()
  with patch("haystack.components.converters.pypdf.PdfReader"):
- output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
-
- document = output["documents"][0]
+ output = converter.run(
+ sources=[bytestream, test_files_path / "pdf" / "react_paper.pdf"], meta={"language": "it"}
+  )
 
  # check that the metadata from the bytestream is merged with that from the meta parameter
- assert document.meta == {"author": "test_author", "language": "it"}
+ assert output["documents"][0].meta["author"] == "test_author"
+ assert output["documents"][0].meta["language"] == "it"
+ assert output["documents"][1].meta["language"] == "it"
 
  def test_run_error_handling(self, test_files_path, caplog):
  """