Skip to content

Commit

Permalink
feat: support single metadata dictionary in MarkdownToDocument (#6629)
Browse files Browse the repository at this point in the history
* support single metadata dict in markdown2document

* reno

* unwrap list

* direct key access

* typing

* add explicit test
  • Loading branch information
ZanSara committed Jan 9, 2024
1 parent 9ace6bf commit abd16ab
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 17 deletions.
6 changes: 5 additions & 1 deletion haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ def __init__(
self.extractor_type = extractor_type

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a list of HTML files to Documents.
Expand Down
25 changes: 14 additions & 11 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from haystack import Document, component
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
from haystack.components.converters.utils import get_bytestream_from_source
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata

with LazyImport("Run 'pip install markdown-it-py mdit_plain'") as markdown_conversion_imports:
from markdown_it import MarkdownIt
Expand All @@ -27,7 +27,7 @@ class MarkdownToDocument:
from haystack.components.converters.markdown import MarkdownToDocument
converter = MarkdownToDocument()
results = converter.run(sources=["sample.md"])
results = converter.run(sources=["sample.md"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the markdown file.'
Expand All @@ -45,28 +45,31 @@ def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True
self.progress_bar = progress_bar

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Reads text from a markdown file and executes optional preprocessing steps.
:param sources: A list of markdown data sources (file paths or binary objects)
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of paths. Defaults to `None`.
:param meta: Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
parser = MarkdownIt(renderer_cls=RendererPlain)
if self.table_to_single_line:
parser.enable("table")

documents = []

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

for source, metadata in tqdm(
zip(sources, meta),
zip(sources, meta_list),
total=len(sources),
desc="Converting markdown files to Documents",
disable=not self.progress_bar,
Expand Down
6 changes: 5 additions & 1 deletion haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ def to_dict(self):
return default_to_dict(self, converter_name=self.converter_name)

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts a list of PDF sources into Document objects using the configured converter.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Adds support for single metadata dictionary input in `MarkdownToDocument``.
24 changes: 20 additions & 4 deletions test/components/converters/test_markdown_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,33 @@ def test_run(self, test_files_path):
assert "What to build with Haystack" in doc.content
assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content

def test_run_with_meta(self):
def test_run_calls_normalize_metadata(self, test_files_path):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

converter = MarkdownToDocument()

with patch("haystack.components.converters.markdown.normalize_metadata") as normalize_metadata, patch(
"haystack.components.converters.markdown.MarkdownIt"
):
converter.run(sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"})

# check that the metadata normalizer is called properly
normalize_metadata.assert_called_with(meta={"language": "it"}, sources_count=2)

def test_run_with_meta(self, test_files_path):
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})

converter = MarkdownToDocument()

with patch("haystack.components.converters.markdown.MarkdownIt"):
output = converter.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]
output = converter.run(
sources=[bytestream, test_files_path / "markdown" / "sample.md"], meta={"language": "it"}
)

# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"
assert output["documents"][1].meta["language"] == "it"

@pytest.mark.integration
def test_run_wrong_file_type(self, test_files_path, caplog):
Expand Down

0 comments on commit abd16ab

Please sign in to comment.