Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: Converters - add usage examples #6556

Merged
merged 28 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7555912
standardize converters inputs: first draft
anakin87 Dec 13, 2023
69aa402
fix precommit
anakin87 Dec 13, 2023
5508b36
fix precommit 2
anakin87 Dec 13, 2023
b3b5625
fix precommit 3
anakin87 Dec 13, 2023
4d71ba8
add default for optional param
anakin87 Dec 13, 2023
63752f0
rm leftover
anakin87 Dec 13, 2023
0ba43bd
install boilerpy in linting workflow
anakin87 Dec 14, 2023
50a7ae2
add boilerpy3 to the core dependencies
masci Dec 14, 2023
cc9bbbd
add reno
anakin87 Dec 14, 2023
5d05954
remove boilerpy3 installation from test workflow
julian-risch Dec 14, 2023
ca0a5e1
fix pylint: import order and unused import
julian-risch Dec 14, 2023
879e737
fix import order
julian-risch Dec 14, 2023
3b9da7a
add release note
masci Dec 14, 2023
cabbb11
better Tika docstring
anakin87 Dec 14, 2023
09c60ab
Merge branch 'massi/boilerpy3' into converters-standardize-sources
anakin87 Dec 14, 2023
b381194
rm boilerpy from linting
anakin87 Dec 14, 2023
ba8c1d0
leftover
anakin87 Dec 14, 2023
fd9d17f
Merge branch 'main' into converters-standardize-sources
anakin87 Dec 14, 2023
1c1b235
first impl for html
anakin87 Dec 14, 2023
989e48e
progressing on other components
anakin87 Dec 14, 2023
ab44436
fix test
anakin87 Dec 14, 2023
a212ddc
add tests - run with meta
anakin87 Dec 14, 2023
c5d6d29
release note
anakin87 Dec 14, 2023
6e9a853
reintroduce patches wrongly deleted
anakin87 Dec 14, 2023
22171e9
add patch in test
anakin87 Dec 14, 2023
5c894f0
fix tika test
anakin87 Dec 14, 2023
622f209
add usage examples
anakin87 Dec 15, 2023
b48a9fa
Merge branch 'main' into converters-add-usage-examples
anakin87 Dec 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
progressing on other components
  • Loading branch information
anakin87 committed Dec 14, 2023
commit 989e48e55bab4678152f72bee8d08049b89384c1
18 changes: 15 additions & 3 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def __init__(self, endpoint: str, api_key: Optional[str] = None, model_id: str =
self.endpoint = endpoint
self.model_id = model_id

@component.output_types(documents=List[Document], azure=List[Dict])
def run(self, sources: List[Union[str, Path, ByteStream]]):
@component.output_types(documents=List[Document], raw_azure_response=List[Dict])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert files to Documents using Azure's Document Intelligence service.

Expand All @@ -66,10 +66,20 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
the raw responses from Azure's Document Intelligence service.

:param sources: List of file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
It also contains the raw Azure response under the 'raw_azure_response' key.
"""
documents = []
azure_output = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source=source)
except Exception as e:
Expand All @@ -87,6 +97,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
file_suffix = Path(bytestream.metadata["file_path"]).suffix

document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix)
merged_metadata = {**bytestream.metadata, **metadata}
document.meta = merged_metadata
documents.append(document)

return {"documents": documents, "raw_azure_response": azure_output}
Expand Down
4 changes: 2 additions & 2 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D

:param sources: List of HTML file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: List of converted Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""

documents = []
Expand Down
9 changes: 7 additions & 2 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,19 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D

:param sources: A list of markdown data sources (file paths or binary objects)
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of paths. Defaults to `None`.
The length of the list must match the number of paths. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
parser = MarkdownIt(renderer_cls=RendererPlain)
if self.table_to_single_line:
parser.enable("table")

documents = []

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in tqdm(
zip(sources, meta),
Expand All @@ -79,7 +83,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue

document = Document(content=text, meta=metadata)
merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
17 changes: 14 additions & 3 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import logging
from typing import List, Union, Protocol, Dict
from typing import List, Union, Protocol, Dict, Any, Optional
from pathlib import Path

from haystack.dataclasses import ByteStream
Expand Down Expand Up @@ -71,15 +71,23 @@ def to_dict(self):
return default_to_dict(self, converter_name=self.converter_name)

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Converts a list of PDF sources into Document objects using the configured converter.

:param sources: A list of PDF data sources, which can be file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
documents = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand All @@ -91,6 +99,9 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
except Exception as e:
logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
continue

merged_metadata = {**bytestream.metadata, **metadata}
document.meta = merged_metadata
documents.append(document)

return {"documents": documents}
19 changes: 15 additions & 4 deletions haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Union
from typing import List, Union, Dict, Any, Optional
import io

from haystack.lazy_imports import LazyImport
Expand Down Expand Up @@ -37,15 +37,24 @@ def __init__(self, tika_url: str = "https://localhost:9998/tika"):
self.tika_url = tika_url

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert files to Documents.

:param sources: List of file paths or ByteStream objects.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""

documents = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
Expand All @@ -56,6 +65,8 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
except Exception as conversion_e:
logger.warning("Failed to extract text from %s. Skipping it. Error: %s", source, conversion_e)
continue
document = Document(content=text)

merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)
return {"documents": documents}
31 changes: 21 additions & 10 deletions haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Union
from typing import List, Union, Dict, Any, Optional

from haystack import Document, component
from haystack.dataclasses import ByteStream
Expand All @@ -27,28 +27,39 @@ def __init__(self, encoding: str = "utf-8"):
self.encoding = encoding

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""
Convert text files to Documents.

:param streams: A list of paths to text files or ByteStream objects.
Note that if an encoding is specified in the metadata of a ByteStream,
it will override the component's default.
:return: A dictionary containing the converted documents.
:param sources: A list of paths to text files or ByteStream objects.
Note that if an encoding is specified in the metadata of a ByteStream,
it will override the component's default.
:param meta: Optional list of metadata to attach to the Documents.
The length of the list must match the number of sources. Defaults to `None`.
:return: A dictionary containing a list of Document objects under the 'documents' key.
"""
documents = []
for source in sources:

if meta is None:
meta = [{}] * len(sources)
elif len(sources) != len(meta):
raise ValueError("The length of the metadata list must match the number of sources.")

for source, metadata in zip(sources, meta):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
continue
try:
encoding = bytestream.metadata.get("encoding", self.encoding)
document = Document(content=bytestream.data.decode(encoding))
document.meta = bytestream.metadata
documents.append(document)
text = Document(content=bytestream.data.decode(encoding))
except Exception as e:
logger.warning("Could not convert file %s. Skipping it. Error message: %s", source, e)
continue

merged_metadata = {**bytestream.metadata, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}