Skip to content

Commit

Permalink
refactor!: Improve PyPDFToDocument (#7362)
Browse files Browse the repository at this point in the history
* first draft

* rm kwargs from protocol

* Simplify

* no breaking changes

* reno

* one more test of the deprecated registry
  • Loading branch information
anakin87 committed Mar 26, 2024
1 parent 19d3f39 commit 6925e3a
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 18 deletions.
77 changes: 63 additions & 14 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import io
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol, Union

from haystack import Document, component, default_to_dict, logging
from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
from haystack.utils.type_serialization import deserialize_type

with LazyImport("Run 'pip install pypdf'") as pypdf_import:
from pypdf import PdfReader
Expand All @@ -22,6 +24,13 @@ class PyPDFConverter(Protocol):
def convert(self, reader: "PdfReader") -> Document:
...

def to_dict(self):
...

@classmethod
def from_dict(cls, data):
...


class DefaultConverter:
"""
Expand All @@ -33,9 +42,17 @@ def convert(self, reader: "PdfReader") -> Document:
text = "\f".join(page.extract_text() for page in reader.pages)
return Document(content=text)

def to_dict(self):
return default_to_dict(self)

@classmethod
def from_dict(cls, data):
return default_from_dict(cls, data)


# This registry is used to store converters names and instances.
# It can be used to register custom converters.
# It is now deprecated and will be removed in Haystack 2.3.0.
CONVERTERS_REGISTRY: Dict[str, PyPDFConverter] = {"default": DefaultConverter()}


Expand All @@ -59,24 +76,39 @@ class PyPDFToDocument:
```
"""

def __init__(self, converter_name: str = "default"):
def __init__(self, converter_name: Optional[str] = None, converter: Optional[PyPDFConverter] = None):
"""
Create an PyPDFToDocument component.
:param converter_name:
Name of the registered converter to use.
The name of a PyPDFConverter instance stored in the CONVERTERS_REGISTRY. Deprecated.
:param converter:
An instance of a PyPDFConverter compatible class.
"""
pypdf_import.check()

try:
converter = CONVERTERS_REGISTRY[converter_name]
except KeyError:
msg = (
f"Invalid converter_name: {converter_name}.\n Available converters: {list(CONVERTERS_REGISTRY.keys())}"
)
raise ValueError(msg) from KeyError
self.converter_name = converter_name
self._converter: PyPDFConverter = converter
if converter_name:
warnings.warn(
"The `converter_name` parameter is deprecated and will be removed in Haystack 2.3.0. "
"Please use the `converter` parameter instead.",
DeprecationWarning,
)
try:
converter = CONVERTERS_REGISTRY[converter_name]
except KeyError:
msg = (
f"Invalid converter_name: {converter_name}.\n Available converters: {list(CONVERTERS_REGISTRY.keys())}"
f"To specify a custom converter, it is recommended to use the `converter` parameter."
)
raise ValueError(msg) from KeyError

self.converter = converter

elif converter:
self.converter = converter
else:
self.converter = DefaultConverter()

def to_dict(self):
"""
Expand All @@ -85,8 +117,25 @@ def to_dict(self):
:returns:
Dictionary with serialized data.
"""
# do not serialize the _converter instance
return default_to_dict(self, converter_name=self.converter_name)
return default_to_dict(self, converter_name=self.converter_name, converter=self.converter.to_dict())

@classmethod
def from_dict(cls, data):
"""
Deserializes the component from a dictionary.
:param data:
Dictionary with serialized data.
:returns:
Deserialized component.
"""
if data["init_parameters"].get("converter_name"):
return default_from_dict(cls, data)

converter_class = deserialize_type(data["init_parameters"]["converter"]["type"])
data["init_parameters"]["converter"] = converter_class.from_dict(data["init_parameters"]["converter"])
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -121,7 +170,7 @@ def run(
continue
try:
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
document = self._converter.convert(pdf_reader)
document = self.converter.convert(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
Expand Down
12 changes: 12 additions & 0 deletions releasenotes/notes/pypdf-refactoring-de869c91b42ce5b6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
enhancements:
- |
Refactor `PyPDFToDocument` to simplify support for custom PDF converters.
PDF converters are classes that implement the `PyPDFConverter` protocol and have 3 methods:
`convert`, `to_dict` and `from_dict`.
The `DefaultConverter` class is provided as a default implementation.
deprecations:
- |
Using the `converter_name` parameter in the `PyPDFToDocument` component is deprecated.
It will be removed in the 2.3.0 release.
Use the `converter` parameter instead.
66 changes: 62 additions & 4 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
from unittest.mock import patch

import pytest

from haystack import Document
from haystack.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY
from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import CONVERTERS_REGISTRY, DefaultConverter, PyPDFToDocument
from haystack.dataclasses import ByteStream


Expand All @@ -14,13 +15,46 @@ def pypdf_converter():

class TestPyPDFToDocument:
def test_init(self, pypdf_converter):
assert pypdf_converter.converter_name == "default"
assert hasattr(pypdf_converter, "_converter")
assert isinstance(pypdf_converter.converter, DefaultConverter)
assert pypdf_converter.converter_name is None

def test_init_fail_nonexisting_converter(self):
with pytest.raises(ValueError):
PyPDFToDocument(converter_name="non_existing_converter")

def test_to_dict(self, pypdf_converter):
data = pypdf_converter.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}},
"converter_name": None,
},
}

def test_from_dict(self):
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
},
}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, DefaultConverter)
assert instance.converter_name is None

def test_from_dict_with_converter_name(self):
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {"converter_name": "default"},
}

instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, DefaultConverter)
assert instance.converter_name == "default"

@pytest.mark.integration
def test_run(self, test_files_path, pypdf_converter):
"""
Expand Down Expand Up @@ -86,6 +120,30 @@ def test_custom_converter(self, test_files_path):

paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]

class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document:
return Document(content="I don't care about converting given pdfs, I always return this")

def to_dict(self):
return default_to_dict(self)

@classmethod
def from_dict(cls, data):
return default_from_dict(cls, data)

component = PyPDFToDocument(converter=MyCustomConverter())
output = component.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "ReAct" not in docs[0].content
assert "I don't care about converting given pdfs, I always return this" in docs[0].content

@pytest.mark.integration
def test_custom_converter_deprecated(self, test_files_path):
from pypdf import PdfReader

paths = [test_files_path / "pdf" / "sample_pdf_1.pdf"]

class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document:
return Document(content="I don't care about converting given pdfs, I always return this")
Expand Down

0 comments on commit 6925e3a

Please sign in to comment.