Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Azure converter updates #7409

Merged
merged 13 commits into from
Apr 9, 2024
Prev Previous commit
Next Next commit
Carry over unit tests from the other side
  • Loading branch information
vblagoje committed Mar 27, 2024
commit f2678371d1f2bc93b84c139ee0cb12a25aad7948
212 changes: 210 additions & 2 deletions test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,83 @@
# ruff: noqa: E501
import json
import os
from unittest.mock import Mock, patch
import os.path
from typing import Literal
from unittest.mock import patch

import pytest
from azure.ai.formrecognizer import AnalyzeResult

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.dataclasses import ByteStream
from haystack.utils import Secret


def get_sample_pdf_1_text(page_layout: Literal["natural", "single_column"]) -> str:
if page_layout == "natural":
return (
"A sample PDF file\nHistory and standardization\nFormat (PDF) Adobe Systems made the PDF specification "
"available free of charge in 1993. In the early years PDF was popular mainly in desktop publishing "
"workflows, and competed with a variety of formats such as DjVu, Envoy, Common Ground Digital Paper, "
"Farallon Replica and even Adobe's own PostScript format. PDF was a proprietary format controlled by "
"Adobe until it was released as an open standard on July 1, 2008, and published by the International "
"Organization for Standardization as ISO 32000-1:2008, at which time control of the specification "
"passed to an ISO Committee of volunteer industry experts. In 2008, Adobe published a Public Patent "
"License to ISO 32000-1 granting royalty-free rights for all patents owned by Adobe that are necessary "
"to make, use, sell, and distribute PDF-compliant implementations. PDF 1.7, the sixth edition of the PDF "
"specification that became ISO 32000-1, includes some proprietary technologies defined only by Adobe, "
"such as Adobe XML Forms Architecture (XFA) and JavaScript extension for Acrobat, which are referenced "
"by ISO 32000-1 as normative and indispensable for the full implementation of the ISO 32000-1 "
"specification. These proprietary technologies are not standardized and their specification is published "
"only on Adobe's website. Many of them are also not supported by popular third-party implementations of "
"PDF.\n\x0cPage 2 of Sample PDF\n\x0c\x0cPage 4 of Sample PDF\n... the page 3 is empty.\n"
)
else:
return (
"A sample PDF file\nHistory and standardization\nFormat (PDF) Adobe Systems made the PDF specification "
"available free of\ncharge in 1993. In the early years PDF was popular mainly in desktop\npublishing "
"workflows, and competed with a variety of formats such as DjVu,\nEnvoy, Common Ground Digital Paper, "
"Farallon Replica and even Adobe's\nown PostScript format. PDF was a proprietary format controlled by "
"Adobe\nuntil it was released as an open standard on July 1, 2008, and published by\nthe International "
"Organization for Standardization as ISO 32000-1:2008, at\nwhich time control of the specification passed "
"to an ISO Committee of\nvolunteer industry experts. In 2008, Adobe published a Public Patent License\nto "
"ISO 32000-1 granting royalty-free rights for all patents owned by Adobe\nthat are necessary to make, use, "
"sell, and distribute PDF-compliant\nimplementations. PDF 1.7, the sixth edition of the PDF specification "
"that\nbecame ISO 32000-1, includes some proprietary technologies defined only by\nAdobe, such as Adobe "
"XML Forms Architecture (XFA) and JavaScript\nextension for Acrobat, which are referenced by ISO 32000-1 "
"as normative\nand indispensable for the full implementation of the ISO 32000-1\nspecification. These "
"proprietary technologies are not standardized and their\nspecification is published only on Adobe's "
"website. Many of them are also not\nsupported by popular third-party implementations of PDF.\n\x0cPage 2 "
"of Sample PDF\n\x0c\x0cPage 4 of Sample PDF\n... the page 3 is empty.\n"
)


def get_sample_pdf_2_text(page_layout: Literal["natural", "single_column"]) -> str:
if page_layout == "natural":
return (
"A Simple PDF File\nThis is a small demonstration .pdf file -\njust for use in the Virtual Mechanics "
"tutorials. More text. And more text. And more text. And more text. And more text.\nAnd more text. And more "
"text. And more text. And more text. And more text. And more text. Boring, zzzzz. And more text. And more "
"text. And more text. And more text. And more text. And more text. And more text. And more text. And more "
"text.\nAnd more text. And more text. And more text. And more text. And more text. And more text. And more "
"text. Even more. Continued on page 2 ...\n\x0cSimple PDF File 2\n... continued from page 1. Yet more text. "
"And more text. And more text. And more text. And more text. And more text. And more text. And more text. "
"Oh, how boring typing this stuff. But not as boring as watching paint dry. And more text. And more text. "
"And more text. And more text. Boring. More, a little more text. The end, and just as well.\n"
)
else:
return (
"A Simple PDF File\nThis is a small demonstration .pdf file -\njust for use in the Virtual Mechanics "
"tutorials. More text. And more\ntext. And more text. And more text. And more text.\nAnd more text. And "
"more text. And more text. And more text. And more\ntext. And more text. Boring, zzzzz. And more text. "
"And more text. And\nmore text. And more text. And more text. And more text. And more text.\nAnd more text. "
"And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. "
"And more text. Even more. Continued on page 2 ...\n\x0cSimple PDF File 2\n... continued from page 1. "
"Yet more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. "
"And more\ntext. Oh, how boring typing this stuff. But not as boring as watching\npaint dry. And more text. "
"And more text. And more text. And more text.\nBoring. More, a little more text. The end, and just as well.\n"
)


class TestAzureOCRDocumentConverter:
def test_init_fail_wo_api_key(self, monkeypatch):
monkeypatch.delenv("AZURE_AI_API_KEY", raising=False)
Expand All @@ -28,6 +98,144 @@ def test_to_dict(self, mock_resolve_value):
},
}

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_azure_converter_with_pdf(self, mock_resolve_value, test_files_path) -> None:
mock_resolve_value.return_value = "test_api_key"

class MockPoller:
def result(self) -> AnalyzeResult:
with open(test_files_path / "json" / "azure_sample_pdf_2.json", encoding="utf-8") as azure_file:
result = json.load(azure_file)
return AnalyzeResult.from_dict(result)

with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_2.pdf"])
assert len(out["documents"]) == 1
assert out["documents"][0].content == get_sample_pdf_2_text(page_layout="natural")
assert out["documents"][0].content.count("\f") == 1

@pytest.mark.parametrize("page_layout", ["natural", "single_column"])
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_azure_converter_with_table(
self, mock_resolve_value, page_layout: Literal["natural", "single_column"], test_files_path
) -> None:
mock_resolve_value.return_value = "test_api_key"

class MockPoller:
def result(self) -> AnalyzeResult:
with open(test_files_path / "json" / "azure_sample_pdf_1.json", encoding="utf-8") as azure_file:
result = json.load(azure_file)
return AnalyzeResult.from_dict(result)

with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="", page_layout=page_layout)
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])

docs = out["documents"]
assert len(docs) == 2
# Checking the table doc extracted
assert docs[0].content_type == "table"
assert docs[0].dataframe.shape[0] == 4 # number of rows
assert docs[0].dataframe.shape[1] == 4 # number of columns
assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
assert (
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
"standardized and their\nspecification is published only on "
"Adobe's website. Many of them are also not\nsupported by "
"popular third-party implementations of PDF."
)
assert docs[0].meta["following_context"] == ""
assert docs[0].meta["page"] == 1

# Checking the text extracted
assert docs[1].content_type == "text"
assert docs[1].content.startswith("A sample PDF file")
assert docs[1].content.count("\f") == 3 # There should be three page separations
pages = docs[1].content.split("\f")
gold_pages = get_sample_pdf_1_text(page_layout=page_layout).split("\f")
assert pages[0] == gold_pages[0]
assert pages[1] == gold_pages[1]
assert pages[2] == gold_pages[2]
assert pages[3] == gold_pages[3]

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_azure_converter_with_table_no_bounding_region(self, mock_resolve_value, test_files_path) -> None:
mock_resolve_value.return_value = "test_api_key"

class MockPoller:
def result(self) -> AnalyzeResult:
with open(test_files_path / "json" / "azure_sample_pdf_1.json", encoding="utf-8") as azure_file:
result = json.load(azure_file)
return AnalyzeResult.from_dict(result)

with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])

docs = out["documents"]
assert len(docs) == 2
# Checking the table doc extracted that is missing bounding info
assert docs[0].content_type == "table"
assert docs[0].dataframe.shape[0] == 4 # number of rows
assert docs[0].dataframe.shape[1] == 4 # number of columns
assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
# TODO below assert fails
# assert docs[0].meta["preceding_context"] == ""

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_azure_converter_with_multicolumn_header_table(self, mock_resolve_value, test_files_path) -> None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@silvanocerza this one fails

mock_resolve_value.return_value = "test_api_key"

class MockPoller:
def result(self) -> AnalyzeResult:
with open(test_files_path / "json" / "azure_sample_pdf_3.json", encoding="utf-8") as azure_file:
result = json.load(azure_file)
return AnalyzeResult.from_dict(result)

with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")

# TODO: fails because of non-unique column names, azure_sample_pdf_3.json has duplicate column names
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_3.pdf"])

docs = out["documents"]
assert len(docs) == 2
assert docs[0].content_type == "table"
assert docs[0].content.shape[0] == 1 # number of rows
assert docs[0].content.shape[1] == 3 # number of columns
assert list(docs[0].content.columns) == ["This is a subheader", "This is a subheader", "This is a subheader"]
assert list(docs[0].content.iloc[0]) == ["Value 1", "Value 2", "Val 3"]
assert (
docs[0].meta["preceding_context"]
== "Table 1. This is an example table with two multicolumn headers\nHeader 1"
)

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_table_pdf_with_non_empty_meta(self, mock_resolve_value, test_files_path) -> None:
mock_resolve_value.return_value = "test_api_key"

class MockPoller:
def result(self) -> AnalyzeResult:
with open(test_files_path / "json" / "azure_sample_pdf_1.json", encoding="utf-8") as azure_file:
result = json.load(azure_file)
return AnalyzeResult.from_dict(result)

with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"], meta=[{"test": "value_1"}])

docs = out["documents"]
# TODO assert below changed from the original test
assert docs[1].meta[0]["test"] == "value_1"

@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure endpoint not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
Expand Down
Loading