Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Azure converter updates #7409

Merged
merged 13 commits into from
Apr 9, 2024
Prev Previous commit
Next Next commit
Minor touch ups
  • Loading branch information
vblagoje committed Mar 27, 2024
commit a062fbd6751c4b1c96076a3b52ba5f883ed1c6d6
15 changes: 11 additions & 4 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,14 @@ def _convert_tables_and_text(self, result: "AnalyzeResult", meta: Optional[Dict[
docs = [*tables, text]
return docs

def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> List[Document]:
def _convert_tables(self, result: AnalyzeResult, meta: Optional[Dict[str, Any]]) -> List[Document]:
"""
Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents.
:param result: The AnalyzeResult Azure object
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.

:returns: List of Documents containing the tables extracted from the AnalyzeResult object.
"""
converted_tables: List[Document] = []

if not result.tables:
Expand Down Expand Up @@ -281,7 +288,7 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]

return converted_tables

def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> Document:
def _convert_to_natural_text(self, result: AnalyzeResult, meta: Optional[Dict[str, Any]]) -> Document:
"""
This converts the `AnalyzeResult` object into a single Document. We add "\f" separators between to
differentiate between the text on separate pages. This is the expected format for the PreProcessor.
Expand All @@ -296,7 +303,7 @@ def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[

texts = []
if result.paragraphs:
paragraphs_to_pages: Dict = defaultdict(str)
paragraphs_to_pages: Dict[int, str] = defaultdict(str)
for paragraph in result.paragraphs:
if paragraph.bounding_regions:
# If paragraph spans multiple pages we group it with the first page number
Expand All @@ -311,7 +318,7 @@ def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[
continue
paragraphs_to_pages[page_numbers[0]] += paragraph.content + "\n"

max_page_number = max(n for n in paragraphs_to_pages)
max_page_number: int = max(paragraphs_to_pages)
for page_idx in range(1, max_page_number + 1):
# We add empty strings for missing pages so the preprocessor can still extract the correct page number
# from the original PDF.
Expand Down