Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Azure converter updates #7409

Merged
merged 13 commits into from
Apr 9, 2024
Prev Previous commit
Next Next commit
Fix pylint and mypy
  • Loading branch information
vblagoje committed Mar 27, 2024
commit 60f7466cb01ec998898cc0e1e46bcdcf6dee32a0
16 changes: 7 additions & 9 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
"""
documents = []
azure_output = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))

meta_list: List[Dict[str, Any]] = normalize_metadata(meta=meta, sources_count=len(sources))
for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source=source)
Expand All @@ -130,7 +129,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
result = poller.result()
azure_output.append(result.to_dict())

docs = self._convert_tables_and_text(result=result, meta=meta)
docs = self._convert_tables_and_text(result=result, meta=metadata)
documents.extend(docs)

return {"documents": documents, "raw_azure_response": azure_output}
sjrl marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -282,7 +281,7 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]

return converted_tables

def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, str]]) -> Document:
def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]]) -> Document:
"""
This converts the `AnalyzeResult` object into a single Document. We add "\f" separators between to
differentiate between the text on separate pages. This is the expected format for the PreProcessor.
Expand Down Expand Up @@ -322,7 +321,7 @@ def _convert_to_natural_text(self, result: "AnalyzeResult", meta: Optional[Dict[
logger.warning("No text paragraphs were detected by the OCR conversion.")

all_text = "\f".join(texts)
return Document(content=all_text, meta=meta)
return Document(content=all_text, meta=meta if meta else {})

def _convert_to_single_column_text(
self, result: "AnalyzeResult", meta: Optional[Dict[str, str]], threshold_y: float = 0.05
Expand Down Expand Up @@ -359,9 +358,8 @@ def _convert_to_single_column_text(
# Default if polygon is not available
else:
logger.info(
"Polygon information for lines on page %s is not available so it is not possible to enforce a "
"single column page layout.",
page_idx,
"Polygon information for lines on page {page_idx} is not available so it is not possible "
"to enforce a single column page layout.".format(page_idx=page_idx)
)
for i in range(len(lines)):
pairs_by_page[page_idx].append([i, i])
Expand Down Expand Up @@ -411,7 +409,7 @@ def _convert_to_single_column_text(
page_text += "\n"
texts.append(page_text)
all_text = "\f".join(texts)
return Document(content=all_text, meta=meta)
return Document(content=all_text, meta=meta if meta else {})

def _collect_table_spans(self, result: "AnalyzeResult") -> Dict:
"""
Expand Down