Skip to content

Commit

Permalink
Upgrade RapidOCR and enable for Python 3.12. Fix PDF OCR test
Browse files Browse the repository at this point in the history
  • Loading branch information
debanjum committed Jun 22, 2024
1 parent 55a23ea commit 22f6db0
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ dependencies = [
"psycopg2-binary == 2.9.9",
"lxml == 4.9.3",
"tzdata == 2023.3",
"rapidocr-onnxruntime == 1.3.11; python_version<'3.12'",
"rapidocr-onnxruntime == 1.3.22",
"openai-whisper >= 20231117",
"django-phonenumber-field == 7.3.0",
"phonenumbers == 8.13.27",
Expand Down
2 changes: 0 additions & 2 deletions tests/test_docx_to_entries.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import os

from khoj.processor.content.docx.docx_to_entries import DocxToEntries


Expand Down
14 changes: 11 additions & 3 deletions tests/test_pdf_to_entries.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re

from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
from khoj.utils.fs_syncer import get_pdf_files
Expand Down Expand Up @@ -38,16 +39,23 @@ def test_multi_page_pdf_to_jsonl():

def test_ocr_page_pdf_to_jsonl():
"Convert multiple pages from single PDF file to jsonl."
# Act
# Arrange
expected_str = "playing on a strip of marsh"
expected_str_with_variable_spaces = re.compile(expected_str.replace(" ", r"\s*"), re.IGNORECASE)

# Extract Entries from specified Pdf files
with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
pdf_bytes = f.read()

data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}

# Act
entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
raw_entry = entries[1][0].raw

# Assert
assert len(entries) == 2
assert len(entries[1]) == 1
assert "playing on a strip of marsh" in entries[1][0].raw
assert re.search(expected_str_with_variable_spaces, raw_entry) is not None


def test_get_pdf_files(tmp_path):
Expand Down

0 comments on commit 22f6db0

Please sign in to comment.