diff --git a/cdp_backend/tests/conftest.py b/cdp_backend/tests/conftest.py index 4b5f0af4..03c9c8f8 100644 --- a/cdp_backend/tests/conftest.py +++ b/cdp_backend/tests/conftest.py @@ -42,6 +42,11 @@ def resources_dir() -> Path: # City of Chicago, Illinois EXAMPLE_VIMEO_SHOWCASE = "https://vimeo.com/showcase/6277394/video/722690793" +EXAMPLE_DOCX = "example_docx.docx" +EXAMPLE_DOC = "example_doc.doc" +EXAMPLE_PDF = "example_pdf.pdf" +EXAMPLE_PPTX = "example_pptx.pptx" + @pytest.fixture def example_video(resources_dir: Path) -> Path: diff --git a/cdp_backend/tests/resources/example_doc.doc b/cdp_backend/tests/resources/example_doc.doc new file mode 100644 index 00000000..bd3838b0 Binary files /dev/null and b/cdp_backend/tests/resources/example_doc.doc differ diff --git a/cdp_backend/tests/resources/example_docx.docx b/cdp_backend/tests/resources/example_docx.docx new file mode 100644 index 00000000..4b86bd82 Binary files /dev/null and b/cdp_backend/tests/resources/example_docx.docx differ diff --git a/cdp_backend/tests/resources/example_pdf.pdf b/cdp_backend/tests/resources/example_pdf.pdf new file mode 100644 index 00000000..80c5ef23 Binary files /dev/null and b/cdp_backend/tests/resources/example_pdf.pdf differ diff --git a/cdp_backend/tests/resources/example_pptx.pptx b/cdp_backend/tests/resources/example_pptx.pptx new file mode 100644 index 00000000..5f08d61b Binary files /dev/null and b/cdp_backend/tests/resources/example_pptx.pptx differ diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 63188cbf..5391c623 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -12,17 +12,23 @@ import imageio import pytest import requests_mock +from requests import HTTPError from cdp_backend.utils import file_utils from cdp_backend.utils.file_utils import ( MAX_THUMBNAIL_HEIGHT, MAX_THUMBNAIL_WIDTH, + parse_document, resource_copy, ) from .. import test_utils from ..conftest import ( + EXAMPLE_DOC, + EXAMPLE_DOCX, EXAMPLE_MKV_VIDEO_FILENAME, + EXAMPLE_PDF, + EXAMPLE_PPTX, EXAMPLE_VIDEO_FILENAME, EXAMPLE_VIDEO_HD_FILENAME, EXAMPLE_VIMEO, @@ -402,6 +408,77 @@ def test_clip_and_reformat_video( os.remove(outfile) +@pytest.mark.parametrize( + "document_uri, expected", + [ + ( + EXAMPLE_DOCX, + "Word9 Word10 Word12 Word11 Word14 Word16 " + + "we Word13 Word15 Word17 Word18 Word19 " + + "Word1 Word2 Word3 Word4 Word5 " + + "Word6 OH 007 A 001 Word7 word8", + ), + ( + EXAMPLE_DOC, + " Word1 Word2 Word3 Word4 " + + "Word5 Word6 OH 007 A 001 Word9 Word10 " + + "Word12 Word11 Word14 Word16 we Word13 " + + "Word15 Word17 Word18 Word19 Word7 word8 ", + ), + ( + EXAMPLE_PDF, + "Word1 Word2 Word3 Word4 Word5 Word6 " + + "OH 007 A 001 Word7 word8 Word9 Word10 Word12 Word11 " + + "Word14 Word16 we Word13 Word15 Word17 Word18 Word19 ", + ), + ( + EXAMPLE_PPTX, + " Title Word1 word2 word3 word4 Word5 word6 " + + "1 word7 word8 word9 word10 word11 Word12 word13 word14 " + + "/docProps/thumbnail.jpeg ", + ), + (EXAMPLE_VIDEO_FILENAME, ""), + ], +) +def test_parse_document(resources_dir: Path, document_uri: str, expected: str) -> None: + + actual_uri = str(resources_dir / document_uri) + + with mock.patch("requests.get") as mocked_requests_get: + + class MockResponse: + def __init__(self) -> None: + self.content = open(actual_uri, "rb").read() + self.status_code = 200 + + mocked_requests_get.return_value = MockResponse() + parsed_doc = parse_document(actual_uri) + assert parsed_doc == expected + + +def test_parse_document_bad_uri() -> None: + + with mock.patch("requests.get") as mocked_requests_get: + + class MockResponse: + def __init__(self) -> None: + self.content = None + self.status_code = 404 + + def raise_for_status(self) -> None: + raise HTTPError + + mocked_requests_get.return_value = MockResponse() + try: + parsed_doc = parse_document("some/bad/uri") + except HTTPError: + assert True + return + + assert parsed_doc == "" + raise AssertionError() + + @pytest.mark.parametrize( "video_filepath, output_format, codec_type, codec_name, expected", [ diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index dbb257ff..53894e3f 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -2,19 +2,24 @@ from __future__ import annotations +import io import logging import math import random import re import shutil +import xml.dom.minidom +import zipfile from hashlib import sha256 from pathlib import Path from uuid import uuid4 import fireo import fsspec +import pypdf import requests from fsspec.core import url_to_fs +from tika import parser from ..database import models as db_models @@ -759,6 +764,161 @@ def clip_and_reformat_video( return output_path +def parse_document(document_uri: str) -> str: + """ + Extract text from a .doc, .docx, or .ppt matter file. + + Parameters + ---------- + document_uri: str + The matter file uri. + + Returns + ------- + str: + A string of all text in the matter file. + """ + response = requests.get(document_uri, stream=True) + if response.status_code != 200: + response.raise_for_status() + else: + document_raw = response.content + + docx_pattern = "\.docx$" + doc_pattern = "\.doc$" + pdf_pattern = "\.pdf$" + pptx_pattern = "\.pptx$" + + if re.search(docx_pattern, document_uri): + return parse_docx_file(document_raw) + elif re.search(doc_pattern, document_uri): + return parse_doc_file(document_raw) + elif re.search(pdf_pattern, document_uri): + return parse_pdf_file(document_raw) + elif re.search(pptx_pattern, document_uri): + return parse_pptx_file(document_raw) + + log.error("Unsupported document type: " + document_uri) + + return "" + + +def parse_docx_file(zip_archive_bytes: bytes) -> str: + """ + Extract text from a .docx matter file. + + Parameters + ---------- + zip_archive_bytes: bytes + The raw document to be parsed. Word docx files are zip archives. + + Returns + ------- + str: + A str of all text in the .docx file. + """ + zip_archive_stream = io.BytesIO(zip_archive_bytes) + zip_archive = zipfile.ZipFile(zip_archive_stream) + archive_members = zip_archive.namelist() + + xml_regex_pattern = "^.*\.xml$" + text = [] + + for file in archive_members: + # text found in .xml files not .rels + if re.search(xml_regex_pattern, file): + file_stream = io.BytesIO(zip_archive.read(file)) + parsed_xml = xml.dom.minidom.parse(file_stream) + + root = parsed_xml.documentElement + text_nodes = root.getElementsByTagName("w:t") + + for node in text_nodes: + text.append(node.firstChild.nodeValue) + + parsed_text = " ".join(text) + return remove_duplicate_space(parsed_text) + + +def parse_doc_file(document_raw: bytes) -> str: + """ + Extract text from a .doc matter file. + + Parameters + ---------- + document_raw: bytes + The raw document. + + Returns + ------- + str: + A str of all text in the .doc file. + """ + parsed_content = parser.from_buffer(document_raw)["content"] + return remove_duplicate_space(parsed_content) + + +def parse_pdf_file(document_raw: bytes) -> str: + """ + Extract text from a .pdf matter file. + + Parameters + ---------- + document_raw: bytes + The raw document. + + Returns + ------- + str: + A str of all text in the .pdf file. + """ + pdf_reader = pypdf.PdfReader(io.BytesIO(document_raw)) + text = "" + + count = 0 + while count < len(pdf_reader.pages): + current_page = pdf_reader.pages[count] + text += current_page.extract_text() + count += 1 + + return remove_duplicate_space(text) + + +def parse_pptx_file(document_raw: bytes) -> str: + """ + Extract text from a .pdf matter file. + + Parameters + ---------- + document_raw: bytes + The raw document. + + Returns + ------- + str: + A str of all text in the .pdf file. + """ + parsed_pptx = parser.from_buffer(document_raw)["content"] + return remove_duplicate_space(parsed_pptx) + + +def remove_duplicate_space(parsed_text: str) -> str: + """ + Remove all duplicate whitespace characters and replace with a single space. + + Parameters + ---------- + parsed_text: str + The parsed text from the document. + + Returns + ------- + str: + A string with no more than one consecutive space. + """ + return re.sub("\s+", " ", parsed_text) + + def should_copy_video(video_filepath: Path, output_format: str = "mp4") -> bool: """ Check if the video should be copied using ffmpeg StreamCopy codec or if it should diff --git a/pyproject.toml b/pyproject.toml index 0e2bcd29..398c635c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,9 +70,11 @@ pipeline = [ "prefect>=1.2,<2", "pyarrow>=8.0", "pydub>=0.25.1", + "pypdf>=2.0", "rapidfuzz>=2.0", "spacy>=3.4", "spacy-transformers>=1.1", + "tika==2.6.0", "torch>=1.10", "tqdm>=4.62", "transformers>=4.16", @@ -222,6 +224,10 @@ ignore = [ "*generated_*", "*.wav", "Dockerfile", + "*example_*.docx", + "*example_*.doc", + "*example_*.pptx", + "*example_*.pdf", ] [tool.mypy]