feature/matter-text-extraction (#231)

* .docx text extracted * .pdf text extracted * test docx * working docx unit test * refactor request out of parse functions * parse doc & parse pptx * fixed documentation * update requirements * check-manifest * docx tests * add example docxs * test pptx, pdf, doc * remove print statement * test docx, doc, pdf with single file * test pptx w/ single document * remove additional example documents * remove print * test error handling * update files
CouncilDataProject · Jun 14, 2023 · a603be1 · a603be1
1 parent 703d7f1
commit a603be1
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 0 deletions.
diff --git a/cdp_backend/tests/conftest.py b/cdp_backend/tests/conftest.py
@@ -42,6 +42,11 @@ def resources_dir() -> Path:
 # City of Chicago, Illinois
 EXAMPLE_VIMEO_SHOWCASE = "https://vimeo.com/showcase/6277394/video/722690793"
 
+EXAMPLE_DOCX = "example_docx.docx"
+EXAMPLE_DOC = "example_doc.doc"
+EXAMPLE_PDF = "example_pdf.pdf"
+EXAMPLE_PPTX = "example_pptx.pptx"
+
 
 @pytest.fixture
 def example_video(resources_dir: Path) -> Path:

diff --git a/cdp_backend/tests/resources/example_doc.doc b/cdp_backend/tests/resources/example_doc.doc
diff --git a/cdp_backend/tests/resources/example_docx.docx b/cdp_backend/tests/resources/example_docx.docx
diff --git a/cdp_backend/tests/resources/example_pdf.pdf b/cdp_backend/tests/resources/example_pdf.pdf
diff --git a/cdp_backend/tests/resources/example_pptx.pptx b/cdp_backend/tests/resources/example_pptx.pptx
diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py
@@ -12,17 +12,23 @@
 import imageio
 import pytest
 import requests_mock
+from requests import HTTPError
 
 from cdp_backend.utils import file_utils
 from cdp_backend.utils.file_utils import (
  MAX_THUMBNAIL_HEIGHT,
  MAX_THUMBNAIL_WIDTH,
+ parse_document,
  resource_copy,
 )
 
 from .. import test_utils
 from ..conftest import (
+ EXAMPLE_DOC,
+ EXAMPLE_DOCX,
  EXAMPLE_MKV_VIDEO_FILENAME,
+ EXAMPLE_PDF,
+ EXAMPLE_PPTX,
  EXAMPLE_VIDEO_FILENAME,
  EXAMPLE_VIDEO_HD_FILENAME,
  EXAMPLE_VIMEO,
@@ -402,6 +408,77 @@ def test_clip_and_reformat_video(
  os.remove(outfile)
 
 
+@pytest.mark.parametrize(
+ "document_uri, expected",
+ [
+ (
+ EXAMPLE_DOCX,
+ "Word9 Word10 Word12 Word11 Word14 Word16 "
+ + "we Word13 Word15 Word17 Word18 Word19 "
+ + "Word1 Word2 Word3 Word4 Word5 "
+ + "Word6 OH 007 A 001 Word7 word8",
+ ),
+ (
+ EXAMPLE_DOC,
+ " Word1 Word2 Word3 Word4 "
+ + "Word5 Word6 OH 007 A 001 Word9 Word10 "
+ + "Word12 Word11 Word14 Word16 we Word13 "
+ + "Word15 Word17 Word18 Word19 Word7 word8 ",
+ ),
+ (
+ EXAMPLE_PDF,
+ "Word1 Word2 Word3 Word4 Word5 Word6 "
+ + "OH 007 A 001 Word7 word8 Word9 Word10 Word12 Word11 "
+ + "Word14 Word16 we Word13 Word15 Word17 Word18 Word19 ",
+ ),
+ (
+ EXAMPLE_PPTX,
+ " Title Word1 word2 word3 word4 Word5 word6 "
+ + "1 word7 word8 word9 word10 word11 Word12 word13 word14 "
+ + "/docProps/thumbnail.jpeg ",
+ ),
+ (EXAMPLE_VIDEO_FILENAME, ""),
+ ],
+)
+def test_parse_document(resources_dir: Path, document_uri: str, expected: str) -> None:
+
+ actual_uri = str(resources_dir / document_uri)
+
+ with mock.patch("requests.get") as mocked_requests_get:
+
+ class MockResponse:
+ def __init__(self) -> None:
+ self.content = open(actual_uri, "rb").read()
+ self.status_code = 200
+
+ mocked_requests_get.return_value = MockResponse()
+ parsed_doc = parse_document(actual_uri)
+ assert parsed_doc == expected
+
+
+def test_parse_document_bad_uri() -> None:
+
+ with mock.patch("requests.get") as mocked_requests_get:
+
+ class MockResponse:
+ def __init__(self) -> None:
+ self.content = None
+ self.status_code = 404
+
+ def raise_for_status(self) -> None:
+ raise HTTPError
+
+ mocked_requests_get.return_value = MockResponse()
+ try:
+ parsed_doc = parse_document("some/bad/uri")
+ except HTTPError:
+ assert True
+ return
+
+ assert parsed_doc == ""
+ raise AssertionError()
+
+
 @pytest.mark.parametrize(
  "video_filepath, output_format, codec_type, codec_name, expected",
  [

diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py
@@ -2,19 +2,24 @@
 
 from __future__ import annotations
 
+import io
 import logging
 import math
 import random
 import re
 import shutil
+import xml.dom.minidom
+import zipfile
 from hashlib import sha256
 from pathlib import Path
 from uuid import uuid4
 
 import fireo
 import fsspec
+import pypdf
 import requests
 from fsspec.core import url_to_fs
+from tika import parser
 
 from ..database import models as db_models
 
@@ -759,6 +764,161 @@ def clip_and_reformat_video(
  return output_path
 
 
+def parse_document(document_uri: str) -> str:
+ """
+ Extract text from a .doc, .docx, or .ppt matter file.
+
+ Parameters
+ ----------
+ document_uri: str
+ The matter file uri.
+
+ Returns
+ -------
+ str:
+ A string of all text in the matter file.
+ """
+ response = requests.get(document_uri, stream=True)
+ if response.status_code != 200:
+ response.raise_for_status()
+ else:
+ document_raw = response.content
+
+ docx_pattern = "\.docx$"
+ doc_pattern = "\.doc$"
+ pdf_pattern = "\.pdf$"
+ pptx_pattern = "\.pptx$"
+
+ if re.search(docx_pattern, document_uri):
+ return parse_docx_file(document_raw)
+ elif re.search(doc_pattern, document_uri):
+ return parse_doc_file(document_raw)
+ elif re.search(pdf_pattern, document_uri):
+ return parse_pdf_file(document_raw)
+ elif re.search(pptx_pattern, document_uri):
+ return parse_pptx_file(document_raw)
+
+ log.error("Unsupported document type: " + document_uri)
+
+ return ""
+
+
+def parse_docx_file(zip_archive_bytes: bytes) -> str:
+ """
+ Extract text from a .docx matter file.
+
+ Parameters
+ ----------
+ zip_archive_bytes: bytes
+ The raw document to be parsed. Word docx files are zip archives.
+
+ Returns
+ -------
+ str:
+ A str of all text in the .docx file.
+ """
+ zip_archive_stream = io.BytesIO(zip_archive_bytes)
+ zip_archive = zipfile.ZipFile(zip_archive_stream)
+ archive_members = zip_archive.namelist()
+
+ xml_regex_pattern = "^.*\.xml$"
+ text = []
+
+ for file in archive_members:
+ # text found in .xml files not .rels
+ if re.search(xml_regex_pattern, file):
+ file_stream = io.BytesIO(zip_archive.read(file))
+ parsed_xml = xml.dom.minidom.parse(file_stream)
+
+ root = parsed_xml.documentElement
+ text_nodes = root.getElementsByTagName("w:t")
+
+ for node in text_nodes:
+ text.append(node.firstChild.nodeValue)
+
+ parsed_text = " ".join(text)
+ return remove_duplicate_space(parsed_text)
+
+
+def parse_doc_file(document_raw: bytes) -> str:
+ """
+ Extract text from a .doc matter file.
+
+ Parameters
+ ----------
+ document_raw: bytes
+ The raw document.
+
+ Returns
+ -------
+ str:
+ A str of all text in the .doc file.
+ """
+ parsed_content = parser.from_buffer(document_raw)["content"]
+ return remove_duplicate_space(parsed_content)
+
+
+def parse_pdf_file(document_raw: bytes) -> str:
+ """
+ Extract text from a .pdf matter file.
+
+ Parameters
+ ----------
+ document_raw: bytes
+ The raw document.
+
+ Returns
+ -------
+ str:
+ A str of all text in the .pdf file.
+ """
+ pdf_reader = pypdf.PdfReader(io.BytesIO(document_raw))
+ text = ""
+
+ count = 0
+ while count < len(pdf_reader.pages):
+ current_page = pdf_reader.pages[count]
+ text += current_page.extract_text()
+ count += 1
+
+ return remove_duplicate_space(text)
+
+
+def parse_pptx_file(document_raw: bytes) -> str:
+ """
+ Extract text from a .pdf matter file.
+
+ Parameters
+ ----------
+ document_raw: bytes
+ The raw document.
+
+ Returns
+ -------
+ str:
+ A str of all text in the .pdf file.
+ """
+ parsed_pptx = parser.from_buffer(document_raw)["content"]
+ return remove_duplicate_space(parsed_pptx)
+
+
+def remove_duplicate_space(parsed_text: str) -> str:
+ """
+ Remove all duplicate whitespace characters and replace with a single space.
+
+ Parameters
+ ----------
+ parsed_text: str
+ The parsed text from the document.
+
+ Returns
+ -------
+ str:
+ A string with no more than one consecutive space.
+ """
+ return re.sub("\s+", " ", parsed_text)
+
+
 def should_copy_video(video_filepath: Path, output_format: str = "mp4") -> bool:
  """
  Check if the video should be copied using ffmpeg StreamCopy codec or if it should

diff --git a/pyproject.toml b/pyproject.toml
@@ -70,9 +70,11 @@ pipeline = [
  "prefect>=1.2,<2",
  "pyarrow>=8.0",
  "pydub>=0.25.1",
+ "pypdf>=2.0",
  "rapidfuzz>=2.0",
  "spacy>=3.4",
  "spacy-transformers>=1.1",
+ "tika==2.6.0",
  "torch>=1.10",
  "tqdm>=4.62",
  "transformers>=4.16",
@@ -222,6 +224,10 @@ ignore = [
  "*generated_*",
  "*.wav",
  "Dockerfile",
+ "*example_*.docx",
+ "*example_*.doc",
+ "*example_*.pptx",
+ "*example_*.pdf",
 ]
 
 [tool.mypy]