Skip to content

Commit

Permalink
feature/matter-text-extraction (#231)
Browse files Browse the repository at this point in the history
* .docx text extracted

* .pdf text extracted

* test docx

* working docx unit test

* refactor request out of parse functions

* parse doc & parse pptx

* fixed documentation

* update requirements

* check-manifest

* docx tests

* add example docxs

* test pptx, pdf, doc

* remove print statement

* test docx, doc, pdf with single file

* test pptx w/  single document

* remove additional example documents

* remove print

* test error handling

* update files
  • Loading branch information
sagarrat7 committed Jun 14, 2023
1 parent 703d7f1 commit a603be1
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 0 deletions.
5 changes: 5 additions & 0 deletions cdp_backend/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def resources_dir() -> Path:
# City of Chicago, Illinois
EXAMPLE_VIMEO_SHOWCASE = "https://vimeo.com/showcase/6277394/video/722690793"

EXAMPLE_DOCX = "example_docx.docx"
EXAMPLE_DOC = "example_doc.doc"
EXAMPLE_PDF = "example_pdf.pdf"
EXAMPLE_PPTX = "example_pptx.pptx"


@pytest.fixture
def example_video(resources_dir: Path) -> Path:
Expand Down
Binary file added cdp_backend/tests/resources/example_doc.doc
Binary file not shown.
Binary file added cdp_backend/tests/resources/example_docx.docx
Binary file not shown.
Binary file added cdp_backend/tests/resources/example_pdf.pdf
Binary file not shown.
Binary file added cdp_backend/tests/resources/example_pptx.pptx
Binary file not shown.
77 changes: 77 additions & 0 deletions cdp_backend/tests/utils/test_file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,23 @@
import imageio
import pytest
import requests_mock
from requests import HTTPError

from cdp_backend.utils import file_utils
from cdp_backend.utils.file_utils import (
MAX_THUMBNAIL_HEIGHT,
MAX_THUMBNAIL_WIDTH,
parse_document,
resource_copy,
)

from .. import test_utils
from ..conftest import (
EXAMPLE_DOC,
EXAMPLE_DOCX,
EXAMPLE_MKV_VIDEO_FILENAME,
EXAMPLE_PDF,
EXAMPLE_PPTX,
EXAMPLE_VIDEO_FILENAME,
EXAMPLE_VIDEO_HD_FILENAME,
EXAMPLE_VIMEO,
Expand Down Expand Up @@ -402,6 +408,77 @@ def test_clip_and_reformat_video(
os.remove(outfile)


@pytest.mark.parametrize(
"document_uri, expected",
[
(
EXAMPLE_DOCX,
"Word9 Word10 Word12 Word11 Word14 Word16 "
+ "we Word13 Word15 Word17 Word18 Word19 "
+ "Word1 Word2 Word3 Word4 Word5 "
+ "Word6 OH 007 A 001 Word7 word8",
),
(
EXAMPLE_DOC,
" Word1 Word2 Word3 Word4 "
+ "Word5 Word6 OH 007 A 001 Word9 Word10 "
+ "Word12 Word11 Word14 Word16 we Word13 "
+ "Word15 Word17 Word18 Word19 Word7 word8 ",
),
(
EXAMPLE_PDF,
"Word1 Word2 Word3 Word4 Word5 Word6 "
+ "OH 007 A 001 Word7 word8 Word9 Word10 Word12 Word11 "
+ "Word14 Word16 we Word13 Word15 Word17 Word18 Word19 ",
),
(
EXAMPLE_PPTX,
" Title Word1 word2 word3 word4 Word5 word6 "
+ "1 word7 word8 word9 word10 word11 Word12 word13 word14 "
+ "/docProps/thumbnail.jpeg ",
),
(EXAMPLE_VIDEO_FILENAME, ""),
],
)
def test_parse_document(resources_dir: Path, document_uri: str, expected: str) -> None:

actual_uri = str(resources_dir / document_uri)

with mock.patch("requests.get") as mocked_requests_get:

class MockResponse:
def __init__(self) -> None:
self.content = open(actual_uri, "rb").read()
self.status_code = 200

mocked_requests_get.return_value = MockResponse()
parsed_doc = parse_document(actual_uri)
assert parsed_doc == expected


def test_parse_document_bad_uri() -> None:

with mock.patch("requests.get") as mocked_requests_get:

class MockResponse:
def __init__(self) -> None:
self.content = None
self.status_code = 404

def raise_for_status(self) -> None:
raise HTTPError

mocked_requests_get.return_value = MockResponse()
try:
parsed_doc = parse_document("some/bad/uri")
except HTTPError:
assert True
return

assert parsed_doc == ""
raise AssertionError()


@pytest.mark.parametrize(
"video_filepath, output_format, codec_type, codec_name, expected",
[
Expand Down
160 changes: 160 additions & 0 deletions cdp_backend/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@

from __future__ import annotations

import io
import logging
import math
import random
import re
import shutil
import xml.dom.minidom
import zipfile
from hashlib import sha256
from pathlib import Path
from uuid import uuid4

import fireo
import fsspec
import pypdf
import requests
from fsspec.core import url_to_fs
from tika import parser

from ..database import models as db_models

Expand Down Expand Up @@ -759,6 +764,161 @@ def clip_and_reformat_video(
return output_path


def parse_document(document_uri: str) -> str:
"""
Extract text from a .doc, .docx, or .ppt matter file.
Parameters
----------
document_uri: str
The matter file uri.
Returns
-------
str:
A string of all text in the matter file.
"""
response = requests.get(document_uri, stream=True)
if response.status_code != 200:
response.raise_for_status()
else:
document_raw = response.content

docx_pattern = "\.docx$"
doc_pattern = "\.doc$"
pdf_pattern = "\.pdf$"
pptx_pattern = "\.pptx$"

if re.search(docx_pattern, document_uri):
return parse_docx_file(document_raw)
elif re.search(doc_pattern, document_uri):
return parse_doc_file(document_raw)
elif re.search(pdf_pattern, document_uri):
return parse_pdf_file(document_raw)
elif re.search(pptx_pattern, document_uri):
return parse_pptx_file(document_raw)

log.error("Unsupported document type: " + document_uri)

return ""


def parse_docx_file(zip_archive_bytes: bytes) -> str:
"""
Extract text from a .docx matter file.
Parameters
----------
zip_archive_bytes: bytes
The raw document to be parsed. Word docx files are zip archives.
Returns
-------
str:
A str of all text in the .docx file.
"""
zip_archive_stream = io.BytesIO(zip_archive_bytes)
zip_archive = zipfile.ZipFile(zip_archive_stream)
archive_members = zip_archive.namelist()

xml_regex_pattern = "^.*\.xml$"
text = []

for file in archive_members:
# text found in .xml files not .rels
if re.search(xml_regex_pattern, file):
file_stream = io.BytesIO(zip_archive.read(file))
parsed_xml = xml.dom.minidom.parse(file_stream)

root = parsed_xml.documentElement
text_nodes = root.getElementsByTagName("w:t")

for node in text_nodes:
text.append(node.firstChild.nodeValue)

parsed_text = " ".join(text)
return remove_duplicate_space(parsed_text)


def parse_doc_file(document_raw: bytes) -> str:
"""
Extract text from a .doc matter file.
Parameters
----------
document_raw: bytes
The raw document.
Returns
-------
str:
A str of all text in the .doc file.
"""
parsed_content = parser.from_buffer(document_raw)["content"]
return remove_duplicate_space(parsed_content)


def parse_pdf_file(document_raw: bytes) -> str:
"""
Extract text from a .pdf matter file.
Parameters
----------
document_raw: bytes
The raw document.
Returns
-------
str:
A str of all text in the .pdf file.
"""
pdf_reader = pypdf.PdfReader(io.BytesIO(document_raw))
text = ""

count = 0
while count < len(pdf_reader.pages):
current_page = pdf_reader.pages[count]
text += current_page.extract_text()
count += 1

return remove_duplicate_space(text)


def parse_pptx_file(document_raw: bytes) -> str:
"""
Extract text from a .pdf matter file.
Parameters
----------
document_raw: bytes
The raw document.
Returns
-------
str:
A str of all text in the .pdf file.
"""
parsed_pptx = parser.from_buffer(document_raw)["content"]
return remove_duplicate_space(parsed_pptx)


def remove_duplicate_space(parsed_text: str) -> str:
"""
Remove all duplicate whitespace characters and replace with a single space.
Parameters
----------
parsed_text: str
The parsed text from the document.
Returns
-------
str:
A string with no more than one consecutive space.
"""
return re.sub("\s+", " ", parsed_text)


def should_copy_video(video_filepath: Path, output_format: str = "mp4") -> bool:
"""
Check if the video should be copied using ffmpeg StreamCopy codec or if it should
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,11 @@ pipeline = [
"prefect>=1.2,<2",
"pyarrow>=8.0",
"pydub>=0.25.1",
"pypdf>=2.0",
"rapidfuzz>=2.0",
"spacy>=3.4",
"spacy-transformers>=1.1",
"tika==2.6.0",
"torch>=1.10",
"tqdm>=4.62",
"transformers>=4.16",
Expand Down Expand Up @@ -222,6 +224,10 @@ ignore = [
"*generated_*",
"*.wav",
"Dockerfile",
"*example_*.docx",
"*example_*.doc",
"*example_*.pptx",
"*example_*.pdf",
]

[tool.mypy]
Expand Down

0 comments on commit a603be1

Please sign in to comment.