diff --git a/cdp_backend/tests/conftest.py b/cdp_backend/tests/conftest.py
index 4b5f0af4..03c9c8f8 100644
--- a/cdp_backend/tests/conftest.py
+++ b/cdp_backend/tests/conftest.py
@@ -42,6 +42,11 @@ def resources_dir() -> Path:
 # City of Chicago, Illinois
 EXAMPLE_VIMEO_SHOWCASE = "https://vimeo.com/showcase/6277394/video/722690793"
 
+EXAMPLE_DOCX = "example_docx.docx"
+EXAMPLE_DOC = "example_doc.doc"
+EXAMPLE_PDF = "example_pdf.pdf"
+EXAMPLE_PPTX = "example_pptx.pptx"
+
 
 @pytest.fixture
 def example_video(resources_dir: Path) -> Path:
diff --git a/cdp_backend/tests/resources/example_doc.doc b/cdp_backend/tests/resources/example_doc.doc
new file mode 100644
index 00000000..bd3838b0
Binary files /dev/null and b/cdp_backend/tests/resources/example_doc.doc differ
diff --git a/cdp_backend/tests/resources/example_docx.docx b/cdp_backend/tests/resources/example_docx.docx
new file mode 100644
index 00000000..4b86bd82
Binary files /dev/null and b/cdp_backend/tests/resources/example_docx.docx differ
diff --git a/cdp_backend/tests/resources/example_pdf.pdf b/cdp_backend/tests/resources/example_pdf.pdf
new file mode 100644
index 00000000..80c5ef23
Binary files /dev/null and b/cdp_backend/tests/resources/example_pdf.pdf differ
diff --git a/cdp_backend/tests/resources/example_pptx.pptx b/cdp_backend/tests/resources/example_pptx.pptx
new file mode 100644
index 00000000..5f08d61b
Binary files /dev/null and b/cdp_backend/tests/resources/example_pptx.pptx differ
diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py
index 63188cbf..5391c623 100644
--- a/cdp_backend/tests/utils/test_file_utils.py
+++ b/cdp_backend/tests/utils/test_file_utils.py
@@ -12,17 +12,23 @@
 import imageio
 import pytest
 import requests_mock
+from requests import HTTPError
 
 from cdp_backend.utils import file_utils
 from cdp_backend.utils.file_utils import (
     MAX_THUMBNAIL_HEIGHT,
     MAX_THUMBNAIL_WIDTH,
+    parse_document,
     resource_copy,
 )
 
 from .. import test_utils
 from ..conftest import (
+    EXAMPLE_DOC,
+    EXAMPLE_DOCX,
     EXAMPLE_MKV_VIDEO_FILENAME,
+    EXAMPLE_PDF,
+    EXAMPLE_PPTX,
     EXAMPLE_VIDEO_FILENAME,
     EXAMPLE_VIDEO_HD_FILENAME,
     EXAMPLE_VIMEO,
@@ -402,6 +408,77 @@ def test_clip_and_reformat_video(
     os.remove(outfile)
 
 
+@pytest.mark.parametrize(
+    "document_uri, expected",
+    [
+        (
+            EXAMPLE_DOCX,
+            "Word9 Word10 Word12 Word11 Word14 Word16 "
+            + "we Word13 Word15 Word17 Word18 Word19 "
+            + "Word1 Word2 Word3 Word4 Word5 "
+            + "Word6 OH 007 A 001 Word7 word8",
+        ),
+        (
+            EXAMPLE_DOC,
+            " Word1 Word2 Word3 Word4 "
+            + "Word5 Word6 OH 007 A 001 Word9 Word10 "
+            + "Word12 Word11 Word14 Word16 we Word13 "
+            + "Word15 Word17 Word18 Word19 Word7 word8 ",
+        ),
+        (
+            EXAMPLE_PDF,
+            "Word1 Word2 Word3 Word4 Word5 Word6 "
+            + "OH 007 A 001 Word7 word8 Word9 Word10 Word12 Word11 "
+            + "Word14 Word16 we Word13 Word15 Word17 Word18 Word19 ",
+        ),
+        (
+            EXAMPLE_PPTX,
+            " Title Word1 word2 word3 word4 Word5 word6 "
+            + "1 word7 word8 word9 word10 word11 Word12 word13 word14 "
+            + "/docProps/thumbnail.jpeg ",
+        ),
+        (EXAMPLE_VIDEO_FILENAME, ""),
+    ],
+)
+def test_parse_document(resources_dir: Path, document_uri: str, expected: str) -> None:
+
+    actual_uri = str(resources_dir / document_uri)
+
+    with mock.patch("requests.get") as mocked_requests_get:
+
+        class MockResponse:
+            def __init__(self) -> None:
+                self.content = open(actual_uri, "rb").read()
+                self.status_code = 200
+
+        mocked_requests_get.return_value = MockResponse()
+        parsed_doc = parse_document(actual_uri)
+        assert parsed_doc == expected
+
+
+def test_parse_document_bad_uri() -> None:
+
+    with mock.patch("requests.get") as mocked_requests_get:
+
+        class MockResponse:
+            def __init__(self) -> None:
+                self.content = None
+                self.status_code = 404
+
+            def raise_for_status(self) -> None:
+                raise HTTPError
+
+        mocked_requests_get.return_value = MockResponse()
+        try:
+            parsed_doc = parse_document("some/bad/uri")
+        except HTTPError:
+            assert True
+            return
+
+        assert parsed_doc == ""
+        raise AssertionError()
+
+
 @pytest.mark.parametrize(
     "video_filepath, output_format, codec_type, codec_name, expected",
     [
diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py
index dbb257ff..53894e3f 100644
--- a/cdp_backend/utils/file_utils.py
+++ b/cdp_backend/utils/file_utils.py
@@ -2,19 +2,24 @@
 
 from __future__ import annotations
 
+import io
 import logging
 import math
 import random
 import re
 import shutil
+import xml.dom.minidom
+import zipfile
 from hashlib import sha256
 from pathlib import Path
 from uuid import uuid4
 
 import fireo
 import fsspec
+import pypdf
 import requests
 from fsspec.core import url_to_fs
+from tika import parser
 
 from ..database import models as db_models
 
@@ -759,6 +764,161 @@ def clip_and_reformat_video(
     return output_path
 
 
+def parse_document(document_uri: str) -> str:
+    """
+    Extract text from a .doc, .docx, or .ppt matter file.
+
+    Parameters
+    ----------
+    document_uri: str
+        The matter file uri.
+
+    Returns
+    -------
+    str:
+        A string of all text in the matter file.
+    """
+    response = requests.get(document_uri, stream=True)
+    if response.status_code != 200:
+        response.raise_for_status()
+    else:
+        document_raw = response.content
+
+        docx_pattern = "\.docx$"
+        doc_pattern = "\.doc$"
+        pdf_pattern = "\.pdf$"
+        pptx_pattern = "\.pptx$"
+
+        if re.search(docx_pattern, document_uri):
+            return parse_docx_file(document_raw)
+        elif re.search(doc_pattern, document_uri):
+            return parse_doc_file(document_raw)
+        elif re.search(pdf_pattern, document_uri):
+            return parse_pdf_file(document_raw)
+        elif re.search(pptx_pattern, document_uri):
+            return parse_pptx_file(document_raw)
+
+        log.error("Unsupported document type: " + document_uri)
+
+    return ""
+
+
+def parse_docx_file(zip_archive_bytes: bytes) -> str:
+    """
+    Extract text from a .docx matter file.
+
+    Parameters
+    ----------
+    zip_archive_bytes: bytes
+        The raw document to be parsed. Word docx files are zip archives.
+
+    Returns
+    -------
+    str:
+        A str of all text in the .docx file.
+    """
+    zip_archive_stream = io.BytesIO(zip_archive_bytes)
+    zip_archive = zipfile.ZipFile(zip_archive_stream)
+    archive_members = zip_archive.namelist()
+
+    xml_regex_pattern = "^.*\.xml$"
+    text = []
+
+    for file in archive_members:
+        # text found in .xml files not .rels
+        if re.search(xml_regex_pattern, file):
+            file_stream = io.BytesIO(zip_archive.read(file))
+            parsed_xml = xml.dom.minidom.parse(file_stream)
+
+            root = parsed_xml.documentElement
+            text_nodes = root.getElementsByTagName("w:t")
+
+            for node in text_nodes:
+                text.append(node.firstChild.nodeValue)
+
+    parsed_text = " ".join(text)
+    return remove_duplicate_space(parsed_text)
+
+
+def parse_doc_file(document_raw: bytes) -> str:
+    """
+    Extract text from a .doc matter file.
+
+    Parameters
+    ----------
+    document_raw: bytes
+        The raw document.
+
+    Returns
+    -------
+    str:
+        A str of all text in the .doc file.
+    """
+    parsed_content = parser.from_buffer(document_raw)["content"]
+    return remove_duplicate_space(parsed_content)
+
+
+def parse_pdf_file(document_raw: bytes) -> str:
+    """
+    Extract text from a .pdf matter file.
+
+    Parameters
+    ----------
+    document_raw: bytes
+        The raw document.
+
+    Returns
+    -------
+    str:
+        A str of all text in the .pdf file.
+    """
+    pdf_reader = pypdf.PdfReader(io.BytesIO(document_raw))
+    text = ""
+
+    count = 0
+    while count < len(pdf_reader.pages):
+        current_page = pdf_reader.pages[count]
+        text += current_page.extract_text()
+        count += 1
+
+    return remove_duplicate_space(text)
+
+
+def parse_pptx_file(document_raw: bytes) -> str:
+    """
+    Extract text from a .pdf matter file.
+
+    Parameters
+    ----------
+    document_raw: bytes
+        The raw document.
+
+    Returns
+    -------
+    str:
+        A str of all text in the .pdf file.
+    """
+    parsed_pptx = parser.from_buffer(document_raw)["content"]
+    return remove_duplicate_space(parsed_pptx)
+
+
+def remove_duplicate_space(parsed_text: str) -> str:
+    """
+    Remove all duplicate whitespace characters and replace with a single space.
+
+    Parameters
+    ----------
+    parsed_text: str
+       The parsed text from the document.
+
+    Returns
+    -------
+    str:
+       A string with no more than one consecutive space.
+    """
+    return re.sub("\s+", " ", parsed_text)
+
+
 def should_copy_video(video_filepath: Path, output_format: str = "mp4") -> bool:
     """
     Check if the video should be copied using ffmpeg StreamCopy codec or if it should
diff --git a/pyproject.toml b/pyproject.toml
index 0e2bcd29..398c635c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,9 +70,11 @@ pipeline = [
   "prefect>=1.2,<2",
   "pyarrow>=8.0",
   "pydub>=0.25.1",
+  "pypdf>=2.0",
   "rapidfuzz>=2.0",
   "spacy>=3.4",
   "spacy-transformers>=1.1",
+  "tika==2.6.0",
   "torch>=1.10",
   "tqdm>=4.62",
   "transformers>=4.16",
@@ -222,6 +224,10 @@ ignore = [
   "*generated_*",
   "*.wav",
   "Dockerfile",
+  "*example_*.docx",
+  "*example_*.doc",
+  "*example_*.pptx",
+  "*example_*.pdf",
 ]
 
 [tool.mypy]