diff --git a/cdp_backend/tests/resources/code_of_conduct.docx b/cdp_backend/tests/resources/code_of_conduct.docx new file mode 100644 index 00000000..7cf681c2 Binary files /dev/null and b/cdp_backend/tests/resources/code_of_conduct.docx differ diff --git a/cdp_backend/tests/resources/code_of_conduct.pdf b/cdp_backend/tests/resources/code_of_conduct.pdf new file mode 100644 index 00000000..afd7a64b Binary files /dev/null and b/cdp_backend/tests/resources/code_of_conduct.pdf differ diff --git a/cdp_backend/tests/resources/code_of_conduct.txt b/cdp_backend/tests/resources/code_of_conduct.txt new file mode 100644 index 00000000..5c537cd2 --- /dev/null +++ b/cdp_backend/tests/resources/code_of_conduct.txt @@ -0,0 +1,92 @@ +# Contributor Covenant Code of Conduct + + +## Our Pledge + + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + + +## Our Standards + + +Examples of behavior that contributes to creating a positive environment +include: + + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + + +Examples of unacceptable behavior by participants include: + + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + + +## Our Responsibilities + + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + + +## Scope + + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + + +## Enforcement + + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting any of the maintainers of this project and +we will attempt to resolve the issues with respect and dignity. + + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + + +## Attribution + + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + + +[homepage]: https://www.contributor-covenant.org + + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq \ No newline at end of file diff --git a/cdp_backend/tests/resources/example_video_large.mp4 b/cdp_backend/tests/resources/example_video_large.mp4 index 6d931d1e..72e3db61 100644 Binary files a/cdp_backend/tests/resources/example_video_large.mp4 and b/cdp_backend/tests/resources/example_video_large.mp4 differ diff --git a/cdp_backend/tests/utils/test_matter_extraction.py b/cdp_backend/tests/utils/test_matter_extraction.py new file mode 100644 index 00000000..c7bf2e79 --- /dev/null +++ b/cdp_backend/tests/utils/test_matter_extraction.py @@ -0,0 +1,40 @@ +import sys +from pathlib import Path + +import pytest +from textract.exceptions import ExtensionNotSupported, MissingFileError + +from cdp_backend.utils import matter_extraction + + +@pytest.mark.skipif( + sys.platform == "win32", + reason="File removal for test cleanup sometimes fails on Windows", +) +@pytest.mark.parametrize( + "file_uri, expected_size", + [ + ("fake_creds.json", 382), + ("code_of_conduct.docx", 3169), + ("code_of_conduct.pdf", 4105), + ("code_of_conduct.txt", 3119), + pytest.param( + "fake_fake_creds.json", + 100, + marks=pytest.mark.raises(exception=MissingFileError), + ), + pytest.param( + "example_video.mp4", + 100, + marks=pytest.mark.raises(exception=ExtensionNotSupported), + ), + ], +) +def test_get_matter_text( + resources_dir: Path, + file_uri: Path, + expected_size: int, +) -> None: + file_uri = resources_dir / file_uri + text = matter_extraction.get_matter_text(str(file_uri)) + assert len(text) == expected_size diff --git a/cdp_backend/utils/matter_extraction.py b/cdp_backend/utils/matter_extraction.py new file mode 100644 index 00000000..7bbac37a --- /dev/null +++ b/cdp_backend/utils/matter_extraction.py @@ -0,0 +1,19 @@ +import textract + + +def get_matter_text(uri: str) -> str: + """ + Get the text contained in a particular docx, ppt, pdf, other text file. + + Parameters + ---------- + uri: str + The text file's uri. + + Returns + ------- + matter_text: str + The text contained within the text file. + """ + text = textract.process(uri) + return text diff --git a/setup.py b/setup.py index 96f8f17b..e60704dd 100644 --- a/setup.py +++ b/setup.py @@ -21,12 +21,14 @@ "imageio-ffmpeg~=0.4.4", "nltk~=3.6", "pandas~=1.2", + "pdftotext~=2.2.0", "prefect~=0.14.0", "pulumi~=3.3", "pulumi-google-native~=0.7.0", "pulumi-gcp~=5.7", "rapidfuzz~=1.4", "spacy~=3.0", + "textract~=1.6.4", "truecase~=0.0.12", "webvtt-py~=0.4.6", ]