Skip to content

Commit

Permalink
Merge pull request #65 from meghdadFar/higher-order-mwe
Browse files Browse the repository at this point in the history
Extract variable-length MWE using a user-defined POS regex pattern.
  • Loading branch information
meghdadFar committed Jun 26, 2023
2 parents 85e1c18 + 49a9c34 commit ca55f44
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ repos:
# supported by your project here, or alternatively use
# pre-commit's default_language_version, see
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.9
language_version: python3.10
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
Expand Down
5 changes: 5 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Version 0.4.0
-------------
- Support for extracting variable length MWE given a user pattern of POS tags.


Version 0.3.7
-------------
- Change newline encoding.
Expand Down
22 changes: 8 additions & 14 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
Wordview (Work In Progress)
###########################

|PyPI version|

|Python 3.9|
.. image:: https://img.shields.io/pypi/v/wordview
:alt: PyPI

.. image:: https://img.shields.io/pypi/pyversions/wordview
:alt: PyPI - Python Version

.. image:: https://img.shields.io/pypi/dm/wordview
:alt: PyPI - Downloads

Wordview is a Python package for Exploratory Data Analysis (EDA) and Feature Extraction for text.
Wordview's Python API is open-source and available under the `MIT
Expand Down Expand Up @@ -93,18 +99,6 @@ Contributing
Thank you for contributing to wordview! We and the users of this repo
appreciate your efforts! You can visit the `contributing page <CONTRIBUTING.rst>`__ for detailed instructions about how you can contribute to Wordview.


.. |PyPI version| image:: https://badge.fury.io/py/wordview.svg
:target: https://badge.fury.io/py/wordview

.. |Python 3.9| image:: https://img.shields.io/badge/python-3.9-blue.svg
:target: https://www.python.org/downloads/release/python-390/
.. |verbs| image:: docs/figs/verbs.png
.. |nouns| image:: docs/figs/nouns.png
.. |adjs| image:: docs/figs/adjectives.png
.. |doclen| image:: docs/figs/doclen.png
.. |wordszipf| image:: docs/figs/wordszipf.png
.. |labels| image:: docs/figs/labels.png
.. |cover| image:: docs/figs/abstract_cover_2.png
.. |clustering_cover| image:: docs/figs/clustering_cover.png
.. |text_analysis_cover| image:: docs/figs/text_analysis.png
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "wordview"
version = "0.3.7"
version = "0.4.0"
description = "Wordview is a Python package for text analysis."
authors = ["meghdadFar <[email protected]>"]
readme = "README.rst"
Expand Down
63 changes: 62 additions & 1 deletion tests/mwe/test_mwe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
from unittest.mock import patch, MagicMock
import pandas as pd
from wordview.mwes.mwe import MWE
from wordview.mwes.mwe import MWE, HigherOrderMWEExtractor
import nltk


@pytest.fixture
Expand Down Expand Up @@ -36,6 +37,13 @@ def dummy_text_pandas_with_no_noun_compund():
dummy_pos_tags_without_noun_compund = [("no", "XXX"),("sequence", "XXX"),("of", "XXX"),("nouns", "XXX"),("in", "XXX"),("this", "XXX"),("one", "XXX")]


@pytest.fixture
def tagged_sentence_fixture():
sentence = "The very quick brown fox swiftly jumps over the lazy dog that is extremely lazy while John Doe attentively watches the lazy dog."
tokens = nltk.word_tokenize(sentence)
return tokens


class TestMweInitialisation:

def test_mwe_does_not_tokenize_text_with_multiple_whitespaces(self, dummy_text_pandas):
Expand Down Expand Up @@ -79,6 +87,59 @@ def test_mwe_if_no_nc_returns_empty_mwe_counts(self, dummy_text_pandas_with_no_n
assert counts["NC"] == {}


class TestHigherOrderMWEExtraction:

def test_extract_higher_order_mwes_wrong_type_tokens(self):
tokens = "this is a string"
pattern = "NP: {<DT>?<JJ>*<NN>}"
with pytest.raises(TypeError):
mwe_extractor = HigherOrderMWEExtractor(tokens, pattern)

def test_extract_higher_order_mwes_empty_tokens(self):
tokens = []
pattern = "NP: {<DT>?<JJ>*<NN>}"
with pytest.raises(ValueError):
mwe_extractor = HigherOrderMWEExtractor(tokens, pattern)

def test_extract_higher_order_mwes_wrong_type_pattern(self, tagged_sentence_fixture):
pattern = 1
with pytest.raises(TypeError):
mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)

def test_extract_higher_order_mwes_empty_pattern(self, tagged_sentence_fixture):
pattern = ""
with pytest.raises(ValueError):
mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)

def test_extract_higher_order_mwes_incorrect_pattern(self, tagged_sentence_fixture):
pattern = "{<DT>?<JJ>*<NN>}"
mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
with pytest.raises(ValueError):
mwe_extractor.extract_higher_order_mwe_candidates()

def test_extract_higher_order_mwes_single_pattern(self, tagged_sentence_fixture):
pattern = "NP: {<DT>?<JJ>+<NN>}"
mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
expected = {'NP': {'quick brown': 1, 'the lazy dog': 2}}
actual = mwe_extractor.extract_higher_order_mwe_candidates()
assert expected == actual

def test_extract_higher_order_mwes_multi_pattern(self, tagged_sentence_fixture):
pattern = """NP: {<DT>?<JJ>+<NN>} # Noun phrase
PROPN: {<NNP>+} # Proper noun
ADJP: {<RB|RBR|RBS>*<JJ>} # Adjective phrase
ADVP: {<RB.*>+<VB.*><RB.*>*} # Adverb phrase"""
expected = {
'NP': {'quick brown': 1, 'the lazy dog': 2},
'PROPN': {'John Doe': 1},
'ADJP': {'extremely lazy': 1},
'ADVP': {'swiftly jumps': 1, 'attentively watches': 1}
}
mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
actual = mwe_extractor.extract_higher_order_mwe_candidates()
assert expected == actual


@pytest.mark.xfail
def test_mwe_build_counts(dummy_text_pandas_with_no_noun_compund):
mwe = MWE(df = dummy_text_pandas_with_no_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
Expand Down
84 changes: 80 additions & 4 deletions wordview/mwes/mwe.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json
import re
from collections import Counter
from pathlib import Path
from typing import Dict, List, Optional, Union
from typing import Dict, Optional

import pandas
import tqdm
from nltk import word_tokenize
from nltk import RegexpParser, word_tokenize

from wordview import logger
from wordview.mwes.am import calculate_am
Expand Down Expand Up @@ -240,3 +238,81 @@ def extract_mwes_from_sent(self, tokens: list[str], mwe_type: str) -> Dict:
mwes.append(w1[0] + " " + w2[0])
mwes_count_dic = Counter(mwes)
return mwes_count_dic


class HigherOrderMWEExtractor:
def __init__(self, tokens: list[str], pattern: str) -> None:
self.tokens = tokens
self.pattern = pattern
self._validate_input()

def _validate_input(self) -> None:
if not isinstance(self.tokens, list):
raise TypeError(
f'Input argument "tokens" must be a list of string. Currently it is of type {type(self.tokens)} \
with a value of: {self.tokens}.'
)
if len(self.tokens) == 0:
raise ValueError(
'Input argument "tokens" must be a non-empty list of string.'
)
if not isinstance(self.pattern, str):
raise TypeError(
f'Input argument "pattern" must be a string. Currently it is of type {type(self.pattern)} \
with a value of: {self.pattern}.'
)
if len(self.pattern) == 0:
raise ValueError(
'Input argument "pattern" must be a non-zero length string.'
)

def extract_higher_order_mwe_candidates(self) -> dict:
"""
Extract variable-length MWE from tokenized input, using a user-defined POS regex pattern.
Parameters:
tokens (list[str]): A list of tuples containing the word and its corresponding part-of-speech tag.
pattern (str): A string containing a user-defined pattern for nltk.RegexpParser.
Returns:
match_counter (dict[str, dict[str, int]]): A counter dictionary with count of matched strings, grouped by pattern label.
An empty list if none were found.
Examples of user-defined patterns:
- NP: {<DT>?<JJ>*<NN>} # Noun phrase
- VP: {<MD>?<VB.*><NP|PP|CLAUSE>+$} # Verb phrase
- PP: {<IN><NP>} # Prepositional phrase
You can use multiple and/or nested patterns, separated by a newline character:
pattern = '''
NP: {<DT>?<JJ>*<NN>} # Noun phrase
PROPN: {<NNP>+} # Proper noun
ADJP: {<RB|RBR|RBS>*<JJ>} # Adjective phrase
ADVP: {<RB.*>+<VB.*><RB.*>*} # Adverb phrase
'''
In this case, patterns of a clause are executed in order. An earlier
pattern may introduce a chunk boundary that prevents a later pattern from executing.
"""

tagged_tokens: list[tuple[str, str]] = get_pos_tags(self.tokens)
parser = RegexpParser(self.pattern)
parsed_tokens = parser.parse(tagged_tokens)

labels: list[str] = [
rule.split(":")[0].strip() for rule in self.pattern.split("\n") if rule
]

matches: dict[str, list[str]] = {label: [] for label in labels}

for subtree in parsed_tokens.subtrees():
label = subtree.label()
if label in matches:
matches[label].append(
" ".join(word for (word, tag) in subtree.leaves())
)

matches_counter: dict[str, dict[str, int]] = {
label: dict(Counter(match_list)) for label, match_list in matches.items()
}
return matches_counter

0 comments on commit ca55f44

Please sign in to comment.