Merge pull request #65 from meghdadFar/higher-order-mwe

Extract variable-length MWE using a user-defined POS regex pattern.
meghdadFar · Jun 26, 2023 · ca55f44 · ca55f44
2 parents 85e1c18 + 49a9c34
commit ca55f44
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 21 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
  # supported by your project here, or alternatively use
  # pre-commit's default_language_version, see
  # https://pre-commit.com/#top_level-default_language_version
- language_version: python3.9
+ language_version: python3.10
  - repo: https://github.com/PyCQA/flake8
  rev: 6.0.0
  hooks:

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,8 @@
+Version 0.4.0
+-------------
+- Support for extracting variable length MWE given a user pattern of POS tags.
+
+
 Version 0.3.7
 -------------
 - Change newline encoding.

diff --git a/README.rst b/README.rst
@@ -1,9 +1,15 @@
 Wordview (Work In Progress)
 ###########################
 
-|PyPI version|
 
-|Python 3.9|
+.. image:: https://img.shields.io/pypi/v/wordview
+ :alt: PyPI
+
+.. image:: https://img.shields.io/pypi/pyversions/wordview
+ :alt: PyPI - Python Version
+
+.. image:: https://img.shields.io/pypi/dm/wordview
+ :alt: PyPI - Downloads
 
 Wordview is a Python package for Exploratory Data Analysis (EDA) and Feature Extraction for text.
 Wordview's Python API is open-source and available under the `MIT
@@ -93,18 +99,6 @@ Contributing
 Thank you for contributing to wordview! We and the users of this repo
 appreciate your efforts! You can visit the `contributing page <CONTRIBUTING.rst>`__ for detailed instructions about how you can contribute to Wordview.
 
-
-.. |PyPI version| image:: https://badge.fury.io/py/wordview.svg
- :target: https://badge.fury.io/py/wordview
-
-.. |Python 3.9| image:: https://img.shields.io/badge/python-3.9-blue.svg
- :target: https://www.python.org/downloads/release/python-390/
-.. |verbs| image:: docs/figs/verbs.png
-.. |nouns| image:: docs/figs/nouns.png
-.. |adjs| image:: docs/figs/adjectives.png
-.. |doclen| image:: docs/figs/doclen.png
-.. |wordszipf| image:: docs/figs/wordszipf.png
-.. |labels| image:: docs/figs/labels.png
 .. |cover| image:: docs/figs/abstract_cover_2.png
 .. |clustering_cover| image:: docs/figs/clustering_cover.png
 .. |text_analysis_cover| image:: docs/figs/text_analysis.png

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "wordview"
-version = "0.3.7"
+version = "0.4.0"
 description = "Wordview is a Python package for text analysis."
 authors = ["meghdadFar <[email protected]>"]
 readme = "README.rst"

diff --git a/tests/mwe/test_mwe.py b/tests/mwe/test_mwe.py
@@ -1,7 +1,8 @@
 import pytest
 from unittest.mock import patch, MagicMock
 import pandas as pd
-from wordview.mwes.mwe import MWE
+from wordview.mwes.mwe import MWE, HigherOrderMWEExtractor
+import nltk
 
 
 @pytest.fixture
@@ -36,6 +37,13 @@ def dummy_text_pandas_with_no_noun_compund():
 dummy_pos_tags_without_noun_compund = [("no", "XXX"),("sequence", "XXX"),("of", "XXX"),("nouns", "XXX"),("in", "XXX"),("this", "XXX"),("one", "XXX")]
 
 
+@pytest.fixture
+def tagged_sentence_fixture():
+ sentence = "The very quick brown fox swiftly jumps over the lazy dog that is extremely lazy while John Doe attentively watches the lazy dog."
+ tokens = nltk.word_tokenize(sentence)
+ return tokens
+
+
 class TestMweInitialisation:
 
  def test_mwe_does_not_tokenize_text_with_multiple_whitespaces(self, dummy_text_pandas):
@@ -79,6 +87,59 @@ def test_mwe_if_no_nc_returns_empty_mwe_counts(self, dummy_text_pandas_with_no_n
  assert counts["NC"] == {}
 
 
+class TestHigherOrderMWEExtraction:
+
+ def test_extract_higher_order_mwes_wrong_type_tokens(self):
+ tokens = "this is a string"
+ pattern = "NP: {<DT>?<JJ>*<NN>}"
+ with pytest.raises(TypeError):
+ mwe_extractor = HigherOrderMWEExtractor(tokens, pattern) 
+
+ def test_extract_higher_order_mwes_empty_tokens(self):
+ tokens = []
+ pattern = "NP: {<DT>?<JJ>*<NN>}"
+ with pytest.raises(ValueError):
+ mwe_extractor = HigherOrderMWEExtractor(tokens, pattern) 
+
+ def test_extract_higher_order_mwes_wrong_type_pattern(self, tagged_sentence_fixture):
+ pattern = 1
+ with pytest.raises(TypeError):
+ mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
+
+ def test_extract_higher_order_mwes_empty_pattern(self, tagged_sentence_fixture):
+ pattern = ""
+ with pytest.raises(ValueError):
+ mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
+
+ def test_extract_higher_order_mwes_incorrect_pattern(self, tagged_sentence_fixture):
+ pattern = "{<DT>?<JJ>*<NN>}"
+ mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
+ with pytest.raises(ValueError):
+ mwe_extractor.extract_higher_order_mwe_candidates()
+
+ def test_extract_higher_order_mwes_single_pattern(self, tagged_sentence_fixture):
+ pattern = "NP: {<DT>?<JJ>+<NN>}"
+ mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
+ expected = {'NP': {'quick brown': 1, 'the lazy dog': 2}}
+ actual = mwe_extractor.extract_higher_order_mwe_candidates()
+ assert expected == actual
+
+ def test_extract_higher_order_mwes_multi_pattern(self, tagged_sentence_fixture):
+ pattern = """NP: {<DT>?<JJ>+<NN>} # Noun phrase
+ PROPN: {<NNP>+} # Proper noun
+ ADJP: {<RB|RBR|RBS>*<JJ>} # Adjective phrase
+ ADVP: {<RB.*>+<VB.*><RB.*>*} # Adverb phrase"""
+ expected = {
+ 'NP': {'quick brown': 1, 'the lazy dog': 2},
+ 'PROPN': {'John Doe': 1},
+ 'ADJP': {'extremely lazy': 1},
+ 'ADVP': {'swiftly jumps': 1, 'attentively watches': 1}
+ }
+ mwe_extractor = HigherOrderMWEExtractor(tagged_sentence_fixture, pattern)
+ actual = mwe_extractor.extract_higher_order_mwe_candidates()
+ assert expected == actual
+
+
 @pytest.mark.xfail
 def test_mwe_build_counts(dummy_text_pandas_with_no_noun_compund):
  mwe = MWE(df = dummy_text_pandas_with_no_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])

diff --git a/wordview/mwes/mwe.py b/wordview/mwes/mwe.py
@@ -1,12 +1,10 @@
 import json
-import re
 from collections import Counter
-from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, Optional
 
 import pandas
 import tqdm
-from nltk import word_tokenize
+from nltk import RegexpParser, word_tokenize
 
 from wordview import logger
 from wordview.mwes.am import calculate_am
@@ -240,3 +238,81 @@ def extract_mwes_from_sent(self, tokens: list[str], mwe_type: str) -> Dict:
  mwes.append(w1[0] + " " + w2[0])
  mwes_count_dic = Counter(mwes)
  return mwes_count_dic
+
+
+class HigherOrderMWEExtractor:
+ def __init__(self, tokens: list[str], pattern: str) -> None:
+ self.tokens = tokens
+ self.pattern = pattern
+ self._validate_input()
+
+ def _validate_input(self) -> None:
+ if not isinstance(self.tokens, list):
+ raise TypeError(
+ f'Input argument "tokens" must be a list of string. Currently it is of type {type(self.tokens)} \
+ with a value of: {self.tokens}.'
+ )
+ if len(self.tokens) == 0:
+ raise ValueError(
+ 'Input argument "tokens" must be a non-empty list of string.'
+ )
+ if not isinstance(self.pattern, str):
+ raise TypeError(
+ f'Input argument "pattern" must be a string. Currently it is of type {type(self.pattern)} \
+ with a value of: {self.pattern}.'
+ )
+ if len(self.pattern) == 0:
+ raise ValueError(
+ 'Input argument "pattern" must be a non-zero length string.'
+ )
+
+ def extract_higher_order_mwe_candidates(self) -> dict:
+ """
+ Extract variable-length MWE from tokenized input, using a user-defined POS regex pattern.
+
+ Parameters:
+ tokens (list[str]): A list of tuples containing the word and its corresponding part-of-speech tag.
+ pattern (str): A string containing a user-defined pattern for nltk.RegexpParser.
+
+ Returns:
+ match_counter (dict[str, dict[str, int]]): A counter dictionary with count of matched strings, grouped by pattern label.
+ An empty list if none were found.
+
+ Examples of user-defined patterns:
+ - NP: {<DT>?<JJ>*<NN>} # Noun phrase
+ - VP: {<MD>?<VB.*><NP|PP|CLAUSE>+$} # Verb phrase
+ - PP: {<IN><NP>} # Prepositional phrase
+
+ You can use multiple and/or nested patterns, separated by a newline character:
+ pattern = '''
+ NP: {<DT>?<JJ>*<NN>} # Noun phrase
+ PROPN: {<NNP>+} # Proper noun
+ ADJP: {<RB|RBR|RBS>*<JJ>} # Adjective phrase
+ ADVP: {<RB.*>+<VB.*><RB.*>*} # Adverb phrase
+ '''
+
+ In this case, patterns of a clause are executed in order. An earlier
+ pattern may introduce a chunk boundary that prevents a later pattern from executing.
+ """
+
+ tagged_tokens: list[tuple[str, str]] = get_pos_tags(self.tokens)
+ parser = RegexpParser(self.pattern)
+ parsed_tokens = parser.parse(tagged_tokens)
+
+ labels: list[str] = [
+ rule.split(":")[0].strip() for rule in self.pattern.split("\n") if rule
+ ]
+
+ matches: dict[str, list[str]] = {label: [] for label in labels}
+
+ for subtree in parsed_tokens.subtrees():
+ label = subtree.label()
+ if label in matches:
+ matches[label].append(
+ " ".join(word for (word, tag) in subtree.leaves())
+ )
+
+ matches_counter: dict[str, dict[str, int]] = {
+ label: dict(Counter(match_list)) for label, match_list in matches.items()
+ }
+ return matches_counter