Merge pull request #99 from meghdadFar/meghdadFar/fix-nltk-download-i…

…ssue Fix nltk download issue
meghdadFar · Aug 11, 2023 · 6d44278 · 6d44278
2 parents cc2c6a4 + 823424e
commit 6d44278
Show file tree

Hide file tree

Showing 14 changed files with 1,049 additions and 990 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,7 +29,7 @@ jobs:
  python -m pip install poetry
  - name: Install Dependencies
  run: poetry install
- - name: Download NLTK Resources
- run: poetry run nltk_download_script
+ # - name: Download NLTK Resources
+ #  run: poetry run nltk_download_script
  - name: Run Tests
  run: poetry run pytest --ignore=tests/clustering/
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,10 @@
+Version 1.1.2
+-------------
+- Automatic check and download of NLTK missing resources. 
+- Rm CI step for downloading NLTK resources.
+- Facilitate configuration of plots for Text & Label Analysis plots, by creating new and more clear arguments.
+
+
 Version 1.1.1
 -------------
 - Fix minor bugs in bias analysis.

diff --git a/bin/downloads.py b/bin/downloads.py
diff --git a/bin/nltk_resources.py b/bin/nltk_resources.py
@@ -0,0 +1,20 @@
+import nltk
+import os
+from wordview import logger
+
+
+def check_nltk_resources():
+ nltk_data_path = os.path.expanduser('~/nltk_data/')
+
+ resources = {
+ 'tokenizers/punkt': 'punkt',
+ 'corpora/stopwords': 'stopwords',
+ 'taggers/averaged_perceptron_tagger': 'averaged_perceptron_tagger'
+ }
+
+ for path, package in resources.items():
+ if not os.path.exists(os.path.join(nltk_data_path, path)):
+ logger.info(f"Downloading NLTK resource: {package}")
+ nltk.download(package)
+ else:
+ pass
diff --git a/docs/source/mwes.rst b/docs/source/mwes.rst
@@ -18,8 +18,8 @@ the documentation.
  # you can do it as follows:
  from wordview.preprocessing import NgramExtractor
  import pandas as pd
- imdb_train = pd.read_csv("data/IMDB_Dataset_sample.csv")
- extractor = NgramExtractor(imdb_train, "review")
+ imdb_corpus = pd.read_csv("data/IMDB_Dataset_sample.csv")
+ extractor = NgramExtractor(imdb_corpus, "review")
  extractor.extract_ngrams()
  extractor.get_ngram_counts(ngram_count_file_path="data/ngram_counts.json")
  

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "wordview"
-version = "1.1.1"
+version = "1.1.2"
 description = "Wordview is a Python package for text analysis."
 authors = ["meghdadFar <[email protected]>"]
 readme = "README.rst"

diff --git a/tests/mwe/test_mwe.py b/tests/mwe/test_mwe.py
@@ -1,8 +1,7 @@
 import pytest
 from unittest.mock import patch, MagicMock
 import pandas as pd
-from wordview.mwes.mwe import MWE, MWEPatternAssociation
-import nltk
+from wordview.mwes.mwe import MWE
 
 
 @pytest.fixture
@@ -64,11 +63,11 @@ def dummy_text_pandas_with_no_noun_compund():
 dummy_pos_tags_without_noun_compund = [("no", "XXX"),("sequence", "XXX"),("of", "XXX"),("nouns", "XXX"),("in", "XXX"),("this", "XXX"),("one", "XXX")]
 
 
-@pytest.fixture
-def tagged_sentence_fixture():
- sentence = "The very quick brown fox swiftly jumps over the lazy dog that is extremely lazy while John Doe attentively watches the lazy dog."
- tokens = nltk.word_tokenize(sentence)
- return tokens
+# @pytest.fixture
+# def tagged_sentence_fixture():
+#  sentence = "The very quick brown fox swiftly jumps over the lazy dog that is extremely lazy while John Doe attentively watches the lazy dog."
+#  tokens = nltk.word_tokenize(sentence)
+#  return tokens
 
 
 class TestMweInitialisation:

diff --git a/wordview/bias_analysis/bias.py b/wordview/bias_analysis/bias.py
@@ -8,10 +8,13 @@
 from tqdm import tqdm
 from transformers import BertForSequenceClassification, BertTokenizer
 
+from bin.nltk_resources import check_nltk_resources
 from wordview import logger
 from wordview.bias_analysis import bias_terms
 from wordview.io.dataframe_reader import DataFrameReader
 
+check_nltk_resources()
+
 
 class BiasDetector:
  def __init__(self, df, text_column):

diff --git a/wordview/io/dataframe_reader.py b/wordview/io/dataframe_reader.py
@@ -1,7 +1,10 @@
 from nltk.tokenize import sent_tokenize
 
+from bin.nltk_resources import check_nltk_resources
 from wordview import logger
 
+check_nltk_resources()
+
 
 class DataFrameReader:
  """Reads a dataframe column and returns sentences."""

diff --git a/wordview/mwes/mwe.py b/wordview/mwes/mwe.py
@@ -9,6 +9,7 @@
 from tabulate import tabulate # type: ignore
 from tqdm import tqdm
 
+from bin.nltk_resources import check_nltk_resources
 from wordview import logger
 from wordview.io.dataframe_reader import DataFrameReader
 from wordview.mwes.association_measures import PMICalculator
@@ -21,6 +22,9 @@ def is_alphanumeric_latinscript_multigram(word: str) -> Optional[Match[str]]:
  return match
 
 
+check_nltk_resources()
+
+
 class MWEPatternAssociation:
  """Extract MWE candidates from a list of tokens based on a given pattern."""
 

diff --git a/wordview/preprocessing/cleaning.py b/wordview/preprocessing/cleaning.py
@@ -3,6 +3,10 @@
 
 from nltk import word_tokenize
 
+from bin.nltk_resources import check_nltk_resources
+
+check_nltk_resources()
+
 
 def clean_text(
  text: str,

diff --git a/wordview/preprocessing/count.py b/wordview/preprocessing/count.py
@@ -9,9 +9,12 @@
 from nltk.tokenize import word_tokenize
 from nltk.util import ngrams
 
+from bin.nltk_resources import check_nltk_resources
 from wordview import logger
 from wordview.io.dataframe_reader import DataFrameReader
 
+check_nltk_resources()
+
 
 class NgramExtractor:
  """Extracts n-grams from a dataframe.

diff --git a/wordview/text_analysis/core.py b/wordview/text_analysis/core.py
@@ -15,8 +15,11 @@
 from tqdm import tqdm
 from wordcloud import WordCloud, get_single_color_func
 
+from bin.nltk_resources import check_nltk_resources
 from wordview import logger
 
+check_nltk_resources()
+
 
 def plotly_wordcloud(
  token_count_dic: dict, plot_settings: Dict = {}