addressed fred's comments in my PR. fixed a few typos, clarified doc …

…comments, and added more depth to regression tests
CODAIT · Jul 13, 2021 · 7a6c864 · 7a6c864
1 parent 924bf59
commit 7a6c864
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 13 deletions.
diff --git a/text_extensions_for_pandas/io/conll.py b/text_extensions_for_pandas/io/conll.py
@@ -1299,26 +1299,28 @@ def download_file(url, destination):
 
 
 def maybe_download_dataset_data(
- target_dir: str, document_url: str, alternate_name: str = None
+ target_dir: str, document_url: str, fname: str = None
 ) -> Union[str, List[str]]:
  """
- If the file found at the github url is not found in the target directory,
- downloads it from the github url, and saves it to that plave in downloads.
+ If the file found at the url is not found in the target directory,
+ downloads it, and saves it to that place in downloads.
  Returns the path to the file. If a zip archive is downloaded, only files that are not already in the target
  directory will be fetched, and if an alternate_name is given only that file will be operated on.
  Note if a Zip archive is downloaded it will be unpacked so verify that the url being used is safe.
 
  :param target_dir: Directory where this function should write the document
  :param document_url: url from which to download the docuemnt. If no alternate name is specified,
  it is assumed that the string after the last slash is the name of the file.
- :param alternate_name: if given, the name of the file that is checked in the target directory,
+ :param fname: if given, the name of the file that is checked in the target directory,
  as well as what is used to save the file if no such file is found. If a zip file is downloaded, and a file of this
  name exists in in the archive, only it will be extracted.
 
  :returns: the path to the file, or None if downloading was not successful
+ If the file found at the url is not found in the target directory,
+ downloads it, and saves it to that place in downloads
  """
  file_name = (
- alternate_name if alternate_name is not None else document_url.split("/")[-1]
+ fname if fname is not None else document_url.split("/")[-1]
  )
  full_path = target_dir + "/" + file_name
  # if no directory exists, create one
@@ -1327,7 +1329,7 @@ def maybe_download_dataset_data(
 
  # special logic for zip files
  if document_url.split(".")[-1] == "zip" and (
- alternate_name is None or not os.path.exists(full_path)
+ fname is None or not os.path.exists(full_path)
  ):
  # if we have a zip file already, don't re-download it
  zipPath = target_dir + "/" + document_url.split("/")[-1]
@@ -1338,8 +1340,8 @@ def maybe_download_dataset_data(
  # if need be, extract the zipfile documents
  with ZipFile(zipPath, "r") as zipf:
  fnames = zipf.namelist()
- if alternate_name is not None and alternate_name in fnames:
- zipf.extract(alternate_name, target_dir)
+ if fname is not None and fname in fnames:
+ zipf.extract(fname, target_dir)
  return full_path
  for fname in fnames:
  if not os.path.exists(target_dir + fname):

diff --git a/text_extensions_for_pandas/io/test_conll.py b/text_extensions_for_pandas/io/test_conll.py
@@ -17,6 +17,7 @@
 import unittest
 import textwrap
 import os
+import shutil
 
 from text_extensions_for_pandas.io.conll import *
 from text_extensions_for_pandas.io.spacy import make_tokens_and_features
@@ -734,8 +735,8 @@ def test_compute_accuracy(self):
 
  def test_maybe_download_dataset(self):
  base_dir = "test_data/io/test_conll"
- ewt_dir = base_dir + "/ewt"
- conll9_dir = base_dir + "/conll9"
+ ewt_dir = base_dir + "/ewt_temp"
+ conll9_dir = base_dir + "/conll9_temp"
  ewt_url = "https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-dev.conllu"
  conll_09_test_data_url = (
  "https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip"
@@ -746,13 +747,28 @@ def test_maybe_download_dataset(self):
  self.assertEqual(val, ewt_dir + "/en_ewt-ud-dev.conllu")
  self.assertTrue(os.path.isdir(ewt_dir))
  self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
- # test download of
- val = maybe_download_dataset_data(ewt_dir, ewt_url, alternate_name="dev.conllu")
+
+ # test functionality when file already exists
+ val = maybe_download_dataset_data(ewt_dir, ewt_url)
+ self.assertEqual(val, ewt_dir + "/en_ewt-ud-dev.conllu")
+ self.assertTrue(os.path.isdir(ewt_dir))
+ self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
+
+ # test download of alternately named file
+ val = maybe_download_dataset_data(ewt_dir, ewt_url, fname="dev.conllu")
  self.assertEqual(val, ewt_dir + "/dev.conllu")
  self.assertTrue(os.path.isdir(ewt_dir))
  self.assertTrue(os.path.isfile(ewt_dir + "/dev.conllu"))
  # check we didn't overwrite the last file
  self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
+ # verify functionality when file already exists
+ val = maybe_download_dataset_data(ewt_dir, ewt_url, fname="dev.conllu")
+ self.assertEqual(val, ewt_dir + "/dev.conllu")
+ self.assertTrue(os.path.isdir(ewt_dir))
+ self.assertTrue(os.path.isfile(ewt_dir + "/dev.conllu"))
+ # check we didn't overwrite the last file
+ self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
+
 
  # test zip
  conll_9_file = conll9_dir + "/CoNLL2009-ST-English-trial.txt"
@@ -765,10 +781,14 @@ def test_maybe_download_dataset(self):
  maybe_download_dataset_data(
  conll9_dir,
  conll_09_test_data_url,
- alternate_name="/CoNLL2009-ST-English-trial.txt",
+ fname="/CoNLL2009-ST-English-trial.txt",
  )
  self.assertFalse(os.path.exists(conll9_dir + "CoNLL2009-ST-English-trial.zip"))
 
+ #clean up by removing our two temp dirs
+ shutil.rmtree(ewt_dir)
+ shutil.rmtree(conll9_dir)
+
 
 if __name__ == "__main__":
  unittest.main()