Skip to content

Commit

Permalink
addressed fred's comments in my PR. fixed a few typos, clarified doc …
Browse files Browse the repository at this point in the history
…comments, and added more depth to regression tests
  • Loading branch information
ZachEichen committed Jul 13, 2021
1 parent 924bf59 commit 7a6c864
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 13 deletions.
18 changes: 10 additions & 8 deletions text_extensions_for_pandas/io/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,26 +1299,28 @@ def download_file(url, destination):


def maybe_download_dataset_data(
target_dir: str, document_url: str, alternate_name: str = None
target_dir: str, document_url: str, fname: str = None
) -> Union[str, List[str]]:
"""
If the file found at the github url is not found in the target directory,
downloads it from the github url, and saves it to that plave in downloads.
If the file found at the url is not found in the target directory,
downloads it, and saves it to that place in downloads.
Returns the path to the file. If a zip archive is downloaded, only files that are not already in the target
directory will be fetched, and if an alternate_name is given only that file will be operated on.
Note if a Zip archive is downloaded it will be unpacked so verify that the url being used is safe.
:param target_dir: Directory where this function should write the document
:param document_url: url from which to download the docuemnt. If no alternate name is specified,
it is assumed that the string after the last slash is the name of the file.
:param alternate_name: if given, the name of the file that is checked in the target directory,
:param fname: if given, the name of the file that is checked in the target directory,
as well as what is used to save the file if no such file is found. If a zip file is downloaded, and a file of this
name exists in in the archive, only it will be extracted.
:returns: the path to the file, or None if downloading was not successful
If the file found at the url is not found in the target directory,
downloads it, and saves it to that place in downloads
"""
file_name = (
alternate_name if alternate_name is not None else document_url.split("/")[-1]
fname if fname is not None else document_url.split("/")[-1]
)
full_path = target_dir + "/" + file_name
# if no directory exists, create one
Expand All @@ -1327,7 +1329,7 @@ def maybe_download_dataset_data(

# special logic for zip files
if document_url.split(".")[-1] == "zip" and (
alternate_name is None or not os.path.exists(full_path)
fname is None or not os.path.exists(full_path)
):
# if we have a zip file already, don't re-download it
zipPath = target_dir + "/" + document_url.split("/")[-1]
Expand All @@ -1338,8 +1340,8 @@ def maybe_download_dataset_data(
# if need be, extract the zipfile documents
with ZipFile(zipPath, "r") as zipf:
fnames = zipf.namelist()
if alternate_name is not None and alternate_name in fnames:
zipf.extract(alternate_name, target_dir)
if fname is not None and fname in fnames:
zipf.extract(fname, target_dir)
return full_path
for fname in fnames:
if not os.path.exists(target_dir + fname):
Expand Down
30 changes: 25 additions & 5 deletions text_extensions_for_pandas/io/test_conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import unittest
import textwrap
import os
import shutil

from text_extensions_for_pandas.io.conll import *
from text_extensions_for_pandas.io.spacy import make_tokens_and_features
Expand Down Expand Up @@ -734,8 +735,8 @@ def test_compute_accuracy(self):

def test_maybe_download_dataset(self):
base_dir = "test_data/io/test_conll"
ewt_dir = base_dir + "/ewt"
conll9_dir = base_dir + "/conll9"
ewt_dir = base_dir + "/ewt_temp"
conll9_dir = base_dir + "/conll9_temp"
ewt_url = "https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-dev.conllu"
conll_09_test_data_url = (
"https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip"
Expand All @@ -746,13 +747,28 @@ def test_maybe_download_dataset(self):
self.assertEqual(val, ewt_dir + "/en_ewt-ud-dev.conllu")
self.assertTrue(os.path.isdir(ewt_dir))
self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
# test download of
val = maybe_download_dataset_data(ewt_dir, ewt_url, alternate_name="dev.conllu")

# test functionality when file already exists
val = maybe_download_dataset_data(ewt_dir, ewt_url)
self.assertEqual(val, ewt_dir + "/en_ewt-ud-dev.conllu")
self.assertTrue(os.path.isdir(ewt_dir))
self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))

# test download of alternately named file
val = maybe_download_dataset_data(ewt_dir, ewt_url, fname="dev.conllu")
self.assertEqual(val, ewt_dir + "/dev.conllu")
self.assertTrue(os.path.isdir(ewt_dir))
self.assertTrue(os.path.isfile(ewt_dir + "/dev.conllu"))
# check we didn't overwrite the last file
self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
# verify functionality when file already exists
val = maybe_download_dataset_data(ewt_dir, ewt_url, fname="dev.conllu")
self.assertEqual(val, ewt_dir + "/dev.conllu")
self.assertTrue(os.path.isdir(ewt_dir))
self.assertTrue(os.path.isfile(ewt_dir + "/dev.conllu"))
# check we didn't overwrite the last file
self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))


# test zip
conll_9_file = conll9_dir + "/CoNLL2009-ST-English-trial.txt"
Expand All @@ -765,10 +781,14 @@ def test_maybe_download_dataset(self):
maybe_download_dataset_data(
conll9_dir,
conll_09_test_data_url,
alternate_name="/CoNLL2009-ST-English-trial.txt",
fname="/CoNLL2009-ST-English-trial.txt",
)
self.assertFalse(os.path.exists(conll9_dir + "CoNLL2009-ST-English-trial.zip"))

#clean up by removing our two temp dirs
shutil.rmtree(ewt_dir)
shutil.rmtree(conll9_dir)


if __name__ == "__main__":
unittest.main()

0 comments on commit 7a6c864

Please sign in to comment.