Skip to content

Commit

Permalink
cleaned up formatting slightly
Browse files Browse the repository at this point in the history
  • Loading branch information
ZachEichen committed Jul 12, 2021
1 parent 6878381 commit fd52cf2
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 49 deletions.
44 changes: 22 additions & 22 deletions text_extensions_for_pandas/io/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ def _doc_to_df(
ret["line_num"] = pd.Series(doc_line_nums)
if conll_u and "head" in column_names:
ret = ret.astype({"head": "Int64"}, errors="ignore")
ret.loc[ret['head'] == -1, 'head'] = pd.NA
ret.loc[ret["head"] == -1, "head"] = pd.NA
return ret


Expand Down Expand Up @@ -1302,25 +1302,25 @@ def maybe_download_dataset_data(
target_dir: str, document_url: str, alternate_name: str = None
) -> Union[str, List[str]]:
"""
If the file found at the github url is not found in the target directory,
downloads it from the github url, and saves it to that plave in downloads.
Returns the path to the file. If a zip archive is downloaded, only files that are not already in the target
directory will be fetched, and if an alternate_name is given only that file will be operated on.
Note if a Zip archive is downloaded it will be unpacked so verify that the url being used is safe.
:param target_dir: Directory where this function should write the document
:param document_url: url from which to download the docuemnt. If no alternate name is specified,
it is assumed that the string after the last slash is the name of the file.
:param alternate_name: if given, the name of the file that is checked in the target directory,
as well as what is used to save the file if no such file is found. If a zip file is downloaded, and a file of this
name exists in in the archive, only it will be extracted.
:returns: the path to the file, or None if downloading was not successful
If the file found at the github url is not found in the target directory,
downloads it from the github url, and saves it to that plave in downloads.
Returns the path to the file. If a zip archive is downloaded, only files that are not already in the target
directory will be fetched, and if an alternate_name is given only that file will be operated on.
Note if a Zip archive is downloaded it will be unpacked so verify that the url being used is safe.
:param target_dir: Directory where this function should write the document
:param document_url: url from which to download the docuemnt. If no alternate name is specified,
it is assumed that the string after the last slash is the name of the file.
:param alternate_name: if given, the name of the file that is checked in the target directory,
as well as what is used to save the file if no such file is found. If a zip file is downloaded, and a file of this
name exists in in the archive, only it will be extracted.
:returns: the path to the file, or None if downloading was not successful
"""
file_name = (
alternate_name if alternate_name is not None else document_url.split("/")[-1]
)
full_path = target_dir +'/' + file_name
full_path = target_dir + "/" + file_name
# if no directory exists, create one
if not os.path.exists(target_dir):
os.mkdir(target_dir)
Expand All @@ -1329,13 +1329,13 @@ def maybe_download_dataset_data(
if document_url.split(".")[-1] == "zip" and (
alternate_name is None or not os.path.exists(full_path)
):
# if we have a zip file already, don't re-download it
zipPath = target_dir +'/'+ document_url.split("/")[-1]
# if we have a zip file already, don't re-download it
zipPath = target_dir + "/" + document_url.split("/")[-1]
if not os.path.exists(zipPath):
data = requests.get(document_url)
open(zipPath, "wb").write(data.content)
# if need be, extract the zipfile documents

# if need be, extract the zipfile documents
with ZipFile(zipPath, "r") as zipf:
fnames = zipf.namelist()
if alternate_name is not None and alternate_name in fnames:
Expand All @@ -1345,9 +1345,9 @@ def maybe_download_dataset_data(
if not os.path.exists(target_dir + fname):
zipf.extract(fname, target_dir)
if len(fnames) == 1:
full_path = target_dir +'/'+ fnames[0]
full_path = target_dir + "/" + fnames[0]
else:
return [target_dir + '/'+ fname for fname in fnames]
return [target_dir + "/" + fname for fname in fnames]

# regular logic
elif not os.path.exists(full_path):
Expand Down
54 changes: 27 additions & 27 deletions text_extensions_for_pandas/io/test_conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,40 +736,40 @@ def test_compute_accuracy(self):

def test_maybe_download_dataset(self):
base_dir = "test_data/io/test_conll"
ewt_dir = base_dir + '/ewt'
conll9_dir = base_dir + '/conll9'
ewt_dir = base_dir + "/ewt"
conll9_dir = base_dir + "/conll9"
ewt_url = "https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-dev.conllu"
conll_09_test_data_url = 'https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip'

#test download of file
val = maybe_download_dataset_data(ewt_dir,ewt_url)
self.assertEqual(val,ewt_dir + '/en_ewt-ud-dev.conllu')
conll_09_test_data_url = (
"https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip"
)

# test download of file
val = maybe_download_dataset_data(ewt_dir, ewt_url)
self.assertEqual(val, ewt_dir + "/en_ewt-ud-dev.conllu")
self.assertTrue(os.path.isdir(ewt_dir))
self.assertTrue(os.path.isfile(ewt_dir + '/en_ewt-ud-dev.conllu'))
#test download of
val = maybe_download_dataset_data(ewt_dir,ewt_url,alternate_name="dev.conllu")
self.assertEqual(val,ewt_dir + '/dev.conllu')
self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))
# test download of
val = maybe_download_dataset_data(ewt_dir, ewt_url, alternate_name="dev.conllu")
self.assertEqual(val, ewt_dir + "/dev.conllu")
self.assertTrue(os.path.isdir(ewt_dir))
self.assertTrue(os.path.isfile(ewt_dir + '/dev.conllu'))
self.assertTrue(os.path.isfile(ewt_dir + "/dev.conllu"))
# check we didn't overwrite the last file
self.assertTrue(os.path.isfile(ewt_dir + '/en_ewt-ud-dev.conllu'))

self.assertTrue(os.path.isfile(ewt_dir + "/en_ewt-ud-dev.conllu"))

#test zip
conll_9_file = conll9_dir + '/CoNLL2009-ST-English-trial.txt'
val = maybe_download_dataset_data(conll9_dir,conll_09_test_data_url)
self.assertEqual(val,conll_9_file)
# test zip
conll_9_file = conll9_dir + "/CoNLL2009-ST-English-trial.txt"
val = maybe_download_dataset_data(conll9_dir, conll_09_test_data_url)
self.assertEqual(val, conll_9_file)
self.assertTrue(os.path.isdir(conll9_dir))
self.assertTrue(os.path.isfile(conll_9_file))
#verify we don't double download for zips
os.remove(conll9_dir + '/CoNLL2009-ST-English-trial.zip')
maybe_download_dataset_data(conll9_dir,
conll_09_test_data_url,
alternate_name='/CoNLL2009-ST-English-trial.txt')
self.assertFalse(os.path.exists(conll9_dir + 'CoNLL2009-ST-English-trial.zip'))



# verify we don't double download for zips
os.remove(conll9_dir + "/CoNLL2009-ST-English-trial.zip")
maybe_download_dataset_data(
conll9_dir,
conll_09_test_data_url,
alternate_name="/CoNLL2009-ST-English-trial.txt",
)
self.assertFalse(os.path.exists(conll9_dir + "CoNLL2009-ST-English-trial.zip"))


if __name__ == "__main__":
Expand Down

0 comments on commit fd52cf2

Please sign in to comment.