Skip to content

Commit

Permalink
Create script to split large mismatch files and update upload script
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis-wmde committed Apr 5, 2024
1 parent 908cc22 commit 388bfe3
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 18 deletions.
175 changes: 175 additions & 0 deletions split_mismatch_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""
Script that splits a large mismatch file into separate files.
Note: the upload limit for the Mismatch Finder API is 10 MB.
Please see the Mismatch Finder User Guide for more information:
https://github.com/wmde/wikidata-mismatch-finder/blob/development/docs/UserGuide.md
Usage:
python3 split_mismatch_file.py \
--mismatch-file MISMATCH_FILE \
--mismatch-files-dir MISMATCH_FILE_DIR \
--delete-mismatch-file \
--verbose
Abbreviated argument usage:
python3 split_mismatch_file.py \
-mf MISMATCH_FILE \
-mfd MISMATCH_FILE_DIR \
-del \
-v
"""

import argparse
import os

import pandas as pd
import numpy as np


# Section: functions for the script.
def lower(s: str):
"""
Returns a string with the first letter lowercased.
"""
return s[:1].lower() + s[1:] if s else ""


# Section: Set arguments for the script.
parser = argparse.ArgumentParser()
parser._actions[0].help = "Show this help message and exit."
parser.add_argument(
"-v", "--verbose", help="Increase output verbosity.", action="store_true"
)
parser.add_argument(
"-mf",
"--mismatch-file",
help="Path to the CSV file containing mismatches that should be split into smaller files (<10 MB).",
)
parser.add_argument(
"-mfd",
"--mismatch-files-dir",
help="(Optional) Path to a directory where split mismatches should be saved. The directory will be made if it doesn't already exist.",
)
parser.add_argument(
"-del",
"--delete-mismatch-file",
help="(Optional) Delete the original mismatch file passed via the --mismatch-file (-mf) argument.",
action="store_true",
)

args = parser.parse_args()

VERBOSE = args.verbose
MISMATCH_FILE = args.mismatch_file
MISMATCH_FILES_DIR = args.mismatch_files_dir
DELETE_MISMATCH_FILE = args.delete_mismatch_file

# Section: Assertions for passed arguments.
assert MISMATCH_FILE, f"""Please provide a path via the --mismatch-file (-mf) argument:
--mismatch-file (-mf): a {lower(parser._actions[2].help)}"""

# Assert that the file exists and that it is a CSV that is greater than 10 MB.
if MISMATCH_FILE:
assert os.path.isfile(
MISMATCH_FILE
), f"Please provide a {lower(parser._actions[2].help)}"

assert (
MISMATCH_FILE[-4:] == ".csv"
), f"Please provide a {lower(parser._actions[2].help)}"

mf_size = os.path.getsize(MISMATCH_FILE) >> 20

assert (
mf_size > 10
), "The size of the mismatch file passed via the --mismatch-file (-mf) argument is less than the import file size limit of 10 MB. You do not need to run this script, and are ready to upload your CSV to Mismatch Finder via the upload API! Please use the `upload_mismatches.py` file or see other instructions in the user guide at https://github.com/wmde/wikidata-mismatch-finder/blob/development/docs/UserGuide.md."

# Section: Create the needed directory for the output CSVs.
if os.name == "nt": # Windows
dir_path_separator = "\\"
else:
dir_path_separator = "/"

if dir_path_separator in MISMATCH_FILE:
path_to_mismatch_file = f"{dir_path_separator}".join(
MISMATCH_FILE.split(dir_path_separator)[:-1]
)
mismatch_file_name = MISMATCH_FILE.split(dir_path_separator)[-1]
mismatch_dir_name = os.path.splitext(mismatch_file_name)[0]
mismatch_files_dir_path = (
path_to_mismatch_file + dir_path_separator + mismatch_dir_name
)

else:
mismatch_file_name = MISMATCH_FILE
mismatch_dir_name = os.path.splitext(mismatch_file_name)[0]
mismatch_files_dir_path = mismatch_dir_name

if not MISMATCH_FILES_DIR:
if VERBOSE:
print(
"No output directory has been provided. Creating one based on the mismatch file name."
)

assert not os.path.exists(
mismatch_files_dir_path
), "No output directory has been provided, but a directory that matches the mismatch file name passed to the --mismatch-file (-mf) argument exists. Please pass a desired directory name."

os.makedirs(mismatch_files_dir_path, exist_ok=True)

else:
if not os.path.exists(MISMATCH_FILES_DIR):
if VERBOSE:
print(
"The output mismatch files directory does not exist and will be created."
)

os.makedirs(MISMATCH_FILES_DIR)

else:
assert (
len(os.listdir(MISMATCH_FILES_DIR)) == 0
), "The mismatch directory passed to the --mismatch-files-directory (-mfd) argument is not empty. This directory should be empty to assure that directory based uploads to Mismatch Finder with the resulting split CSV files will not send invalid files."

if VERBOSE:
print(
"The output mismatch files directory exists and is empty. Splitting and saving mismatches."
)

mismatch_files_dir_path = MISMATCH_FILES_DIR

# Section: Calculate how many CSVs should be made.
# In the following, the second quantity is True (= 1) if there is a remainder of the division.
number_of_split_mismatch_files = int(mf_size / 10) + (mf_size % 10 > 0)

both_or_all = "both" if number_of_split_mismatch_files == 2 else "all"
if VERBOSE:
print(
f"The mismatch file {mismatch_file_name} will be split into {number_of_split_mismatch_files} different files that will {both_or_all} be 10 MB or less."
)

# Section: Split and save the resulting CSVs.
df_mismatch_file = pd.read_csv(MISMATCH_FILE)

mismatch_file_dfs = np.array_split(df_mismatch_file, number_of_split_mismatch_files)
mismatch_file_df_names = [
f"{os.path.splitext(mismatch_file_name)[0]}_{i+1}.csv"
for i in range(len(mismatch_file_dfs))
]

for i, df in enumerate(mismatch_file_dfs):
df.to_csv(
f"{mismatch_files_dir_path}{dir_path_separator}{mismatch_file_df_names[i]}",
encoding="utf-8",
index=False,
)

created_mismatch_files_print_str = "\n".join(mismatch_file_df_names)
print(
f"The following mismatch files were created in the {mismatch_files_dir_path} directory:\n\n{created_mismatch_files_print_str}"
)
print(
"\nYou're now ready to upload your mismatch files to Mismatch Finder via the upload API! Please use the `upload_mismatches.py` file or see other instructions in the user guide at https://github.com/wmde/wikidata-mismatch-finder/blob/development/docs/UserGuide.md."
)
43 changes: 25 additions & 18 deletions upload_mismatches.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
import tqdm


# Section: Helper classes, variables and functions for the script.
# Section: Helper classes functions for the script.
class terminal_colors:
"""
Class for easily applying the Wikidata brand colors in the terminal and resetting.
Expand Down Expand Up @@ -95,12 +95,12 @@ def lower(s: str):
parser.add_argument(
"-mf",
"--mismatch-file",
help="Path to the CSV file containing mismatches to import to Mismatch Finder.",
help="(Optional) Path to the CSV file containing mismatches to import to Mismatch Finder.",
)
parser.add_argument(
"-mfd",
"--mismatch-files-dir",
help="Path to a directory containing only CSV files with mismatches to import to Mismatch Finder.",
help="(Optional) Path to a directory containing only CSV files with mismatches to import to Mismatch Finder.",
)
parser.add_argument(
"-des",
Expand Down Expand Up @@ -139,7 +139,7 @@ def lower(s: str):

assert (
MISMATCH_FILE or MISMATCH_FILES_DIR and not (MISMATCH_FILE and MISMATCH_FILES_DIR)
), f"""Please provide a path via --mismatch-file (-mf) OR --mismatch-files-dir (-mfd):
), f"""Please provide a path via EITHER the --mismatch-file (-mf) OR --mismatch-files-dir (-mfd) arguments:
--mismatch-file (-mf): a {lower(parser._actions[3].help)}
--mismatch-files-dir (-mfd): a {lower(parser._actions[4].help)}"""

Expand All @@ -149,11 +149,11 @@ def lower(s: str):
if MISMATCH_FILE:
assert os.path.isfile(
MISMATCH_FILE
), f"Please provide a {lower(parser._actions[3].help)}"
), f"Please provide a {lower(parser._actions[3].help.split('(Optional) ')[1])}"

assert (
MISMATCH_FILE[-4:] == ".csv"
), f"Please provide a {lower(parser._actions[3].help)}"
), f"Please provide a {lower(parser._actions[3].help.split('(Optional) ')[1])}"

mf_size = os.path.getsize(MISMATCH_FILE) >> 20

Expand All @@ -165,7 +165,7 @@ def lower(s: str):
if MISMATCH_FILES_DIR:
assert os.path.isdir(
MISMATCH_FILES_DIR
), f"Please provide a {lower(parser._actions[4].help)}"
), f"Please provide a {lower(parser._actions[4].help.split('(Optional) ')[1])}"

mfd_files = [
f
Expand All @@ -175,28 +175,35 @@ def lower(s: str):
mfd_mf_files = [f for f in mfd_files if f[-4:] == ".csv"]
mfd_remaining_files = set(mfd_files) - set(mfd_mf_files)

assert not mfd_remaining_files, f"Please provide a {lower(parser._actions[4].help)}"
assert (
not mfd_remaining_files
), f"Please provide a {lower(parser._actions[4].help.split('(Optional) ')[1])}"

mfd_mf_paths = []
for mf in mfd_mf_files:
if os.name == "nt": # Windows
dir_path_separator = "\\"
else:
dir_path_separator = "/"

# Remove potential trailing slash or backlash from the end of the directory path.
if MISMATCH_FILES_DIR.endswith("/") or MISMATCH_FILES_DIR.endswith("\\"):
if MISMATCH_FILES_DIR.endswith(dir_path_separator):
mfd_path = MISMATCH_FILES_DIR[:-1]
else:
mfd_path = MISMATCH_FILES_DIR

if os.name == "nt": # Windows
mfd_mf_paths.append(mfd_path + "\\" + mf)

else:
mfd_mf_paths.append(mfd_path + "/" + mf)
mfd_mf_paths.append(mfd_path + dir_path_separator + mf)

too_large_mismatch_files = []
for mf_path in mfd_mf_paths:
mfd_mf_size = os.path.getsize(mf_path) >> 20

assert (
mfd_mf_size < 10
), "The size of one of the passed mismatch files via the --mismatch-files-dir (-mdf) argument is greater than the import file size limit of 10 MB. Please break it down into smaller CSV files and pass a directory containing only these CSVs to the --mismatch-files-dir (-mdf) argument."
if mfd_mf_size > 10:
too_large_mismatch_files.append(mf_path)

too_large_mismatch_files_print_st = "\n".join(too_large_mismatch_files)

assert not too_large_mismatch_files, f"The size of one of the passed mismatch files via the --mismatch-files-dir (-mdf) argument is greater than the import file size limit of 10 MB. Please break it down into smaller CSV files and pass a directory containing only these CSVs to the --mismatch-files-dir (-mdf) argument. Mismatch files that are too large are:\n\n{too_large_mismatch_files_print_st}"

# Section: Prepare components of the request.
MF_API_IMPORT_URL = "https://mismatch-finder.toolforge.org/api/imports"
Expand Down Expand Up @@ -232,7 +239,7 @@ def lower(s: str):
print(
"The following mismatch files will be uploaded to the Wikidata Mismatch Finder:"
)
print({", ".join(p for p in mfd_mf_paths)})
print({", ".join(mfd_mf_paths)})

for mf in tqdm(
mfd_mf_paths, desc="Mismatch files uploaded", unit="file", disable=not VERBOSE
Expand Down

0 comments on commit 388bfe3

Please sign in to comment.