-
Notifications
You must be signed in to change notification settings - Fork 6
/
upload_mismatches.py
313 lines (256 loc) · 10.4 KB
/
upload_mismatches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""
Script to send post requests to the Mismatch Finder API given a filepath or directory of mismatch files.
Please see the Mismatch Finder User Guide for more information:
https://github.com/wmde/wikidata-mismatch-finder/blob/development/docs/UserGuide.md
The example cURL request for Mismatch Finder is:
curl -X POST "https://mismatch-finder.toolforge.org/api/imports" \
-H "Accept: application/json" \
-H "Authorization: Bearer {ACCESS_TOKEN}" \
-F "mismatch_file=@PATH_TO_CSV_FILE" \
-F "description=DESCRIPTION" \
-F "external_source=SOURCE" \
-F "external_source_url=URL" \
-F "expires=YYYY-MM-DD"
Usage:
Note: Please only pass arguments to EITHER --mismatch-file OR --mismatch-files-dir.
Note: --description, --external-source-url and --expires are optional.
python3 upload_mismatches.py \
--access-token ACCESS_TOKEN \
--mismatch-file MISMATCH_FILE \
--mismatch-files-dir MISMATCH_FILE_DIR \
--description DESCRIPTION \
--external-source EXTERNAL_SOURCE \
--external-source-url EXTERNAL_SOURCE_URL \
--expires EXPIRES \
--verbose
Example:
python3 upload_mismatches.py \
--access-token "YOUR_ACCESS_TOKEN" \
--mismatch-file mismatches_test.csv \
--description "Test mismatches upload" \
--external-source "Test Source" \
--external-source-url "https://www.wikidata.org" \
--verbose
Abbreviated arguments usage:
Note: Please only pass arguments to EITHER -mf OR -mfd.
Note: --des, --src and --exp are optional.
python3 upload_mismatches.py \
-pat ACCESS_TOKEN \
-mf MISMATCH_FILE \
-mfd MISMATCH_FILE_DIR \
-des DESCRIPTION \
-src EXTERNAL_SOURCE \
-url EXTERNAL_SOURCE_URL \
-exp EXPIRES \
-v
Abbreviated arguments example:
python3 upload_mismatches.py \
-pat "YOUR_ACCESS_TOKEN" \
-mf mismatches_test.csv \
-des "Test mismatches upload" \
-src "Test Source" \
-url "https://www.wikidata.org" \
-v
"""
import argparse
import os
import requests
from tqdm import tqdm
# Section: Helper classes functions for the script.
class terminal_colors:
"""
Class for easily applying the Wikidata brand colors in the terminal and resetting.
"""
WD_RED = "\033[38;2;153;0;0m"
WD_GREEN = "\033[38;2;51;153;102m"
WD_BLUE = "\033[38;2;0;102;153m"
RESET = "\033[0m"
def print_thank_you_message():
"""
Prints a multicolored thank you message to the command line.
"""
heart_char = "\u2665"
print(
"Thank you for helping to improve Wikidata's data!"
+ " "
+ f"{terminal_colors.WD_RED}{heart_char}"
+ f"{terminal_colors.WD_GREEN}{heart_char}"
+ f"{terminal_colors.WD_BLUE}{heart_char}"
+ f"{terminal_colors.WD_GREEN}{heart_char}"
+ f"{terminal_colors.RESET}"
)
def lower(s: str):
"""
Returns a string with the first letter lowercased.
"""
return s[:1].lower() + s[1:] if s else ""
# Section: Set arguments for the script.
parser = argparse.ArgumentParser()
parser._actions[0].help = "Show this help message and exit."
parser.add_argument(
"-v", "--verbose", help="Increase output verbosity.", action="store_true"
)
parser.add_argument(
"-pat", "--access-token", help="Your Mismatch Finder API access token."
)
parser.add_argument(
"-mf",
"--mismatch-file",
help="(Optional) Path to the CSV file containing mismatches to import to Mismatch Finder.",
)
parser.add_argument(
"-mfd",
"--mismatch-files-dir",
help="(Optional) Path to a directory containing only CSV files with mismatches to import to Mismatch Finder.",
)
parser.add_argument(
"-des",
"--description",
help="(Optional) A short text (up to 350 characters) to describe this import.",
)
parser.add_argument(
"-src",
"--external-source",
help="The name of the external source that mismatches are coming from (up to 100 characters).",
)
parser.add_argument(
"-url",
"--external-source-url",
help="(Optional) A URL to the external source that mismatches are coming from.",
)
parser.add_argument(
"-exp",
"--expires",
help="(Optional) An ISO formatted date to describe the date where the mismatches imported will be no longer relevant. If omitted, mismatches from the import will expire after 6 months by default. A timeframe of a few weeks or months is recommended.",
)
args = parser.parse_args()
VERBOSE = args.verbose
ACCESS_TOKEN = args.access_token
MISMATCH_FILE = args.mismatch_file
MISMATCH_FILES_DIR = args.mismatch_files_dir
DESCRIPTION = args.description
EXTERNAL_SOURCE = args.external_source
EXTERNAL_SOURCE_URL = args.external_source_url
EXPIRES = args.expires
# Section: Assertions for passed arguments.
assert ACCESS_TOKEN, f"Please provide {lower(parser._actions[2].help)}"
assert (
MISMATCH_FILE or MISMATCH_FILES_DIR and not (MISMATCH_FILE and MISMATCH_FILES_DIR)
), f"""Please provide a path via EITHER the --mismatch-file (-mf) OR --mismatch-files-dir (-mfd) arguments:
--mismatch-file (-mf): {lower(parser._actions[3].help)}
--mismatch-files-dir (-mfd): {lower(parser._actions[4].help)}"""
assert EXTERNAL_SOURCE, f"Please provide {lower(parser._actions[6].help)}"
# Assert that the file exists and that it is a CSV with a filesize less than 10 MB.
if MISMATCH_FILE:
assert os.path.isfile(
MISMATCH_FILE
), f"Mismatch file not found. Please provide a {lower(parser._actions[3].help.split('(Optional) ')[1])}"
assert (
MISMATCH_FILE[-4:] == ".csv"
), f"Mismatch file not a CSV. Please provide a {lower(parser._actions[3].help.split('(Optional) ')[1])}"
mf_size = os.path.getsize(MISMATCH_FILE) >> 20
assert (
mf_size < 10
), "The size of the passed mismatch file via the --mismatch-file (-mf) argument is greater than the Mismatch Finder import file size limit of 10 MB. Please break it down into smaller CSV files using `split_mismatch_file.py` and pass a directory containing only these CSVs to the --mismatch-files-dir (-mdf) argument."
# Assert that the directory exists and that the contents of the directory are all CSVs that are less than 10 MB.
if MISMATCH_FILES_DIR:
assert os.path.isdir(
MISMATCH_FILES_DIR
), f"Mismatch directory not found. Please provide a {lower(parser._actions[4].help.split('(Optional) ')[1])}"
mfd_files = [
f
for f in os.listdir(MISMATCH_FILES_DIR)
if os.path.isfile(os.path.join(MISMATCH_FILES_DIR, f))
]
mfd_mf_files = [f for f in mfd_files if f[-4:] == ".csv"]
mfd_remaining_files = set(mfd_files) - set(mfd_mf_files)
assert (
not mfd_remaining_files
), f"Mismatch directory is not empty. Please provide a {lower(parser._actions[4].help.split('(Optional) ')[1])}"
mfd_mf_paths = []
for mf in mfd_mf_files:
if os.name == "nt": # Windows
dir_path_separator = "\\"
else:
dir_path_separator = "/"
# Remove potential trailing slash or backlash from the end of the directory path.
if MISMATCH_FILES_DIR.endswith(dir_path_separator):
mfd_path = MISMATCH_FILES_DIR[:-1]
else:
mfd_path = MISMATCH_FILES_DIR
mfd_mf_paths.append(mfd_path + dir_path_separator + mf)
too_large_mismatch_files = []
for mf_path in mfd_mf_paths:
mfd_mf_size = os.path.getsize(mf_path) >> 20
if mfd_mf_size > 10:
too_large_mismatch_files.append(mf_path)
too_large_mismatch_files_print_st = "\n".join(too_large_mismatch_files)
assert (
not too_large_mismatch_files
), f"The size of one of the passed mismatch files via the --mismatch-files-dir (-mdf) argument is greater than the import file size limit of 10 MB. Please break it down into smaller CSV files and pass a directory containing only these CSVs to the --mismatch-files-dir (-mdf) argument. Mismatch files that are too large are:\n\n{too_large_mismatch_files_print_st}"
# Section: Prepare components of the request.
MF_API_IMPORT_URL = "https://mismatch-finder.toolforge.org/api/imports"
headers = {"Accept": "application/json", "Authorization": f"Bearer {ACCESS_TOKEN}"}
params = {"external_source": EXTERNAL_SOURCE}
if DESCRIPTION:
params["description"] = DESCRIPTION
if EXTERNAL_SOURCE_URL:
params["external_source_url"] = EXTERNAL_SOURCE_URL
if EXPIRES:
params["expires"] = EXPIRES
# Section: Make upload request(s).
if MISMATCH_FILE:
if VERBOSE:
print(
f"Uploading the mismatch file {MISMATCH_FILE} to the Wikidata Mismatch Finder..."
)
try:
with open(MISMATCH_FILE, "rb") as mismatch_file_binary:
r = requests.post(
MF_API_IMPORT_URL,
files={"mismatch_file": mismatch_file_binary},
headers=headers,
params=params,
)
r.raise_for_status()
print(
f"Mismatch file {MISMATCH_FILE} was successfully uploaded to Mismatch Finder."
)
print_thank_you_message()
except requests.exceptions.HTTPError as e:
print(
f"There was an error in trying to upload the mismatch file {MISMATCH_FILE}."
)
print("Response: " + e.response.text)
elif MISMATCH_FILES_DIR:
sorted_mfd_mf_paths = sorted(mfd_mf_paths)
if VERBOSE:
mismatch_files_to_upload_print_str = "\n".join(sorted_mfd_mf_paths)
print(
f"The following mismatch files will be uploaded to the Wikidata Mismatch Finder:\n\n{mismatch_files_to_upload_print_str}\n"
)
for mf in tqdm(
sorted_mfd_mf_paths,
desc="Mismatch files uploaded",
unit="file",
disable=not VERBOSE,
):
try:
with open(mf, "rb") as mismatch_file_binary:
r = requests.post(
MF_API_IMPORT_URL,
files={"mismatch_file": mismatch_file_binary},
headers=headers,
params=params,
)
r.raise_for_status()
# To assure some level of logging for if there is an error with one of the uploads.
if not VERBOSE:
print(
f"Mismatch file {mf} was successfully uploaded to Mismatch Finder."
)
except requests.exceptions.HTTPError as e:
print(f"There was an error in trying to upload the mismatch file {mf}.")
print("Response: " + e.response.text)
print("All mismatch files were successfully uploaded to Mismatch Finder.")
print_thank_you_message()