forked from cc-archive/cccatalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikimedia_commons.py
393 lines (322 loc) · 11 KB
/
wikimedia_commons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""
Content Provider: Wikimedia Commons
ETL Process: Use the API to identify all CC-licensed images.
Output: TSV file containing the image, the respective
meta-data.
Notes: https://commons.wikimedia.org/wiki/API:Main_page
No rate limit specified.
"""
import argparse
from copy import deepcopy
from datetime import datetime, timedelta, timezone
import logging
import os
from urllib.parse import urlparse
import lxml.html as html
import common.requester as requester
import common.storage.image as image
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
LIMIT = 500
# The 10000 is a bit arbitrary, but needs to be larger than the mean
# number of uses per file (globally) in the response_json, or we will
# fail without a continuation token. The largest example seen so far
# had a little over 1000 uses
MEAN_GLOBAL_USAGE_LIMIT = 10000
DELAY = 1
HOST = 'commons.wikimedia.org'
ENDPOINT = f'https://{HOST}/w/api.php'
PROVIDER = 'wikimedia'
CONTACT_EMAIL = os.getenv('WM_SCRIPT_CONTACT')
UA_STRING = (
f'CC-Catalog/0.1 (https://creativecommons.org; {CONTACT_EMAIL})'
)
DEFAULT_REQUEST_HEADERS = {
'User-Agent': UA_STRING
}
DEFAULT_QUERY_PARAMS = {
'action': 'query',
'generator': 'allimages',
'gaisort': 'timestamp',
'gaidir': 'newer',
'gailimit': LIMIT,
'prop': 'imageinfo|globalusage',
'iiprop': 'url|user|dimensions|extmetadata',
'gulimit': LIMIT,
'gunamespace': 0,
'format': 'json',
}
PAGES_PATH = ['query', 'pages']
delayed_requester = requester.DelayedRequester(DELAY)
image_store = image.ImageStore(provider=PROVIDER)
def main(date):
"""
This script pulls the data for a given date from the Wikimedia
Commons API, and writes it into a .TSV file to be eventually read
into our DB.
Required Arguments:
date: Date String in the form YYYY-MM-DD. This is the date for
which running the script will pull data.
"""
logger.info(f'Processing Wikimedia Commons API for date: {date}')
continue_token = {}
total_images = 0
start_timestamp, end_timestamp = _derive_timestamp_pair(date)
while True:
image_batch, continue_token = _get_image_batch(
start_timestamp,
end_timestamp,
continue_token=continue_token)
logger.info(f'Continue Token: {continue_token}')
image_pages = _get_image_pages(image_batch)
if image_pages:
total_images = _process_image_pages(image_pages)
logger.info(f'Total Images so far: {total_images}')
if not continue_token:
break
total_images = image_store.commit()
logger.info(f'Total images: {total_images}')
logger.info('Terminated!')
def _derive_timestamp_pair(date):
date_obj = datetime.strptime(date, '%Y-%m-%d')
utc_date = date_obj.replace(tzinfo=timezone.utc)
start_timestamp = str(int(utc_date.timestamp()))
end_timestamp = str(int((utc_date + timedelta(days=1)).timestamp()))
return start_timestamp, end_timestamp
def _get_image_batch(
start_timestamp,
end_timestamp,
continue_token={},
retries=5
):
query_params = _build_query_params(
start_timestamp,
end_timestamp,
continue_token=continue_token
)
image_batch = None
for _ in range(MEAN_GLOBAL_USAGE_LIMIT):
response_json = _get_response_json(query_params, retries=retries)
if response_json is None:
image_batch = None
new_continue_token = None
break
else:
new_continue_token = response_json.pop('continue', {})
logger.debug(f'new_continue_token: {new_continue_token}')
query_params.update(new_continue_token)
image_batch = _merge_response_jsons(image_batch, response_json)
if 'batchcomplete' in response_json:
logger.debug('Found batchcomplete')
break
return image_batch, new_continue_token
def _get_image_pages(image_batch):
image_pages = None
if image_batch is not None:
image_pages = image_batch.get('query', {}).get('pages')
logger.info(f'Got {len(image_pages)} pages')
return image_pages
def _process_image_pages(image_pages):
for i in image_pages.values():
total_images = _process_image_data(i)
return total_images
def _build_query_params(
start_date,
end_date,
continue_token={},
default_query_params=DEFAULT_QUERY_PARAMS,
):
query_params = default_query_params.copy()
query_params.update(
gaistart=start_date,
gaiend=end_date,
)
query_params.update(continue_token)
return query_params
def _merge_response_jsons(left_json, right_json):
# Note that we will keep the continue value from the right json in
# the merged output! This is because we assume the right json is
# the later one in the sequence of responses.
if left_json is None:
return right_json
left_pages = _get_image_pages(left_json)
right_pages = _get_image_pages(right_json)
if (
left_pages is None
or right_pages is None
or left_pages.keys() != right_pages.keys()
):
logger.warning('Cannot merge responses with different pages!')
merged_json = None
else:
merged_json = deepcopy(left_json)
merged_json.update(right_json)
merged_pages = _get_image_pages(merged_json)
merged_pages.update({
k: _merge_image_pages(left_pages[k], right_pages[k])
for k in left_pages
})
return merged_json
def _merge_image_pages(left_page, right_page):
merged_page = deepcopy(left_page)
merged_globalusage = (
left_page['globalusage']
+ right_page['globalusage']
)
merged_page.update(right_page)
merged_page['globalusage'] = merged_globalusage
return merged_page
def _get_response_json(
query_params,
endpoint=ENDPOINT,
request_headers=DEFAULT_REQUEST_HEADERS,
retries=0,
):
response_json = None
if retries < 0:
logger.error('No retries remaining. Failure.')
raise Exception('Retries exceeded')
response = delayed_requester.get(
endpoint,
params=query_params,
headers=request_headers,
timeout=60
)
if response is not None and response.status_code == 200:
try:
response_json = response.json()
except Exception as e:
logger.warning(f'Could not get response_json.\n{e}')
response_json = None
if (
response_json is None
or response_json.get('error') is not None
):
logger.warning(f'Bad response_json: {response_json}')
logger.warning(
'Retrying:\n_get_response_json(\n'
f' {endpoint},\n'
f' {query_params},\n'
f' {request_headers}'
f' retries={retries - 1}'
')'
)
response_json = _get_response_json(
query_params,
endpoint=endpoint,
request_headers=request_headers,
retries=retries - 1
)
return response_json
def _process_image_data(image_data):
foreign_id = image_data.get('pageid')
logger.debug(f'Processing page ID: {foreign_id}')
image_info = _get_image_info_dict(image_data)
image_url = image_info.get('url')
creator, creator_url = _extract_creator_info(image_info)
return image_store.add_item(
foreign_landing_url=image_info.get('descriptionshorturl'),
image_url=image_url,
license_url=_get_license_url(image_info),
foreign_identifier=foreign_id,
width=image_info.get('width'),
height=image_info.get('height'),
creator=creator,
creator_url=creator_url,
title=image_data.get('title'),
meta_data=_create_meta_data_dict(image_data)
)
def _get_image_info_dict(image_data):
image_info_list = image_data.get('imageinfo')
if image_info_list:
image_info = image_info_list[0]
else:
image_info = {}
return image_info
def _extract_date_info(image_info):
date_originally_created = (
image_info
.get('extmetadata', {})
.get('DateTimeOriginal', {})
.get('value', '')
)
last_modified_at_source = (
image_info
.get('extmetadata', {})
.get('DateTime', {})
.get('value', '')
)
return (date_originally_created, last_modified_at_source)
def _extract_creator_info(image_info):
artist_string = (
image_info
.get('extmetadata', {})
.get('Artist', {})
.get('value', '')
)
if not artist_string:
return (None, None)
artist_elem = html.fromstring(artist_string)
# We take all text to replicate what is shown on Wikimedia Commons
artist_text = ''.join(artist_elem.xpath('//text()')).strip()
url_list = list(artist_elem.iterlinks())
artist_url = _cleanse_url(url_list[0][2]) if url_list else None
return (artist_text, artist_url)
def _get_license_url(image_info):
return (
image_info
.get('extmetadata', {})
.get('LicenseUrl', {})
.get('value', '')
.strip()
)
def _create_meta_data_dict(image_data):
meta_data = {}
global_usage_length = len(image_data.get('globalusage', []))
image_info = _get_image_info_dict(image_data)
date_originally_created, last_modified_at_source = _extract_date_info(
image_info)
description = (
image_info
.get('extmetadata', {})
.get('ImageDescription', {})
.get('value')
)
if description:
description_text = ' '.join(
html.fromstring(description).xpath('//text()')
).strip()
meta_data['description'] = description_text
meta_data['global_usage_count'] = global_usage_length
meta_data['date_originally_created'] = date_originally_created
meta_data['last_modified_at_source'] = last_modified_at_source
return meta_data
def _cleanse_url(url_string):
"""
Check to make sure that a url is valid, and prepend a protocol if needed
"""
parse_result = urlparse(url_string)
if parse_result.netloc == HOST:
parse_result = urlparse(url_string, scheme='https')
elif not parse_result.scheme:
parse_result = urlparse(url_string, scheme='http')
if parse_result.netloc or parse_result.path:
return parse_result.geturl()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Wikimedia Commons API Job',
add_help=True,
)
parser.add_argument(
'--date',
help='Identify images uploaded on a date (format: YYYY-MM-DD).')
args = parser.parse_args()
if args.date:
date = args.date
else:
date_obj = datetime.now() - timedelta(days=2)
date = datetime.strftime(date_obj, '%Y-%m-%d')
main(date)