forked from cc-archive/cccatalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NYPL.py
222 lines (146 loc) · 7.05 KB
/
NYPL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
Content Provider: New York Public Library
ETL Process: Use the API to identify all CC0 images.
Output: TSV file containing the images and the respective meta-data.
Notes: http:https://api.repo.nypl.org/
Rate limit: 10,000 requests per day.
"""
from modules.etlMods import *
logging.basicConfig(format='%(asctime)s: [%(levelname)s - NYPL API] =======> %(message)s', level=logging.INFO)
DELAY = 3.0 #time delay (in seconds)
FILE = 'nypl_{}.tsv'.format(int(time.time()))
SIZE = 500
TOKEN = os.environ['NYPL_TOKEN']
HEADER = {'Authorization': 'Token token={}'.format(TOKEN)}
TOT_REQ = 0
INIT_TIME = time.time() #start time
BASE_ENDPOINT = 'http:https://api.repo.nypl.org/api/v1/'
def getRootIDs():
#NYPL API - Method 9: Return Top-Level UUIDS.
#Methods returns uuids for top-level collection records and item records without parents
logging.info('Requesting top-level UUIDs (collections and items)')
uuidList, total, pages = getUUIDs('items/roots')
logging.info('No. top-level UUIDS: {}'.format(total))
delayProcessing(time.time(), DELAY) #throttle requests
return uuidList
def getUUIDs(_method, _type=None, _limit=None, _offset=None):
endpoint = '{}{}'.format(BASE_ENDPOINT, _method)
request = requestContent(endpoint, HEADER)
if not request:
return None
nypl = request.get('nyplAPI')
reqParams = nypl.get('request')
logging.info('API Request details: {}'.format(reqParams))
response = nypl.get('response')
headers = response.get('headers')
if headers.get('code', '') == '200':
if _type:
data = response.get(_type)
else:
uuids = response.get('uuids', {})
data = uuids.get('uuid')
total = response.get('numResults')
pages = reqParams.get('totalPages')
return [data, total, pages]
return None
def getCaptures(_uuid):
#NYPL API - Method 4: Return Captures for a UUID
#Input - a UUID for an item, a container, or a collection
#Method returns uuid (capture), imageID, itemLink, title, etc
page = 1
limit = 500
method = 'items/{}?page={}&per_page={}'.format(_uuid, page, limit)
logging.info('Requesting captures for UUID: {}'.format(_uuid))
#loop and increment the page numpers
captures, total, pages = getUUIDs(method, 'capture')
delayProcessing(time.time(), DELAY) #throttle requests
if pages is None:
logging.warning('Captures not found!')
return None
logging.info('Total Captures {}'.format(total))
while page <= int(pages):
logging.info('Processing page {} of {}'.format(page, pages))
#filter captures without images
imgDetails = list(filter(lambda item: (item.get('imageLinks') is not None), captures))
#filter captures without creativecommons.org in the rights statement and get the UUID
ccContent = list(map(lambda itemID: itemID.get('uuid'), filter(lambda item: ('creativecommons.org' in item.get('rightsStatementURI')), imgDetails)))
#get the capture details
if ccContent:
extracted = list(map(lambda uuid: getMetaData(uuid), ccContent)) #('510d47df-e519-a3d9-e040-e00a18064a99')
#write to file
if extracted:
writeToFile(extracted, FILE)
page += 1
delayProcessing(time.time(), DELAY) #throttle requests
#get next batch of captures
method = 'items/{}?page={}&per_page={}'.format(_uuid, page, limit)
logging.info('Requesting captures for UUID: {}'.format(_uuid))
#loop and increment the page numpers
captures, total, pages = getUUIDs(method, 'capture')
def getMetaData(_uuid):
#NYPL API - Method 6: Return Capture Details
#Input: capture ID
#Method returns uuid, imageLinks, typeOfResource, imageID, etc
delayProcessing(time.time(), DELAY) #throttle requests
endpoint = '{}items/item_details/{}'.format(BASE_ENDPOINT, _uuid)
logging.info('Requesting capture details for UUID: {}'.format(_uuid))
request = requestContent(endpoint, HEADER)
nyplData = request.get('nyplAPI')
response = nyplData.get('response')
headers = response.get('headers')
if headers.get('code', {}).get('$') == '200':
mods = response.get('mods')
license = ''
version = ''
title = ''
creator = ''
title = mods.get('titleInfo', {}).get('title', {}).get('$')
title = sanitizeString(title)
creator = mods.get('name', {}).get('namePart', {}).get('$')
creator = sanitizeString(creator)
#originInfo = mods.get('originInfo', {}).get('dateIssued', {}).get('$')
captureInfo = response.get('sibling_captures', {}).get('capture', [])
if type(captureInfo) is not list:
captureInfo = [captureInfo]
#get all images (and alternative images)
#exclude images that do not match the title (this is not mandatory but it is an extra validation step)
captureInfo = list(filter(lambda item: (sanitizeString(item.get('title', {}).get('$')) == title), captureInfo))
for item in captureInfo:
foreignID = ''
foreignURL = ''
imgURL = ''
width = ''
height = ''
thumbnail = ''
#rightsStatementURI - get only CC0 images
rightsInfo = item.get('rightsStatementURI', {}).get('$')
if rightsInfo.endswith('creativecommons.org/publicdomain/zero/1.0/'):
license = 'cc0'
version = 1.0
else:
logging.warning('CC0 license not detected in UUID: {}!'.format(_uuid))
continue
#get image information
imageLinks = item.get('imageLinks', {}).get('imageLink')
if imageLinks:
imgInfo = list(map(lambda imgInfo: imgInfo.get('$'), filter(lambda img: ('"full-size"' in img.get('description', {})), imageLinks)))
imgURL = imgInfo[0].replace('&download=1', '')
thmbInfo = list(map(lambda imgInfo: imgInfo.get('$'), filter(lambda img: ('300 pixels' in img.get('description', {})), imageLinks)))
thumbnail = thmbInfo[0].replace('&download=1', '')
if imgURL == '':
logging.warning('Image not detected in UUID: {}!'.format(_uuid))
continue
#print(item.get('typeOfResource'))
foreignURL = item.get('itemLink', {}).get('$')
foreignID = item.get('imageID', {}).get('$')
yield [
str(foreignID), foreignURL, imgURL, thumbnail,
'\\N', '\\N', '\\N', license, str(version), creator, '\\N',
title, '\\N', '\\N', 'f', 'nypl', 'nypl'
]
def getRecent():
pass
rootID = getRootIDs()
#sample = rootID[0:10]
#print('Sample UUIDs: {}'.format(sample))
list(map(lambda uuid: getCaptures(uuid), rootID))