forked from cc-archive/cccatalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RawPixel.py
140 lines (100 loc) · 4.02 KB
/
RawPixel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Content Provider: RawPixel
ETL Process: Use the API to identify all CC0 images.
Output: TSV file containing the image, their respective meta-data.
Notes: https://api.rawpixel.com/api/v1/search?freecc0=1&html=0
No rate limit specified.
"""
from modules.etlMods import *
from urllib.parse import parse_qs
DELAY = 1.0 #time delay (in seconds)
FILE = 'rawpixel_{}.tsv'.format(int(time.time()))
logging.basicConfig(format='%(asctime)s: [%(levelname)s - RawPixel API] =======> %(message)s', level=logging.INFO)
def getImageList(_page=1):
endpoint = 'https://api.rawpixel.com/api/v1/search?freecc0=1&html=0&page={}'.format(_page)
request = requestContent(endpoint)
if request.get('results'):
return [request.get('total'), request.get('results')]
else:
return [None, None]
def getMetaData(_image):
startTime = time.time()
#verify the license and extract the metadata
foreignID = ''
foreignURL = ''
imgURL = ''
width = ''
height = ''
thumbnail = ''
tags = ''
title = ''
owner = ''
license = 'cc0'
version = '1.0'
tags = {}
if _image.get('freecc0'):
#get the image identifier
foreignID = _image.get('id', '')
#get the landing page
foreignURL = _image.get('url')
if not foreignURL:
logging.warning('Landing page not detected for image ID: {}'.format(foreignID))
return None
imgURL = _image.get('image_opengraph')
if imgURL:
#extract the dimensions from the query params because the dimensions in the metadata are at times inconsistent with the rescaled images
queryParams = urlparse(imgURL)
width = parse_qs(queryParams.query).get('w', [])[0] #width
height = parse_qs(queryParams.query).get('h', [])[0] #height
thumbnail = _image.get('image_400', '')
else:
logging.warning('Image not detected in URL: {}'.format(foreignURL))
return None
title = sanitizeString(_image.get('image_title', ''))
owner = sanitizeString(_image.get('artists', ''))
owner = owner.replace('(Source)', '').strip()
keywords = _image.get('keywords_raw')
if keywords:
keywordList = keywords.split(',')
keywordList = list(filter(lambda word: word.strip() not in ['cc0', 'creative commons', 'creative commons 0'], keywordList))
tags = [{'name': sanitizeString(tag), 'provider': 'rawpixel'} for tag in keywordList]
delayProcessing(startTime, DELAY)
return [
str(foreignID), foreignURL, imgURL,
thumbnail if thumbnail else '\\N',
str(width) if width else '\\N',
str(height) if height else '\\N', '\\N',
license, str(version),
owner if owner else '\\N', '\\N',
title if title else '\\N',
'\\N',
json.dumps(tags, ensure_ascii=False) if bool(tags) else '\\N',
'f', 'rawpixel', 'rawpixel'
]
def main():
page = 1
imgCtr = 0
isValid = True
logging.info('Begin: RawPixel API requests')
total, result = getImageList(page)
while (imgCtr < total) and isValid:
logging.info('Processing page: {}'.format(page))
startTime = time.time()
extracted = list(map(lambda img: getMetaData(img), result))
extracted = list(filter(None, extracted))
imgCtr += len(extracted)
#write to file
if extracted:
writeToFile(extracted, FILE)
page += 1
delayProcessing(startTime, DELAY) #throttle requests
total, result = getImageList(page)
if not result:
isValid = False
if not total:
total = 0
isValid = False
logging.info('Total images: {}'.format(imgCtr))
logging.info('Terminated!')
if __name__ == '__main__':
main()