forked from cc-archive/cccatalog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SyncImageProviders.py
98 lines (69 loc) · 2.99 KB
/
SyncImageProviders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import boto3
import re
import os
import sys
import logging
import argparse
from botocore import UNSIGNED
from botocore.client import Config
BUCKET = os.environ['S3_BUCKET']
PATH = os.environ['OUTPUT_DIR']
ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
SECRET_KEY = os.environ['AWS_SECRET_KEY']
logging.basicConfig(format='%(asctime)s: [%(levelname)s - Sync Common Crawl Image Providers] =======> %(message)s', level=logging.INFO)
def getCrawlIndex(_param):
if not _param: #get the most recent index from common crawl
bucket = 'commoncrawl'
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
#verify bucket
contents = []
prefix = 'cc-index/collections/CC-MAIN-'
botoArgs = {'Bucket': bucket, 'Prefix': prefix}
while True:
objects = s3.list_objects_v2(**botoArgs)
for obj in objects['Contents']:
key = obj['Key']
if 'indexes' in key:
cIndex = key.split('/indexes/')[0].split('/')
cIndex = cIndex[len(cIndex)-1]
if str(cIndex) not in contents:
contents.append(str(cIndex))
try:
botoArgs['ContinuationToken'] = objects['NextContinuationToken']
except KeyError:
break
if contents:
_param = contents[-1]
return _param
def validateIndexPattern(_index, _pattern=re.compile(r'CC-MAIN-\d{4}-\d{2}')):
if not _pattern.match(_index):
logging.error('Invalid common crawl index format => {}.'.format(_index))
raise argparse.ArgumentTypeError
return _index
def syncS3Objects(_index):
s3 = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
botoArgs = {'Bucket': BUCKET, 'Prefix': 'common_crawl_image_data/{}'.format(_index)}
objects = s3.list_objects_v2(**botoArgs)
for obj in objects.get('Contents', []):
key = obj['Key']
if '_SUCCESS' not in key:
fileName = key.lstrip('common_crawl_image_data/').replace('/', '_')
fileName = '{}{}'.format(PATH, fileName)
fileName = fileName.replace('.csv', '.tsv')
with open(fileName, 'wb') as fh:
s3.download_fileobj(BUCKET, key, fh)
#check if the file exists locally before removing it from the s3 bucket
if os.path.exists(fileName) and os.path.getsize(fileName) > 0:
s3.delete_object(Bucket=BUCKET, Key=key)
logging.info('Deleted object: {}'.format(key))
else:
s3.delete_object(Bucket=BUCKET, Key=key)
def main():
parser = argparse.ArgumentParser(description='Sync Common Crawl Image Providers', add_help=True)
parser.add_argument('--index', type=validateIndexPattern)
args = parser.parse_args()
ccIndex = getCrawlIndex(args.index)
syncS3Objects(ccIndex)
logging.info('Terminated!')
if __name__ == '__main__':
main()