-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.py
116 lines (99 loc) · 3.63 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from three import Three
from rake_nltk import Rake
import requests
import urllib
from urllib.request import urlretrieve
from urllib.parse import urlparse
import os
FMS_BASE_URL = "https://www.fixmystreet.com/open311/v2/"
FMS_BASE_PARAMETERS = {
"jurisdiction_id": "fixmystreet",
}
DATA_FOLDER = "data"
CATEGORIES = [
"Abandoned vehicles",
"Bus stops",
"Car parking",
"Dog fouling",
"Dumped rubbish",
"Flyposting",
"Flytipping",
"Graffiti",
"Parks/landscapes",
"Pavements/footpaths",
"Potholes",
"Public toilets",
"Roads/highways",
"Road traffic signs",
"Rubbish (refuse and recycling)",
"Street cleaning",
"Street lighting",
"Street nameplates",
"Traffic lights",
"Trees",
]
class FMS(Three):
def __init__(self):
super(FMS, self).__init__()
self.endpoint = FMS_BASE_URL
self.format = "xml"
self.jurisdiction = FMS_BASE_PARAMETERS['jurisdiction_id']
def make_a_category_folder(category):
folder_path = os.path.join(DATA_FOLDER, category)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
def setup_folders():
""" Initialise the folder structure for training if it doesn't exist"""
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
for cat in CATEGORIES:
make_a_category_folder(cat)
def extract_labels(text_list):
"""Takes a list of strings, extracts significant words and returns them
as a list of labels for use in the model. We make use of the RAKE Algorithm.
There may be value in running the operation across a whole dataset of reports
and then ranking the phrases and using those rankings to select significant
labels from the user supplied text. We also may need to reduce this dataset
across all labels to remove the phrases that don't occur across multiple
reports, for example placenames, etc, as this may create bias in our model.
"""
label_list = []
r = Rake()
corpus = " ".join(text_list)
r.extract_keywords_from_text(corpus)
label_list = r.get_ranked_phrases()
return label_list
def grab_report_data(report):
""" Given an single XML entry representing a single FMS report,
grab the metadata and attached photo(s) placing in the correct folder
to build the dataset for training."""
if 'media_url' in report.keys():
labels = extract_labels([report['detail'],
report['title'],
report['description']])
web_path = urlparse(report['media_url'])
file_name = os.path.split(web_path.path)[1]
img_file_path = os.path.join(DATA_FOLDER,
report['service_code'],
file_name)
print(report['service_code'])
try:
urlretrieve(report['media_url'], img_file_path)
except FileNotFoundError as e:
print("New service code found, because we're special")
make_a_category_folder(report['service_code'])
urlretrieve(report['media_url'], img_file_path)
label_file_name = file_name.rsplit('.', maxsplit=1)[0]
labels_file_path = os.path.join(DATA_FOLDER,
report['service_code'],
label_file_name)
print(labels_file_path)
with open(labels_file_path, 'w') as file_handler:
file_handler.write("{}\n".format(labels))
def main():
setup_folders()
fms_client = FMS()
for report in fms_client.requests()['request']:
grab_report_data(report)
if __name__ == '__main__':
main()