forked from civictechdc/ancfinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_meeting_times.py
113 lines (97 loc) · 4.35 KB
/
update_meeting_times.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from bs4 import BeautifulSoup
import urllib2, lxml, csv, json, datetime, os, errno, re
# URLs for different pages look like this:
# http:https://anc.dc.gov/events?field_date_time_rep_value[value]=2013-12-25&field_date_time_rep_value2[value]&keys=& ...
# type=All&field_topic_tid=All&field_audience_tid=All&field_ward_tid=All&field_police_service_area_tid=All&sort_ ...
# by=field_date_time_rep_value&sort_order=ASC&page=1
#-----------------------------------------------------------------------------
# Notice the "page=1" at the end. That's page 2. The real page one has no "page" attribute in the URL, but has everything else
# Link giving the last page number:
# <a title="Go to last page">
#-----------------------------------------------------------------------------
# A row in the table looks like this:
#
# <div class="views-row views-row-1 views-row-odd views-row-first calendar-event-row">
# <div class="views-field views-field-field-date-time-rep">
# <div class="field-content">
# <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2013-12-25T18:30:00-05:00">
# 12/25/2013 - 6:30pm
# </span>
# </div>
# </div>
# <div class="views-field views-field-title">
# <span class="field-content">
# <a href="http:https://calendar.dc.gov/event/anc-5b-monthly-meeting">
# ANC 5B Monthly Meeting
# </a>
# </span>
# </div>
# </div>
#------------------------------------------------------------------------------
# To break it down into individual dates and their corresponding ANC:
#
# <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2013-12-25T18:30:00-05:00">
# --> date/time in 12/25/2013 - 6:30pm format
# <span class="field-content"> --> name of council: 'ANC 5B Monthly Meeting'
#------------------------------------------------------------------------------
file_name = 'ancbrigadesite/static/meetings.json'
# Ensure JSON file exists and then open
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else:
raise
mkdir_p(file_name)
try:
archive = json.loads(open(file_name).read())
except ValueError:
archive = {}
# Open up main page and figure out how many pages there are so we can loop through them all
presoup = BeautifulSoup(urllib2.urlopen('http:https://anc.dc.gov/events'))
lastpage = presoup.find_all('a',{'title':'Go to last page'})
lastnum = lastpage[0]['href']
lastnum = int(lastnum[len(lastnum)-1])
pagenums = ['']; i = 1
while i <= lastnum:
pagenums.append('&page='+(str(i)))
i+=1
# Now loop through the url's for each page
for page in pagenums:
soup = BeautifulSoup(urllib2.urlopen('http:https://anc.dc.gov/events?field_date_time_rep_value[value]=2013-12-25&field_date_time_rep_value2[value]&keys=&type=All&field_topic_tid=All&field_audience_tid=All&field_ward_tid=All&field_police_service_area_tid=All&sort_by=field_date_time_rep_value&sort_order=ASC'+page))
meetings = soup.find_all('div','views-row')
for meeting in meetings:
meet = BeautifulSoup(str(meeting))
date = datetime.datetime.strptime(meet.find('span','date-display-single').text, '%m/%d/%Y - %I:%M%p').isoformat()
anc = meet.find('span','field-content').text[4:6]
link = meet.find('a').get('href')
if link[0] == '/':
link = 'http:https://anc.dc.gov' + link
link_text = BeautifulSoup(urllib2.urlopen(link))
address = link_text.find('div','field-name-field-location')
address = address.find('div','field-item').text
try:
building = link_text.find('div','field-name-field-building-name')
building = building.find('div','field-item').text
except AttributeError:
building = None
try:
room = link_text.find('div','field-name-field-suite-number')
room = room.find('div','field-item').text
except AttributeError:
room = None
#print link # any output gets emailed to Josh whenever this script is run by cron, so let's not have output
details = {'address':address,'building':building,'room':room,'link':link}
try:
archive[anc]['meetings'][date] = details
except KeyError:
meetings = {'meetings': {}}
archive[anc] = meetings
# Save the JSON file
with open(file_name, 'w') as output:
json.dump(archive, output, sort_keys=True, indent=4)
with open(file_name + 'p', 'w') as output:
output.write('anc_meetings = \n')
json.dump(archive, output, sort_keys=True, indent=4)