forked from civictechdc/ancfinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_meeting_times.py
128 lines (109 loc) · 4.29 KB
/
update_meeting_times.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from bs4 import BeautifulSoup
import dateutil.parser
import sys, urllib2, lxml, csv, json, datetime, os, errno, re, os.path, urlparse
# Get ANC upcoming meetings from http:https://anc.dc.gov/events by reading the page
# for meeting details URLs, and following the "next >" link to page through
# the paginated results.
#
# A row in the table looks like this:
#
# <div class="views-row views-row-1 views-row-odd views-row-first calendar-event-row">
# <div class="views-field views-field-field-date-time-rep">
# <div class="field-content">
# <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2013-12-25T18:30:00-05:00">
# 12/25/2013 - 6:30pm
# </span>
# </div>
# </div>
# <div class="views-field views-field-title">
# <span class="field-content">
# <a href="http:https://calendar.dc.gov/event/anc-5b-monthly-meeting">
# ANC 5B Monthly Meeting
# </a>
# </span>
# </div>
# </div>
#------------------------------------------------------------------------------
# To break it down into individual dates and their corresponding ANC:
#
# <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2013-12-25T18:30:00-05:00">
# --> date/time in 12/25/2013 - 6:30pm format
# <span class="field-content"> --> name of council: 'ANC 5B Monthly Meeting'
#------------------------------------------------------------------------------
file_name = 'data/meetings.json'
# Ensure JSON file output directory exists and then open.
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST:
pass
else:
raise
mkdir_p(os.path.basename(file_name))
try:
archive = json.loads(open(file_name).read())
except IOError:
archive = {}
# Remove any future meetings (and re-add them below) in case they've been changed.
for ancmtgs1 in archive.values():
for mtg in list(ancmtgs1.get("meetings", {})):
if dateutil.parser.parse(mtg) > datetime.datetime.now():
del ancmtgs1["meetings"][mtg]
# Loop through the paginated list of upcoming ANC events.
url = 'http:https://anc.dc.gov/events'
cached_meeting_data = { }
meeting_links = []
while True:
if sys.stdout.isatty(): print url, "..." # don't print when running from a cron job
soup = BeautifulSoup(urllib2.urlopen(url))
meetings = soup.find_all('div','views-row')
last_meeting_date = None
for meet in meetings:
# get the date, ANC, and link to more information
date = datetime.datetime.strptime(meet.find('span','date-display-single').text, '%m/%d/%Y - %I:%M%p')
anc = meet.find('span','field-content').text[4:6]
link = urlparse.urljoin(url, meet.find('a').get('href'))
# scrape the individual meeting page for more details. the same
# target page is used for each meeting time, so we can cache it
# to be a little faster.
if link not in cached_meeting_data:
if sys.stdout.isatty(): print "\t", link, "..."
meeting_info = urllib2.urlopen(link).read()
cached_meeting_data[link] = meeting_info
else:
meeting_info = cached_meeting_data[link]
meeting_info = BeautifulSoup(meeting_info)
try:
address = meeting_info.find('div', 'field-name-field-location').find('div','field-item').text
except AttributeError:
address = None
try:
building = meeting_info.find('div','field-name-field-building-name').find('div','field-item').text
except AttributeError:
building = None
try:
room = meeting_info.find('div','field-name-field-suite-number').find('div','field-item').text
except AttributeError:
room = None
if anc not in archive: archive[anc] = { "meetings": { } }
archive[anc]['meetings'][date.isoformat()] = {
'address': address,
'building': building,
'room': room,
'link': link
}
last_meeting_date = date
# Stop if we are waaaay in the future, since the events page goes years ahead, which
# is not actually very helpful.
if last_meeting_date and last_meeting_date > datetime.datetime.now()+datetime.timedelta(days=6*30.5):
break
# Go onto the next page, if there is a next page.
nextpage = soup.find_all('a',{'title':'Go to next page'})
if not nextpage:
break
# turn a relative URL into an absolute URL for the next iteration
url = urlparse.urljoin(url, nextpage[0]['href'])
# Save the JSON file
with open(file_name, 'w') as output:
json.dump(archive, output, sort_keys=True, indent=4)