-
Notifications
You must be signed in to change notification settings - Fork 0
/
mail_retriever.py
128 lines (109 loc) · 4.87 KB
/
mail_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import email
import imaplib
import logging
import pickle
import re
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
import constants
class Booking:
def __init__(self, date, start_time, end_time):
self.date = date
self.start_time = start_time
self.end_time = end_time
self.total_time_hours = (end_time-start_time).seconds/3600
def __str__(self):
return f"{self.date} {self.start_time.time()} to {self.end_time.time()}, total: {self.total_time_hours} hours"
def __repr__(self):
return self.__str__()
def to_dict(self):
return {
'date': self.date,
'start_time': self.start_time,
'end_time': self.end_time,
'total_time_hours': self.total_time_hours
}
class MailRetriever:
def __init__(self, mail_login=None, mail_pwd=None, use_cache=False, retrieve_after: bool = False) -> None:
self.mail_login = mail_login
self.mail_pwd = mail_pwd
self.bookings = []
self.use_cache = use_cache
self.retrieve_after = retrieve_after
def init_imap(self) -> None:
self.imap = imaplib.IMAP4_SSL(constants.GMAIL_IMAP)
self.imap.login(self.mail_login, self.mail_pwd)
self.imap.select(constants.GMAIL_FOLDER_QUERY, readonly=True)
@staticmethod
def extract_booking_time(parsed_html):
for i in parsed_html:
if (result := re.findall(constants.BOOKING_DATE_REGEX, i.text)) != []:
start_time = datetime.strptime(result[0][0], "%Y-%m-%d %H:%M:%S")
end_time = datetime.strptime(result[1][0], "%Y-%m-%d %H:%M:%S")
date = start_time.date()
return Booking(date, start_time, end_time)
def retrieve_cache(self) -> None:
try:
f = open("bookings.pkl", "rb")
self.bookings = pickle.load(f)
logging.info("Retrieved data from cache.")
except FileNotFoundError:
logging.warning("No cache found. Retrieving mails...")
def get_mails(self) -> None:
if self.use_cache:
self.retrieve_cache()
self.consolidate_bookings()
return
self.init_imap()
# Retrieve all mails recieved from `retrieve_after` date
if self.retrieve_after:
latest_booking = self.find_lates_booking_date()
logging.info(f"Looking for mails after {latest_booking}")
imap_query = f'(FROM "{constants.LAUNDRY_CONFIRMATION_EMAIL}" SINCE "{latest_booking.strftime("%d-%b-%Y")}")'
else:
imap_query = f'(FROM "{constants.LAUNDRY_CONFIRMATION_EMAIL}")'
status, messages = self.imap.search(None, imap_query)
if status != 'OK':
raise Exception("Error searching Inbox.")
mails = messages[0].split()
logging.info(f"Found {len(mails)} mails.")
for message_uid in mails:
_, msg = self.imap.fetch(message_uid.decode(), "(RFC822)")
for response in msg:
if not isinstance(response, tuple):
continue
msg = email.message_from_bytes(response[1])
if msg.is_multipart():
# One laundry-session might consist of two bookings
for part in msg.walk():
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
if content_type == "text/html" and "attachment" not in content_disposition:
body = part.get_payload(decode=True).decode()
parsed_html = BeautifulSoup(body, features="lxml")
soup = parsed_html.body.find_all('p')
self.bookings.append(self.extract_booking_time(soup))
break
logging.info("Done retrieving mails.")
self.imap.close()
self.imap.logout()
self.consolidate_bookings()
self.persist_data()
def persist_data(self) -> None:
open_file = open("bookings.pkl", "wb")
pickle.dump(self.bookings, open_file)
open_file.close()
logging.info("Data persisted to cache.")
def find_lates_booking_date(self) -> datetime:
self.retrieve_cache()
df = pd.DataFrame.from_records([b.to_dict() for b in self.bookings if b is not None])
return df['date'].max()
def consolidate_bookings(self) -> None:
df = pd.DataFrame.from_records([b.to_dict() for b in self.bookings if b is not None])
# remove duplicates
df.drop_duplicates(inplace=True)
# Aggregate bookings on same day
df = df.groupby('date').agg({'start_time': 'min', 'end_time': 'max', 'total_time_hours': 'sum'})
df.to_csv(constants.DATA_PATH)
logging.info(f"Data consolidated and persisted to {constants.DATA_PATH}.")