mail_retriever.py

import email
import imaplib
import logging
import pickle
import re
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup

import constants


class Booking:
    def __init__(self, date, start_time, end_time):
        self.date = date
        self.start_time = start_time
        self.end_time = end_time
        self.total_time_hours = (end_time-start_time).seconds/3600

    def __str__(self):
        return f"{self.date} {self.start_time.time()} to {self.end_time.time()}, total: {self.total_time_hours} hours"

    def __repr__(self):
        return self.__str__()

    def to_dict(self):
        return {
            'date': self.date,
            'start_time': self.start_time,
            'end_time': self.end_time,
            'total_time_hours': self.total_time_hours
        }


class MailRetriever:
    def __init__(self, mail_login=None, mail_pwd=None, use_cache=False, retrieve_after: bool = False) -> None:
        self.mail_login = mail_login
        self.mail_pwd = mail_pwd
        self.bookings = []
        self.use_cache = use_cache
        self.retrieve_after = retrieve_after

    def init_imap(self) -> None:
        self.imap = imaplib.IMAP4_SSL(constants.GMAIL_IMAP)
        self.imap.login(self.mail_login, self.mail_pwd)
        self.imap.select(constants.GMAIL_FOLDER_QUERY, readonly=True)

    @staticmethod
    def extract_booking_time(parsed_html):
        for i in parsed_html:
            if (result := re.findall(constants.BOOKING_DATE_REGEX, i.text)) != []:
                start_time = datetime.strptime(result[0][0], "%Y-%m-%d %H:%M:%S")
                end_time = datetime.strptime(result[1][0], "%Y-%m-%d %H:%M:%S")
                date = start_time.date()
                return Booking(date, start_time, end_time)

    def retrieve_cache(self) -> None:
        try:
            f = open("bookings.pkl", "rb")
            self.bookings = pickle.load(f)
            logging.info("Retrieved data from cache.")
        except FileNotFoundError:
            logging.warning("No cache found. Retrieving mails...")

    def get_mails(self) -> None:
        if self.use_cache:
            self.retrieve_cache()
            self.consolidate_bookings()
            return

        self.init_imap()
        # Retrieve all mails recieved from `retrieve_after` date
        if self.retrieve_after:
            latest_booking = self.find_lates_booking_date()
            logging.info(f"Looking for mails after {latest_booking}")
            imap_query = f'(FROM "{constants.LAUNDRY_CONFIRMATION_EMAIL}" SINCE "{latest_booking.strftime("%d-%b-%Y")}")'
        else:
            imap_query = f'(FROM "{constants.LAUNDRY_CONFIRMATION_EMAIL}")'

        status, messages = self.imap.search(None, imap_query)
        if status != 'OK':
            raise Exception("Error searching Inbox.")

        mails = messages[0].split()
        logging.info(f"Found {len(mails)} mails.")
        for message_uid in mails:
            _, msg = self.imap.fetch(message_uid.decode(), "(RFC822)")
            for response in msg:
                if not isinstance(response, tuple):
                    continue
                msg = email.message_from_bytes(response[1])
                if msg.is_multipart():
                    # One laundry-session might consist of two bookings
                    for part in msg.walk():
                        content_type = part.get_content_type()
                        content_disposition = str(part.get("Content-Disposition"))
                        if content_type == "text/html" and "attachment" not in content_disposition:
                            body = part.get_payload(decode=True).decode()
                            parsed_html = BeautifulSoup(body, features="lxml")
                            soup = parsed_html.body.find_all('p')
                            self.bookings.append(self.extract_booking_time(soup))
                break
        logging.info("Done retrieving mails.")
        self.imap.close()
        self.imap.logout()
        self.consolidate_bookings()
        self.persist_data()

    def persist_data(self) -> None:
        open_file = open("bookings.pkl", "wb")
        pickle.dump(self.bookings, open_file)
        open_file.close()
        logging.info("Data persisted to cache.")

    def find_lates_booking_date(self) -> datetime:
        self.retrieve_cache()
        df = pd.DataFrame.from_records([b.to_dict() for b in self.bookings if b is not None])
        return df['date'].max()

    def consolidate_bookings(self) -> None:
        df = pd.DataFrame.from_records([b.to_dict() for b in self.bookings if b is not None])
        # remove duplicates
        df.drop_duplicates(inplace=True)
        # Aggregate bookings on same day
        df = df.groupby('date').agg({'start_time': 'min', 'end_time': 'max', 'total_time_hours': 'sum'})
        df.to_csv(constants.DATA_PATH)
        logging.info(f"Data consolidated and persisted to {constants.DATA_PATH}.")