0.12

vardecab · Mar 2, 2021 · aae247a · aae247a
1 parent fad2dc2
commit aae247a
Show file tree

Hide file tree

Showing 4 changed files with 627 additions and 306 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,9 @@ scraper-olx.py
 output-olx
 _*
 automate/*
-!automate/.gitkeep
+!automate/.gitkeep
+olx/data
+!olx/data/.gitkeep
+olx/olx_output
+!olx/olx_output/.gitkeep
+!olx/olx_output/diff/.gitkeep
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# otomoto-scraper
+# otomoto_olx-scraper
 
->Scrape car offers from OTOMOTO․pl and run IFTTT automation (eg. send email; add a to-do task) when new car(s) matching search criteria is found. With support for native macOS & Windows 10 notifications. 
+>Scrape car offers from OTOMOTO․pl & OLX․pl and run IFTTT automation (eg. send email; add a to-do task) when new car(s) matching search criteria is found. With support for native macOS & Windows 10 notifications. 
 
 <!-- Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cumanos sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. -->
 
@@ -14,6 +14,7 @@
 
 ## Release History
 
+- 0.12: Added OLX․pl support 🎉
 - 0.11.1: Replaced old `win10toast` module with `win10toast-click`.
 - 0.11: Improved Windows 10 notifications to open URL on-click using [win10toast-click](https://github.com/vardecab/win10toast-click); added URL shortening module; renamed a few variables; cleaned up project structure.
 - 0.10: Pagination support - script will scrape only the # of pages that are available for certain search query instead of relying on hard-coded value. Also: turned off notifications when there are no new cars; fixed a bug that prevented adding more than 32 cars to the file.
@@ -52,7 +53,7 @@ Using [SemVer](https://semver.org/).
 - [click Windows 10 notification to open URL](https://stackoverflow.com/questions/63867448/interactive-notification-windows-10-using-python)
 
 ### Other
-- [Flaticon](https://www.flaticon.com/)
+- [Flaticon / Freepik](https://www.flaticon.com/)
 - [IFTTT](https://ifttt.com/)
 - [Connect a Python Script to IFTTT by Enrico Bergamini](https://medium.com/mai-piu-senza/connect-a-python-script-to-ifttt-8ee0240bb3aa)
 - [Use IFTTT web requests to send email alerts by Anthony Hartup](https://anthscomputercave.com/tutorials/ifttt/using_ifttt_web_request_email.html)

diff --git a/olx/olx.py b/olx/olx.py
@@ -0,0 +1,313 @@
+# === libs ===
+
+import pickle # store data
+import os # create new folders
+from urllib.request import urlopen # open URLs
+from bs4 import BeautifulSoup # BeautifulSoup; parsing HTML
+import re # regex; extract substrings
+import time # delay execution; calculate script's run time
+from datetime import datetime # add IDs to files/folders' names
+from alive_progress import alive_bar # progress bar
+import webbrowser # open browser
+import ssl # certificate issue fix: https://stackoverflow.com/questions/52805115/certificate-verify-failed-unable-to-get-local-issuer-certificate
+import certifi # certificate issue fix: https://stackoverflow.com/questions/52805115/certificate-verify-failed-unable-to-get-local-issuer-certificate
+from sys import platform # check platform (Windows/Linux/macOS)
+if platform == 'win32':
+    from win10toast_click import ToastNotifier # Windows 10 notifications
+    toaster = ToastNotifier() # initialize win10toast
+    # from termcolor import colored # colored input/output in terminal
+elif platform == 'darwin':
+    import pync # macOS notifications 
+import requests # for IFTTT integration to send webhook
+from urllib import request # get OLX's page source
+import gdshortener # shorten URLs using is.gd 
+
+# === start + run time ===
+
+start = time.time() # run time start
+print("Starting...")
+
+# === have current date & time in exported files' names ===
+
+# https://www.w3schools.com/python/python_datetime.asp
+this_run_datetime = datetime.strftime(datetime.now(), '%y%m%d-%H%M%S') # eg 210120-173112
+
+file_saved_date = './data/date.pk'
+try: # might crash on first run
+    # load your data back to memory so we can save new value; NOTE: b = binary
+    with open(file_saved_date, 'rb') as file:
+        previous_run_datetime = pickle.load(file) # keep previous_run_datetime (last time the script ran) in a file so we can retrieve it later and compare / diff files 
+        print("Previous run:", previous_run_datetime) 
+except IOError:
+    print("First run - no file exists.") # if it's the first time script is running we won't have the file created so we skip  
+
+try:
+    with open(file_saved_date, 'wb') as file: # open pickle file
+        pickle.dump(this_run_datetime, file) # dump this_run_datetime (the time script is running) into the file so then we can use it to compare / diff files
+        print("This run:", this_run_datetime) 
+except IOError:
+    print("File doesn't exist.")
+
+# create new folder
+if not os.path.isdir("olx_output/" + this_run_datetime):
+    os.mkdir("olx_output/" + this_run_datetime) # eg 210120-173112
+    print("Folder created:", this_run_datetime)
+
+# === URL to scrape ===
+
+# BMW, 140+ KM, AT, Pb/On, 2002+, 18.5k PLN, Tarnów + 100 km, sort: newest
+page_url = 'https://www.olx.pl/motoryzacja/samochody/bmw/tarnow/?search%5Bfilter_float_price%3Ato%5D=18500&search%5Bfilter_float_year%3Afrom%5D=2002&search%5Bfilter_enum_petrol%5D%5B0%5D=petrol&search%5Bfilter_enum_petrol%5D%5B1%5D=diesel&search%5Bfilter_float_enginepower%3Afrom%5D=140&search%5Bfilter_enum_condition%5D%5B0%5D=notdamaged&search%5Bfilter_enum_transmission%5D%5B0%5D=automatic&search%5Bdist%5D=100'
+
+# === shorten the URL === 
+
+isgd = gdshortener.ISGDShortener() # initialize
+page_url_shortened = isgd.shorten(page_url) # shorten URL; result is in tuple
+print("Page URL:", page_url_shortened[0]) # [0] to get the first element from tuple
+
+# === IFTTT automation === 
+
+file_saved_imk = '../data/imk.pk'
+try: # might crash on first run
+    # load your data back to memory so we can save new value; NOTE: b = binary
+    with open(file_saved_imk, 'rb') as file:
+        ifttt_maker_key = pickle.load(file)
+except IOError:
+    print("First run - no file exists.")
+
+event_name = 'new-car-otomoto' # TODO: change the event_name 
+webhook_url = f'https://maker.ifttt.com/trigger/{event_name}/with/key/{ifttt_maker_key}'
+
+def run_ifttt_automation(url):
+    report = {}
+    report["value1"] = url
+    # report["value2"] = second
+    # report["value3"] = third
+    requests.post(webhook_url, data=report)
+
+# === pimp Windows 10 notification === 
+
+# https://stackoverflow.com/questions/63867448/interactive-notification-windows-10-using-python
+def open_url():
+    try: 
+        webbrowser.open_new(page_url)
+        print('Opening search results...')  
+    except: 
+        print('Failed to open search results. Unsupported variable type.')
+
+# === function to scrape data ===
+
+def pullData(page_url):
+
+    # *NOTE: no need to wait if it's only 1 page
+    # # ? can't crawl too often? works better with Otomoto limits perhaps
+    # pause_duration = 3 # seconds to wait
+    # print("Waiting for", pause_duration, "seconds before opening URL...")
+    # with alive_bar(pause_duration, bar="circles", spinner="dots_waves") as bar:
+    #     for second in range(0, pause_duration):
+    #         time.sleep(1)
+    #         bar()
+
+    print("Opening page...")
+    # print (page_url) # debug 
+    # page = urlopen(page_url, context=ssl.create_default_context(cafile=certifi.where())) # fix certificate issue # *NOTE: original
+    page = request.urlopen(page_url) # get URL # *NOTE: v: olx
+
+    print("Scraping page...")
+    # soup = BeautifulSoup(page, 'html.parser') # parse the page # *NOTE: original 
+    soup = BeautifulSoup(page, features="lxml") # get URL into BS # *NOTE: v: olx
+
+    # 'a' (append) to add lines to existing file vs overwriting
+    with open(r"olx_output/" + this_run_datetime + "/1-output.txt", "a", encoding="utf-8") as bs_output:
+        # print (colored("Creating local file to store URLs...", 'green')) # colored text on Windows
+        counter = 0 # counter to get # of URLs/cars
+        with alive_bar(bar="classic2", spinner="classic") as bar: # progress bar
+            for link in soup.find_all("a", {"class": "thumb"}):
+                bs_output.write(link.get('href'))
+                counter += 1 # counter ++
+                bar() # progress bar ++
+                # print ("Adding", counter, "URL to file...")
+        print("Successfully added", counter, "cars to file.")
+
+# === run URL in function ^ ===
+
+# TODO: ?
+# *NOTE 1/2: perhaps no longer needed as of 0.10? 
+try:
+    open(r"olx_output/" + this_run_datetime + "/1-output.txt",
+         "w").close() # clean main file at start
+except: # crashes on 1st run when file is not yet created
+    print("Nothing to clean, moving on...")
+# *NOTE 2/2: ^
+# TODO: ^
+
+# *NOTE: number of search results pages 
+# page = urlopen(page_url, context=ssl.create_default_context(cafile=certifi.where())) # fix certificate issue; open URL
+# soup = BeautifulSoup(page, 'html.parser') # parse the page
+
+# number_of_pages_to_crawl = ([item.get_text(strip=True) for item in soup.select("span.page")]) # get page numbers from the bottom of the page
+# number_of_pages_to_crawl = int(number_of_pages_to_crawl[-1]) # get the last element from the list ^ to get the the max page # and convert to int 
+# print('How many pages are there to crawl?', number_of_pages_to_crawl)
+
+# page_prefix = '&page='
+# page_number = 1 # begin at page=1
+# for page in range(1, number_of_pages_to_crawl+1):
+#     print("Page number:", page_number, "/",
+#           number_of_pages_to_crawl) 
+#     full_page_url = f"{page_url}{page_prefix}{page_number}"
+#     pullData(full_page_url)
+#     page_number += 1 # go to next page
+pullData(page_url) # throw URL to function
+
+# === make file more pretty by adding new lines ===
+
+with open(r"olx_output/" + this_run_datetime + "/1-output.txt", "r", encoding="utf-8") as scraping_output_file: # open file...
+    print("Reading file to clean up...")
+    read_scraping_output_file = scraping_output_file.read() # ... and read it
+
+urls_line_by_line = re.sub(r"#[a-zA-Z0-9]+(?!https$):https://|https://|#[a-zA-Z0-9]+", "\n", read_scraping_output_file) # add new lines; remove IDs at the end of URL, eg '#e5c6831089'
+
+urls_line_by_line = urls_line_by_line.replace("www", "https://www") # make text clickable again
+
+print("Cleaning the file...")
+
+# === switch to a list to remove duplicates & sort === 
+
+carList = urls_line_by_line.split() # remove "\n"; add to list
+uniqueCarList = list(set(carList)) # remove duplicates 
+print(f'There are {len(uniqueCarList)} cars in total.')
+
+print("File cleaned up. New lines added.")
+
+with open(r"olx_output/" + this_run_datetime + "/2-clean.txt", "w", encoding="utf-8") as clean_file:
+    for element in sorted(uniqueCarList): # sort URLs
+        clean_file.write("%s\n" % element) # write to file
+
+# === tailor the results by using a keyword: brand, model (possibly also engine size etc) === 
+# TODO: mostly broken as of 0.9; core works 
+
+# regex_user_input = input("Jak chcesz zawęzić wyniki? Możesz wpisać markę (np. BMW) albo model (np. E39) >>> ") # for now using brand as quesion but user can put any one-word keyword
+regex_user_input = ""
+if len(regex_user_input) == 0:
+    print("Keyword wasn't provided - not searching.")
+else: 
+    regex_user_input = regex_user_input.strip() # strip front & back
+    print("Opening file to search for keyword:", regex_user_input)
+    reg = re.compile(regex_user_input) # matches "KEYWORD" in lines
+    counter2 = 0 # another counter to get the # of search results
+    with open(r'olx_output/' + this_run_datetime + '/3-search_keyword.txt', 'w') as output: # open file for writing
+        print("Searching for keyword...")
+        with open(r'olx_output/' + this_run_datetime + '/2-clean.txt', 'r', encoding='UTF-8') as clean_no_dupes_file: # look for keyword in the clean file without empty lines and duplicates 
+            with alive_bar(bar="circles", spinner="dots_waves") as bar:
+                for line in clean_no_dupes_file: # read file line by line
+                    if reg.search(line): # if there is a match anywhere in a line
+                        output.write(line) # write the line into the new file
+                        counter2 += 1 # counter ++
+                        bar() # progress bar ++
+                        # print ("Progress:", counter2)
+            if counter2 == 1:
+                print("Found", counter2, "result.")
+                # if platform == "win32":
+                #     toaster.show_toast("otomoto-scraper", "Found " + str(counter2) +
+                #                        " result.",  icon_path="icons/www.ico", duration=None)
+            else:
+                print("Found", counter2, "results.")
+                # if platform == "win32":
+                #     toaster.show_toast("otomoto-scraper", "Found " + str(counter2) +
+                #                        " results.",  icon_path="icons/www.ico", duration=None)
+
+# === open keyword/search results ^ in browser ===
+
+    if counter2 != 0:
+        # user_choice_open_urls = input("Chcesz otworzyć linki w przeglądarce? [y/n] >>> ")
+        user_choice_open_urls = 'n'
+        if user_choice_open_urls == 'y':
+            with open("olx_output/" + this_run_datetime + "/3-search_keyword.txt", 'r', encoding='UTF-8') as search_results:
+                counter3 = 0
+                print("Opening URLs in browser...")
+                with alive_bar(bar="circles", spinner="dots_waves") as bar:
+                    for line in search_results: # go through the file
+                        webbrowser.open(line) # open URL in browser
+                        counter3 += 1
+                        bar()
+            if counter3 != 1: # correct grammar for multiple (URLs; them; they)
+                print("Opened ", str(counter3),
+                    " URLs in the browser. Go and check them before they go 404 ;)")
+                # if platform == "win32":
+                #     toaster.show_toast("otomoto-scraper", "Opened " + str(counter3) +
+                #                        " URLs.",  icon_path="icons/www.ico", duration=None)
+            else: # correct grammar for 1 (URL; it)
+                print("Opened", counter3,
+                    "URL in the browser. Go and check it before it goes 404 ;)")
+                # if platform == "win32":
+                #     toaster.show_toast("otomoto-scraper", "Opened " + str(counter3) +
+                #                        " URL.",  icon_path="icons/www.ico", duration=None)
+        else:
+            # print ("Ok - URLs saved in 'olx_output/search-output.txt' anyway.")
+            print("Ok - URLs saved to a file.")
+            # print("Script run time:", datetime.now()-start)
+            # sys.exit()
+    else:
+        print("No search results found.")
+
+# === compare files === 
+
+try:
+    counter2
+except NameError:
+    print("Variable not defined. Keyword wasn't provided.") 
+
+    try:
+        file_previous_run = open('olx_output/' + previous_run_datetime + '/2-clean.txt', 'r') # 1st file 
+        file_current_run = open('olx_output/' + this_run_datetime + '/2-clean.txt', 'r') # 2nd file 
+
+        f1 = [x for x in file_previous_run.readlines()] # set with lines from 1st file  
+        f2 = [x for x in file_current_run.readlines()] # set with lines from 2nd file 
+
+        diff = [line for line in f1 if line not in f2] # lines present only in 1st file 
+        diff1 = [line for line in f2 if line not in f1] # lines present only in 2nd file 
+        # *NOTE file2 must be > file1
+
+        if len(diff1) == 0: # check if set is empty - if it is then there are no differences between files 
+            print('Files are the same.')
+            # if platform == "darwin":
+            #         pync.notify('Nie ma nowych aut.', title='OLX', open=page_url, contentImage="https://i.postimg.cc/t4qh2n6V/car.png") # appIcon="" doesn't work, using contentImage instead
+            # elif platform == "win32":
+            #     toaster.show_toast(title="OLX", msg='Nie ma nowych aut.', icon_path="../icons/car.ico", duration=None, threaded=True, callback_on_click=open_url) # duration=None - leave notification in Notification Center; threaded=True - rest of the script will be allowed to be executed while the notification is still active
+        else:
+            with open('olx_output/diff/diff-' + this_run_datetime + '.txt', 'w') as w:
+                counter4 = 0 # counter 
+                with alive_bar(bar="circles", spinner="dots_waves") as bar:
+                    for url in diff1: # go piece by piece through the differences 
+                        w.write(url) # write to file
+                        run_ifttt_automation(url) # run IFTTT automation with URL
+                        # print('Running IFTTT automation...')
+                        bar()
+                        counter4 += 1 # counter++
+            if counter4 <= 0: # should not fire 
+                print ('No new cars since last run.')
+                # if platform == "darwin":
+                #     pync.notify('Nie ma nowych aut.', title='OLX', open=page_url, contentImage="https://i.postimg.cc/t4qh2n6V/car.png") # appIcon="" doesn't work, using contentImage instead
+                # elif platform == "win32":
+                #     toaster.show_toast(title="OLX", msg='Nie ma nowych aut.', icon_path="../icons/car.ico", duration=None, threaded=True, callback_on_click=open_url) # duration=None - leave notification in Notification Center; threaded=True - rest of the script will be allowed to be executed while the notification is still active
+            else:
+                print (counter4, "new cars found since last run! Go check them now!")
+                if platform == "darwin":
+                    pync.notify(f'Nowe auta: {counter4}', title='OLX', open=page_url, contentImage="https://i.postimg.cc/t4qh2n6V/car.png", sound="Funk") # appIcon="" doesn't work, using contentImage instead
+                elif platform == "win32":
+                    toaster.show_toast(title="OLX", msg=f'Nowe auta: {counter4}', icon_path="../icons/car.ico", duration=None, threaded=True, callback_on_click=open_url) # duration=None - leave notification in Notification Center; threaded=True - rest of the script will be allowed to be executed while the notification is still active
+                    time.sleep(5)
+                    webbrowser.open(page_url)
+
+    except IOError:
+        print("No previous data - can't diff.")
+
+else:
+    print("Keyword was provided; search was successful.") 
+    # TODO: same as above but with /[x]-search_keyword.txt
+
+# === run time ===
+
+# run_time = datetime.now()-start
+end = time.time() # run time end 
+run_time = round(end-start,2)
+print("Script run time:", run_time, "seconds.")