-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_upcoming.py
92 lines (69 loc) · 2.91 KB
/
scrape_upcoming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os
from database import update_upcoming
# Setup webdriver
opt = webdriver.ChromeOptions()
opt.add_argument('headless')
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=opt)
def scrape_upcoming():
"""
Scrape upcoming movies from Village Cinema's upcoming movies page and update database.
"""
upcoming_movie_links = get_upcoming_movie_links('https://www.villagecinemas.gr/el/tainies/prosehos/?pg=0')
count = 1 # Terminal info
upcoming_movies = []
for link in upcoming_movie_links:
# Terminal info
clear()
print(f"Collecting data... {count}/{len(upcoming_movie_links)}")
print(link + "\n")
count += 1
# Get movie data
upcoming_movies.append(collect_movie_data(link))
update_upcoming(upcoming_movies)
def get_upcoming_movie_links(url):
"""
Return a list of movie page links from the provided URL.
Args:
url (str): The URL of the page containing movie links.
Returns:
List of movie links.
"""
driver.get(url)
movie_links = []
movie_elements = driver.find_elements(By.CSS_SELECTOR, "div[class='box_title'] > h2 > a")
for movie in movie_elements:
link = movie.get_attribute("href")
movie_links.append(link)
return movie_links
def collect_movie_data(link):
"""
Collect movie data from the given movie page link.
Args:
link (str): Movie page link.
Returns:
Dictionary with title, poster, premier, description, genre, and trailer (if found) of the movie."""
driver.get(link)
current_movie = {
"title": driver.find_element(By.CSS_SELECTOR, "#movie_container > div.title2 > h2").accessible_name,
"poster": driver.find_element(By.CSS_SELECTOR,"#ContentPlaceHolderDefault_ContentPlaceHolder1_movie_3_MainImage").get_attribute("src"),
"premier": driver.find_element(By.CSS_SELECTOR,"#movie_container > div.details > div.dtls.FloatLeft > div.info > div.info_txt > table > tbody > tr:nth-child(5) > td:nth-child(2)").accessible_name,
"description": driver.find_element(By.CSS_SELECTOR, ".summary > div:nth-child(2)").text.replace('\n', ' '),
"genre": driver.find_element(By.CSS_SELECTOR,".info_txt > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(7) > td:nth-child(2)").accessible_name
}
try:
current_movie["trailer"] = driver.find_element(By.CSS_SELECTOR,"#movie_container > div.video > iframe").get_attribute("src")
except:
current_movie["trailer"] = ""
return current_movie
def clear():
""""Clear terminal screen"""
if os.name == 'posix':
os.system('clear')
else:
os.system('cls')
if __name__ == "__main__":
scrape_upcoming()