Skip to content

Commit

Permalink
add content
Browse files Browse the repository at this point in the history
  • Loading branch information
Benjamin committed Jun 21, 2024
1 parent 6d96d9d commit 5b85942
Show file tree
Hide file tree
Showing 29 changed files with 1,092 additions and 5 deletions.
14 changes: 11 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Ebay Scraper
csv/
log/
keys.json

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -106,10 +111,8 @@ ipython_config.py
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
Expand Down Expand Up @@ -160,3 +163,8 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
mail_keys.json
pg_keys.json
docker.command
venv
links.json
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2024 Sutamis
Copyright (c) 2023 Sutamis

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# EbayScrapper
# EbayScrap
Scrapping Ebay For Data
7 changes: 7 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
LOG_DIRECTORY: str = "./log"
LOGNAME_PATTERN: str = "ebay-{subject}-{date}.log"

MAIL_CREDENTIAL_FILENAME = "mail_keys.json"
POSTGRESS_CREDENTIAL_FILENAME = "pg_keys.json"
LINKS_FILENAME = "links.json"
CONFIG_DATA = "config.json"
72 changes: 72 additions & 0 deletions csvtodb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

if __name__ == "__main__":
import os, sys, json, psycopg2, csv
import offer
from common import POSTGRESS_CREDENTIAL_FILENAME
from queries import INSERT_QUERY
from scrap import notnull, nullable

postgress_path: str = os.path.abspath(POSTGRESS_CREDENTIAL_FILENAME)
with open(postgress_path, 'r') as key_file:
postgress_data: dict = json.loads(key_file.read())

connection = psycopg2.connect(**postgress_data)
cursor = connection.cursor()

filepath: str = sys.argv[1]
type: str = sys.argv[2]

try:
with open(filepath, "r", encoding="UTF-8") as file:
reader = csv.reader(file)
next(reader)

for id, date, is_new, title, star, subtitle, price, pseudo, sales_count, satisfaction, bid_count, purchase, shipping, country in reader:
id: str = id
date: str = date[:10].replace("/", "-")
title: str = title.replace("\'", "_")

if id == "None" or title == "None":
continue

is_new: str = None if is_new == "None" else is_new
star: float = None if star == "-1.0" else float(star)
subtitle: str = subtitle.replace("\'", "_")
price: float = price
pseudo: str = pseudo
sales_count: int = sales_count
satisfaction: int= satisfaction
bid_count: int = None if bid_count == "-1" else int(bid_count)
purchase: str = None if purchase == "None" else purchase
shipping: str = shipping
country: str = country

cursor.execute(INSERT_QUERY.format(
id = notnull(id),
title = notnull(title),
type = notnull(type),
current_date = notnull(date),
is_new = nullable(is_new),
star = nullable(star),
subtitle = nullable(subtitle),
price = nullable(price),
pseudo = nullable(pseudo),
sales_count = nullable(sales_count),
satisfaction = nullable(satisfaction),
bid_count = nullable(bid_count),
purchase = nullable(purchase),
shipping = nullable(shipping),
country = nullable(country)
))

connection.commit()
except Exception as error:
print(error)

finally:
if "connection" in locals():
connection.close()
if "cursor" in locals():
cursor.close()


18 changes: 18 additions & 0 deletions db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import psycopg2, json, os

from common import POSTGRESS_CREDENTIAL_FILENAME

def create_db():
postgress_path: str = os.path.abspath(POSTGRESS_CREDENTIAL_FILENAME)
with open(postgress_path, 'r') as key_file:
postgress_data: dict = json.loads(key_file.read())

connection = psycopg2.connect(**postgress_data)
cursor = connection.cursor()

with open("db.sql", "r") as file:
cursor.execute(file.read())

connection.commit()

create_db()
19 changes: 19 additions & 0 deletions db.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
DROP TABLE IF EXISTS offer;
CREATE TABLE offer (
id VARCHAR,
date DATE,
title VARCHAR,
type VARCHAR,
is_new VARCHAR NULL,
star DECIMAL NULL,
subtitle VARCHAR NULL,
price DECIMAL NULL,
pseudo VARCHAR NULL,
sales_count INTEGER NULL,
satisfaction DECIMAL NULL,
bid_count INTEGER NULL,
purchase VARCHAR NULL,
shipping VARCHAR NULL,
country VARCHAR NULL,
CONSTRAINT pk_offer PRIMARY KEY (id, date, title, type)
);
65 changes: 65 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from dataclasses import dataclass

@dataclass
class GenericDataclass:
pass

@dataclass
class WebsiteConfig:
filename: str
wait_time: int
update_interval: int
url: str

@dataclass
class MailCredential:
sender: str
password: str
receiver: str

@dataclass
class PostgressCredential:
dbname: str
user: str
password: str
host: str
port: str

@dataclass
class Offer:
id: str
date: str
title: str
type: str
is_new: str
star: float
subtitle: str
price: float
pseudo: str
sales_count: int
satisfaction: float
bid_count: int
purchase: str
shipping: str
country: str

@dataclass
class ScrapStats:
skipped: int = 0

@dataclass
class RunStats:
total: int = 0
exist: int = 0
added: int = 0


class StatsSingleton:
_stats = None
def get_instance() -> ScrapStats:
if StatsSingleton._stats is None:
StatsSingleton._stats = ScrapStats()
return StatsSingleton._stats

def reset():
StatsSingleton._stats = None
113 changes: 113 additions & 0 deletions multiapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import sys
import time
import json
import logging
from logging import Logger
import datetime as dt
from dataclasses import asdict

import psycopg2
from prometheus_client import start_http_server, Counter

import notify
from scrap import run
from utils import get_json_as
from models import (
PostgressCredential,
MailCredential,
)
from common import (
POSTGRESS_CREDENTIAL_FILENAME,
MAIL_CREDENTIAL_FILENAME,
LOG_DIRECTORY,
LINKS_FILENAME,
LOGNAME_PATTERN
)

def get_logger():
now = dt.datetime.now()
current_date: str = now.strftime("%Y-%m-%d")

log_filename = LOGNAME_PATTERN.format(subject="log", date=current_date)

logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
filename=os.path.join(LOG_DIRECTORY, log_filename)
)

return logging.getLogger("log")

if __name__ == "__main__":
skipped = Counter('ebayscrap_skipped', 'skip offer')
total = Counter('ebayscrap_total', 'total offer')
exist = Counter('ebayscrap_exist', 'exist offer')
added = Counter('ebayscrap_added', 'added offer')

start_http_server(8000)

if (len(sys.argv)) > 1:
links_file = sys.argv[1]
if os.path.exists(links_file) and os.path.isfile(links_file):
if not os.path.exists(LOG_DIRECTORY):
os.mkdir(LOG_DIRECTORY)

postgress = get_json_as(POSTGRESS_CREDENTIAL_FILENAME, PostgressCredential)
mail = get_json_as(MAIL_CREDENTIAL_FILENAME, MailCredential)

with open(LINKS_FILENAME, "r") as file:
links: dict = json.loads(file.read())

wait_time: int = 3
while True:
for index in range(wait_time):
print(f"Time left: {wait_time - index}s")
time.sleep(1)

logger: Logger = get_logger()

try:
logger.info("Connection to database...")

connection = psycopg2.connect(**asdict(postgress))
cursor = connection.cursor()

logger.info("Connected to database!")
logger.info("Scrapping begings...")

for subject, link in links.items():
scrap_stats, run_stats = run(subject, link, cursor)

skipped.inc(run_stats.skipped)

total.inc(scrap_stats.total)
exist.inc(scrap_stats.exist)
added.inc(scrap_stats.added)

logger.info(f"Scrapping ended with {json.dumps(asdict(run_stats))}")
logger.info("Commit to databse...")

connection.commit()

logger.info("Scrapping succes !")

except Exception as error:
logger.exception(error)

notify.send_message(
mail.sender,
mail.password,
mail.receiver,
f"Error ebayscrap with {subject}",
f"Error with {error} with {json.dumps(asdict(run_stats))}"
)
finally:
logger.info("Closing database connecion...")

if "connection" in locals():
connection.close()
if "cursor" in locals():
cursor.close()

logger.info("Database connecion closed !")
31 changes: 31 additions & 0 deletions notify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders

SMTP_SERVER = "smtp.gmail.com"
SMTP_PORT = 587

def send_message(_from: str, password: str, to: str, subject: str, message: str, file_path: str = "None"):
msg = MIMEMultipart()
msg['From'] = _from
msg['To'] = to
msg['Subject'] = subject
msg.attach(MIMEText(message, 'plain'))

if (file_path != "None"):
with open(file_path, 'r') as file:
part = MIMEBase('application', 'octet-stream')
part.set_payload(file.read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', f'attachment; filename={file_path}')
msg.attach(part)

text = msg.as_string()

server = smtplib.SMTP(SMTP_SERVER, SMTP_PORT)
server.starttls()
server.login(_from, password)
server.sendmail(_from, to, text)
server.quit()
Loading

0 comments on commit 5b85942

Please sign in to comment.