Skip to content

Commit

Permalink
implement async parser
Browse files Browse the repository at this point in the history
  • Loading branch information
fmorato authored and Felipe Morato committed Dec 18, 2019
1 parent 761074f commit 166772f
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 29 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ vahti = "vahti:cli.main"

[tool.poetry.dependencies]
python = "^3.6"
requests = "^2.21"
beautifulsoup4 = "^4.7"
rope = "^0.14.0"
aiohttp = "^3.5"
lxml = "^4.4"

[tool.poetry.dev-dependencies]
# pre-commit = "^1.10"
Expand Down
60 changes: 45 additions & 15 deletions vahti/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import logging
import requests
from bs4 import BeautifulSoup
from aiohttp import ClientSession, ClientConnectorError
from vahti.cliargs import arg

logger = logging.getLogger("vahti.parser")

HTML_PARSER = "html.parser"
try:
import lxml # pylint: disable=unused-import

HTML_PARSER = "lxml"
except ImportError:
pass



@arg.query
class Parser:
Expand All @@ -14,29 +23,50 @@ def __init__(self, params=None, config=None):
self.url_template = "".format
self.params = params or {}
self.config = config or {}
self.session = requests.Session()
self.session.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux i586; rv:62.0) Gecko/20100101 Firefox/62.0"})
self.last_url = ""
self.session = None
self.url = None
self.last_url = None

def start(self):
self.session = ClientSession(
headers={"User-Agent": "Mozilla/5.0 (X11; Linux i586; rv:62.0) Gecko/20100101 Firefox/62.0"}
)

async def close(self):
await self.session.close()

def set_query(self, query):
pass

def query(self, url=""):
url = url or self.url_template(**self.config)
logger.debug(f"query {url} with params {self.params}")
r = requests.get(url, params=self.params)
self.last_url = r.url
logger.debug(f"queried {r.url} status {r.status_code}")
async def get(self, url, params=None):
try:
async with self.session.get(url, params=params) as response:
logger.debug(f"queried {response.url} status {response.status}")
return await response.text()
except ClientConnectorError as e:
logger.error(e)
return None

return r.text
def update_url(self, url=None):
self.url = url or self.url_template(**self.config)

async def query(self, url=None):
self.update_url(url)
logger.debug(f"query {url} with params {self.params}")
return await self.get(self.url, self.params)

@staticmethod
def parse(html):
logger.debug("parsing html")
return BeautifulSoup(html, features="html.parser")
return BeautifulSoup(html, features=HTML_PARSER)

def run(self, query, **params):
async def run(self, query, **params):
self.start()
self.set_query(query)
self.params.update(params)
html = self.query()
return self.parse(html)
html = await self.query()
await self.close()
if html:
return self.parse(html)
else:
return []
37 changes: 24 additions & 13 deletions vahti/vahti.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import asyncio
from vahti.helpers import extend_dict, persist_db

logger = logging.getLogger("vahti")
Expand All @@ -23,22 +24,24 @@ def _update_new(self, new):
if new:
self.new = extend_dict(self.new, new)

def print_result(self, string_format="{date:12} {title:40} {price:>6} {link}"):
if getattr(self.config, "all", False):
def print_result(self, string_format="{date:12} {title:40} {price:>6} {link}".format):
if self.config["all"] or self.config["no_db"]:
to_print = self.result
else:
to_print = self.new

for _, item in to_print.items():
if not item:
logger.info("No results found")
return
print(string_format.format(**item))
if not to_print:
logger.info("No results to print")
return

def run_query(self, query):
result = self.parser.run(query)
string_format = self.parser.item_format or string_format
for _, item in to_print.items():
print(string_format(**item))

no_db = getattr(self.config, "no-db", False)
async def run_query(self, query):
result = await self.parser.run(query)
new = {}
no_db = self.config["no_db"]
if not no_db:
new = persist_db(query, result)

Expand All @@ -50,11 +53,19 @@ def run_query(self, query):
self._update_result(result)
self._update_new(new)

def run_queries(self):
async def run_queries(self):
# https://stackoverflow.com/questions/49118449/python-3-6-async-get-requests-in-with-aiohttp-are-running-synchronously
requests = []
for query in self.queries:
logger.debug(f"running {query}")
self.run_query(query)
requests.append(self.run_query(query))

await asyncio.gather(*requests)

def run(self):
self.run_queries()
loop = asyncio.get_event_loop()
if self.queries:
loop.run_until_complete(self.run_queries())
else:
loop.run_until_complete(self.run_query(""))
return 0

0 comments on commit 166772f

Please sign in to comment.