Skip to content

Commit

Permalink
fix extracting img urls
Browse files Browse the repository at this point in the history
  • Loading branch information
nrenzoni committed May 17, 2020
1 parent d8a22bb commit a017ad7
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions safaribooks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
# coding: utf-8
import pathlib
import re
import os
import sys
Expand Down Expand Up @@ -348,6 +349,8 @@ def __init__(self, args):
self.display.info("Retrieving book chapters...")
self.book_chapters = self.get_book_chapters()

self.images = self.extract_image_links(self.book_chapters)

self.chapters_queue = self.book_chapters[:]

if len(self.book_chapters) > sys.getrecursionlimit():
Expand All @@ -373,7 +376,6 @@ def __init__(self, args):
self.filename = ""
self.chapter_stylesheets = []
self.css = []
self.images = []

self.display.info("Downloading book contents... (%s chapters)" % len(self.book_chapters), state=True)
self.BASE_HTML = self.BASE_01_HTML + (self.KINDLE_HTML if not args.no_kindle else "") + self.BASE_02_HTML
Expand Down Expand Up @@ -609,16 +611,15 @@ def get_html(self, url):
def url_is_absolute(url):
return bool(urlparse(url).netloc)

@staticmethod
def is_image_link(url: str):
return pathlib.Path(url).suffix[1:] in ["jpg", "peg", "png", "gif"]

def link_replace(self, link):
if link and not link.startswith("mailto"):
if not self.url_is_absolute(link):
if "cover" in link or "images" in link or "graphics" in link or \
link[-3:] in ["jpg", "peg", "png", "gif"]:
link = urljoin(self.base_url, link)
if link not in self.images:
self.images.append(link)
self.display.log("Crawler: found a new image at %s" % link)

if any(x in link for x in ["cover", "images", "graphics"]) or \
self.is_image_link(link):
image = link.split("/")[-1]
return "Images/" + image

Expand Down Expand Up @@ -1044,6 +1045,14 @@ def create_epub(self):
shutil.make_archive(zip_file, 'zip', self.BOOK_PATH)
os.rename(zip_file + ".zip", os.path.join(self.BOOK_PATH, self.book_id) + ".epub")

@staticmethod
def extract_image_links(chapters):
imgs = []
for chapter in chapters:
chapter_imgs = [urljoin(chapter['asset_base_url'], img_url) for img_url in chapter['images']]
imgs.extend(chapter_imgs)
return imgs


# MAIN
if __name__ == "__main__":
Expand Down

0 comments on commit a017ad7

Please sign in to comment.