Skip to content

Commit

Permalink
add model file + update crawler to download image with bs4
Browse files Browse the repository at this point in the history
  • Loading branch information
Mirtia committed Jun 24, 2023
1 parent e7db1b1 commit e1d4702
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
Empty file added model.py
Empty file.
31 changes: 25 additions & 6 deletions wikimedia_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import aiohttp
import requests
from bs4 import BeautifulSoup

from crawler import Crawler

Expand Down Expand Up @@ -62,27 +63,45 @@ def read_input_file(self):

async def download_image(self, url, title, output_dir):
"""
This is an asynchronous function that downloads an image from a given URL and saves it to a
specified output directory.
This is an async function that downloads an image from a given URL and saves it to a specified
output directory.
:param url: The URL of the image to be downloaded
:param url: The URL of the webpage containing the image to be downloaded
:param title: The title of the image file that will be saved to the output directory
:param output_dir: The directory where the downloaded image will be saved
"""
output_file = os.path.join(output_dir, title)
async with self.sem:
async with aiohttp.ClientSession() as aio_session:
try:

async with aio_session.get(url) as response:
if response.status == 200:
chunks = ""
while True:
chunk = await response.content.read(1024)
if not chunk:
break
else:
chunks += chunk.decode("utf-8")
# print(f"Log: HTML page downloaded successfully: {output_file}")
soup = BeautifulSoup(chunks, 'html.parser')
image_url = soup.find("img")["src"]
# print(f"Log: Image url extracted from HTML page: {image_url}")
else:
print(f"Error: Failed to download HTML page from {url}. Status code: {response.status}")

async with aio_session.get(image_url) as response:
if response.status == 200:
with open(output_file, 'wb') as file:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
file.write(chunk)
print(f"Log: Image downloaded successfully: {output_file}")
# print(f"Log: Image downloaded successfully: {output_file}")
else:
print(f"Error: Failed to download image from {url}. Status code: {response.status}")
print(f"Error: Failed to download image page from {image_url}. Status code: {response.status}")

except Exception as e:
print(f"Error: Failed to download image from {url}. Error: {e}")
print(f"Error: Failed to download HTML page from {url}. Error: {e}")

0 comments on commit e1d4702

Please sign in to comment.