Skip to content

Commit

Permalink
first commit, wikimedia crawler is fine
Browse files Browse the repository at this point in the history
  • Loading branch information
Mirtia committed Jun 23, 2023
1 parent 2137f5c commit e7db1b1
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.env
__pycache__
output
cx
.vscode
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Artstyle Detector
44 changes: 44 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

# The Crawler class checks if a directory and file exist, and creates them if they don't.
# It's the base class for the alternative crawlers.
class Crawler:

def __init__(self, output_dir, input_file, prefix):
self.output_dir = self.dir_exists(output_dir, prefix)
self.input_file = self.file_exists(input_file)
self.prefix = prefix

@staticmethod
def dir_exists(dir, prefix=""):
"""
This is a static method in Python that checks if a directory exists and creates it if it
doesn't.
:param dir: The directory path where the new directory will be created
:param prefix: The prefix parameter is an optional string that can be added to the directory
path. It is used to create a subdirectory within the main directory specified by the dir
parameter. If no prefix is provided, the function will simply create the directory specified by
the dir parameter
:return: the directory path that was created or already existed.
"""
dir_path = os.path.join(dir, prefix)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
return dir_path


@staticmethod
def file_exists(file):
"""
This is a static method in Python that checks if a file exists and raises an error if it
doesn't.
:param file: The parameter "file" is a string representing the file path of the file being
checked for existence
:return: the input file if it exists.
"""
if not os.path.isfile(file):
raise FileNotFoundError("Error: Input file does not exist. Please provide an existing file.")
else:
return file
52 changes: 52 additions & 0 deletions google_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import json
from dotenv import load_dotenv
from google_images_search import GoogleImagesSearch

from crawler import Crawler

# The GoogleCrawler class is a subclass of Crawler that uses the Google Images Search API to crawl and
# download images based on input queries and search parameters. There is a limitation on the number of queries
# so I ended up crawling Wikimedia Commons.

class GoogleCrawler(Crawler):

def __init__(self, output_dir, input_file, prefix, parameters_file):
super().__init__(output_dir, input_file, prefix)
load_dotenv()
self.api_key = os.getenv("API_KEY")
self.cx = os.getenv("CX")
self.gis = GoogleImagesSearch(self.api_key, self.cx)
self.read_input_file()
self.read_search_parameters(self.file_exists(parameters_file))

def crawl_images(self):
"""
This function crawls images based on a list of queries, downloads and resizes them, and saves
them to the specified directories.
"""
for query in self.queries:
self.search_parameters["q"] = query
self.gis.search(search_params=self.search_parameters)
query_dir = self.dir_exists(os.path.join(self.output_dir, query))
for result in self.gis.results():
print("Image url: ", result.url)
result.download(query_dir)
result.resize(1000, 1000)
print("Image path: ", result.path)

def read_input_file(self):
"""
This function reads the contents of an input file and stores each line as a query in a list.
"""
with open(self.input_file, mode="r", encoding="utf-8") as f:
self.queries = f.read().split("\n")

def read_search_parameters(self, parameters_file):
"""
This function reads search parameters from a JSON file and stores them in an object attribute.
:param parameters_file: The file path of the JSON file containing the search parameters
"""
with open(parameters_file, mode="r", encoding="utf-8") as f:
self.search_parameters = json.load(f)
6 changes: 6 additions & 0 deletions input/google_input
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
surrealism paintings
realism paintings
pointilism paintings
abstract paintings
impressionism paintings
action paintings
6 changes: 6 additions & 0 deletions input/wikimedia_input
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Impressionist_paintings
Abstract_paintings
Romantic_paintings
Pointilism
Realist_paintings
Tingatinga
10 changes: 10 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from google_crawler import GoogleCrawler
from wikimedia_crawler import WikimediaCrawler

def main():
crawler = WikimediaCrawler("output", "input/wikimedia_input", "wikimedia")
# crawler = GoogleCrawler("output", "input/google_input", "google", "parameters.json")
crawler.crawl_images()

if __name__ == "__main__":
main()
11 changes: 11 additions & 0 deletions parameters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"q": "",
"num": 100,
"fileType": "jpg|png",
"rights": "cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived",
"safe": "off",
"imgType": "imgTypeUndefined",
"imgSize": "large",
"imgDominantColor": "imgDominantColorUndefined",
"imgColorType": "imgColorTypeUndefined"
}
88 changes: 88 additions & 0 deletions wikimedia_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import asyncio
import os
import re

import aiohttp
import requests

from crawler import Crawler

# The WikimediaCrawler class is a Python class that crawls images from Wikimedia Commons using asyncio
# and aiohttp libraries.
class WikimediaCrawler(Crawler):

url = "https://commons.wikimedia.org/w/api.php"

def __init__(self, output_dir, input_file, prefix):
super().__init__(output_dir, input_file, prefix)
self.session = requests.Session()
self.read_input_file()
self.sem = asyncio.Semaphore(10)

def crawl_images(self):
asyncio.run(self.__crawl_images_async())

async def __crawl_images_async(self):
"""
This is an asynchronous function that crawls images from Wikimedia Commons categories and
downloads them.
"""
tasks = []
for category in self.categories:
parameters = {
"action": "query",
"format": "json",
"list": "categorymembers",
"cmtitle": "Category:" + category,
"cmlimit": "500",
"cmtype": "file"
}
request = self.session.get(url=self.url, params=parameters)
data = request.json()
images = data["query"]["categorymembers"]
category_output_dir = self.dir_exists(self.output_dir, category)

for image in images:
image_title = image["title"].replace(" ", "_")
image_url = "https://commons.wikimedia.org/wiki/" + image_title
# print("Log: Image url: ", image_url)
# print("Log: Image title: ", image_title)
task = self.download_image(image_url, re.sub(
r'[<>:"/\\|?*]', "", image_title).replace(" ", "_"), category_output_dir)
tasks.append(task)
await asyncio.gather(*tasks)

def read_input_file(self):
"""
This function reads the contents of an input file, splits it by newline character, and assigns
the resulting list to the "categories" attribute of the object.
"""
with open(self.input_file, mode="r", encoding="utf-8") as f:
self.categories = f.read().split("\n")[:-1]

async def download_image(self, url, title, output_dir):
"""
This is an asynchronous function that downloads an image from a given URL and saves it to a
specified output directory.
:param url: The URL of the image to be downloaded
:param title: The title of the image file that will be saved to the output directory
:param output_dir: The directory where the downloaded image will be saved
"""
output_file = os.path.join(output_dir, title)
async with self.sem:
async with aiohttp.ClientSession() as aio_session:
try:
async with aio_session.get(url) as response:
if response.status == 200:
with open(output_file, 'wb') as file:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
file.write(chunk)
print(f"Log: Image downloaded successfully: {output_file}")
else:
print(f"Error: Failed to download image from {url}. Status code: {response.status}")
except Exception as e:
print(f"Error: Failed to download image from {url}. Error: {e}")

0 comments on commit e7db1b1

Please sign in to comment.