-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first commit, wikimedia crawler is fine
- Loading branch information
Showing
9 changed files
with
223 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
.env | ||
__pycache__ | ||
output | ||
cx | ||
.vscode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Artstyle Detector |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import os | ||
|
||
# The Crawler class checks if a directory and file exist, and creates them if they don't. | ||
# It's the base class for the alternative crawlers. | ||
class Crawler: | ||
|
||
def __init__(self, output_dir, input_file, prefix): | ||
self.output_dir = self.dir_exists(output_dir, prefix) | ||
self.input_file = self.file_exists(input_file) | ||
self.prefix = prefix | ||
|
||
@staticmethod | ||
def dir_exists(dir, prefix=""): | ||
""" | ||
This is a static method in Python that checks if a directory exists and creates it if it | ||
doesn't. | ||
:param dir: The directory path where the new directory will be created | ||
:param prefix: The prefix parameter is an optional string that can be added to the directory | ||
path. It is used to create a subdirectory within the main directory specified by the dir | ||
parameter. If no prefix is provided, the function will simply create the directory specified by | ||
the dir parameter | ||
:return: the directory path that was created or already existed. | ||
""" | ||
dir_path = os.path.join(dir, prefix) | ||
if not os.path.exists(dir_path): | ||
os.makedirs(dir_path) | ||
return dir_path | ||
|
||
|
||
@staticmethod | ||
def file_exists(file): | ||
""" | ||
This is a static method in Python that checks if a file exists and raises an error if it | ||
doesn't. | ||
:param file: The parameter "file" is a string representing the file path of the file being | ||
checked for existence | ||
:return: the input file if it exists. | ||
""" | ||
if not os.path.isfile(file): | ||
raise FileNotFoundError("Error: Input file does not exist. Please provide an existing file.") | ||
else: | ||
return file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import os | ||
import json | ||
from dotenv import load_dotenv | ||
from google_images_search import GoogleImagesSearch | ||
|
||
from crawler import Crawler | ||
|
||
# The GoogleCrawler class is a subclass of Crawler that uses the Google Images Search API to crawl and | ||
# download images based on input queries and search parameters. There is a limitation on the number of queries | ||
# so I ended up crawling Wikimedia Commons. | ||
|
||
class GoogleCrawler(Crawler): | ||
|
||
def __init__(self, output_dir, input_file, prefix, parameters_file): | ||
super().__init__(output_dir, input_file, prefix) | ||
load_dotenv() | ||
self.api_key = os.getenv("API_KEY") | ||
self.cx = os.getenv("CX") | ||
self.gis = GoogleImagesSearch(self.api_key, self.cx) | ||
self.read_input_file() | ||
self.read_search_parameters(self.file_exists(parameters_file)) | ||
|
||
def crawl_images(self): | ||
""" | ||
This function crawls images based on a list of queries, downloads and resizes them, and saves | ||
them to the specified directories. | ||
""" | ||
for query in self.queries: | ||
self.search_parameters["q"] = query | ||
self.gis.search(search_params=self.search_parameters) | ||
query_dir = self.dir_exists(os.path.join(self.output_dir, query)) | ||
for result in self.gis.results(): | ||
print("Image url: ", result.url) | ||
result.download(query_dir) | ||
result.resize(1000, 1000) | ||
print("Image path: ", result.path) | ||
|
||
def read_input_file(self): | ||
""" | ||
This function reads the contents of an input file and stores each line as a query in a list. | ||
""" | ||
with open(self.input_file, mode="r", encoding="utf-8") as f: | ||
self.queries = f.read().split("\n") | ||
|
||
def read_search_parameters(self, parameters_file): | ||
""" | ||
This function reads search parameters from a JSON file and stores them in an object attribute. | ||
:param parameters_file: The file path of the JSON file containing the search parameters | ||
""" | ||
with open(parameters_file, mode="r", encoding="utf-8") as f: | ||
self.search_parameters = json.load(f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
surrealism paintings | ||
realism paintings | ||
pointilism paintings | ||
abstract paintings | ||
impressionism paintings | ||
action paintings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Impressionist_paintings | ||
Abstract_paintings | ||
Romantic_paintings | ||
Pointilism | ||
Realist_paintings | ||
Tingatinga |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from google_crawler import GoogleCrawler | ||
from wikimedia_crawler import WikimediaCrawler | ||
|
||
def main(): | ||
crawler = WikimediaCrawler("output", "input/wikimedia_input", "wikimedia") | ||
# crawler = GoogleCrawler("output", "input/google_input", "google", "parameters.json") | ||
crawler.crawl_images() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"q": "", | ||
"num": 100, | ||
"fileType": "jpg|png", | ||
"rights": "cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived", | ||
"safe": "off", | ||
"imgType": "imgTypeUndefined", | ||
"imgSize": "large", | ||
"imgDominantColor": "imgDominantColorUndefined", | ||
"imgColorType": "imgColorTypeUndefined" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import asyncio | ||
import os | ||
import re | ||
|
||
import aiohttp | ||
import requests | ||
|
||
from crawler import Crawler | ||
|
||
# The WikimediaCrawler class is a Python class that crawls images from Wikimedia Commons using asyncio | ||
# and aiohttp libraries. | ||
class WikimediaCrawler(Crawler): | ||
|
||
url = "https://commons.wikimedia.org/w/api.php" | ||
|
||
def __init__(self, output_dir, input_file, prefix): | ||
super().__init__(output_dir, input_file, prefix) | ||
self.session = requests.Session() | ||
self.read_input_file() | ||
self.sem = asyncio.Semaphore(10) | ||
|
||
def crawl_images(self): | ||
asyncio.run(self.__crawl_images_async()) | ||
|
||
async def __crawl_images_async(self): | ||
""" | ||
This is an asynchronous function that crawls images from Wikimedia Commons categories and | ||
downloads them. | ||
""" | ||
tasks = [] | ||
for category in self.categories: | ||
parameters = { | ||
"action": "query", | ||
"format": "json", | ||
"list": "categorymembers", | ||
"cmtitle": "Category:" + category, | ||
"cmlimit": "500", | ||
"cmtype": "file" | ||
} | ||
request = self.session.get(url=self.url, params=parameters) | ||
data = request.json() | ||
images = data["query"]["categorymembers"] | ||
category_output_dir = self.dir_exists(self.output_dir, category) | ||
|
||
for image in images: | ||
image_title = image["title"].replace(" ", "_") | ||
image_url = "https://commons.wikimedia.org/wiki/" + image_title | ||
# print("Log: Image url: ", image_url) | ||
# print("Log: Image title: ", image_title) | ||
task = self.download_image(image_url, re.sub( | ||
r'[<>:"/\\|?*]', "", image_title).replace(" ", "_"), category_output_dir) | ||
tasks.append(task) | ||
await asyncio.gather(*tasks) | ||
|
||
def read_input_file(self): | ||
""" | ||
This function reads the contents of an input file, splits it by newline character, and assigns | ||
the resulting list to the "categories" attribute of the object. | ||
""" | ||
with open(self.input_file, mode="r", encoding="utf-8") as f: | ||
self.categories = f.read().split("\n")[:-1] | ||
|
||
async def download_image(self, url, title, output_dir): | ||
""" | ||
This is an asynchronous function that downloads an image from a given URL and saves it to a | ||
specified output directory. | ||
:param url: The URL of the image to be downloaded | ||
:param title: The title of the image file that will be saved to the output directory | ||
:param output_dir: The directory where the downloaded image will be saved | ||
""" | ||
output_file = os.path.join(output_dir, title) | ||
async with self.sem: | ||
async with aiohttp.ClientSession() as aio_session: | ||
try: | ||
async with aio_session.get(url) as response: | ||
if response.status == 200: | ||
with open(output_file, 'wb') as file: | ||
while True: | ||
chunk = await response.content.read(1024) | ||
if not chunk: | ||
break | ||
file.write(chunk) | ||
print(f"Log: Image downloaded successfully: {output_file}") | ||
else: | ||
print(f"Error: Failed to download image from {url}. Status code: {response.status}") | ||
except Exception as e: | ||
print(f"Error: Failed to download image from {url}. Error: {e}") |