first commit, wikimedia crawler is fine

Mirtia · Jun 23, 2023 · e7db1b1 · e7db1b1
1 parent 2137f5c
commit e7db1b1
Show file tree

Hide file tree

Showing 9 changed files with 223 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.env
+__pycache__
+output
+cx
+.vscode
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# Artstyle Detector
diff --git a/crawler.py b/crawler.py
@@ -0,0 +1,44 @@
+import os
+
+# The Crawler class checks if a directory and file exist, and creates them if they don't. 
+# It's the base class for the alternative crawlers.
+class Crawler:
+
+ def __init__(self, output_dir, input_file, prefix):
+ self.output_dir = self.dir_exists(output_dir, prefix)
+ self.input_file = self.file_exists(input_file)
+ self.prefix = prefix
+
+ @staticmethod
+ def dir_exists(dir, prefix=""):
+ """
+ This is a static method in Python that checks if a directory exists and creates it if it
+ doesn't.
+ 
+ :param dir: The directory path where the new directory will be created
+ :param prefix: The prefix parameter is an optional string that can be added to the directory
+ path. It is used to create a subdirectory within the main directory specified by the dir
+ parameter. If no prefix is provided, the function will simply create the directory specified by
+ the dir parameter
+ :return: the directory path that was created or already existed.
+ """
+ dir_path = os.path.join(dir, prefix)
+ if not os.path.exists(dir_path):
+ os.makedirs(dir_path)
+ return dir_path
+
+
+ @staticmethod
+ def file_exists(file):
+ """
+ This is a static method in Python that checks if a file exists and raises an error if it
+ doesn't.
+ 
+ :param file: The parameter "file" is a string representing the file path of the file being
+ checked for existence
+ :return: the input file if it exists.
+ """
+ if not os.path.isfile(file):
+ raise FileNotFoundError("Error: Input file does not exist. Please provide an existing file.")
+ else:
+ return file 
diff --git a/google_crawler.py b/google_crawler.py
@@ -0,0 +1,52 @@
+import os
+import json
+from dotenv import load_dotenv
+from google_images_search import GoogleImagesSearch
+
+from crawler import Crawler
+
+# The GoogleCrawler class is a subclass of Crawler that uses the Google Images Search API to crawl and
+# download images based on input queries and search parameters. There is a limitation on the number of queries
+# so I ended up crawling Wikimedia Commons.
+
+class GoogleCrawler(Crawler):
+
+ def __init__(self, output_dir, input_file, prefix, parameters_file):
+ super().__init__(output_dir, input_file, prefix)
+ load_dotenv()
+ self.api_key = os.getenv("API_KEY")
+ self.cx = os.getenv("CX")
+ self.gis = GoogleImagesSearch(self.api_key, self.cx)
+ self.read_input_file()
+ self.read_search_parameters(self.file_exists(parameters_file))
+
+ def crawl_images(self):
+ """
+ This function crawls images based on a list of queries, downloads and resizes them, and saves
+ them to the specified directories.
+ """
+ for query in self.queries:
+ self.search_parameters["q"] = query
+ self.gis.search(search_params=self.search_parameters)
+ query_dir = self.dir_exists(os.path.join(self.output_dir, query))
+ for result in self.gis.results():
+ print("Image url: ", result.url)
+ result.download(query_dir)
+ result.resize(1000, 1000)
+ print("Image path: ", result.path)
+
+ def read_input_file(self):
+ """
+ This function reads the contents of an input file and stores each line as a query in a list.
+ """
+ with open(self.input_file, mode="r", encoding="utf-8") as f:
+ self.queries = f.read().split("\n")
+
+ def read_search_parameters(self, parameters_file):
+ """
+ This function reads search parameters from a JSON file and stores them in an object attribute.
+ 
+ :param parameters_file: The file path of the JSON file containing the search parameters
+ """
+ with open(parameters_file, mode="r", encoding="utf-8") as f:
+ self.search_parameters = json.load(f)
diff --git a/input/google_input b/input/google_input
@@ -0,0 +1,6 @@
+surrealism paintings
+realism paintings
+pointilism paintings
+abstract paintings
+impressionism paintings
+action paintings
diff --git a/input/wikimedia_input b/input/wikimedia_input
@@ -0,0 +1,6 @@
+Impressionist_paintings
+Abstract_paintings
+Romantic_paintings
+Pointilism
+Realist_paintings
+Tingatinga
diff --git a/main.py b/main.py
@@ -0,0 +1,10 @@
+from google_crawler import GoogleCrawler
+from wikimedia_crawler import WikimediaCrawler
+
+def main():
+ crawler = WikimediaCrawler("output", "input/wikimedia_input", "wikimedia")
+ # crawler = GoogleCrawler("output", "input/google_input", "google", "parameters.json")
+ crawler.crawl_images()
+
+if __name__ == "__main__":
+ main()
diff --git a/parameters.json b/parameters.json
@@ -0,0 +1,11 @@
+{
+ "q": "",
+ "num": 100,
+ "fileType": "jpg|png",
+ "rights": "cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived",
+ "safe": "off",
+ "imgType": "imgTypeUndefined",
+ "imgSize": "large",
+ "imgDominantColor": "imgDominantColorUndefined",
+ "imgColorType": "imgColorTypeUndefined"
+}
diff --git a/wikimedia_crawler.py b/wikimedia_crawler.py
@@ -0,0 +1,88 @@
+import asyncio
+import os
+import re
+
+import aiohttp
+import requests
+
+from crawler import Crawler
+
+# The WikimediaCrawler class is a Python class that crawls images from Wikimedia Commons using asyncio
+# and aiohttp libraries.
+class WikimediaCrawler(Crawler):
+
+ url = "https://commons.wikimedia.org/w/api.php"
+
+ def __init__(self, output_dir, input_file, prefix):
+ super().__init__(output_dir, input_file, prefix)
+ self.session = requests.Session()
+ self.read_input_file()
+ self.sem = asyncio.Semaphore(10)
+
+ def crawl_images(self):
+ asyncio.run(self.__crawl_images_async())
+
+ async def __crawl_images_async(self):
+ """
+ This is an asynchronous function that crawls images from Wikimedia Commons categories and
+ downloads them.
+ """
+ tasks = []
+ for category in self.categories:
+ parameters = {
+ "action": "query",
+ "format": "json",
+ "list": "categorymembers",
+ "cmtitle": "Category:" + category,
+ "cmlimit": "500",
+ "cmtype": "file"
+ }
+ request = self.session.get(url=self.url, params=parameters)
+ data = request.json()
+ images = data["query"]["categorymembers"]
+ category_output_dir = self.dir_exists(self.output_dir, category)
+
+ for image in images:
+ image_title = image["title"].replace(" ", "_")
+ image_url = "https://commons.wikimedia.org/wiki/" + image_title
+ # print("Log: Image url: ", image_url)
+ # print("Log: Image title: ", image_title)
+ task = self.download_image(image_url, re.sub(
+ r'[<>:"/\\|?*]', "", image_title).replace(" ", "_"), category_output_dir)
+ tasks.append(task)
+ await asyncio.gather(*tasks)
+
+ def read_input_file(self):
+ """
+ This function reads the contents of an input file, splits it by newline character, and assigns
+ the resulting list to the "categories" attribute of the object.
+ """
+ with open(self.input_file, mode="r", encoding="utf-8") as f:
+ self.categories = f.read().split("\n")[:-1]
+
+ async def download_image(self, url, title, output_dir):
+ """
+ This is an asynchronous function that downloads an image from a given URL and saves it to a
+ specified output directory.
+ 
+ :param url: The URL of the image to be downloaded
+ :param title: The title of the image file that will be saved to the output directory
+ :param output_dir: The directory where the downloaded image will be saved
+ """
+ output_file = os.path.join(output_dir, title)
+ async with self.sem:
+ async with aiohttp.ClientSession() as aio_session:
+ try:
+ async with aio_session.get(url) as response:
+ if response.status == 200:
+ with open(output_file, 'wb') as file:
+ while True:
+ chunk = await response.content.read(1024)
+ if not chunk:
+ break
+ file.write(chunk)
+ print(f"Log: Image downloaded successfully: {output_file}")
+ else:
+ print(f"Error: Failed to download image from {url}. Status code: {response.status}")
+ except Exception as e:
+ print(f"Error: Failed to download image from {url}. Error: {e}")