diff --git a/src/data_collection/asos_data_collection.ipynb b/src/data_collection/asos_data_collection.ipynb new file mode 100644 index 0000000..38251b3 --- /dev/null +++ b/src/data_collection/asos_data_collection.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6dd259a9-9451-470a-81d3-818fe2aacda0", + "metadata": {}, + "outputs": [], + "source": [ + "# Name: Wenqi Wang\n", + "# Github username: acse-ww721" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "42522491-28fd-495c-8e6c-9396b2793df2", + "metadata": {}, + "outputs": [], + "source": [ + "notebook_path = os.path.abspath(\"\")\n", + "project_root = os.path.abspath(os.path.join(notebook_path, \"../../\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "948a9b69-5c59-4199-9495-829a60cda85a", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "import datetime\n", + "import requests\n", + "import pandas as pd\n", + "import concurrent.futures\n", + "from bs4 import BeautifulSoup\n", + "import sys\n", + "sys.path.append(project_root)\n", + "from utils import folder_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "47ec5043-45a7-4a67-b4e1-8b2adfe10086", + "metadata": {}, + "outputs": [], + "source": [ + "eu_member_codes = [\n", + " 'AT',\n", + " 'BE',\n", + " 'BG',\n", + " 'HR',\n", + " 'CY',\n", + " 'CZ',\n", + " 'DK',\n", + " 'EE',\n", + " 'FI',\n", + " 'FR',\n", + " 'DE',\n", + " 'GR',\n", + " 'HU',\n", + " 'IE',\n", + " 'IT',\n", + " 'LV',\n", + " 'LT',\n", + " 'LU',\n", + " 'MT',\n", + " 'NL',\n", + " 'PL',\n", + " 'PT',\n", + " 'RO',\n", + " 'SK',\n", + " 'SI',\n", + " 'ES',\n", + " 'SE',\n", + " 'GB',\n", + "]\n", + "\n", + "# Extract data time range\n", + "startts = datetime.datetime(2022, 1, 1)\n", + "endts = datetime.datetime(2023, 8, 2)\n", + "\n", + "# Number of attempts to download data\n", + "MAX_ATTEMPTS = 6" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e77af530-52c5-440f-b3f9-a722cc4119d1", + "metadata": {}, + "outputs": [], + "source": [ + "# def get_current_directory():\n", + "# if \"__file__\" in globals():\n", + "# # Running in a Python file\n", + "# return os.path.abspath(os.path.dirname(__file__))\n", + "# else:\n", + "# # Running in a Jupyter Notebook\n", + "# return os.path.abspath(os.path.dirname(\"\"))\n", + "\n", + "\n", + "# def create_folder(c):\n", + "# current_directory = get_current_directory()\n", + "# folder_name = f\"{c}_ASOS_DATA\"\n", + "# folder_path = os.path.join(current_directory, folder_name)\n", + "#\n", + "# try:\n", + "# os.mkdir(folder_path)\n", + "# print(f\"Folder '{folder_path}' created successfully.\")\n", + "# except FileExistsError:\n", + "# print(f\"Folder '{folder_path}' already exists.\")\n", + "#\n", + "# return folder_path" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d26ae44c-a304-458d-8e0c-633178d36ea6", + "metadata": {}, + "outputs": [], + "source": [ + "def get_all_network():\n", + " url = \"https://mesonet.agron.iastate.edu/request/download.phtml?network=FR__ASOS\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, \"html.parser\")\n", + " select_element = soup.find(\"select\")\n", + " if select_element:\n", + " option_elements = select_element.find_all(\"option\")\n", + "\n", + " option_values = [\n", + " option[\"value\"] for option in option_elements if option.get(\"value\")\n", + " ]\n", + "\n", + " print(\"Option values:\")\n", + " for value in option_values:\n", + " print(value)\n", + " else:\n", + " print(\"Select element not found on the page.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "eee404cd-3982-41a4-bad1-f79ac691565e", + "metadata": {}, + "outputs": [], + "source": [ + "def get_network_url(country_list):\n", + " valid_urls = []\n", + " invalid_urls = []\n", + "\n", + " # for i in eu_member_codes:\n", + " for i in country_list:\n", + " url_network = f\"https://mesonet.agron.iastate.edu/request/download.phtml?network={i}__ASOS\"\n", + " response = requests.head(url_network)\n", + "\n", + " if response.status_code == 200:\n", + " valid_urls.append(url_network)\n", + " else:\n", + " invalid_urls.append(f\"Invalid URL: {url_network}\")\n", + "\n", + " return valid_urls" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6ad57800-7b53-4461-8bd7-44d3744d7f43", + "metadata": {}, + "outputs": [], + "source": [ + "def get_all_station_by_network(country_list):\n", + " # valid_urls = get_network_url()\n", + " for i in country_list:\n", + " url_station_geojson = (\n", + " f\"https://mesonet.agron.iastate.edu/geojson/network/{i}__ASOS.geojson\"\n", + " )\n", + " output_directory = folder_utils.create_folder(\n", + " i, data_folder, data_category, output_folder\n", + " )\n", + " output_filename = f\"{i}__asos_station_network.csv\"\n", + " output_filepath = os.path.join(output_directory, output_filename)\n", + "\n", + " # Get GeoJSON data\n", + " response = requests.get(url_station_geojson)\n", + " geojson_data = response.json()\n", + " # Create a list to store geojson\n", + " data = []\n", + "\n", + " # Extract GeoJSON items\n", + " for feature in geojson_data[\"features\"]:\n", + " properties = feature[\"properties\"]\n", + " geometry = feature[\"geometry\"]\n", + " row = {\n", + " # \"Type\": feature.get(\"type\", None),\n", + " \"Name\": properties.get(\"sname\", None),\n", + " \"ID\": feature.get(\"id\", None),\n", + " \"Latitude\": geometry.get(\"coordinates\", None)[1],\n", + " \"Logitude\": geometry.get(\"coordinates\", None)[0],\n", + " \"Elevation\": properties.get(\"elevation\", None),\n", + " \"Country\": properties.get(\"country\", None),\n", + " \"Network\": properties.get(\"network\", None),\n", + " \"Archive_Begin\": properties.get(\"archive_begin\", None),\n", + " \"Archive_End\": properties.get(\"archive_end\", None),\n", + " \"Time_Domain\": properties.get(\"time_domain\", None),\n", + " # \"Properties\": properties\n", + " }\n", + " data.append(row)\n", + "\n", + " # Transfer the list to Pandas DataFrame\n", + " df = pd.DataFrame(data)\n", + " df.to_csv(output_filepath, index=False, encoding=\"utf-8\")\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4ff2ddbc-d368-4973-968d-c7b8f0232058", + "metadata": {}, + "outputs": [], + "source": [ + "def get_data_url(df, startts, endts):\n", + " url_site_list = []\n", + " id_list = []\n", + " url_site_header = \"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?\"\n", + " url_site_tail = f\"data=all&\" # add all data variables\n", + " url_site_tail += startts.strftime(\"year1=%Y&month1=%m&day1=%d&\") # add start date\n", + " url_site_tail += endts.strftime(\"year2=%Y&month2=%m&day2=%d&\") # add end date\n", + " url_site_tail += f\"tz=Etc%2FUTC&format=onlycomma&latlon=yes&elev=yes&missing=null&trace=T&direct=no&report_type=3&report_type=4\" # add data format\n", + " for id in df['ID']:\n", + " url_site = f\"{url_site_header}station={id}&{url_site_tail}\" # add all stations\n", + " url_site_list.append(url_site)\n", + " id_list.append(id)\n", + "\n", + " return url_site_list, id_list # return station list" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "80831c07-8ff6-4671-8f4b-f126301665c8", + "metadata": {}, + "outputs": [], + "source": [ + "def download_data(url_site):\n", + " attempt = 0\n", + " while attempt < MAX_ATTEMPTS:\n", + " try:\n", + " response = requests.get(url_site, timeout=300)\n", + " data = response.text\n", + " if data is not None and not data.startswith(\"ERROR\"):\n", + " return data\n", + " except Exception as e:\n", + " print(f\"download_data({url_site}) failed with {e}\")\n", + " time.sleep(5)\n", + " attempt += 1\n", + "\n", + " print(\"Exhausted attempts to download, returning empty data\")\n", + " return \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f4119991-50a1-42fe-9a88-18efc9521e07", + "metadata": {}, + "outputs": [], + "source": [ + "def save_data(url_site, country, station, startts, endts):\n", + " data = download_data(url_site)\n", + " # output_filename = f\"{country}_{startts:%Y%m%d%H%M}_{endts:%Y%m%d%H%M}.csv\"\n", + " output_directory = folder_utils.create_folder(\n", + " country, data_folder, data_category, output_folder\n", + " )\n", + " output_filename = f\"{country}_{station}_{startts:%Y%m%d}_{endts:%Y%m%d}.csv\"\n", + " output_filepath = os.path.join(output_directory, output_filename)\n", + " # Split the data into lines\n", + " lines = data.split(\"\\n\")\n", + "\n", + " # Extract column names from the first line\n", + " column_names = lines[0].split(\",\")\n", + "\n", + " # Initialize lists to store data\n", + " data_rows = []\n", + "\n", + " # Iterate through the remaining lines (data rows)\n", + " for line in lines[1:]:\n", + " if line: # Skip empty lines\n", + " data_row = line.split(\",\")\n", + " data_rows.append(data_row)\n", + "\n", + " # Create a DataFrame from the data rows using the extracted column names\n", + " df = pd.DataFrame(data_rows, columns=column_names)\n", + "\n", + " # Save the DataFrame to a CSV file\n", + " df.to_csv(output_filepath, index=False, encoding=\"utf-8\")\n", + " print(f'{output_filename} done!')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "9b17f5f5-91bb-4608-a909-7e29c1945940", + "metadata": {}, + "outputs": [], + "source": [ + "def download_and_save_data(url_site, country, station, startts, endts):\n", + " start_time = time.time() # Record start time\n", + " data = download_data(url_site)\n", + " end_time = time.time() # Record end time\n", + " download_time = end_time - start_time\n", + "\n", + " if data:\n", + " save_data(url_site, country, station, startts, endts)\n", + " print(f'{station} - Download time: {download_time:.3f} s')\n", + " else:\n", + " print(f'{station} - Download failed')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "763e913d-8f07-4d7c-aaa7-74474c1b78f4", + "metadata": {}, + "outputs": [], + "source": [ + "def download_and_save_data_thread(args):\n", + " url_site, country, station_id, startts, endts = args\n", + " download_and_save_data(url_site, country, station_id, startts, endts)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "24e47da5-6169-4d89-a69b-f114109e1aa2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Folder '/Users/ww721/JupyterNotebookPath/IRP_20220602/irp_ww721_bakcup/data/raw_data/ASOS_DATA/FR_ASOS_DATA' created successfully.\n" + ] + } + ], + "source": [ + "# UK example\n", + "country = [\n", + " \"FR\",\n", + "]\n", + "\n", + "data_folder = \"data\"\n", + "data_category = \"raw_data\"\n", + "output_folder = \"ASOS_DATA\"\n", + "\n", + "start_date = datetime.datetime(2022, 1, 1)\n", + "end_date = datetime.datetime(2023, 8, 2)\n", + "\n", + "gb_df = get_all_station_by_network(country)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26ae18b5-d428-484f-a0a3-dfff366e02ae", + "metadata": {}, + "outputs": [], + "source": [ + "url_site_list, id_list = get_data_url(gb_df, start_date, end_date)\n", + "args_list = [\n", + " (url_site, \"GB\", station_id, start_date, end_date)\n", + " for url_site, station_id in zip(url_site_list, id_list)\n", + "]\n", + "\n", + "with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " executor.map(download_and_save_data_thread, args_list) # fast" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/data_collection/era5_download_T850_v2.ipynb b/src/data_collection/era5_download_T850_v2.ipynb index 3ebf7ff..b228497 100644 --- a/src/data_collection/era5_download_T850_v2.ipynb +++ b/src/data_collection/era5_download_T850_v2.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7fa5130b-73d6-41fc-8fbf-1117f82439f4", + "metadata": {}, + "outputs": [], + "source": [ + "# Name: Wenqi Wang\n", + "# Github username: acse-ww721" + ] + }, { "cell_type": "code", "execution_count": 24, @@ -471,8 +482,7 @@ "end_time": "2023-08-26T11:41:51.362023Z", "start_time": "2023-08-26T11:41:51.311558Z" }, - "code_folding": [], - "scrolled": false + "code_folding": [] }, "outputs": [], "source": [ @@ -12221,7 +12231,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.5" + "version": "3.11.5" }, "toc": { "base_numbering": 1, diff --git a/src/data_collection/era5_download_t2m_v2.ipynb b/src/data_collection/era5_download_t2m_v2.ipynb index 7b65d33..4bf6b83 100644 --- a/src/data_collection/era5_download_t2m_v2.ipynb +++ b/src/data_collection/era5_download_t2m_v2.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "20b1318a-bf6b-45e9-930e-75d0bf01d8ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Name: Wenqi Wang\n", + "# Github username: acse-ww721" + ] + }, { "cell_type": "code", "execution_count": 13, @@ -452,7 +463,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.5" + "version": "3.11.5" }, "toc": { "base_numbering": 1,