-
Notifications
You must be signed in to change notification settings - Fork 0
/
nepal-public-bodies.py
124 lines (94 loc) · 2.53 KB
/
nepal-public-bodies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---
# %% [markdown]
# # Scraping public bodies info for Nepal
# %%
import pandas as pd
# %%
import requests
# %%
from bs4 import BeautifulSoup
# %%
url = 'https://mofaga.gov.np/local-contact'
# %%
response = requests.get(url)
# %%
soup = BeautifulSoup(response.content)
# %%
table = soup.find('table')
# %%
columns = [tag.text for tag in table.find_all('th')]
columns
# %%
rows = table.find('tbody').find_all('tr')
# %% [markdown]
# ## Exploring the first row
# %%
first_row = rows[0]
first_row
# %%
[cell.text for cell in first_row.find_all('td')]
# %%
len(columns)
# %% [markdown]
# ## Building the data frame
# %%
df = pd.DataFrame(columns=columns)
# %%
df
# %%
for row in rows:
cells = [cell.text for cell in row.find_all('td')]
entry = dict(zip(columns, cells))
entry['वेवसाईट'] = row.find('a').attrs['href']
df.loc[len(df.index)] = entry.values()
# %%
df
# %% [markdown]
# ### Scraping function
# %%
def scrape_table(url: str) -> pd.DataFrame:
"""Scrapes a table from html in the government directory at https://mofaga.gov.np/local-contact
and returns a data frame.
"""
response = requests.get(url)
soup = BeautifulSoup(response.content)
table = soup.find('table')
columns = [tag.text for tag in table.find_all('th')]
rows = table.find('tbody').find_all('tr')
df = pd.DataFrame(columns=columns)
for row in rows:
cells = [cell.text for cell in row.find_all('td')]
entry = dict(zip(columns, cells))
entry['वेवसाईट'] = row.find('a').attrs['href']
df.loc[len(df.index)] = entry.values()
return df
# %% [markdown]
# ### Data extraction
# %% [markdown]
# Scrape the district government organizations.
# %%
df = scrape_table('https://mofaga.gov.np/local-contact/dcc-prov-1?_token=29tedmjpqp7lgXXFG7hP0rliFO5kijkX36jLRsFF&province_id=0&dist_id=&visible=500')
df
# %%
df.to_csv('data/nepal-district-govorgs.csv', index=False)
# %% [markdown]
# Scrape the local government organizations url.
# %%
df = scrape_table('https://mofaga.gov.np/local-contact?_token=29tedmjpqp7lgXXFG7hP0rliFO5kijkX36jLRsFF&province_id=&dist_id=&local_tpe=&visible=10000&q=')
df
# %%
df.to_csv('data/nepal-local-govorgs.csv', index=False)