-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrangle_data.py
128 lines (89 loc) · 5.44 KB
/
wrangle_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#import libraries
import requests
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import pandas as pd
from collections import defaultdict
warnings.filterwarnings("ignore")
def create_dataset (indicator:str):
'''INPUT:
indicator: string corresponding to the indicator that will be embedded to the API link
OUTPUT:
df: dataframe created from json file in the format: country , year , value '''
get_req_param = {'format': 'json', 'per_page': '5000', 'date':'2000:2020'}
json_data = requests.get('https://api.worldbank.org/v2/countries/fin;dnk;che;ca;us/indicators/'+ str(indicator), params=get_req_param)
indicator_dict = defaultdict(list)
for e in json_data.json()[1]:
# check if country is already in dictionary. If so, append the new x and y values to the lists
if indicator_dict[e['country']['value']]:
indicator_dict[e['country']['value']][0].append(int(e['date']))
indicator_dict[e['country']['value']][1].append(float(e['value']))
else: # if country not in dictionary, then initialize the lists that will hold the x and y values
indicator_dict[e['country']['value']] = [[],[]]
states_list = []
year_list = []
values_list = []
for i in indicator_dict.keys(): #Loop through dictionary keys
for (j,k) in zip(indicator_dict[i][0] , indicator_dict[i][1]) : #Loop through each of the lists for a given key
states_list.append(i)
year_list.append(j)
values_list.append(k)
df = pd.DataFrame(list(zip(states_list ,year_list , values_list )) , columns = ['Country' , 'Year' , 'Value'])
return df
#Create a dataset for mortality caused by road traffic - indicator 'SH.STA.TRAF.P5'
df_mortality_road_traffic = create_dataset('SH.STA.TRAF.P5').rename(columns = {'Value':'Road Traffic Mortality'})
#Create a dataset for unemployed population - indicator SL.UEM.TOTL.ZS
df_unemployed = create_dataset('SL.UEM.TOTL.ZS').rename(columns = {'Value':'Unemployment %'})
#Create a dataset for inflation - indicator FP.CPI.TOTL.ZG
df_inflation = create_dataset('FP.CPI.TOTL.ZG').rename(columns = {'Value':'Inflation %'})
#Create datasets for women and men than work in agriculture - women indicator SL.AGR.EMPL.FE.ZS men indicator SL.AGR.EMPL.MA.ZS
df_agriculture_women = create_dataset('SL.AGR.EMPL.FE.ZS').rename(columns = {'Value':'Employment in Agriculture in the past 20 years'})
df_agriculture_men = create_dataset('SL.AGR.EMPL.MA.ZS').rename(columns = {'Value':'Employment in Agriculture in the past 20 years'})
#Group by Country dataset pertaining to men
df_male_emp_agr_grouped = df_agriculture_men.groupby(['Country']).sum()['Employment in Agriculture in the past 20 years'].to_frame().reset_index()
df_male_emp_agr_grouped['Gender'] = 'Men'
#Group by Country dataset pertaining to women
df_female_emp_agr_grouped = df_agriculture_women.groupby(['Country']).sum()['Employment in Agriculture in the past 20 years'].to_frame().reset_index()
df_female_emp_agr_grouped['Gender'] = 'Women'
#Create a single dataset by appending
df_agriculture = df_female_emp_agr_grouped.append(df_male_emp_agr_grouped)
#Round the value to 3 decimals
df_agriculture['Employment in Agriculture in the past 20 years'] = round(df_agriculture['Employment in Agriculture in the past 20 years'],3)
#Convert the value to percentage
df_agriculture['Percentage'] = round(df_agriculture.groupby(["Country"])['Employment in Agriculture in the past 20 years'].transform(lambda x:x/x.sum()) * 100 , 1 )
#Create a new column for percentage value so that it contains the '%' sign as well
df_agriculture['Percentage_'] = df_agriculture['Percentage'].astype(float).map('{:.1f}%'.format)
#Order by Gender for visualization purposes
df_agriculture.sort_values(by="Gender" , ascending = True , inplace = True)
#Create a list for graph 1
graph_one = []
#Create a line chart
graph_one = px.line(df_unemployed, x="Year", y="Unemployment %", color='Country' , title = "Unemployment (% total of labor force)")
#Update the title of chart while positioning it
graph_one.update_layout(title_text='Unemployment (% total of labor force)', title_x=0.3)
#Create a list for graph 2
graph_two= []
graph_two = px.bar(df_agriculture, x="Country", y="Percentage",color='Gender', barmode='group', text= "Percentage" , height=400)
#Update the title of chart while positioning it
graph_two.update_layout(title_text='Women vs Men employment in agriculture (2000 - 2022)', title_x=0.45)
#Create a list for graph 3
graph_three= []
#Create a line chart
graph_three = px.line(df_inflation, x="Year", y="Inflation %", color='Country' , title = "Inflation, consumer prices (annual %)")
#Update the title of chart while positioning it
graph_three.update_layout(title_text='Inflation, consumer prices (annual %)', title_x=0.45)
#Create a list for graph 4
graph_four = []
#Create a line chart
graph_four = px.line(df_mortality_road_traffic, x="Year", y="Road Traffic Mortality", color='Country' )
#Update the title of chart while positioning it
graph_four.update_layout(title_text= "Mortality caused by road traffic injury (per 100,000) years 2000 - 2020 people" , title_x=0.45)
#Create a function which will be used in other files for retrieving graphs and their layouts
def return_figures():
figures = []
figures.append(dict(data = graph_one ))
figures.append(dict(data = graph_two ))
figures.append(dict(data = graph_three ))
figures.append(dict(data = graph_four ))
return figures