{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For this second Excel dataset, I took advantage of the fact that we were allowed to use external programes.\n",
"# I created a copy of the excel file and changed it into something that was more managable.\n",
"\n",
"# 1. I unmerged all of the cells that had been merged together, to do this I used the keyboard shortcut 'ALT + H + M + U'.\n",
"# 2. I filled in all the cells that were previously merged with the value that they were previously merged to.\n",
"# 3. I filled in the headings of the first three columns with 'Age', 'Sex' and 'Participant' respectively.\n",
"# 4. I took the '2010' that was on top of the table and brought it down so that it was just under the heading row and \n",
"# was in a new column.\n",
"# 5. I added a header to this column which was 'Year'.\n",
"# 6. I coppied the value '2010' all the way down the table untill the row 145.\n",
"# 7. I deleted the top row which was empty to put the table in the top left corner.\n",
"# 8. I also noticed from looking through the data that one of the rows had the value '1.3' in the 'Injured Cassualties (Number)'\n",
"# column.\n",
"# 9. I knew this value was wrong as this column must only contain whole values so I made the assumption that this value was \n",
"# supposed to be '13' insead and I deleted the decimal point within the number.\n",
"# 10. I also thought about changing this value to 1, however I decided upon the value 13 as I felt it was more likely to put \n",
"# in an accidental decimal point as opposed to an accidental decimal point along with an accidental number three.\n",
"# 11. I then read the data into pandas to form a dataframe from this copied file."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#READ IN THE FILES\n",
"\n",
"#read the first excel file into 2 different dataframes of its 2 sheets seperate\n",
"df_excel1_sheet_1 = pd.read_excel('../data/raw/rsa_2005-2006.xlsx', sheet_name=0)\n",
"df_excel1_sheet_2 = pd.read_excel('../data/raw/rsa_2005-2006.xlsx', sheet_name=1)\n",
"\n",
"#read in the json file\n",
"df_json = pd.read_json('../data/raw/rsa_2007.json')\n",
"\n",
"#read in the txt file as 2 seperate dataframes of their seperate years\n",
"df_txt_2008 = pd.read_csv('../data/raw/rsa_2008-2009.txt', sep='\\t', usecols=['Age','Sex','Participant','Casualty','2008'])\n",
"df_txt_2009 = pd.read_csv('../data/raw/rsa_2008-2009.txt', sep='\\t', usecols=['Age','Sex','Participant','Casualty','2009'])\n",
"\n",
"#read in the second Excel file, one for where people were killed and one for where people were injured\n",
"df_excel2_killed = pd.read_excel('../data/processed/copy_rsa_2010.xlsx', usecols=[0,1,2,3,5])\n",
"df_excel2_injured = pd.read_excel('../data/processed/copy_rsa_2010.xlsx', usecols=[0,1,2,4,5])\n",
"\n",
"#read in the csv file as 2 seperate dataframes of their seperate years\n",
"df_csv_2011 = pd.read_csv('../data/raw/rsa_2011-2012.csv', usecols=['Age', 'Sex', 'Participant', 'Casualty', '2011'])\n",
"df_csv_2012 = pd.read_csv('../data/raw/rsa_2011-2012.csv', usecols=['Age', 'Sex', 'Participant', 'Casualty', '2012'])\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# CREATE A 'YEAR' ATTIBUTE IN EACH OF THE DATAFRAMES"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Year | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Year]\n",
"Index: []"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Create a dataframe containing only the column 'Year'\n",
"\n",
"df_year = pd.DataFrame(columns = ['Year'])\n",
"df_year.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#Add this 'Year' dataframe to all of the other Dataframes\n",
"\n",
"#Excel 1 - sheet 1 & sheet 2 dataframes\n",
"df_excel1_sheet_1 = pd.concat([df_excel1_sheet_1, df_year], axis=1)\n",
"df_excel1_sheet_2 = pd.concat([df_excel1_sheet_2, df_year], axis=1)\n",
"\n",
"#Json dataframe\n",
"df_json = pd.concat([df_json, df_year], axis=1)\n",
"\n",
"#Text File - 2008 Year & 2009 Year dataframes\n",
"df_txt_2008 = pd.concat([df_txt_2008, df_year], axis = 1)\n",
"df_txt_2009 = pd.concat([df_txt_2009, df_year], axis = 1)\n",
"\n",
"#Excel 2 - Killed & Injured dataframes\n",
"# I had already added this 'Year' column to the data when I copied the excel data and re-worked it a little bit\n",
"# This could have been done the same was as it was done with the other data files\n",
"\n",
"#CSV File - 2011 Year & 2012 Year dataframes\n",
"df_csv_2011 = pd.concat([df_csv_2011, df_year], axis = 1)\n",
"df_csv_2012 = pd.concat([df_csv_2012, df_year], axis = 1)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#PUT THE RESPECTIVE YEAR INTO THE COLUMN 'Year' IN EACH ROW OF EACH OF THE DATAFRAMES\n",
"\n",
"#Excel 1 - sheet 1 & sheet 2 dataframes\n",
"df_excel1_sheet_1['Year'] = 2005\n",
"df_excel1_sheet_2['Year'] = 2006\n",
"\n",
"#Json dataframe\n",
"df_json['Year'] = 2007\n",
"\n",
"#Text File - 2008 Year & 2009 Year dataframes\n",
"df_txt_2008['Year'] = 2008\n",
"df_txt_2009['Year'] = 2009\n",
"\n",
"#Excel 2 - Killed & Injured dataframes\n",
"# I had already copied the year attribute '2010' along this column when I copied the excel data and re-worked it a little bit\n",
"# I could have done this the same way as the I did with the other dataframes though\n",
"\n",
"#CSV File - 2011 Year & 2012 Year dataframes\n",
"df_csv_2011['Year'] = 2011\n",
"df_csv_2012['Year'] = 2012\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Casualty | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Casualty]\n",
"Index: []"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Create a dataframe containing only the column 'Casualty'\n",
"\n",
"df_Casualty = pd.DataFrame(columns = ['Casualty'])\n",
"df_Casualty.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#Add the 'Casualty' Column to the two dataframes that came from the second Excle file\n",
"\n",
"df_excel2_killed = pd.concat([df_excel2_killed, df_Casualty], axis=1)\n",
"df_excel2_injured = pd.concat([df_excel2_injured, df_Casualty], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#Insert the Casualty value into the 'Casualty' Column in the two dataframes that came from the second Excle file\n",
"\n",
"df_excel2_killed['Casualty'] = 'Killed'\n",
"df_excel2_injured['Casualty'] = 'Injured'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# RENAME THE COLUMN THAT CONTAINS THE NUMBER OF CASUALTIES FROM THE YEAR TO 'Total'\n",
"\n",
"#Excel 1 - sheet 1 & sheet 2 dataframes\n",
"df_excel1_sheet_1 = df_excel1_sheet_1.rename(columns={2005:'Total'})\n",
"df_excel1_sheet_2 = df_excel1_sheet_2.rename(columns={2006:'Total'})\n",
"\n",
"#Json dataframe\n",
"df_json = df_json.rename(columns={'2007':'Total'})\n",
"\n",
"#Text File - 2008 Year & 2009 Year dataframes\n",
"df_txt_2008 = df_txt_2008.rename(columns={'2008': 'Total'})\n",
"df_txt_2009.rename(columns={'2009': 'Total'},inplace=True)\n",
"\n",
"#Excel 2 - Killed & Injured dataframes\n",
"df_excel2_killed.rename(columns={'Killed Casualties (Number)': 'Total'}, inplace=True)\n",
"df_excel2_injured.rename(columns={'Injured Cassualties (Number)': 'Total'}, inplace=True) #(I did notice spelling mistake)\n",
"\n",
"#CSV File - 2011 Year & 2012 Year dataframes\n",
"df_csv_2011.rename(columns={'2011': 'Total'},inplace=True)\n",
"df_csv_2012.rename(columns={'2012': 'Total'},inplace=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Change the type of the 'Total' column in the Excel2 - injured dataframe to integer type\n",
"\n",
"df_excel2_injured['Total'] = df_excel2_injured['Total'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# MERGE THE DATAFRAMES TOGETHER\n",
"\n",
"#Excel 1 - sheet 1 & sheet 2 dataframes\n",
"both_sheets = [df_excel1_sheet_1, df_excel1_sheet_2]\n",
"df_excel1 = pd.concat(both_sheets, axis=0, ignore_index=True)\n",
"\n",
"#Text File - 2008 Year & 2009 Year\n",
"df_txt = pd.concat([df_txt_2008, df_txt_2009], axis=0, ignore_index=True)\n",
"\n",
"#Excel 2 - Killed & Injured Datframes\n",
"df_excel2 = pd.concat([df_excel2_injured, df_excel2_killed], axis=0, ignore_index=True)\n",
"\n",
"#CSV File - 2011 Year & 2012 Year\n",
"df_csv = pd.concat([df_csv_2011, df_csv_2012], axis=0, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# SORT THE COLUMNS TO A UNIFORM ORDER\n",
"\n",
"# First excel dataframe\n",
"df_excel1 = df_excel1.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
"\n",
"# Json dataframe\n",
"df_json = df_json.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
"\n",
"# Text file dataframe\n",
"df_txt = df_txt.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
"\n",
"# Second excel Dataframe\n",
"df_excel2 = df_excel2.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
"\n",
"# Csv Dataframe\n",
"df_csv = df_csv.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# MERGE ALL DATAFRAMES INTO ONE\n",
"\n",
"df1 = pd.concat([df_excel1, df_txt], axis=0, ignore_index=True)\n",
"\n",
"df2 = pd.concat([df_excel2, df_csv], axis=0, ignore_index=True)\n",
"\n",
"df3 = pd.concat([df2, df_json], axis=0, ignore_index=True)\n",
"\n",
"df = pd.concat([df1, df3], axis=0, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#turn single digit numbers in the 'Age' column into double digits to help sorting\n",
"df = df.replace(to_replace='0 - 5 years', value='00 - 05 years')\n",
"df = df.replace(to_replace='6 - 9 years', value='06 - 09 years')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Sort Dataframe by its Columns\n",
"df = df.sort_values(by=['Year', 'Age', 'Sex', 'Participant', 'Casualty', 'Total'], ascending=[True,True,False,False,True,False])\n",
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# turn the dataframe into a CSV to open in 'Open Refine'\n",
"df.to_csv('../data/processed/Processed_dataframe-csv.csv')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 2304 entries, 0 to 2303\n",
"Data columns (total 6 columns):\n",
"Year 2304 non-null int64\n",
"Age 2304 non-null object\n",
"Sex 2304 non-null object\n",
"Participant 2304 non-null object\n",
"Casualty 2304 non-null object\n",
"Total 2304 non-null object\n",
"dtypes: int64(1), object(5)\n",
"memory usage: 108.1+ KB\n"
]
}
],
"source": [
"# get the info of the dataframe\n",
"df.info()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}