{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# For this second Excel dataset, I took advantage of the fact that we were allowed to use external programes.\n", "# I created a copy of the excel file and changed it into something that was more managable.\n", "\n", "# 1. I unmerged all of the cells that had been merged together, to do this I used the keyboard shortcut 'ALT + H + M + U'.\n", "# 2. I filled in all the cells that were previously merged with the value that they were previously merged to.\n", "# 3. I filled in the headings of the first three columns with 'Age', 'Sex' and 'Participant' respectively.\n", "# 4. I took the '2010' that was on top of the table and brought it down so that it was just under the heading row and \n", "# was in a new column.\n", "# 5. I added a header to this column which was 'Year'.\n", "# 6. I coppied the value '2010' all the way down the table untill the row 145.\n", "# 7. I deleted the top row which was empty to put the table in the top left corner.\n", "# 8. I also noticed from looking through the data that one of the rows had the value '1.3' in the 'Injured Cassualties (Number)'\n", "# column.\n", "# 9. I knew this value was wrong as this column must only contain whole values so I made the assumption that this value was \n", "# supposed to be '13' insead and I deleted the decimal point within the number.\n", "# 10. I also thought about changing this value to 1, however I decided upon the value 13 as I felt it was more likely to put \n", "# in an accidental decimal point as opposed to an accidental decimal point along with an accidental number three.\n", "# 11. I then read the data into pandas to form a dataframe from this copied file." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#READ IN THE FILES\n", "\n", "#read the first excel file into 2 different dataframes of its 2 sheets seperate\n", "df_excel1_sheet_1 = pd.read_excel('../data/raw/rsa_2005-2006.xlsx', sheet_name=0)\n", "df_excel1_sheet_2 = pd.read_excel('../data/raw/rsa_2005-2006.xlsx', sheet_name=1)\n", "\n", "#read in the json file\n", "df_json = pd.read_json('../data/raw/rsa_2007.json')\n", "\n", "#read in the txt file as 2 seperate dataframes of their seperate years\n", "df_txt_2008 = pd.read_csv('../data/raw/rsa_2008-2009.txt', sep='\\t', usecols=['Age','Sex','Participant','Casualty','2008'])\n", "df_txt_2009 = pd.read_csv('../data/raw/rsa_2008-2009.txt', sep='\\t', usecols=['Age','Sex','Participant','Casualty','2009'])\n", "\n", "#read in the second Excel file, one for where people were killed and one for where people were injured\n", "df_excel2_killed = pd.read_excel('../data/processed/copy_rsa_2010.xlsx', usecols=[0,1,2,3,5])\n", "df_excel2_injured = pd.read_excel('../data/processed/copy_rsa_2010.xlsx', usecols=[0,1,2,4,5])\n", "\n", "#read in the csv file as 2 seperate dataframes of their seperate years\n", "df_csv_2011 = pd.read_csv('../data/raw/rsa_2011-2012.csv', usecols=['Age', 'Sex', 'Participant', 'Casualty', '2011'])\n", "df_csv_2012 = pd.read_csv('../data/raw/rsa_2011-2012.csv', usecols=['Age', 'Sex', 'Participant', 'Casualty', '2012'])\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# CREATE A 'YEAR' ATTIBUTE IN EACH OF THE DATAFRAMES" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Year
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Year]\n", "Index: []" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Create a dataframe containing only the column 'Year'\n", "\n", "df_year = pd.DataFrame(columns = ['Year'])\n", "df_year.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#Add this 'Year' dataframe to all of the other Dataframes\n", "\n", "#Excel 1 - sheet 1 & sheet 2 dataframes\n", "df_excel1_sheet_1 = pd.concat([df_excel1_sheet_1, df_year], axis=1)\n", "df_excel1_sheet_2 = pd.concat([df_excel1_sheet_2, df_year], axis=1)\n", "\n", "#Json dataframe\n", "df_json = pd.concat([df_json, df_year], axis=1)\n", "\n", "#Text File - 2008 Year & 2009 Year dataframes\n", "df_txt_2008 = pd.concat([df_txt_2008, df_year], axis = 1)\n", "df_txt_2009 = pd.concat([df_txt_2009, df_year], axis = 1)\n", "\n", "#Excel 2 - Killed & Injured dataframes\n", "# I had already added this 'Year' column to the data when I copied the excel data and re-worked it a little bit\n", "# This could have been done the same was as it was done with the other data files\n", "\n", "#CSV File - 2011 Year & 2012 Year dataframes\n", "df_csv_2011 = pd.concat([df_csv_2011, df_year], axis = 1)\n", "df_csv_2012 = pd.concat([df_csv_2012, df_year], axis = 1)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#PUT THE RESPECTIVE YEAR INTO THE COLUMN 'Year' IN EACH ROW OF EACH OF THE DATAFRAMES\n", "\n", "#Excel 1 - sheet 1 & sheet 2 dataframes\n", "df_excel1_sheet_1['Year'] = 2005\n", "df_excel1_sheet_2['Year'] = 2006\n", "\n", "#Json dataframe\n", "df_json['Year'] = 2007\n", "\n", "#Text File - 2008 Year & 2009 Year dataframes\n", "df_txt_2008['Year'] = 2008\n", "df_txt_2009['Year'] = 2009\n", "\n", "#Excel 2 - Killed & Injured dataframes\n", "# I had already copied the year attribute '2010' along this column when I copied the excel data and re-worked it a little bit\n", "# I could have done this the same way as the I did with the other dataframes though\n", "\n", "#CSV File - 2011 Year & 2012 Year dataframes\n", "df_csv_2011['Year'] = 2011\n", "df_csv_2012['Year'] = 2012\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Casualty
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Casualty]\n", "Index: []" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Create a dataframe containing only the column 'Casualty'\n", "\n", "df_Casualty = pd.DataFrame(columns = ['Casualty'])\n", "df_Casualty.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#Add the 'Casualty' Column to the two dataframes that came from the second Excle file\n", "\n", "df_excel2_killed = pd.concat([df_excel2_killed, df_Casualty], axis=1)\n", "df_excel2_injured = pd.concat([df_excel2_injured, df_Casualty], axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#Insert the Casualty value into the 'Casualty' Column in the two dataframes that came from the second Excle file\n", "\n", "df_excel2_killed['Casualty'] = 'Killed'\n", "df_excel2_injured['Casualty'] = 'Injured'" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# RENAME THE COLUMN THAT CONTAINS THE NUMBER OF CASUALTIES FROM THE YEAR TO 'Total'\n", "\n", "#Excel 1 - sheet 1 & sheet 2 dataframes\n", "df_excel1_sheet_1 = df_excel1_sheet_1.rename(columns={2005:'Total'})\n", "df_excel1_sheet_2 = df_excel1_sheet_2.rename(columns={2006:'Total'})\n", "\n", "#Json dataframe\n", "df_json = df_json.rename(columns={'2007':'Total'})\n", "\n", "#Text File - 2008 Year & 2009 Year dataframes\n", "df_txt_2008 = df_txt_2008.rename(columns={'2008': 'Total'})\n", "df_txt_2009.rename(columns={'2009': 'Total'},inplace=True)\n", "\n", "#Excel 2 - Killed & Injured dataframes\n", "df_excel2_killed.rename(columns={'Killed Casualties (Number)': 'Total'}, inplace=True)\n", "df_excel2_injured.rename(columns={'Injured Cassualties (Number)': 'Total'}, inplace=True) #(I did notice spelling mistake)\n", "\n", "#CSV File - 2011 Year & 2012 Year dataframes\n", "df_csv_2011.rename(columns={'2011': 'Total'},inplace=True)\n", "df_csv_2012.rename(columns={'2012': 'Total'},inplace=True)\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Change the type of the 'Total' column in the Excel2 - injured dataframe to integer type\n", "\n", "df_excel2_injured['Total'] = df_excel2_injured['Total'].astype(int)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# MERGE THE DATAFRAMES TOGETHER\n", "\n", "#Excel 1 - sheet 1 & sheet 2 dataframes\n", "both_sheets = [df_excel1_sheet_1, df_excel1_sheet_2]\n", "df_excel1 = pd.concat(both_sheets, axis=0, ignore_index=True)\n", "\n", "#Text File - 2008 Year & 2009 Year\n", "df_txt = pd.concat([df_txt_2008, df_txt_2009], axis=0, ignore_index=True)\n", "\n", "#Excel 2 - Killed & Injured Datframes\n", "df_excel2 = pd.concat([df_excel2_injured, df_excel2_killed], axis=0, ignore_index=True)\n", "\n", "#CSV File - 2011 Year & 2012 Year\n", "df_csv = pd.concat([df_csv_2011, df_csv_2012], axis=0, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# SORT THE COLUMNS TO A UNIFORM ORDER\n", "\n", "# First excel dataframe\n", "df_excel1 = df_excel1.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n", "\n", "# Json dataframe\n", "df_json = df_json.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n", "\n", "# Text file dataframe\n", "df_txt = df_txt.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n", "\n", "# Second excel Dataframe\n", "df_excel2 = df_excel2.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n", "\n", "# Csv Dataframe\n", "df_csv = df_csv.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# MERGE ALL DATAFRAMES INTO ONE\n", "\n", "df1 = pd.concat([df_excel1, df_txt], axis=0, ignore_index=True)\n", "\n", "df2 = pd.concat([df_excel2, df_csv], axis=0, ignore_index=True)\n", "\n", "df3 = pd.concat([df2, df_json], axis=0, ignore_index=True)\n", "\n", "df = pd.concat([df1, df3], axis=0, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "#turn single digit numbers in the 'Age' column into double digits to help sorting\n", "df = df.replace(to_replace='0 - 5 years', value='00 - 05 years')\n", "df = df.replace(to_replace='6 - 9 years', value='06 - 09 years')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Sort Dataframe by its Columns\n", "df = df.sort_values(by=['Year', 'Age', 'Sex', 'Participant', 'Casualty', 'Total'], ascending=[True,True,False,False,True,False])\n", "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# turn the dataframe into a CSV to open in 'Open Refine'\n", "df.to_csv('../data/processed/Processed_dataframe-csv.csv')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 2304 entries, 0 to 2303\n", "Data columns (total 6 columns):\n", "Year 2304 non-null int64\n", "Age 2304 non-null object\n", "Sex 2304 non-null object\n", "Participant 2304 non-null object\n", "Casualty 2304 non-null object\n", "Total 2304 non-null object\n", "dtypes: int64(1), object(5)\n", "memory usage: 108.1+ KB\n" ] } ], "source": [ "# get the info of the dataframe\n", "df.info()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }