{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For this second Excel dataset, I took advantage of the fact that we were allowed to use external programes.\n",
    "# I created a copy of the excel file and changed it into something that was more managable.\n",
    "\n",
    "# 1.  I unmerged all of the cells that had been merged together, to do this I used the keyboard shortcut 'ALT + H + M + U'.\n",
    "# 2.  I filled in all the cells that were previously merged with the value that they were previously merged to.\n",
    "# 3.  I filled in the headings of the first three columns with 'Age', 'Sex' and 'Participant' respectively.\n",
    "# 4.  I took the '2010' that was on top of the table and brought it down so that it was just under the heading row and \n",
    "#     was in a new column.\n",
    "# 5.  I added a header to this column which was 'Year'.\n",
    "# 6.  I coppied the value '2010' all the way down the table untill the row 145.\n",
    "# 7.  I deleted the top row which was empty to put the table in the top left corner.\n",
    "# 8.  I also noticed from looking through the data that one of the rows had the value '1.3' in the 'Injured Cassualties (Number)'\n",
    "#     column.\n",
    "# 9.  I knew this value was wrong as this column must only contain whole values so I made the assumption that this value was \n",
    "#     supposed to be '13' insead and I deleted the decimal point within the number.\n",
    "# 10. I also thought about changing this value to 1, however I decided upon the value 13 as I felt it was more likely to put \n",
    "#     in an accidental decimal point as opposed to an accidental decimal point along with an accidental number three.\n",
    "# 11. I then read the data into pandas to form a dataframe from this copied file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#READ IN THE FILES\n",
    "\n",
    "#read the first excel file into 2 different dataframes of its 2 sheets seperate\n",
    "df_excel1_sheet_1 = pd.read_excel('../data/raw/rsa_2005-2006.xlsx', sheet_name=0)\n",
    "df_excel1_sheet_2 = pd.read_excel('../data/raw/rsa_2005-2006.xlsx', sheet_name=1)\n",
    "\n",
    "#read in the json file\n",
    "df_json = pd.read_json('../data/raw/rsa_2007.json')\n",
    "\n",
    "#read in the txt file as 2 seperate dataframes of their seperate years\n",
    "df_txt_2008 = pd.read_csv('../data/raw/rsa_2008-2009.txt', sep='\\t', usecols=['Age','Sex','Participant','Casualty','2008'])\n",
    "df_txt_2009 = pd.read_csv('../data/raw/rsa_2008-2009.txt', sep='\\t', usecols=['Age','Sex','Participant','Casualty','2009'])\n",
    "\n",
    "#read in the second Excel file, one for where people were killed and one for where people were injured\n",
    "df_excel2_killed = pd.read_excel('../data/processed/copy_rsa_2010.xlsx', usecols=[0,1,2,3,5])\n",
    "df_excel2_injured = pd.read_excel('../data/processed/copy_rsa_2010.xlsx', usecols=[0,1,2,4,5])\n",
    "\n",
    "#read in the csv file as 2 seperate dataframes of their seperate years\n",
    "df_csv_2011 = pd.read_csv('../data/raw/rsa_2011-2012.csv', usecols=['Age', 'Sex', 'Participant', 'Casualty', '2011'])\n",
    "df_csv_2012 = pd.read_csv('../data/raw/rsa_2011-2012.csv', usecols=['Age', 'Sex', 'Participant', 'Casualty', '2012'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CREATE A 'YEAR' ATTIBUTE IN EACH OF THE DATAFRAMES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Year]\n",
       "Index: []"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Create a dataframe containing only the column 'Year'\n",
    "\n",
    "df_year = pd.DataFrame(columns = ['Year'])\n",
    "df_year.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add this 'Year' dataframe to all of the other Dataframes\n",
    "\n",
    "#Excel 1 - sheet 1 & sheet 2 dataframes\n",
    "df_excel1_sheet_1 = pd.concat([df_excel1_sheet_1, df_year], axis=1)\n",
    "df_excel1_sheet_2 = pd.concat([df_excel1_sheet_2, df_year], axis=1)\n",
    "\n",
    "#Json dataframe\n",
    "df_json = pd.concat([df_json, df_year], axis=1)\n",
    "\n",
    "#Text File - 2008 Year & 2009 Year dataframes\n",
    "df_txt_2008 = pd.concat([df_txt_2008, df_year], axis = 1)\n",
    "df_txt_2009 = pd.concat([df_txt_2009, df_year], axis = 1)\n",
    "\n",
    "#Excel 2 - Killed & Injured dataframes\n",
    "# I had already added this 'Year' column to the data when I copied the excel data and re-worked it a little bit\n",
    "# This could have been done the same was as it was done with the other data files\n",
    "\n",
    "#CSV File - 2011 Year & 2012 Year dataframes\n",
    "df_csv_2011 = pd.concat([df_csv_2011, df_year], axis = 1)\n",
    "df_csv_2012 = pd.concat([df_csv_2012, df_year], axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#PUT THE RESPECTIVE YEAR INTO THE COLUMN 'Year' IN EACH ROW OF EACH OF THE DATAFRAMES\n",
    "\n",
    "#Excel 1 - sheet 1 & sheet 2 dataframes\n",
    "df_excel1_sheet_1['Year'] = 2005\n",
    "df_excel1_sheet_2['Year'] = 2006\n",
    "\n",
    "#Json dataframe\n",
    "df_json['Year'] = 2007\n",
    "\n",
    "#Text File - 2008 Year & 2009 Year dataframes\n",
    "df_txt_2008['Year'] = 2008\n",
    "df_txt_2009['Year'] = 2009\n",
    "\n",
    "#Excel 2 - Killed & Injured dataframes\n",
    "# I had already copied the year attribute '2010' along this column when I copied the excel data and re-worked it a little bit\n",
    "# I could have done this the same way as the I did with the other dataframes though\n",
    "\n",
    "#CSV File - 2011 Year & 2012 Year dataframes\n",
    "df_csv_2011['Year'] = 2011\n",
    "df_csv_2012['Year'] = 2012\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Casualty</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Casualty]\n",
       "Index: []"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Create a dataframe containing only the column 'Casualty'\n",
    "\n",
    "df_Casualty = pd.DataFrame(columns = ['Casualty'])\n",
    "df_Casualty.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add the 'Casualty' Column to the two dataframes that came from the second Excle file\n",
    "\n",
    "df_excel2_killed = pd.concat([df_excel2_killed, df_Casualty], axis=1)\n",
    "df_excel2_injured = pd.concat([df_excel2_injured, df_Casualty], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Insert the Casualty value into the 'Casualty' Column in the two dataframes that came from the second Excle file\n",
    "\n",
    "df_excel2_killed['Casualty'] = 'Killed'\n",
    "df_excel2_injured['Casualty'] = 'Injured'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# RENAME THE COLUMN THAT CONTAINS THE NUMBER OF CASUALTIES FROM THE YEAR TO 'Total'\n",
    "\n",
    "#Excel 1 - sheet 1 & sheet 2 dataframes\n",
    "df_excel1_sheet_1 = df_excel1_sheet_1.rename(columns={2005:'Total'})\n",
    "df_excel1_sheet_2 = df_excel1_sheet_2.rename(columns={2006:'Total'})\n",
    "\n",
    "#Json dataframe\n",
    "df_json = df_json.rename(columns={'2007':'Total'})\n",
    "\n",
    "#Text File - 2008 Year & 2009 Year dataframes\n",
    "df_txt_2008 = df_txt_2008.rename(columns={'2008': 'Total'})\n",
    "df_txt_2009.rename(columns={'2009': 'Total'},inplace=True)\n",
    "\n",
    "#Excel 2 - Killed & Injured dataframes\n",
    "df_excel2_killed.rename(columns={'Killed Casualties (Number)': 'Total'}, inplace=True)\n",
    "df_excel2_injured.rename(columns={'Injured Cassualties (Number)': 'Total'}, inplace=True) #(I did notice spelling mistake)\n",
    "\n",
    "#CSV File - 2011 Year & 2012 Year dataframes\n",
    "df_csv_2011.rename(columns={'2011': 'Total'},inplace=True)\n",
    "df_csv_2012.rename(columns={'2012': 'Total'},inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Change the type of the 'Total' column in the Excel2 - injured dataframe to integer type\n",
    "\n",
    "df_excel2_injured['Total'] = df_excel2_injured['Total'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# MERGE THE DATAFRAMES TOGETHER\n",
    "\n",
    "#Excel 1 - sheet 1 & sheet 2 dataframes\n",
    "both_sheets = [df_excel1_sheet_1, df_excel1_sheet_2]\n",
    "df_excel1 = pd.concat(both_sheets, axis=0, ignore_index=True)\n",
    "\n",
    "#Text File - 2008 Year & 2009 Year\n",
    "df_txt = pd.concat([df_txt_2008, df_txt_2009], axis=0, ignore_index=True)\n",
    "\n",
    "#Excel 2 - Killed & Injured Datframes\n",
    "df_excel2 = pd.concat([df_excel2_injured, df_excel2_killed], axis=0, ignore_index=True)\n",
    "\n",
    "#CSV File - 2011 Year & 2012 Year\n",
    "df_csv = pd.concat([df_csv_2011, df_csv_2012], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SORT THE COLUMNS TO A UNIFORM ORDER\n",
    "\n",
    "# First excel dataframe\n",
    "df_excel1 = df_excel1.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
    "\n",
    "# Json dataframe\n",
    "df_json = df_json.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
    "\n",
    "# Text file dataframe\n",
    "df_txt = df_txt.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
    "\n",
    "# Second excel Dataframe\n",
    "df_excel2 = df_excel2.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)\n",
    "\n",
    "# Csv Dataframe\n",
    "df_csv = df_csv.reindex(['Year','Age', 'Sex', 'Participant', 'Casualty', 'Total'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# MERGE ALL DATAFRAMES INTO ONE\n",
    "\n",
    "df1 = pd.concat([df_excel1, df_txt], axis=0, ignore_index=True)\n",
    "\n",
    "df2 = pd.concat([df_excel2, df_csv], axis=0, ignore_index=True)\n",
    "\n",
    "df3 = pd.concat([df2, df_json], axis=0, ignore_index=True)\n",
    "\n",
    "df = pd.concat([df1, df3], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#turn single digit numbers in the 'Age' column into double digits to help sorting\n",
    "df = df.replace(to_replace='0 - 5 years', value='00 - 05 years')\n",
    "df = df.replace(to_replace='6 - 9 years', value='06 - 09 years')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort Dataframe by its Columns\n",
    "df = df.sort_values(by=['Year', 'Age', 'Sex', 'Participant', 'Casualty', 'Total'], ascending=[True,True,False,False,True,False])\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# turn the dataframe into a CSV to open in 'Open Refine'\n",
    "df.to_csv('../data/processed/Processed_dataframe-csv.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2304 entries, 0 to 2303\n",
      "Data columns (total 6 columns):\n",
      "Year           2304 non-null int64\n",
      "Age            2304 non-null object\n",
      "Sex            2304 non-null object\n",
      "Participant    2304 non-null object\n",
      "Casualty       2304 non-null object\n",
      "Total          2304 non-null object\n",
      "dtypes: int64(1), object(5)\n",
      "memory usage: 108.1+ KB\n"
     ]
    }
   ],
   "source": [
    "# get the info of the dataframe\n",
    "df.info()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}