{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Marketing Campaign Study\n", "The following notebook contains a synthetic toy example of a marketing campaign to show the efficacy of treatment effect estimation." ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd \n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.preprocessing import StandardScaler, minmax_scale\n", "\n", "plt.style.use(\"seaborn\")\n", "plt.rcParams[\"axes.labelsize\"] = 10\n", "plt.rcParams[\"figure.facecolor\"] = \"1\"\n", "plt.rcParams[\"grid.color\"] = \"black\"\n", "plt.rcParams[\"grid.linestyle\"] = \":\"\n", "plt.rcParams[\"grid.linewidth\"] = 0.5\n", "plt.rcParams[\"savefig.dpi\"] = 300\n", "plt.rcParams[\"figure.figsize\"] = (10,10)\n", "\n", "sns.set_style(\"whitegrid\")\n", "sns.set_context(\"poster\")\n", "\n", "\n", "NUM_BINS = 200\n", "CYAN = \"#4ECDC4\"\n", "BLUE = \"#59D2FE\"\n", "RED = \"#FF6B6B\"\n", "YELLOW = \"#FAA916\"\n", "GREY = \"#4A6670\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cleaning Data - Run Once\n", "Before we can use the banking dataset we clean and prepare the data. That is, we transform categorical features to one-hot encoding and then remove everything we don't use for the sake of simplicity. \n", "The preprocessing functions are taken from [Kaggle](https://www.kaggle.com/aleksandradeis/bank-marketing-analysis)." ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "def get_dummy_from_bool(row, column_name):\n", " \"\"\"Returns 0 if value in column_name is no, returns 1 if value in\n", " column_name is yes\"\"\"\n", " return 1 if row[column_name] == \"yes\" else 0\n", "\n", "\n", "def get_correct_values(row, column_name, threshold, df):\n", " \"\"\" Returns mean value if value in column_name is above threshold\"\"\"\n", " if row[column_name] <= threshold:\n", " return row[column_name]\n", " else:\n", " mean = df[df[column_name] <= threshold][column_name].mean()\n", " return mean\n", "\n", "\n", "def clean_data(df):\n", " \"\"\"\n", " INPUT\n", " df - pandas dataframe containing bank marketing campaign dataset\n", "\n", " OUTPUT\n", " df - cleaned dataset:\n", " 1. columns with 'yes' and 'no' values are converted into boolean variables;\n", " 2. categorical columns are converted into dummy variables;\n", " 3. drop irrelevant columns.\n", " 4. impute incorrect values\n", " \"\"\"\n", "\n", " cleaned_df = df.copy()\n", "\n", " # convert columns containing 'yes' and 'no' values to boolean variables\n", " # and drop original columns\n", " bool_columns = [\"default\", \"housing\", \"loan\", \"y\"]\n", " for bool_col in bool_columns:\n", " cleaned_df[bool_col + \"_bool\"] = df.apply(\n", " lambda row: get_dummy_from_bool(row, bool_col), axis=1\n", " )\n", "\n", " cleaned_df = cleaned_df.drop(columns=bool_columns)\n", "\n", " # convert categorical columns to dummies\n", " cat_columns = [\"job\", \"marital\", \"education\", \"contact\", \"month\", \"poutcome\"]\n", "\n", " for col in cat_columns:\n", " cleaned_df = pd.concat(\n", " [\n", " cleaned_df.drop(col, axis=1),\n", " pd.get_dummies(\n", " cleaned_df[col],\n", " prefix=col,\n", " prefix_sep=\"_\",\n", " drop_first=True,\n", " dummy_na=False,\n", " ),\n", " ],\n", " axis=1,\n", " )\n", "\n", " # drop irrelevant columns\n", " cleaned_df = cleaned_df.drop(columns=[\"pdays\"])\n", "\n", " # impute incorrect values and drop original columns\n", " cleaned_df[\"campaign_cleaned\"] = df.apply(\n", " lambda row: get_correct_values(row, \"campaign\", 34, cleaned_df), axis=1\n", " )\n", " cleaned_df[\"previous_cleaned\"] = df.apply(\n", " lambda row: get_correct_values(row, \"previous\", 34, cleaned_df), axis=1\n", " )\n", "\n", " cleaned_df = cleaned_df.drop(columns=[\"campaign\", \"previous\"])\n", "\n", " return cleaned_df\n", "\n", "# Clean data and write to clean file \n", "del_col = ['y_bool', 'default_bool', 'loan_bool', 'housing_bool', 'day','job_housemaid', 'job_retired', 'job_self-employed', 'job_student', 'job_technician', 'job_unknown', 'marital_single', 'contact_telephone']\n", "\n", "data = pd.read_csv(\"../datasets/banking/bank.csv\", sep=';')\n", "cleaned_data = clean_data(data)\n", "data = cleaned_data.iloc[:, :25].drop(columns=del_col)\n", "data.to_csv(\"../datasets/banking/clean.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Import cleaned data \n", "We've preprocessed the dataset. We: 1) transform boolean features into true booleans, 2) transform categorical features into indicators, 3) drop irrelevant features and 4) imputed missing or incorrect values\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['age', 'balance', 'duration', 'job_blue-collar', 'job_entrepreneur',\n", " 'job_management', 'job_services', 'job_unemployed', 'marital_married',\n", " 'education_secondary', 'education_tertiary', 'education_unknown'],\n", " dtype='object')" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(\"../datasets/banking/clean.csv\").drop(columns=['Unnamed: 0'])\n", "data.columns" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "balance | \n", "duration | \n", "job_blue-collar | \n", "job_entrepreneur | \n", "job_management | \n", "job_services | \n", "job_unemployed | \n", "marital_married | \n", "education_secondary | \n", "education_tertiary | \n", "education_unknown | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "58 | \n", "2143 | \n", "261 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "
1 | \n", "44 | \n", "29 | \n", "151 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
2 | \n", "33 | \n", "2 | \n", "76 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "
3 | \n", "47 | \n", "1506 | \n", "92 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "33 | \n", "1 | \n", "198 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "