{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import feats\n", "import utils\n", "import constants\n", "import transactions\n", "\n", "import os\n", "import pickle\n", "import operator\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from imp import reload\n", "from matplotlib import pyplot as plt\n", "from statsmodels.tsa.api import VAR\n", "from scipy.spatial.distance import euclidean\n", "from sklearn.utils.extmath import cartesian\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from pandas.tools.plotting import lag_plot, autocorrelation_plot" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "uo = tle.get_users_orders('prior')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_pair = uo[['user_id', 'product_id']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_products_train = tle.get_orders_items('train')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_products_prior = tle.get_orders_items('prior')\n", "orders = tle.get_orders()\n", "products = tle.get_items('products')\n", "aisles = tle.get_items('aisles')\n", "departments = tle.get_items('departments')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "products_details = pd.merge(products, tle.craft_feat_product(), on = ['product_id'], how = 'right')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 None订单 " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_is_None = order_products_train.groupby(['order_id'])['reordered'].sum().reset_index()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.07015912631415824" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(order_is_None[order_is_None.reordered == 0]) / len(order_is_None[order_is_None.reordered > 0])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "a = pd.merge(order_is_None, orders, how = 'left', on = ['order_id'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### prior、train订单" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_products_all = pd.concat([order_products_prior, order_products_train], axis = 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 How many products do users buy each time\n", "- 每张订单的商品数目" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = order_products_prior.groupby(\"order_id\")[\"add_to_cart_order\"].aggregate(\"max\").reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 3.214874e+06\n", "mean 1.008888e+01\n", "std 7.525398e+00\n", "min 1.000000e+00\n", "25% 5.000000e+00\n", "50% 8.000000e+00\n", "75% 1.400000e+01\n", "max 1.450000e+02\n", "Name: add_to_cart_order, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grouped.add_to_cart_order.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Do users purchase different numbers of products each time?\n", "- 用户每次购买的商品数目一样麽" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = pd.merge(grouped,\n", " orders,\n", " on = ['order_id'],\n", " how = 'left')[['user_id', 'add_to_cart_order', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = grouped.sort_values(['user_id', 'order_number'])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped.columns = ['user_id', \n", " 'num_products', \n", " 'order_number', \n", " 'order_dow',\n", " 'order_hour_of_day', \n", " 'days_since_prior_order']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "user_num_product = grouped.groupby(['user_id'])['num_products'].agg({'mean':'mean', 'std':'std'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "with open(DATA_DIR + 'user_num_product_stat.pkl', 'wb') as f:\n", " pickle.dump(user_num_product, f, pic)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'user_num_product_stat.pkl', 'rb') as f:\n", " user_num_product = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 206209.000000\n", "mean 4.266349\n", "std 2.675061\n", "min 0.000000\n", "25% 2.345208\n", "50% 3.781534\n", "75% 5.609516\n", "max 44.747439\n", "Name: std, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_num_product['std'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4 Reorder Rate \n", "- 每张订单中重复购买商品比例" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = order_products_all.groupby(\"product_id\")[\"reordered\"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()\n", "grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']\n", "grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])\n", "grouped = grouped[grouped.reorder_total > 75].sort_values(['reorder_probability'], ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_reorder_rate = order_products_prior.groupby(['order_id'])['reordered'] \\\n", " .aggregate({'reorder_pnum':'sum', 'pnum':'count'})" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_reorder_rate['reorder_rate'] = prior_reorder_rate['reorder_pnum'] / prior_reorder_rate['pnum']" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_reorder_rate.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_orders = orders[orders.eval_set == 'prior']" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_orders = pd.merge(prior_orders, prior_reorder_rate,\n", " on = ['order_id'], how = 'left')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | order_id | \n", "user_id | \n", "eval_set | \n", "order_number | \n", "order_dow | \n", "order_hour_of_day | \n", "days_since_prior_order | \n", "days_up_to_last | \n", "pnum | \n", "reorder_pnum | \n", "reorder_rate | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2539329 | \n", "1 | \n", "prior | \n", "1 | \n", "2 | \n", "8 | \n", "0.0 | \n", "190.0 | \n", "5 | \n", "0 | \n", "0.000 | \n", "
1 | \n", "2398795 | \n", "1 | \n", "prior | \n", "2 | \n", "3 | \n", "7 | \n", "15.0 | \n", "175.0 | \n", "6 | \n", "3 | \n", "0.500 | \n", "
2 | \n", "473747 | \n", "1 | \n", "prior | \n", "3 | \n", "3 | \n", "12 | \n", "21.0 | \n", "154.0 | \n", "5 | \n", "3 | \n", "0.600 | \n", "
3 | \n", "2254736 | \n", "1 | \n", "prior | \n", "4 | \n", "4 | \n", "7 | \n", "29.0 | \n", "125.0 | \n", "5 | \n", "5 | \n", "1.000 | \n", "
4 | \n", "431534 | \n", "1 | \n", "prior | \n", "5 | \n", "4 | \n", "15 | \n", "28.0 | \n", "97.0 | \n", "8 | \n", "5 | \n", "0.625 | \n", "
\n", " | user_id | \n", "order_number | \n", "product_id | \n", "days_up_to_last | \n", "p_purchase_interval | \n", "up_interval | \n", "
---|---|---|---|---|---|---|
24181266 | \n", "1 | \n", "10 | \n", "196 | \n", "14.0 | \n", "-1.0 | \n", "NaN | \n", "
21760446 | \n", "1 | \n", "9 | \n", "196 | \n", "44.0 | \n", "30.0 | \n", "NaN | \n", "
29474806 | \n", "1 | \n", "8 | \n", "196 | \n", "44.0 | \n", "0.0 | \n", "NaN | \n", "
5212927 | \n", "1 | \n", "7 | \n", "196 | \n", "58.0 | \n", "14.0 | \n", "NaN | \n", "
31927070 | \n", "1 | \n", "6 | \n", "196 | \n", "78.0 | \n", "20.0 | \n", "NaN | \n", "
4089398 | \n", "1 | \n", "5 | \n", "196 | \n", "97.0 | \n", "19.0 | \n", "NaN | \n", "
21376074 | \n", "1 | \n", "4 | \n", "196 | \n", "125.0 | \n", "28.0 | \n", "NaN | \n", "
4488095 | \n", "1 | \n", "3 | \n", "196 | \n", "154.0 | \n", "29.0 | \n", "NaN | \n", "
22742744 | \n", "1 | \n", "2 | \n", "196 | \n", "175.0 | \n", "21.0 | \n", "NaN | \n", "
24076664 | \n", "1 | \n", "1 | \n", "196 | \n", "190.0 | \n", "15.0 | \n", "NaN | \n", "
24181271 | \n", "1 | \n", "10 | \n", "10258 | \n", "14.0 | \n", "-1.0 | \n", "NaN | \n", "
21760447 | \n", "1 | \n", "9 | \n", "10258 | \n", "44.0 | \n", "30.0 | \n", "NaN | \n", "
29474807 | \n", "1 | \n", "8 | \n", "10258 | \n", "44.0 | \n", "0.0 | \n", "NaN | \n", "
5212928 | \n", "1 | \n", "7 | \n", "10258 | \n", "58.0 | \n", "14.0 | \n", "NaN | \n", "
31927072 | \n", "1 | \n", "6 | \n", "10258 | \n", "78.0 | \n", "20.0 | \n", "NaN | \n", "
4089400 | \n", "1 | \n", "5 | \n", "10258 | \n", "97.0 | \n", "19.0 | \n", "NaN | \n", "
21376076 | \n", "1 | \n", "4 | \n", "10258 | \n", "125.0 | \n", "28.0 | \n", "NaN | \n", "
4488097 | \n", "1 | \n", "3 | \n", "10258 | \n", "154.0 | \n", "29.0 | \n", "NaN | \n", "
22742745 | \n", "1 | \n", "2 | \n", "10258 | \n", "175.0 | \n", "21.0 | \n", "NaN | \n", "
4089402 | \n", "1 | \n", "5 | \n", "10326 | \n", "97.0 | \n", "-1.0 | \n", "NaN | \n", "
24181274 | \n", "1 | \n", "10 | \n", "12427 | \n", "14.0 | \n", "-1.0 | \n", "NaN | \n", "
21760448 | \n", "1 | \n", "9 | \n", "12427 | \n", "44.0 | \n", "30.0 | \n", "NaN | \n", "
29474805 | \n", "1 | \n", "8 | \n", "12427 | \n", "44.0 | \n", "0.0 | \n", "NaN | \n", "
5212929 | \n", "1 | \n", "7 | \n", "12427 | \n", "58.0 | \n", "14.0 | \n", "NaN | \n", "
31927071 | \n", "1 | \n", "6 | \n", "12427 | \n", "78.0 | \n", "20.0 | \n", "NaN | \n", "
4089399 | \n", "1 | \n", "5 | \n", "12427 | \n", "97.0 | \n", "19.0 | \n", "NaN | \n", "
21376075 | \n", "1 | \n", "4 | \n", "12427 | \n", "125.0 | \n", "28.0 | \n", "NaN | \n", "
4488096 | \n", "1 | \n", "3 | \n", "12427 | \n", "154.0 | \n", "29.0 | \n", "NaN | \n", "
22742746 | \n", "1 | \n", "2 | \n", "12427 | \n", "175.0 | \n", "21.0 | \n", "NaN | \n", "
24076666 | \n", "1 | \n", "1 | \n", "12427 | \n", "190.0 | \n", "15.0 | \n", "NaN | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5934984 | \n", "206209 | \n", "12 | \n", "38167 | \n", "37.0 | \n", "-1.0 | \n", "NaN | \n", "
24260103 | \n", "206209 | \n", "9 | \n", "38167 | \n", "114.0 | \n", "77.0 | \n", "77.0 | \n", "
20186696 | \n", "206209 | \n", "8 | \n", "38167 | \n", "136.0 | \n", "22.0 | \n", "22.0 | \n", "
14617226 | \n", "206209 | \n", "3 | \n", "38167 | \n", "203.0 | \n", "67.0 | \n", "67.0 | \n", "
28236066 | \n", "206209 | \n", "13 | \n", "38730 | \n", "30.0 | \n", "-1.0 | \n", "NaN | \n", "
28236064 | \n", "206209 | \n", "13 | \n", "39216 | \n", "30.0 | \n", "-1.0 | \n", "NaN | \n", "
17584595 | \n", "206209 | \n", "11 | \n", "40310 | \n", "55.0 | \n", "-1.0 | \n", "NaN | \n", "
14617216 | \n", "206209 | \n", "3 | \n", "40396 | \n", "203.0 | \n", "-1.0 | \n", "NaN | \n", "
29908312 | \n", "206209 | \n", "1 | \n", "40396 | \n", "240.0 | \n", "37.0 | \n", "37.0 | \n", "
14617215 | \n", "206209 | \n", "3 | \n", "40534 | \n", "203.0 | \n", "-1.0 | \n", "NaN | \n", "
29908314 | \n", "206209 | \n", "1 | \n", "40534 | \n", "240.0 | \n", "37.0 | \n", "37.0 | \n", "
20186704 | \n", "206209 | \n", "8 | \n", "40992 | \n", "136.0 | \n", "-1.0 | \n", "NaN | \n", "
5219651 | \n", "206209 | \n", "7 | \n", "40992 | \n", "158.0 | \n", "22.0 | \n", "22.0 | \n", "
6521430 | \n", "206209 | \n", "4 | \n", "40992 | \n", "173.0 | \n", "15.0 | \n", "15.0 | \n", "
5934982 | \n", "206209 | \n", "12 | \n", "41213 | \n", "37.0 | \n", "-1.0 | \n", "NaN | \n", "
17584591 | \n", "206209 | \n", "11 | \n", "41213 | \n", "55.0 | \n", "18.0 | \n", "18.0 | \n", "
21489865 | \n", "206209 | \n", "10 | \n", "41213 | \n", "85.0 | \n", "30.0 | \n", "30.0 | \n", "
6521424 | \n", "206209 | \n", "4 | \n", "41213 | \n", "173.0 | \n", "88.0 | \n", "88.0 | \n", "
14617213 | \n", "206209 | \n", "3 | \n", "41213 | \n", "203.0 | \n", "30.0 | \n", "30.0 | \n", "
17910756 | \n", "206209 | \n", "2 | \n", "41213 | \n", "233.0 | \n", "30.0 | \n", "30.0 | \n", "
29908311 | \n", "206209 | \n", "1 | \n", "41213 | \n", "240.0 | \n", "7.0 | \n", "7.0 | \n", "
6521428 | \n", "206209 | \n", "4 | \n", "41665 | \n", "173.0 | \n", "-1.0 | \n", "NaN | \n", "
5934987 | \n", "206209 | \n", "12 | \n", "43961 | \n", "37.0 | \n", "-1.0 | \n", "NaN | \n", "
20186697 | \n", "206209 | \n", "8 | \n", "43961 | \n", "136.0 | \n", "99.0 | \n", "99.0 | \n", "
6521435 | \n", "206209 | \n", "4 | \n", "43961 | \n", "173.0 | \n", "37.0 | \n", "37.0 | \n", "
5219655 | \n", "206209 | \n", "7 | \n", "44325 | \n", "158.0 | \n", "-1.0 | \n", "NaN | \n", "
17584596 | \n", "206209 | \n", "11 | \n", "48370 | \n", "55.0 | \n", "-1.0 | \n", "NaN | \n", "
5219653 | \n", "206209 | \n", "7 | \n", "48697 | \n", "158.0 | \n", "-1.0 | \n", "NaN | \n", "
5934990 | \n", "206209 | \n", "12 | \n", "48742 | \n", "37.0 | \n", "-1.0 | \n", "NaN | \n", "
5219652 | \n", "206209 | \n", "7 | \n", "48742 | \n", "158.0 | \n", "121.0 | \n", "121.0 | \n", "
32434489 rows × 6 columns
\n", "