{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import feats\n", "import utils\n", "import constants\n", "import transactions\n", "\n", "import os\n", "import pickle\n", "import operator\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from imp import reload\n", "from matplotlib import pyplot as plt\n", "from statsmodels.tsa.api import VAR\n", "from scipy.spatial.distance import euclidean\n", "from sklearn.utils.extmath import cartesian\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from pandas.tools.plotting import lag_plot, autocorrelation_plot" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "uo = tle.get_users_orders('prior')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_pair = uo[['user_id', 'product_id']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_products_train = tle.get_orders_items('train')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_products_prior = tle.get_orders_items('prior')\n", "orders = tle.get_orders()\n", "products = tle.get_items('products')\n", "aisles = tle.get_items('aisles')\n", "departments = tle.get_items('departments')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "products_details = pd.merge(products, tle.craft_feat_product(), on = ['product_id'], how = 'right')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 None订单 " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_is_None = order_products_train.groupby(['order_id'])['reordered'].sum().reset_index()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.07015912631415824" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(order_is_None[order_is_None.reordered == 0]) / len(order_is_None[order_is_None.reordered > 0])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "a = pd.merge(order_is_None, orders, how = 'left', on = ['order_id'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### prior、train订单" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_products_all = pd.concat([order_products_prior, order_products_train], axis = 0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 How many products do users buy each time\n", "- 每张订单的商品数目" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = order_products_prior.groupby(\"order_id\")[\"add_to_cart_order\"].aggregate(\"max\").reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 3.214874e+06\n", "mean 1.008888e+01\n", "std 7.525398e+00\n", "min 1.000000e+00\n", "25% 5.000000e+00\n", "50% 8.000000e+00\n", "75% 1.400000e+01\n", "max 1.450000e+02\n", "Name: add_to_cart_order, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grouped.add_to_cart_order.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Do users purchase different numbers of products each time?\n", "- 用户每次购买的商品数目一样麽" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = pd.merge(grouped,\n", " orders,\n", " on = ['order_id'],\n", " how = 'left')[['user_id', 'add_to_cart_order', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = grouped.sort_values(['user_id', 'order_number'])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped.columns = ['user_id', \n", " 'num_products', \n", " 'order_number', \n", " 'order_dow',\n", " 'order_hour_of_day', \n", " 'days_since_prior_order']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "user_num_product = grouped.groupby(['user_id'])['num_products'].agg({'mean':'mean', 'std':'std'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "with open(DATA_DIR + 'user_num_product_stat.pkl', 'wb') as f:\n", " pickle.dump(user_num_product, f, pic)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'user_num_product_stat.pkl', 'rb') as f:\n", " user_num_product = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 206209.000000\n", "mean 4.266349\n", "std 2.675061\n", "min 0.000000\n", "25% 2.345208\n", "50% 3.781534\n", "75% 5.609516\n", "max 44.747439\n", "Name: std, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_num_product['std'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4 Reorder Rate \n", "- 每张订单中重复购买商品比例" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = order_products_all.groupby(\"product_id\")[\"reordered\"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()\n", "grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']\n", "grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])\n", "grouped = grouped[grouped.reorder_total > 75].sort_values(['reorder_probability'], ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_reorder_rate = order_products_prior.groupby(['order_id'])['reordered'] \\\n", " .aggregate({'reorder_pnum':'sum', 'pnum':'count'})" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_reorder_rate['reorder_rate'] = prior_reorder_rate['reorder_pnum'] / prior_reorder_rate['pnum']" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_reorder_rate.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_orders = orders[orders.eval_set == 'prior']" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "prior_orders = pd.merge(prior_orders, prior_reorder_rate,\n", " on = ['order_id'], how = 'left')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_iduser_ideval_setorder_numberorder_doworder_hour_of_daydays_since_prior_orderdays_up_to_lastpnumreorder_pnumreorder_rate
025393291prior1280.0190.0500.000
123987951prior23715.0175.0630.500
24737471prior331221.0154.0530.600
322547361prior44729.0125.0551.000
44315341prior541528.097.0850.625
\n", "
" ], "text/plain": [ " order_id user_id eval_set order_number order_dow order_hour_of_day \\\n", "0 2539329 1 prior 1 2 8 \n", "1 2398795 1 prior 2 3 7 \n", "2 473747 1 prior 3 3 12 \n", "3 2254736 1 prior 4 4 7 \n", "4 431534 1 prior 5 4 15 \n", "\n", " days_since_prior_order days_up_to_last pnum reorder_pnum reorder_rate \n", "0 0.0 190.0 5 0 0.000 \n", "1 15.0 175.0 6 3 0.500 \n", "2 21.0 154.0 5 3 0.600 \n", "3 29.0 125.0 5 5 1.000 \n", "4 28.0 97.0 8 5 0.625 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prior_orders.head(5)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "user_reorder_est = prior_orders.groupby(['user_id'])['reorder_pnum']\\\n", " .aggregate({'reorder_pnum_mean':'mean', \n", " 'reorder_pnum_std':'std'}).reset_index()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "user_reorder_est = user_reorder_est[['user_id', 'reorder_pnum_mean', 'reorder_pnum_std']]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'user_reorder_est.pkl', 'wb') as f:\n", " pickle.dump(user_reorder_est, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'user_reorder_est.pkl', 'rb') as f:\n", " user_reorder_est = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 206209.000000\n", "mean 3.018932\n", "std 2.104826\n", "min 0.000000\n", "25% 1.511858\n", "50% 2.563480\n", "75% 4.029652\n", "max 31.210495\n", "Name: reorder_pnum_std, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_reorder_est.reorder_pnum_std.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5 Products User Bought Previously" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "users_products = pd.merge(prior_orders, order_products_prior, on = ['order_id'], how = 'left')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_products = users_products.groupby(['user_id'])['product_id'].apply(list).reset_index()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "with open(DATA_DIR + 'user_product.pkl', 'wb') as f:\n", " pickle.dump(users_products, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'user_product.pkl', 'rb') as f:\n", " users_products = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "l = users_products.product_id.apply(len)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 206209.000000\n", "mean 157.289396\n", "std 204.208233\n", "min 3.000000\n", "25% 39.000000\n", "50% 83.000000\n", "75% 188.000000\n", "max 3725.000000\n", "Name: product_id, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6 Candidate Products\n", "- last purchase\n", "- reorder items\n", "- all items that has high reorder rate\n", "- items that are added to cart first" ] }, { "cell_type": "code", "execution_count": 394, "metadata": { "collapsed": true }, "outputs": [], "source": [ "grouped = order_products_all.groupby(\"product_id\")[\"reordered\"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()\n", "grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## 7 Time of orders" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.5/dist-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans\n", " (prop.get_family(), self.defaultFamily[fontext]))\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfkAAAFLCAYAAADVtPWaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X9wVGWe7/FPk0wsxpAfYNIRNsUMLI6U8qOm5CIQk7WZ\nTpCmTcBkXRy9EuW6AiMTM4tFoAYQENSihEGqXDJZld0FSogkmSWOBJo1PxRk1GEzOOAO43RNuEM6\nbAwJECAknvsHRV8YEjo5aSU8eb+qrApPzvM9345P+tPn9OkTh2VZlgAAgHEG3OwGAADAN4OQBwDA\nUIQ8AACGIuQBADAUIQ8AgKEIeQAADBUy5C9evKjs7Gw9/PDD8ng82rhxoySprq5OOTk5crvdysvL\nU1tbmySpra1NeXl5crvdysnJ0YkTJ4K1Nm/eLLfbrYyMDFVXVwfHq6qqlJGRIbfbrcLCwuB4V/sA\nAAChOUJ9Tt6yLLW2tur222/XpUuX9Nhjj2np0qV66623lJ6eLo/Ho2XLlunuu+/WY489pq1bt+qL\nL77QypUrVV5err1792rDhg06fvy48vPzVVxcrEAgoNzcXO3Zs0eSlJGRobfeektOp1PZ2dl67bXX\n9Ld/+7f66U9/2uk+buTUqTPh++kAANDHJSQM6vJ7IY/kHQ6Hbr/9dklSe3u72tvb5XA4dPDgQWVk\nZEiSZs6cKZ/PJ0nav3+/Zs6cKelyeB84cECWZcnn88nj8SgqKkrJyckaPny4amtrVVtbq+HDhys5\nOVlRUVHyeDzy+XyyLKvLfQAAgNC69Z58R0eHMjMzNXnyZE2ePFnJycmKiYlRZGSkJCkpKUmBQECS\nFAgEdOedd0qSIiMjNWjQIDU1NSkQCCgpKSlY0+l0KhAIdDne1NTU5T4AAEBokd3ZKCIiQmVlZWpp\nadGCBQv05ZdfftN92RYf/11FRkbc7DYAALjpuhXyV8TExGjixIk6fPiwWlpa1N7ersjISNXX18vp\ndEq6fCR+8uRJJSUlqb29XWfOnFF8fLycTqfq6+uDtQKBQHBOZ+Px8fFd7uNGmppae/KQAAC4pfXq\nPfmvvvpKLS0tkqQLFy7oo48+0siRIzVx4sTghXMlJSVyuVySJJfLpZKSEknSnj17dP/998vhcMjl\ncqm8vFxtbW2qq6uT3+/X2LFjNWbMGPn9ftXV1amtrU3l5eVyuVxyOBxd7gMAAIQW8ur6Y8eOafHi\nxero6JBlWZo2bZp+8pOfqK6uTs8//7yam5s1evRorVu3TlFRUbp48aIWLVqko0ePKjY2VuvXr1dy\ncrIk6Y033tC7776riIgILVmyRGlpaZKkyspKrVmzRh0dHXrkkUc0b948SepyHzfC1fUAgP7kRkfy\nIUP+VkPIAwD6k16drgcAALcmQh4AAEMR8gAAGIqQBwDAUIQ8AACGIuQBADBUj+54B6BvmVPzz7bn\nvp3ybBg7AdAXcSQPAIChCHkAAAxFyAMAYCjekwegOVXbbc99O3V2GDsBEE6EPADj/Z/KQ7bn/jLt\nf13z73lVf7Bd643UUbbnAnZwuh4AAEMR8gAAGIqQBwDAUIQ8AACG4sI7AH3WU5U+W/PeTJsa5k6A\nWxNH8gAAGIqQBwDAUIQ8AACGIuQBADAUF94BCKvcyl/ZnvtW2sNh7AQAIQ8AN8GqmtO25/48JS6M\nncBknK4HAMBQhDwAAIYi5AEAMBQhDwCAoQh5AAAMRcgDAGAoQh4AAEMR8gAAGIqQBwDAUIQ8AACG\nIuQBADAUIQ8AgKEIeQAADMVfoQO+ZU9+uML23C1T7M8F0P9wJA8AgKEIeQAADEXIAwBgKEIeAABD\nhQz5kydP6oknntD06dPl8Xi0ZcsWSdLrr7+uBx54QJmZmcrMzFRlZWVwzubNm+V2u5WRkaHq6urg\neFVVlTIyMuR2u1VYWBgcr6urU05Ojtxut/Ly8tTW1iZJamtrU15entxut3JycnTixImwPXAAAEwX\nMuQjIiK0ePFivffee3rnnXe0bds2HT9+XJI0Z84clZWVqaysTGlpaZKk48ePq7y8XOXl5SoqKtKL\nL76ojo4OdXR0aOXKlSoqKlJ5ebl2794drLNu3TrNmTNHe/fuVUxMjIqLiyVJO3fuVExMjPbu3as5\nc+Zo3bp139TPAQAA44QM+cTERN1zzz2SpOjoaI0YMUKBQKDL7X0+nzwej6KiopScnKzhw4ertrZW\ntbW1Gj58uJKTkxUVFSWPxyOfzyfLsnTw4EFlZGRIkmbOnCmfzydJ2r9/v2bOnClJysjI0IEDB2RZ\nVq8fNAAA/UGPPid/4sQJHT16VOPGjdNnn32mrVu3qrS0VPfee68WL16s2NhYBQIBjRs3LjjH6XQG\nXxQkJSVdM15bW6umpibFxMQoMjIyuM2V7QOBgO68887LjUZGatCgQWpqatLgwYO77DE+/ruKjIzo\nycMCbhkJCYP6XC16slvrdJjqAF3rdsifO3dOCxcu1JIlSxQdHa3Zs2dr/vz5cjgc+sUvfqGXX35Z\na9eu/SZ77Zamptab3QLwjTl16kyfq0VP336tcPaEW9+NXvR16+r6S5cuaeHChfJ6vUpPT5ck3XHH\nHYqIiNCAAQOUk5Oj3/3ud5IuH6HX19cH5wYCATmdzi7H4+Pj1dLSovb2dklSfX29nE5nsNbJkycl\nSe3t7Tpz5ozi4+N78tgBAOi3Qoa8ZVlaunSpRowYodzc3OB4Q0ND8Ot9+/Zp1KhRkiSXy6Xy8nK1\ntbWprq5Ofr9fY8eO1ZgxY+T3+1VXV6e2tjaVl5fL5XLJ4XBo4sSJ2rNnjySppKRELpcrWKukpESS\ntGfPHt1///1yOBzhe/QAABgs5On6Tz/9VGVlZbrrrruUmZkpScrPz9fu3bt17NgxSdKwYcO0cuVK\nSdKoUaP00EMPafr06YqIiNCyZcsUEXH5PfJly5Zp7ty56ujo0COPPBJ8YbBo0SI9//zz2rBhg0aP\nHq2cnBxJUnZ2thYtWiS3263Y2FitX78+/D8BAAAMFTLk77vvPn3xxRfXjV/5yFxn5s2bp3nz5nU6\np7N5ycnJwY/NXe22227Txo0bQ7UIAAA6wR3vAAAwFCEPAIChCHkAAAxFyAMAYChCHgAAQxHyAAAY\nipAHAMBQhDwAAIYi5AEAMBQhDwCAoQh5AAAMRcgDAGAoQh4AAEMR8gAAGIqQBwDAUCH/njwAoG8r\nqz5ve27mAwPD2An6Go7kAQAwFCEPAIChCHkAAAxFyAMAYChCHgAAQxHyAAAYipAHAMBQhDwAAIYi\n5AEAMBQhDwCAoQh5AAAMRcgDAGAoQh4AAEMR8gAAGIqQBwDAUIQ8AACGIuQBADAUIQ8AgKEIeQAA\nDEXIAwBgKEIeAABDEfIAABiKkAcAwFCEPAAAhgoZ8idPntQTTzyh6dOny+PxaMuWLZKk06dPKzc3\nV+np6crNzVVzc7MkybIsrV69Wm63W16vV59//nmwVklJidLT05Wenq6SkpLg+JEjR+T1euV2u7V6\n9WpZlnXDfQAAgNBChnxERIQWL16s9957T++88462bdum48ePq7CwUJMmTVJFRYUmTZqkwsJCSVJV\nVZX8fr8qKiq0atUqrVixQtLlwN60aZN27NihnTt3atOmTcHQXrFihVatWqWKigr5/X5VVVVJUpf7\nAAAAoYUM+cTERN1zzz2SpOjoaI0YMUKBQEA+n09ZWVmSpKysLO3bt0+SguMOh0Pjx49XS0uLGhoa\nVFNToylTpiguLk6xsbGaMmWKqqur1dDQoLNnz2r8+PFyOBzKysqSz+e7ptZf7wMAAIQW2ZONT5w4\noaNHj2rcuHFqbGxUYmKiJCkhIUGNjY2SpEAgoKSkpOCcpKQkBQKB68adTmen41e2l9TlPm4kPv67\nioyM6MnDAm4ZCQmD+lwterJb63SY6kjS+TDWgkm6HfLnzp3TwoULtWTJEkVHR1/zPYfDIYfDEfbm\n7Oyjqan1G+0DuJlOnTrT52rR07dfqy/2hJvnRi/UunV1/aVLl7Rw4UJ5vV6lp6dLkoYMGaKGhgZJ\nUkNDgwYPHizp8hF6fX19cG59fb2cTud144FAoNPxK9vfaB8AACC0kCFvWZaWLl2qESNGKDc3Nzju\ncrlUWloqSSotLdXUqVOvGbcsS4cPH9agQYOUmJiolJQU1dTUqLm5Wc3NzaqpqVFKSooSExMVHR2t\nw4cPy7KsTmv99T4AAEBoIU/Xf/rppyorK9Ndd92lzMxMSVJ+fr6eeeYZ5eXlqbi4WEOHDtWGDRsk\nSWlpaaqsrJTb7dbAgQO1Zs0aSVJcXJzmz5+v7OxsSdKCBQsUFxcnSVq+fLkKCgp04cIFpaamKjU1\nVZK63AcAAAgtZMjfd999+uKLLzr93pXPzF/N4XBo+fLlnW6fnZ0dDPmrjRkzRrt3775uPD4+vtN9\nAACA0LjjHQAAhiLkAQAwFCEPAIChCHkAAAxFyAMAYChCHgAAQxHyAAAYipAHAMBQhDwAAIYi5AEA\nMBQhDwCAobr99+QBAOb7ZP9FW/Puc90W5k4QDhzJAwBgKEIeAABDEfIAABiKkAcAwFBceAd00ws1\nObbmvZqyM8ydAED3cCQPAIChCHkAAAxFyAMAYChCHgAAQxHyAAAYipAHAMBQhDwAAIYi5AEAMBQh\nDwCAoQh5AAAMRcgDAGAoQh4AAEMR8gAAGIqQBwDAUIQ8AACGIuQBADAUIQ8AgKEIeQAADEXIAwBg\nKEIeAABDEfIAABiKkAcAwFCEPAAAhgoZ8gUFBZo0aZJmzJgRHHv99df1wAMPKDMzU5mZmaqsrAx+\nb/PmzXK73crIyFB1dXVwvKqqShkZGXK73SosLAyO19XVKScnR263W3l5eWpra5MktbW1KS8vT263\nWzk5OTpx4kRYHjAAAP1FyJCfNWuWioqKrhufM2eOysrKVFZWprS0NEnS8ePHVV5ervLychUVFenF\nF19UR0eHOjo6tHLlShUVFam8vFy7d+/W8ePHJUnr1q3TnDlztHfvXsXExKi4uFiStHPnTsXExGjv\n3r2aM2eO1q1bF87HDQCA8UKG/IQJExQbG9utYj6fTx6PR1FRUUpOTtbw4cNVW1ur2tpaDR8+XMnJ\nyYqKipLH45HP55NlWTp48KAyMjIkSTNnzpTP55Mk7d+/XzNnzpQkZWRk6MCBA7Isy+7jBACg37H9\nnvzWrVvl9XpVUFCg5uZmSVIgEFBSUlJwG6fTqUAg0OV4U1OTYmJiFBkZKUlKSkpSIBAI1rrzzjsl\nSZGRkRo0aJCamprstgsAQL8TaWfS7NmzNX/+fDkcDv3iF7/Qyy+/rLVr14a7N1vi47+ryMiIm90G\nEJSQMMjoWvRkt9bpMNWRpPNhrHUxTHXQF9gK+TvuuCP4dU5Ojp599llJl4/Q6+vrg98LBAJyOp2S\n1Ol4fHy8Wlpa1N7ersjISNXX1we3dzqdOnnypJKSktTe3q4zZ84oPj4+ZG9NTa12HhLwjTl16ozR\ntejp269lek/omRu9wLJ1ur6hoSH49b59+zRq1ChJksvlUnl5udra2lRXVye/36+xY8dqzJgx8vv9\nqqurU1tbm8rLy+VyueRwODRx4kTt2bNHklRSUiKXyxWsVVJSIknas2eP7r//fjkcDjvtAgDQL4U8\nks/Pz9ehQ4fU1NSk1NRUPffcczp06JCOHTsmSRo2bJhWrlwpSRo1apQeeughTZ8+XREREVq2bJki\nIi6fOl+2bJnmzp2rjo4OPfLII8EXBosWLdLzzz+vDRs2aPTo0crJyZEkZWdna9GiRXK73YqNjdX6\n9eu/kR8AAACmChnyr7322nVjV4K4M/PmzdO8efOuG09LSwt+1O5qycnJwY/NXe22227Txo0bQ7UH\nAAC6wB3vAAAwFCEPAIChCHkAAAxFyAMAYChbn5MHAOBG/Lvt36DnezMGhrGT/o0jeQAADEXIAwBg\nKEIeAABDEfIAABiKkAcAwFCEPAAAhiLkAQAwFCEPAIChCHkAAAxFyAMAYChCHgAAQxHyAAAYipAH\nAMBQhDwAAIYi5AEAMBQhDwCAoQh5AAAMRcgDAGAoQh4AAEMR8gAAGIqQBwDAUIQ8AACGIuQBADAU\nIQ8AgKEIeQAADEXIAwBgKEIeAABDEfIAABiKkAcAwFCEPAAAhiLkAQAwFCEPAIChCHkAAAxFyAMA\nYChCHgAAQ0Xe7AYAALiRc1vP2Z57+49vD2Mnt56QR/IFBQWaNGmSZsyYERw7ffq0cnNzlZ6ertzc\nXDU3N0uSLMvS6tWr5Xa75fV69fnnnwfnlJSUKD09Xenp6SopKQmOHzlyRF6vV263W6tXr5ZlWTfc\nBwAA6J6QIT9r1iwVFRVdM1ZYWKhJkyapoqJCkyZNUmFhoSSpqqpKfr9fFRUVWrVqlVasWCHpcmBv\n2rRJO3bs0M6dO7Vp06ZgaK9YsUKrVq1SRUWF/H6/qqqqbrgPAADQPSFDfsKECYqNjb1mzOfzKSsr\nS5KUlZWlffv2XTPucDg0fvx4tbS0qKGhQTU1NZoyZYri4uIUGxurKVOmqLq6Wg0NDTp79qzGjx8v\nh8OhrKws+Xy+G+4DAAB0j6335BsbG5WYmChJSkhIUGNjoyQpEAgoKSkpuF1SUpICgcB1406ns9Px\nK9vfaB+hxMd/V5GREXYeFvCNSEgYZHQterJb63SY6kjS+TDWuhiWOv4w9nRO9t+TD+f/v1tRry+8\nczgccjgc4eglLPtoamr9RnsBeurUqTNG16Knb78WPd2cWn3VjV7I2PoI3ZAhQ9TQ0CBJamho0ODB\ngyVdPkKvr68PbldfXy+n03ndeCAQ6HT8yvY32gcAAOgeWyHvcrlUWloqSSotLdXUqVOvGbcsS4cP\nH9agQYOUmJiolJQU1dTUqLm5Wc3NzaqpqVFKSooSExMVHR2tw4cPy7KsTmv99T4AAED3hDxdn5+f\nr0OHDqmpqUmpqal67rnn9MwzzygvL0/FxcUaOnSoNmzYIElKS0tTZWWl3G63Bg4cqDVr1kiS4uLi\nNH/+fGVnZ0uSFixYoLi4OEnS8uXLVVBQoAsXLig1NVWpqamS1OU+AABA94QM+ddee63T8S1btlw3\n5nA4tHz58k63z87ODob81caMGaPdu3dfNx4fH9/pPgAAQPdwW1sAAAxFyAMAYCjuXQ+jFe6//i2i\n7nrGVRzGTgDg28eRPAAAhiLkAQAwFCEPAIChCHkAAAxFyAMAYChCHgAAQxHyAAAYipAHAMBQhDwA\nAIYi5AEAMBQhDwCAoQh5AAAMRcgDAGAo/godAKBf6PjX/2t7bsT/HhbGTr49HMkDAGAoQh4AAEMR\n8gAAGIqQBwDAUIQ8AACGIuQBADAUIQ8AgKEIeQAADEXIAwBgKEIeAABDEfIAABiKkAcAwFCEPAAA\nhiLkAQAwFCEPAICh+HvyAAD00NfbjtieO+Cxe8PYSYh9fWt7AgAA3ypCHgAAQxHyAAAYipAHAMBQ\nhDwAAIYi5AEAMBQhDwCAoXoV8i6XS16vV5mZmZo1a5Yk6fTp08rNzVV6erpyc3PV3NwsSbIsS6tX\nr5bb7ZbX69Xnn38erFNSUqL09HSlp6erpKQkOH7kyBF5vV653W6tXr1almX1pl0AAPqVXh/Jb9my\nRWVlZdq1a5ckqbCwUJMmTVJFRYUmTZqkwsJCSVJVVZX8fr8qKiq0atUqrVixQtLlFwWbNm3Sjh07\ntHPnTm3atCn4wmDFihVatWqVKioq5Pf7VVVV1dt2AQDoN8J+ut7n8ykrK0uSlJWVpX379l0z7nA4\nNH78eLW0tKihoUE1NTWaMmWK4uLiFBsbqylTpqi6uloNDQ06e/asxo8fL4fDoaysLPl8vnC3CwCA\nsXp9W9unn35aDodDjz76qB599FE1NjYqMTFRkpSQkKDGxkZJUiAQUFJSUnBeUlKSAoHAdeNOp7PT\n8SvbhxIf/11FRkb09mEBSkgY1Kfq9NVa9GS31ukw1ZGk82GsdTEsdfxh7OmczoWlVr3tKtf3FDqN\nul/rm9SrkN++fbucTqcaGxuVm5urESNGXPN9h8Mhh8PRqwZ7qqmp9VvdH8x16tSZPlWnr9aip2+/\nFj19+7X6Yk9X3OhFQ69O1zudTknSkCFD5Ha7VVtbqyFDhqihoUGS1NDQoMGDBwe3ra///6+j6uvr\n5XQ6rxsPBAKdjl/ZHgAAdI/tkG9tbdXZs2eDX3/44YcaNWqUXC6XSktLJUmlpaWaOnWqJAXHLcvS\n4cOHNWjQICUmJiolJUU1NTVqbm5Wc3OzampqlJKSosTEREVHR+vw4cOyLOuaWgAAIDTbp+sbGxu1\nYMECSVJHR4dmzJih1NRUjRkzRnl5eSouLtbQoUO1YcMGSVJaWpoqKyvldrs1cOBArVmzRpIUFxen\n+fPnKzs7W5K0YMECxcXFSZKWL1+ugoICXbhwQampqUpNTe3VgwUAoD+xHfLJycn61a9+dd14fHy8\ntmzZct24w+HQ8uXLO62VnZ0dDPmrjRkzRrt377bbIgAA/Rp3vAMAwFCEPAAAhiLkAQAwFCEPAICh\nen3HOwAAYJ/1To2teY5HU0Juw5E8AACGIuQBADAUIQ8AgKEIeQAADEXIAwBgKEIeAABDEfIAABiK\nkAcAwFCEPAAAhuKOd+hzdr1//Z8d7q5Z04rD2AkA3No4kgcAwFCEPAAAhiLkAQAwFCEPAIChCHkA\nAAxFyAMAYChzP0JXXGZ/bnZm+PoAAOAm4UgeAABDEfIAABiKkAcAwFCEPAAAhiLkAQAwFCEPAICh\nCHkAAAxFyAMAYChCHgAAQxHyAAAYytzb2obJ18X/bHvugOxnw9gJAAA9w5E8AACG4kgeYbN/d47t\nua4ZO8PYCQBA4kgeAABjcST/LTq/Pdf23IGz3wpjJwCA/oAjeQAADMWR/C3Kv+sxW/O+N2vbNf/+\n7FeP2u7hhw+/Y3suAOCbx5E8AACG6vMhX1VVpYyMDLndbhUWFt7sdgAAuGX06ZDv6OjQypUrVVRU\npPLycu3evVvHjx+/2W0BAHBL6NMhX1tbq+HDhys5OVlRUVHyeDzy+Xw3uy0AAG4JfTrkA4GAkpKS\ngv92Op0KBAI3sSMAAG4dDsuyrJvdRFfef/99VVdX66WXXpIklZaWqra2VsuWLbvJnQEA0Pf16SN5\np9Op+vr64L8DgYCcTudN7AgAgFtHnw75MWPGyO/3q66uTm1tbSovL5fL5brZbQEAcEvo0zfDiYyM\n1LJlyzR37lx1dHTokUce0ahRo252WwAA3BL69HvyAADAvj59uh4AANhHyAMAYKh+F/Lhuk1uQUGB\nJk2apBkzZvSqn5MnT+qJJ57Q9OnT5fF4tGXLFtu1Ll68qOzsbD388MPyeDzauHFjr3rr6OhQVlaW\n/vEf/7FXdVwul7xerzIzMzVr1izbdVpaWrRw4UJNmzZNDz30kH7729/aqvPll18qMzMz+N8Pf/hD\nvf3227b7evvtt+XxeDRjxgzl5+fr4sWLtups2bJFM2bMkMfj6XE/na3H06dPKzc3V+np6crNzVVz\nc7PtWr/+9a/l8Xh0991363e/+53tOq+88oqmTZsmr9erBQsWqKWlxXatDRs2BNfVU0891a17aNzo\n9/bNN9/UD37wA3311Ve2e3r99df1wAMPBNdWZWWl7VqS9G//9m+aNm2aPB6PXn31VVt18vLygv24\nXC5lZmba7uno0aP6+7//++Dvcm1tra06x44d06OPPiqv16tnn31WZ8+e7VZPXT1f9nStd1XHzjrv\nqlZP13pXdeys82tY/Uh7e7s1depU689//rN18eJFy+v1Wn/4wx9s1Tp06JB15MgRy+Px9KqnQCBg\nHTlyxLIsyzpz5oyVnp5uu6evv/7aOnv2rGVZltXW1mZlZ2dbv/3tb2339uabb1r5+fnWM888Y7uG\nZVnWgw8+aDU2NvaqhmVZ1gsvvGDt2LHDsizLunjxotXc3Nzrmu3t7dbkyZOtEydO2JpfX19vPfjg\ng9b58+cty7KshQsXWu+++26P63zxxReWx+OxWltbrUuXLllPPvmk5ff7uz2/s/X4yiuvWJs3b7Ys\ny7I2b95svfrqq7ZrHT9+3PrjH/9oPf7441Ztba3tOtXV1dalS5csy7KsV199tVc9nTlzJvj1li1b\nrJ///Oe26liWZf3lL3+xnnrqKevv/u7vur1WO6u1ceNGq6ioqFvzQ9U6cOCA9eSTT1oXL160LMuy\n/ud//sdWnautXbvWev311233lJuba33wwQeWZVnWBx98YD3++OO26syaNcv6+OOPLcuyrJ07d1rr\n16/vVk9dPV/2dK13VcfOOu+qVk/Xeld17Kzzq/WrI/lw3iZ3woQJio2N7XVPiYmJuueeeyRJ0dHR\nGjFihO27+jkcDt1+++2SpPb2drW3t8vhcNiqVV9frw8++EDZ2dm25ofbmTNn9Jvf/CbYT1RUlGJi\nYnpd98CBA0pOTtawYcNs1+jo6NCFCxfU3t6uCxcuKDExscc1/vjHP2rs2LEaOHCgIiMjNWHCBFVU\nVHR7fmfr0efzKSsrS5KUlZWlffv22a41cuRIjRgxotv9dFUnJSVFkZGXP9Qzfvz4a+6D0dNa0dHR\nwa/Pnz/frbXe1e/t2rVrtWjRoh79voTrOaCrWtu3b9czzzyjqKgoSdKQIUN61ZNlWfr1r3/d7bOP\nndVyOBw6d+6cpMu/k91Z653V8fv9mjBhgiRpypQp3V7rXT1f9nStd1XHzjrvqlZP13pXdeys86v1\nq5Dv67fJPXHihI4ePapx48bZrtHR0aHMzExNnjxZkydPtl1rzZo1WrRokQYMCM8SefrppzVr1iy9\n8469v0F/4sQJDR48WAUFBcrKytLSpUvV2tra677Ky8t79ZaL0+nUU089pQcffFApKSmKjo5WSkpK\nj+vcdddd+vTTT9XU1KTz58+rqqqq2wHYlcbGxuCTcEJCghobG3tVL9zeffddpaam9qrG+vXrlZaW\npv/4j//QT3/6U1s19u3bp8TERN1999296uWKrVu3yuv1qqCgoNtvkXTG7/frk08+UU5Ojh5//PFu\nnRq/kU/lKJnIAAAGB0lEQVQ++URDhgzR9773Pds1lixZoldffVVpaWl65ZVXlJ+fb6vOqFGjggdY\n77//vk6ePNnjGlc/X/ZmrYfjeTdUrZ6u9b+u05t13q9Cvi87d+6cFi5cqCVLllzzyq2nIiIiVFZW\npsrKStXW1uq///u/e1zjP//zPzV48GDde++9tvu42vbt21VSUqJf/vKX2rp1q37zm9/0uEZ7e7t+\n//vfa/bs2SotLdXAgQN7/aeH29ratH//fk2bNs12jebmZvl8Pvl8PlVXV+v8+fMqKyvrcZ2RI0dq\n7ty5evrppzV37lzdfffdYXuBJV0+ArN7Vueb8MYbbygiIkIPP/xwr+o8//zzqqyslNfr1b//+7/3\neP758+e1efNm2y8Q/trs2bO1d+9elZWVKTExUS+//LLtWh0dHWpubtaOHTv0wgsvKC8vT1YvPvG8\ne/fuXl9DtH37dhUUFKiyslIFBQVaunSprTovvfSStm3bplmzZuncuXPBsxXddaPny56s9XA9796o\nVk/Xemd1erPO+1XI99Xb5F66dEkLFy6U1+tVenp6WGrGxMRo4sSJqq6u7vHczz77TPv375fL5VJ+\nfr4OHjyof/qnf7Ldy5Wf8ZAhQ+R2u20dkSQlJSkpKSn4ynbatGn6/e9/b7sn6fJFmPfcc4/uuOMO\n2zU++ugj/c3f/I0GDx6s73znO0pPT7d9QWBOTo527dqlrVu3KjY2tldHXNLln3dDQ4MkqaGhQYMH\nD+5VvXDZtWuXPvjgA61bty5sLzy8Xm+P3t644s9//rNOnDgRvCitvr5es2bN0qlTp2z1cccddygi\nIkIDBgxQTk5Oty/e6ozT6ZTb7ZbD4dDYsWM1YMAANTU12arV3t6uvXv3avr06bb7kaSSkpLgc9RD\nDz1k++zCyJEj9eabb2rXrl3yeDxKTk7u9tzOni/trPVwPu92Vaunaz1UT3bWeb8K+b54m1zLsrR0\n6VKNGDFCubm5var11VdfBa/gvHDhgj766KMev78kST/72c9UVVWl/fv367XXXtP999+vdevW2eqp\ntbU1eOVsa2urPvzwQ1t3LUxISFBSUpK+/PJLSZffSx85cqStnq4oLy+Xx+PpVY2hQ4fqv/7rv3T+\n/HlZltWrvq6cYvzLX/6iiooKeb3eXvXmcrlUWloq6fIfd5o6dWqv6oVDVVWVioqK9MYbb2jgwIG9\nquX3+4Nf+3w+W2v9Bz/4gQ4cOKD9+/dr//79SkpK0q5du5SQkGCrpytBI11+G6A3d+j80Y9+pI8/\n/liS9Kc//UmXLl1SfHy8rVpXnguufrvSjsTERB06dEiSdPDgQdsvRK+s9a+//lpvvPGG/uEf/qFb\n87p6vuzpWg/n825XtXq61ruq09t13u/ueFdZWak1a9YEb5M7b948W3Xy8/N16NAhNTU1aciQIXru\nueeUk5PT4zqffPKJfvzjH+uuu+4Knp7Nz89XWlpaj2sdO3ZMixcvVkdHhyzL0rRp0/STn/ykx3Wu\n9vHHH+vNN9/U5s2bbc2vq6vTggULJF0+/ThjxgzbP/OjR49q6dKlunTpkpKTk7V27VrbFz61trbq\nwQcf1L59+zRo0CBbNa7YuHGj3nvvPUVGRmr06NF66aWXenz6UZIee+wxnT59WpGRkcGPHXVXZ+vx\nRz/6kfLy8nTy5EkNHTpUGzZsUFxcnK1acXFxWrVqlb766ivFxMRo9OjR+pd/+Zce1yksLFRbW1uw\nj3HjxmnlypW2eqqqqtKf/vQnORwODRs2TC+++GLIM3Ohfm9dLpeKi4u7dSTYWa1Dhw7p2LFjkqRh\nw4Zp5cqV3bo4rbNamZmZWrJkiY4dO6bvfOc7euGFF0Kuia4e3+LFizVu3DjNnj07ZC83qvX9739f\na9asUXt7u2677TYtX7485Nt6ndVpbW3Vtm3bJElut1s/+9nPunWk29Xz5dixY3u01ruq09bW1uN1\n3lWt1atX92itd1WnuLi4x+v8av0u5AEA6C/61el6AAD6E0IeAABDEfIAABiKkAcAwFCEPAAAhiLk\nAQAwFCEPAIChCHkAAAz1/wBvBrAMU6OQwgAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "grouped = orders.order_hour_of_day.value_counts()\n", "sns.set_style('darkgrid')\n", "sns.barplot(grouped.index, grouped.values)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8 Topic Distance\n", "- user VS product prior中的所有(u,p)对\n", "- latest order VS product 通过LDA-transform来构造" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# term-frequency matrix construct\n", "orders = pd.read_csv(DATA_DIR + 'orders.csv')\n", "\n", "users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id']], \n", " on = ['order_id'], how = 'left')\n", "\n", "users_products_matrix = users_orders.groupby(['user_id'])['product_id'].apply(series_to_str)\n", "\n", "tf = CountVectorizer(analyzer = 'word', lowercase = False, max_df=0.95, min_df=2,)\n", "tf_matrix = tf.fit_transform(users_products_matrix.values)\n", "tf_feature_names = tf.get_feature_names()\n", "\n", "with open(DATA_DIR + 'tf.model', 'wb') as f:\n", " pickle.dump(tf, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#订单的Topic, tf为CountVector,将文档转化为term-frequency矩阵\n", "op = order_products_prior.groupby(['order_id'])['product_id'].apply(series_to_str)\n", "topic_order = pd.DataFrame(lda.transform(tf.transform(op.values)), columns= [\"topic_%d\"%x for x in range(10)])\n", "topic_order['order_id'] = op.index.values\n", "with open(DATA_DIR + 'order_topic_norm.pkl', 'wb') as f:\n", " pickle.dump(topic_order_norm, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "up_distance = pd.merge(users_orders[['user_id', 'product_id']].drop_duplicates(),\n", " user_topic, \n", " on = ['user_id'],\n", " how = 'left')\n", "up_distance.columns = ['user_id', 'product_id'] + [\"u_topic_%d\"%x for x in range(10)] \n", "up_distance = pd.merge(up_distance,\n", " topic_product, \n", " on = ['product_id'],\n", " how = 'left')\n", "up_distance.columns = ['user_id', 'product_id'] + [\"u_topic_%d\"%x for x in range(10)] + [\"p_topic_%d\"%x for x in range(10)] " ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def cal_up_distance(subf):\n", " u_topic = subf[[\"u_topic_%d\"%x for x in range(10)]]\n", " p_topic = subf[[\"p_topic_%d\"%x for x in range(10)]]\n", " upd = euclidean(u_topic, p_topic)\n", " return upd" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3h 40min 56s, sys: 2min 4s, total: 3h 43min\n", "Wall time: 3h 49min 41s\n" ] } ], "source": [ "# 3 hours\n", "up_distance['up_dis'] = up_distance.apply(cal_up_distance, axis = 1)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_distance = up_distance[['user_id', 'product_id', 'up_dis']]\n", "with open(DATA_DIR + 'upd_feat.pkl', 'wb') as f:\n", " pickle.dump(up_distance, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9 Order Topic Construct\n", "- countvector, lda transform\n", "- 由商品的Topic构造订单的Topic表达\n", "- 商品加入购物车的次序??? 先忽视次序\n", "- 每个用户学习:加购物车次序 VS 重购? VS下张订单的Topic??" ] }, { "cell_type": "code", "execution_count": 309, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_topic = pd.merge(order_products_prior[['order_id', 'product_id']],\n", " topic_product,\n", " on = ['product_id'],\n", " how = 'inner')#throw stop words" ] }, { "cell_type": "code", "execution_count": 312, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_topic = order_topic.groupby(['order_id'])[[\"topic_%d\"%x for x in range(10)]].sum().reset_index()" ] }, { "cell_type": "code", "execution_count": 314, "metadata": { "collapsed": true }, "outputs": [], "source": [ "unorm = order_topic[[\"topic_%d\"%x for x in range(10)]].values" ] }, { "cell_type": "code", "execution_count": 315, "metadata": { "collapsed": true }, "outputs": [], "source": [ "order_topic[[\"topic_%d\"%x for x in range(10)]] = unorm / unorm.sum(axis = 1)[:,np.newaxis]" ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "49677" ] }, "execution_count": 301, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(order_products_prior.product_id.unique())" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "49502" ] }, "execution_count": 302, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(topic_product.product_id.unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10 XGBoost Feature Preparation\n", "- 正负样本10:1" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import constants, utils, transactions, feats\n", "from imp import reload" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13307953\n" ] } ], "source": [ "train_none = feats.make_train_or_test_none(tle, 'train')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13307953\n" ] } ], "source": [ "test_none = feats.make_train_or_test_none(tle, 'test')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13307953\n" ] } ], "source": [ "train = feats.make_train_or_test(tle, 'train')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[up_cols])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[ua_cols])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[ud_cols])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking inf ...\n", "Series([], dtype: float64)\n", "Checking NAN ...\n", "Index([], dtype='object')\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utils.check_inf_nan(train[p_cols])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[a_cols])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[d_cols])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[ctx_cols])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "utils.check_inf_nan(train[topic_cols])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 11 LSTM Feature Preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- (u,p,t)\n", "- 间隔、加购物车次序作为Symbol\n", " \n", " - 次序\n", " - 1\n", " - 2\n", " - 3\n", " - 4-6\n", " - 7-11\n", " - 12 ——\n", " \n", " - 间隔\n", " - 1 - 7\n", " - 8 - 16\n", " - 17 - 33\n", " - 34\n", " - 100 NAN\n", " \n", "- 实现\n", " - Encoder两个列, 总共30种符号\n", " - Cartesian查表\n", " - 直接数值" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders = tle.get_users_orders('prior')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "product_feat = tle.craft_feat_item('products')" ] }, { "cell_type": "code", "execution_count": 255, "metadata": { "collapsed": true }, "outputs": [], "source": [ "user_feat = tle.craft_feat_user()" ] }, { "cell_type": "code", "execution_count": 256, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders = pd.merge(users_orders, product_feat[['product_id', 'p_reorder_probability']], on=['product_id'], how='left')" ] }, { "cell_type": "code", "execution_count": 257, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders = pd.merge(users_orders, user_feat[['user_id', 'u_total_reorders']], on=['user_id'], how='left')" ] }, { "cell_type": "code", "execution_count": 258, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def encode_numeric(row, bins):\n", " '''\n", " convert numeric variable into binned category\n", " bins = [b1, b2, b3, b4]\n", " '''\n", " index = ~(row < bins)\n", " return [bins[index][-1]]" ] }, { "cell_type": "code", "execution_count": 321, "metadata": { "collapsed": true }, "outputs": [], "source": [ "add2cart_bins = np.array([1, 2, 3, 4, 7, 12], dtype=float) # 6\n", "interval_bins = np.array([-1, 4, 8, 17, 34], dtype=float)# 5\n", "p_reorder_bins = np.array([0.0, 0.20, 0.38, 0.53], dtype=float)# 4\n", "u_reorder_bins = np.array([0, 10, 33, 101], dtype=float)# 4" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": false }, "outputs": [], "source": [ "%%time\n", "users_orders = users_orders.sort_values(['user_id', 'product_id', 'order_number'], ascending = False)\n", "users_orders['up_interval'] = users_orders.groupby(['user_id', 'product_id'])['days_up_to_last'].diff()\n", "users_orders.up_interval.fillna(-1, inplace=True)\n", "users_orders['up_interval_sym'] = users_orders.up_interval.apply(lambda x: encode_numeric(x, interval_bins))\n", "users_orders['up_add2cart_order_sym'] = users_orders.add_to_cart_order.apply(lambda x: encode_numeric(x, add2cart_bins))" ] }, { "cell_type": "code", "execution_count": 265, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders['p_reorder_prob_sym'] = users_orders.p_reorder_probability.apply(lambda x: encode_numeric(x, p_reorder_bins))\n", "users_orders['u_reorder_sym'] = users_orders.u_total_reorders.apply(lambda x:encode_numeric(x, u_reorder_bins))" ] }, { "cell_type": "code", "execution_count": 322, "metadata": { "collapsed": true }, "outputs": [], "source": [ "feat_card = [add2cart_bins, interval_bins, p_reorder_bins, u_reorder_bins]" ] }, { "cell_type": "code", "execution_count": 323, "metadata": { "collapsed": true }, "outputs": [], "source": [ "feat_cartesian = cartesian(feat_card)" ] }, { "cell_type": "code", "execution_count": 327, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders['up_card'] = users_orders.up_add2cart_order_sym + users_orders.up_interval_sym + users_orders.p_reorder_prob_sym + users_orders.u_reorder_sym" ] }, { "cell_type": "code", "execution_count": 337, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def encode_cartesian(row, feat_cartesian):\n", " '''\n", " lookup table\n", " turn a group of categorical variable into a symbol\n", " '''\n", " sym = np.where(np.all(row == feat_cartesian,axis=1))[0][0] + 1\n", " return sym" ] }, { "cell_type": "code", "execution_count": 340, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 5min 54s, sys: 6.16 s, total: 6min\n", "Wall time: 5min 59s\n" ] } ], "source": [ "%%time\n", "users_orders['up_airr_sym'] = users_orders.up_card.apply(lambda x: encode_cartesian(x, feat_cartesian))" ] }, { "cell_type": "code", "execution_count": 352, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_airr_sym = users_orders[['user_id', 'product_id', 'order_number', 'up_airr_sym']]" ] }, { "cell_type": "code", "execution_count": 354, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "up_airr_sym.sort_values(['user_id', 'product_id', 'order_number'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 356, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_airr_sym_list = up_airr_sym.groupby(['user_id', 'product_id'])['up_airr_sym'].apply(list).reset_index()" ] }, { "cell_type": "code", "execution_count": 358, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'up_airr_sym.pkl', 'wb') as f:\n", " pickle.dump(up_airr_sym_list, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## (u,p)对时间间隔预测\n", "\n", "Time Series Forcasting 问题\n", "\n", "- 方案1:用之前的Timestep对当前值进行回归预测\n", "- 方案2:LSTM 仅仅包含购买间隔信息\n", " - 样本(sample):(u,p,oid)\n", " - 特征(feature):两次购买之间的间隔\n", " \n", "- 预处理\n", " - 只出现一次的(u,p)无法计算间隔,NAN 丢弃\n", " - p_purchase_interval:距离下次购买的时间\n", " - 间隔为0的删除,同一天内购买两次视为一次\n", " - 为了training,间隔序列的长度 >=2 即(u,p)在prior里至少出现3次" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders = tle.get_users_orders(prior_or_train='prior')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "a = users_orders[['user_id', 'order_number', 'product_id', 'days_up_to_last', 'p_purchase_interval']].sort_values(['user_id', 'order_number', 'p_purchase_interval'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "del users_orders" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "a.sort_values(['user_id', 'product_id', 'order_number'], ascending=False, inplace=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 704 ms, sys: 136 ms, total: 840 ms\n", "Wall time: 839 ms\n" ] } ], "source": [ "%%time\n", "a['up_interval'] = a.head(1000).groupby(['user_id', 'product_id'])['days_up_to_last'].diff()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idorder_numberproduct_iddays_up_to_lastp_purchase_intervalup_interval
2418126611019614.0-1.0NaN
217604461919644.030.0NaN
294748061819644.00.0NaN
52129271719658.014.0NaN
319270701619678.020.0NaN
40893981519697.019.0NaN
2137607414196125.028.0NaN
448809513196154.029.0NaN
2274274412196175.021.0NaN
2407666411196190.015.0NaN
241812711101025814.0-1.0NaN
21760447191025844.030.0NaN
29474807181025844.00.0NaN
5212928171025858.014.0NaN
31927072161025878.020.0NaN
4089400151025897.019.0NaN
213760761410258125.028.0NaN
44880971310258154.029.0NaN
227427451210258175.021.0NaN
4089402151032697.0-1.0NaN
241812741101242714.0-1.0NaN
21760448191242744.030.0NaN
29474805181242744.00.0NaN
5212929171242758.014.0NaN
31927071161242778.020.0NaN
4089399151242797.019.0NaN
213760751412427125.028.0NaN
44880961312427154.029.0NaN
227427461212427175.021.0NaN
240766661112427190.015.0NaN
.....................
5934984206209123816737.0-1.0NaN
24260103206209938167114.077.077.0
20186696206209838167136.022.022.0
14617226206209338167203.067.067.0
28236066206209133873030.0-1.0NaN
28236064206209133921630.0-1.0NaN
17584595206209114031055.0-1.0NaN
14617216206209340396203.0-1.0NaN
29908312206209140396240.037.037.0
14617215206209340534203.0-1.0NaN
29908314206209140534240.037.037.0
20186704206209840992136.0-1.0NaN
5219651206209740992158.022.022.0
6521430206209440992173.015.015.0
5934982206209124121337.0-1.0NaN
17584591206209114121355.018.018.0
21489865206209104121385.030.030.0
6521424206209441213173.088.088.0
14617213206209341213203.030.030.0
17910756206209241213233.030.030.0
29908311206209141213240.07.07.0
6521428206209441665173.0-1.0NaN
5934987206209124396137.0-1.0NaN
20186697206209843961136.099.099.0
6521435206209443961173.037.037.0
5219655206209744325158.0-1.0NaN
17584596206209114837055.0-1.0NaN
5219653206209748697158.0-1.0NaN
5934990206209124874237.0-1.0NaN
5219652206209748742158.0121.0121.0
\n", "

32434489 rows × 6 columns

\n", "
" ], "text/plain": [ " user_id order_number product_id days_up_to_last \\\n", "24181266 1 10 196 14.0 \n", "21760446 1 9 196 44.0 \n", "29474806 1 8 196 44.0 \n", "5212927 1 7 196 58.0 \n", "31927070 1 6 196 78.0 \n", "4089398 1 5 196 97.0 \n", "21376074 1 4 196 125.0 \n", "4488095 1 3 196 154.0 \n", "22742744 1 2 196 175.0 \n", "24076664 1 1 196 190.0 \n", "24181271 1 10 10258 14.0 \n", "21760447 1 9 10258 44.0 \n", "29474807 1 8 10258 44.0 \n", "5212928 1 7 10258 58.0 \n", "31927072 1 6 10258 78.0 \n", "4089400 1 5 10258 97.0 \n", "21376076 1 4 10258 125.0 \n", "4488097 1 3 10258 154.0 \n", "22742745 1 2 10258 175.0 \n", "4089402 1 5 10326 97.0 \n", "24181274 1 10 12427 14.0 \n", "21760448 1 9 12427 44.0 \n", "29474805 1 8 12427 44.0 \n", "5212929 1 7 12427 58.0 \n", "31927071 1 6 12427 78.0 \n", "4089399 1 5 12427 97.0 \n", "21376075 1 4 12427 125.0 \n", "4488096 1 3 12427 154.0 \n", "22742746 1 2 12427 175.0 \n", "24076666 1 1 12427 190.0 \n", "... ... ... ... ... \n", "5934984 206209 12 38167 37.0 \n", "24260103 206209 9 38167 114.0 \n", "20186696 206209 8 38167 136.0 \n", "14617226 206209 3 38167 203.0 \n", "28236066 206209 13 38730 30.0 \n", "28236064 206209 13 39216 30.0 \n", "17584595 206209 11 40310 55.0 \n", "14617216 206209 3 40396 203.0 \n", "29908312 206209 1 40396 240.0 \n", "14617215 206209 3 40534 203.0 \n", "29908314 206209 1 40534 240.0 \n", "20186704 206209 8 40992 136.0 \n", "5219651 206209 7 40992 158.0 \n", "6521430 206209 4 40992 173.0 \n", "5934982 206209 12 41213 37.0 \n", "17584591 206209 11 41213 55.0 \n", "21489865 206209 10 41213 85.0 \n", "6521424 206209 4 41213 173.0 \n", "14617213 206209 3 41213 203.0 \n", "17910756 206209 2 41213 233.0 \n", "29908311 206209 1 41213 240.0 \n", "6521428 206209 4 41665 173.0 \n", "5934987 206209 12 43961 37.0 \n", "20186697 206209 8 43961 136.0 \n", "6521435 206209 4 43961 173.0 \n", "5219655 206209 7 44325 158.0 \n", "17584596 206209 11 48370 55.0 \n", "5219653 206209 7 48697 158.0 \n", "5934990 206209 12 48742 37.0 \n", "5219652 206209 7 48742 158.0 \n", "\n", " p_purchase_interval up_interval \n", "24181266 -1.0 NaN \n", "21760446 30.0 NaN \n", "29474806 0.0 NaN \n", "5212927 14.0 NaN \n", "31927070 20.0 NaN \n", "4089398 19.0 NaN \n", "21376074 28.0 NaN \n", "4488095 29.0 NaN \n", "22742744 21.0 NaN \n", "24076664 15.0 NaN \n", "24181271 -1.0 NaN \n", "21760447 30.0 NaN \n", "29474807 0.0 NaN \n", "5212928 14.0 NaN \n", "31927072 20.0 NaN \n", "4089400 19.0 NaN \n", "21376076 28.0 NaN \n", "4488097 29.0 NaN \n", "22742745 21.0 NaN \n", "4089402 -1.0 NaN \n", "24181274 -1.0 NaN \n", "21760448 30.0 NaN \n", "29474805 0.0 NaN \n", "5212929 14.0 NaN \n", "31927071 20.0 NaN \n", "4089399 19.0 NaN \n", "21376075 28.0 NaN \n", "4488096 29.0 NaN \n", "22742746 21.0 NaN \n", "24076666 15.0 NaN \n", "... ... ... \n", "5934984 -1.0 NaN \n", "24260103 77.0 77.0 \n", "20186696 22.0 22.0 \n", "14617226 67.0 67.0 \n", "28236066 -1.0 NaN \n", "28236064 -1.0 NaN \n", "17584595 -1.0 NaN \n", "14617216 -1.0 NaN \n", "29908312 37.0 37.0 \n", "14617215 -1.0 NaN \n", "29908314 37.0 37.0 \n", "20186704 -1.0 NaN \n", "5219651 22.0 22.0 \n", "6521430 15.0 15.0 \n", "5934982 -1.0 NaN \n", "17584591 18.0 18.0 \n", "21489865 30.0 30.0 \n", "6521424 88.0 88.0 \n", "14617213 30.0 30.0 \n", "17910756 30.0 30.0 \n", "29908311 7.0 7.0 \n", "6521428 -1.0 NaN \n", "5934987 -1.0 NaN \n", "20186697 99.0 99.0 \n", "6521435 37.0 37.0 \n", "5219655 -1.0 NaN \n", "17584596 -1.0 NaN \n", "5219653 -1.0 NaN \n", "5934990 -1.0 NaN \n", "5219652 121.0 121.0 \n", "\n", "[32434489 rows x 6 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a.sort_values(['user_id', 'product_id'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "number of (u,p,t) tuples: 32434489\n" ] } ], "source": [ "print(\"number of (u,p,t) tuples: %d\"%len(users_orders))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "del users_orders # free memory usage" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders_intervals = users_orders.dropna() #throw away product_id bought only once" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders_intervals = users_orders_intervals[users_orders_intervals.p_purchase_interval > 0] # throw away record buy in the same day" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "users_orders_intervals = users_orders_intervals.sort_values(['user_id', 'product_id', 'order_number'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3min 48s, sys: 12.5 s, total: 4min\n", "Wall time: 4min 1s\n" ] } ], "source": [ "%%time\n", "up_interval_list = users_orders_intervals.groupby(['user_id', 'product_id'])['p_purchase_interval'].apply(list).reset_index()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5279850" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(up_interval_list)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "del users_orders_intervals # free memory usage" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_interval_list['len'] = up_interval_list.p_purchase_interval.apply(lambda x: len(x))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_interval_list = up_interval_list[up_interval_list.len >= 2] # for train/test split " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open(constants.FEAT_DATA_DIR + 'up_interval_feat.pkl', 'wb') as f:\n", " pickle.dump(up_interval_list, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "len(up_interval_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "up_interval_list.len.describe()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }