Update utils.py

Fix: clean & modify flatten multiindex
yihong-chen · Sep 18, 2017 · a7cee08 · a7cee08
1 parent db02b93
commit a7cee08
Showing 1 changed file with 20 additions and 10 deletions.
diff --git a/utils.py b/utils.py
@@ -5,37 +5,47 @@
 import itertools
 import multiprocessing
 from joblib import Parallel, delayed
+from mp_generic import mp_groupby
 
 from math import sqrt
 from scipy.stats import entropy
 from sklearn.utils import shuffle
-from mp_generic import mp_groupby
-from f1optim import F1Optimizer # disable numba acceleration
 from numpy.random import binomial, beta
 from sklearn.metrics import f1_score, recall_score, precision_score, mean_squared_error
 
 import lightgbm as lgb
 import xgboost as xgb 
 
 import constants, inference
-
+from f1optim import F1Optimizer # disable numba acceleration
 
 ###### Log Extraction
+
 def flatten_multiidx(df):
  '''
- given a df where the columns of df are multiindex
- useful for dealing with groupby-agg results
- return a df with flatten index
+ Given a df where the columns are multiindex(>=2 levels), flat it into one-level index
+ Useful for dealing with groupby-agg results
+ Args:
+ df: pandas DataFrame
+ Return:
+ pandas DataFrame with flatten index
  '''
  def sel_level(col):
  '''
- col = (level0, level1)
- select which level of index to use as new col names
+ Select which level of index to use as new col names
+ Args:
+ col: tuple, (col_name_level_0, col_name_level_1, col_name_level_2, ... )
+ Return:
+ col: string, new col name
+ Example:
+ col = ('price', 'max') --> 'price_max' 
  '''
- return col[1] if col[1] != '' else col[0]
+ col = [level for level in col if level != '']
+ return '_'.join(col)
 
  df.columns = [sel_level(col) for col in df.columns.values]
  return df
+
 ######BEFORE TRAIN UTILS
 
 def train_test_split_users(train, test_size, seed = 1993):
@@ -382,4 +392,4 @@ def submission_format(subf):
 
 def applyParallel(dfGrouped, func, func_args = None):
  retLst = Parallel(n_jobs=multiprocessing.cpu_count() // 2)(delayed(func)(group, func_args) for name, group in dfGrouped)
- return pd.concat(retLst)
+ return pd.concat(retLst)