Skip to content

Commit

Permalink
Update utils.py
Browse files Browse the repository at this point in the history
Fix: clean & modify flatten multiindex
  • Loading branch information
Yihong Chen committed Sep 18, 2017
1 parent db02b93 commit a7cee08
Showing 1 changed file with 20 additions and 10 deletions.
30 changes: 20 additions & 10 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,47 @@
import itertools
import multiprocessing
from joblib import Parallel, delayed
from mp_generic import mp_groupby

from math import sqrt
from scipy.stats import entropy
from sklearn.utils import shuffle
from mp_generic import mp_groupby
from f1optim import F1Optimizer # disable numba acceleration
from numpy.random import binomial, beta
from sklearn.metrics import f1_score, recall_score, precision_score, mean_squared_error

import lightgbm as lgb
import xgboost as xgb

import constants, inference

from f1optim import F1Optimizer # disable numba acceleration

###### Log Extraction

def flatten_multiidx(df):
'''
given a df where the columns of df are multiindex
useful for dealing with groupby-agg results
return a df with flatten index
Given a df where the columns are multiindex(>=2 levels), flat it into one-level index
Useful for dealing with groupby-agg results
Args:
df: pandas DataFrame
Return:
pandas DataFrame with flatten index
'''
def sel_level(col):
'''
col = (level0, level1)
select which level of index to use as new col names
Select which level of index to use as new col names
Args:
col: tuple, (col_name_level_0, col_name_level_1, col_name_level_2, ... )
Return:
col: string, new col name
Example:
col = ('price', 'max') --> 'price_max'
'''
return col[1] if col[1] != '' else col[0]
col = [level for level in col if level != '']
return '_'.join(col)

df.columns = [sel_level(col) for col in df.columns.values]
return df

######BEFORE TRAIN UTILS

def train_test_split_users(train, test_size, seed = 1993):
Expand Down Expand Up @@ -382,4 +392,4 @@ def submission_format(subf):

def applyParallel(dfGrouped, func, func_args = None):
retLst = Parallel(n_jobs=multiprocessing.cpu_count() // 2)(delayed(func)(group, func_args) for name, group in dfGrouped)
return pd.concat(retLst)
return pd.concat(retLst)

0 comments on commit a7cee08

Please sign in to comment.