Skip to content

Commit

Permalink
final: codes
Browse files Browse the repository at this point in the history
  • Loading branch information
orbxball committed Jun 29, 2017
1 parent 9c345b5 commit 0ecb180
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 0 deletions.
10 changes: 10 additions & 0 deletions final/src/final.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

if [ $# != 3 ]; then
echo "Usage: bash final.sh [training set values] [training set labels] [testing set values]";
fi

wget www.csie.ntu.edu.tw/~b03502040/8275.zip
unzip 8275.zip

python3 train.py --train $1 --label $2 --test $3 --model depth23/*
4 changes: 4 additions & 0 deletions final/src/requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy
scipy
pandas
xgboost
97 changes: 97 additions & 0 deletions final/src/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import logging
import argparse
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy.stats import mode
from utils import DataProcessor


def parse_args():
parser = argparse.ArgumentParser('Pump it up.')
parser.add_argument('--train', required=True)
parser.add_argument('--label', required=True)
parser.add_argument('--test', required=True)
parser.add_argument('--cv', type=int, default=4)
parser.add_argument('--eta', type=float, default=0.025)
parser.add_argument('--depth', type=int, default=23)
parser.add_argument('--seed', nargs=2, type=int, default=[60, 73])
parser.add_argument('--model', nargs='*')
return parser.parse_args()


def main(args):

logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

data = DataProcessor()

logger.info('Read csvs')
data.read_data(args.train, args.test, args.label)

logger.info('Preprocess data')
data.preprocess()

train_dmatrix = xgb.DMatrix(data=data.train, label=data.labels, missing=np.nan)
test_dmatrix = xgb.DMatrix(data=data.test, missing=np.nan)

if args.model is None:
depth_dir = 'depth{}'.format(args.depth)
if not os.path.exists(depth_dir):
os.mkdir(depth_dir)

param = {
'booster': 'gbtree',
'obective': 'multi:softmax',
'eta': args.eta,
'max_depth': args.depth,
'colsample_bytree': 0.4,
'silent': 1,
'eval_metric': 'merror',
'num_class': 4
}

logger.info('Start training from seed {} to {}'.format(args.seed[0], args.seed[1]-1))
for i in range(args.seed[0], args.seed[1]):
logger.info('Cross validate with seed {}, depth {}, {}-fold'.format(i, args.depth, args.cv))

param['seed'] = i
#res = xgb.cv(param, dtrain=train_dmatrix, seed=i, num_boost_round=500,
# nfold=args.cv, early_stopping_rounds=30, maximize=False, verbose_eval=True)
#num_boost_round = res['test-merror-mean'].argmin()
num_boost_round = 210
logger.info('Train xgboost tree with seed {}, depth {}, num_boost_round {}'.format(i, args.depth, num_boost_round))

clf = xgb.train(param, dtrain=train_dmatrix, num_boost_round=num_boost_round, maximize=False)

save_path = os.path.join(depth_dir, 'xgb-model-seed-{}'.format(i))

clf.save_model(save_path)

logger.info('Save xgboost tree at {}'.format(save_path))

logger.info('End of training. All models are saved at {}'.format(depth_dir))

else:
pred_overall = []
for mfile in args.model:
logger.info('Load xgboost tree model {}'.format(mfile))
clf = xgb.Booster()
clf.load_model(mfile)

pred = clf.predict(data=test_dmatrix).astype(int)
pred_overall.append(pred)

pred_overall = mode(pred_overall, axis=0)[0].squeeze()
data.write_data('output.csv', pred_overall)


if __name__ == '__main__':
args = parse_args()
main(args)
62 changes: 62 additions & 0 deletions final/src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np
import pandas as pd

class DataProcessor():

def __init__(self):
pass

def read_data(self, train_f, test_f, label_f):
self.raw_train_features = pd.read_csv(train_f)
self.raw_test_features = pd.read_csv(test_f)
self.raw_labels = pd.read_csv(label_f)

def preprocess(self):
train = self.raw_train_features
test = self.raw_test_features
labels = self.raw_labels

train['test'] = 0
test['test'] = 1

data = pd.concat([train, test])

data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['date_recorded'] = (data['date_recorded'] - data['date_recorded'].min()) / np.timedelta64(1, 'D')

data['construction_year'] = data['construction_year'] - 1960
data['construction_year'][data['construction_year'] < 0] = data['construction_year'][data['construction_year'] >= 0].median()

data['gps_height'][data['gps_height'] == 0] = data['gps_height'][data['gps_height'] > 0].median()

data.drop(['num_private', 'recorded_by', 'wpt_name', 'extraction_type_group', 'extraction_type', 'payment_type',
'water_quality', 'scheme_management', 'district_code', 'region', 'region_code', 'subvillage', 'ward',
'waterpoint_type_group', 'quantity_group', 'installer'], axis=1, inplace=True)

columns = list(data.select_dtypes(include=['object']).columns)

data = pd.get_dummies(data, columns=columns)

train = data.loc[data['test'] == 0]
test = data.loc[data['test'] == 1]

self.id = test['id']

train.drop(['id', 'test'], axis=1, inplace=True)
test.drop(['id', 'test'], axis=1, inplace=True)

labels.drop(['id'], axis=1, inplace=True)
labels = labels['status_group'].astype('category')

labels.cat.reorder_categories(['non functional', 'functional needs repair', 'functional'], inplace=True)

self.train = train.values
self.test = test.values
self.labels = labels.cat.codes.values
self.classes = labels.cat.categories

def write_data(self, filename, pred):
pred = [self.classes[i] for i in pred]

output = pd.DataFrame({'id': self.id, 'status_group': pred}, columns=['id', 'status_group'])
output.to_csv(filename, index=False, columns=('id', 'status_group'))

0 comments on commit 0ecb180

Please sign in to comment.