diff --git a/final/src/final.sh b/final/src/final.sh new file mode 100644 index 0000000..b94a3e8 --- /dev/null +++ b/final/src/final.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +if [ $# != 3 ]; then + echo "Usage: bash final.sh [training set values] [training set labels] [testing set values]"; +fi + +wget www.csie.ntu.edu.tw/~b03502040/8275.zip +unzip 8275.zip + +python3 train.py --train $1 --label $2 --test $3 --model depth23/* diff --git a/final/src/requirement.txt b/final/src/requirement.txt new file mode 100644 index 0000000..bfe8fba --- /dev/null +++ b/final/src/requirement.txt @@ -0,0 +1,4 @@ +numpy +scipy +pandas +xgboost diff --git a/final/src/train.py b/final/src/train.py new file mode 100644 index 0000000..fb3cd43 --- /dev/null +++ b/final/src/train.py @@ -0,0 +1,97 @@ +import os +import logging +import argparse +import numpy as np +import pandas as pd +import xgboost as xgb +from scipy.stats import mode +from utils import DataProcessor + + +def parse_args(): + parser = argparse.ArgumentParser('Pump it up.') + parser.add_argument('--train', required=True) + parser.add_argument('--label', required=True) + parser.add_argument('--test', required=True) + parser.add_argument('--cv', type=int, default=4) + parser.add_argument('--eta', type=float, default=0.025) + parser.add_argument('--depth', type=int, default=23) + parser.add_argument('--seed', nargs=2, type=int, default=[60, 73]) + parser.add_argument('--model', nargs='*') + return parser.parse_args() + + +def main(args): + + logger = logging.getLogger() + handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + data = DataProcessor() + + logger.info('Read csvs') + data.read_data(args.train, args.test, args.label) + + logger.info('Preprocess data') + data.preprocess() + + train_dmatrix = xgb.DMatrix(data=data.train, label=data.labels, missing=np.nan) + test_dmatrix = xgb.DMatrix(data=data.test, missing=np.nan) + + if args.model is None: + depth_dir = 'depth{}'.format(args.depth) + if not os.path.exists(depth_dir): + os.mkdir(depth_dir) + + param = { + 'booster': 'gbtree', + 'obective': 'multi:softmax', + 'eta': args.eta, + 'max_depth': args.depth, + 'colsample_bytree': 0.4, + 'silent': 1, + 'eval_metric': 'merror', + 'num_class': 4 + } + + logger.info('Start training from seed {} to {}'.format(args.seed[0], args.seed[1]-1)) + for i in range(args.seed[0], args.seed[1]): + logger.info('Cross validate with seed {}, depth {}, {}-fold'.format(i, args.depth, args.cv)) + + param['seed'] = i + #res = xgb.cv(param, dtrain=train_dmatrix, seed=i, num_boost_round=500, + # nfold=args.cv, early_stopping_rounds=30, maximize=False, verbose_eval=True) + #num_boost_round = res['test-merror-mean'].argmin() + num_boost_round = 210 + logger.info('Train xgboost tree with seed {}, depth {}, num_boost_round {}'.format(i, args.depth, num_boost_round)) + + clf = xgb.train(param, dtrain=train_dmatrix, num_boost_round=num_boost_round, maximize=False) + + save_path = os.path.join(depth_dir, 'xgb-model-seed-{}'.format(i)) + + clf.save_model(save_path) + + logger.info('Save xgboost tree at {}'.format(save_path)) + + logger.info('End of training. All models are saved at {}'.format(depth_dir)) + + else: + pred_overall = [] + for mfile in args.model: + logger.info('Load xgboost tree model {}'.format(mfile)) + clf = xgb.Booster() + clf.load_model(mfile) + + pred = clf.predict(data=test_dmatrix).astype(int) + pred_overall.append(pred) + + pred_overall = mode(pred_overall, axis=0)[0].squeeze() + data.write_data('output.csv', pred_overall) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/final/src/utils.py b/final/src/utils.py new file mode 100644 index 0000000..0cfbb69 --- /dev/null +++ b/final/src/utils.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd + +class DataProcessor(): + + def __init__(self): + pass + + def read_data(self, train_f, test_f, label_f): + self.raw_train_features = pd.read_csv(train_f) + self.raw_test_features = pd.read_csv(test_f) + self.raw_labels = pd.read_csv(label_f) + + def preprocess(self): + train = self.raw_train_features + test = self.raw_test_features + labels = self.raw_labels + + train['test'] = 0 + test['test'] = 1 + + data = pd.concat([train, test]) + + data['date_recorded'] = pd.to_datetime(data['date_recorded']) + data['date_recorded'] = (data['date_recorded'] - data['date_recorded'].min()) / np.timedelta64(1, 'D') + + data['construction_year'] = data['construction_year'] - 1960 + data['construction_year'][data['construction_year'] < 0] = data['construction_year'][data['construction_year'] >= 0].median() + + data['gps_height'][data['gps_height'] == 0] = data['gps_height'][data['gps_height'] > 0].median() + + data.drop(['num_private', 'recorded_by', 'wpt_name', 'extraction_type_group', 'extraction_type', 'payment_type', + 'water_quality', 'scheme_management', 'district_code', 'region', 'region_code', 'subvillage', 'ward', + 'waterpoint_type_group', 'quantity_group', 'installer'], axis=1, inplace=True) + + columns = list(data.select_dtypes(include=['object']).columns) + + data = pd.get_dummies(data, columns=columns) + + train = data.loc[data['test'] == 0] + test = data.loc[data['test'] == 1] + + self.id = test['id'] + + train.drop(['id', 'test'], axis=1, inplace=True) + test.drop(['id', 'test'], axis=1, inplace=True) + + labels.drop(['id'], axis=1, inplace=True) + labels = labels['status_group'].astype('category') + + labels.cat.reorder_categories(['non functional', 'functional needs repair', 'functional'], inplace=True) + + self.train = train.values + self.test = test.values + self.labels = labels.cat.codes.values + self.classes = labels.cat.categories + + def write_data(self, filename, pred): + pred = [self.classes[i] for i in pred] + + output = pd.DataFrame({'id': self.id, 'status_group': pred}, columns=['id', 'status_group']) + output.to_csv(filename, index=False, columns=('id', 'status_group'))