Skip to content

Commit

Permalink
added k-fold valditations
Browse files Browse the repository at this point in the history
  • Loading branch information
berylgithub committed Dec 16, 2019
1 parent 04f5df9 commit 62e9daa
Showing 1 changed file with 95 additions and 51 deletions.
146 changes: 95 additions & 51 deletions trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def dataset_loader(filepath):
'''
load and split the dataset
'''
dataset = dataset_loader('dataset.pkl')
dataset = dataset_loader(os.getcwd()+'/Data/dataset_alpha_121019.pkl')

features = np.array([data['x_vector'] for data in dataset])
labels = np.array([data['y'] for data in dataset])
Expand All @@ -46,7 +46,8 @@ def dataset_loader(filepath):
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)
#


# '''
# data regression
# '''
Expand All @@ -58,56 +59,99 @@ def dataset_loader(filepath):
# '''
# with open(os.getcwd()+"/Model/rf_pp_alpha.pkl", "wb") as f:
# pickle.dump(rf, f)

'''
model loader
'''
with open(os.getcwd()+"/Model/rf_pp_alpha.pkl", "rb") as f:
rf = pickle.load(f)

'''
train set analysis
'''
#Mean Absolute Error
preds = rf.predict(x_train)
errors = abs(preds - y_train)
print('Mean Absolute Error:', round(np.mean(errors), 2))

#Mean Absolute Percentage Error & Accuracy
mape = 100 * (errors / y_train)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

#Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_train, preds))
print('Root Mean Squared Error :', round(rmse, 2))

#Pearson Correlation Coefficient (PCC) score
pcc = pearsonr(y_train, preds)
print('Pearson Correlation Coefficient :', round(pcc[0],2))
print(preds, y_train)
#
#
#
# '''
# train set analysis
# '''
# #Mean Absolute Error
# preds = rf.predict(x_train)
# errors = abs(preds - y_train)
# print('Mean Absolute Error:', round(np.mean(errors), 2))
#
# #Mean Absolute Percentage Error & Accuracy
# mape = 100 * (errors / y_train)
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', round(accuracy, 2), '%.')
#
# #Root Mean Squared Error
# rmse = np.sqrt(mean_squared_error(y_train, preds))
# print('Root Mean Squared Error :', round(rmse, 2))
#
# #Pearson Correlation Coefficient (PCC) score
# pcc = pearsonr(y_train, preds)
# print('Pearson Correlation Coefficient :', round(pcc[0],2))
# print(preds, y_train)
#
# '''
# test set analysis
# '''
# #Mean Absolute Error
# preds = rf.predict(x_test)
# errors = abs(preds - y_test)
# print('Mean Absolute Error:', round(np.mean(errors), 2))
#
# #Mean Absolute Percentage Error & Accuracy
# mape = 100 * (errors / y_test)
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', round(accuracy, 2), '%.')
#
# #Root Mean Squared Error
# rmse = np.sqrt(mean_squared_error(y_test, preds))
# print('Root Mean Squared Error :', round(rmse, 2))
#
# #Pearson Correlation Coefficient (PCC) score
# pcc = pearsonr(y_test, preds)
# print('Pearson Correlation Coefficient :', round(pcc[0],2))


'''
test set analysis
k-fold cross validation
'''
#Mean Absolute Error
preds = rf.predict(x_test)
errors = abs(preds - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))

#Mean Absolute Percentage Error & Accuracy
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

#Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print('Root Mean Squared Error :', round(rmse, 2))

#Pearson Correlation Coefficient (PCC) score
pcc = pearsonr(y_test, preds)
print('Pearson Correlation Coefficient :', round(pcc[0],2))

folds = [3,4,5,7,10]
for fold in folds:
kfolds=[]
n=fold
idx = 0
kf = KFold(n_splits=n)
for train_index, test_index in kf.split(features):
kfold = {}
print("index training :",idx)
print("TRAIN:", len(train_index), "TEST:", len(test_index))
x_train, x_test = features[train_index], features[test_index]
y_train, y_test = labels[train_index], labels[test_index]
rf = RandomForestRegressor(n_estimators = 1000, random_state=13, verbose=0)
rf.fit(x_train, y_train)

# for i in range(len(preds)):
# print(preds[i], y_test[i])
idx+=1

#Pearson Correlation Coefficient (PCC) score
preds = rf.predict(x_train)
pcc = pearsonr(y_train, preds)
kfold["pcc_train"] = pcc[0]
print('PCC train :', round(pcc[0],2))

preds = rf.predict(x_test)
pcc = pearsonr(y_test, preds)
kfold["pcc_test"] = pcc[0]
print('PCC test :', round(pcc[0],2))
print('===================')

kfold["train_idx"] = train_index
kfold["test_idx"] = test_index
kfold["k"] = n
kfold["idx"] = idx
kfold["model"] = rf
kfolds.append(kfold)
kfolds = sorted(kfolds, key=lambda k: k['pcc_test'], reverse=True)
print(kfolds[0]['k'], kfolds[0]['pcc_test'])
#save best model
with open(os.getcwd()+"/Model/rf_pp_a_"+str(n)+"fold_best.pkl", "wb") as f:
pickle.dump(kfolds[0], f)

# '''
# model loader
# '''
# with open(os.getcwd()+"/Model/rf_pp_alpha.pkl", "rb") as f:
# rf = pickle.load(f)

0 comments on commit 62e9daa

Please sign in to comment.