-
Notifications
You must be signed in to change notification settings - Fork 0
/
randomforest_sklearn_qm9test.py
89 lines (71 loc) · 2.71 KB
/
randomforest_sklearn_qm9test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""xgbReg.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1q54pxnQMtaffz7JGaYf2jxL1C9XOFoMo
"""
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
#import graphlab as gl
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import time
import random
start = time.time()
data = pd.read_csv('qm9_feature_data.csv')
X_train, X_test, y_train, y_test = train_test_split(
data.iloc[:,:-1].values,
data.iloc[:,-1].values
)
print (data.head())
"""
egr = RandomForestRegressor(bootstrap=True, max_samples = 1000, max_depth=2000,
random_state=0,min_samples_split = 100, min_samples_leaf = 10,
max_features = 2, n_jobs = -1, verbose = 1)
"""
#best model
#regr = RandomForestRegressor(max_depth=5000, random_state=0)
#max_depths = [3500,4100,4200,4300,4500,4800,5000]
max_depth = 5000
def MAPE(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true))
def mAPE(y_true, y_pred):
return np.median(np.abs((y_true - y_pred) / y_true))
start = time.time()
#max_depth = random.choice(max_depths)
regr = RandomForestRegressor(n_jobs = -1,max_depth = 3500,
random_state=0,verbose = 1,
warm_start = False
)
regr.fit(X_train,y_train)
print (regr.get_params())
y_pred = regr.predict(X_test)
print ("feature_importances: ", regr.feature_importances_)
features = list(data.columns.values)
features.pop(-1)
print (features)
df = pd.DataFrame(regr.feature_importances_,features)
df.plot(kind='barh')
plt.savefig('features.png')
print (df)
print(y_pred)
fig, ax = plt.subplots()
plt.figure(figsize=(15,6))
plt.barh(features, regr.feature_importances_)
plt.yticks(rotation=30, ha = 'right')
plt.savefig("last.png")
print ("r-square from the model: ",regr.score (X_test, y_test))
print ("Mean of Cv_test: ",np.mean(y_test))
print ("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print ("MSE/Mean_Cv_test: ", mean_squared_error(y_test, y_pred)/np.mean(y_test))
print ("Mean absolute error: ",mean_absolute_error(y_test, y_pred))
print ("Mean_absolute_error/Mean_Cv_test: ",mean_absolute_error(y_test, y_pred)/np.mean(y_test))
print ("MeanAbsolutePercentage error (MAPE): ",MAPE(y_test, y_pred))
print ("MedianAbsolutePercentage error (MdAPE): ",mAPE(y_test, y_pred))
end = time.time()
print ("how long it takes: ",end-start)