-
Notifications
You must be signed in to change notification settings - Fork 0
/
forester.R
156 lines (140 loc) · 7.12 KB
/
forester.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#' Automated Machine Learning Model Solver
#'
#' Different tree-based models such as: XGBoost, ranger, CatBoost, LightGBM, etc. require different syntaxes and
#' different specific data objects. This function provides a simple and unified formula to create those models
#' with options to automatically cover the whole process of creating Machine Learning Model: Preprocessing Data,
#' Feature Engineering, Creating Models, Optimizing Hyperparameters, Model Explanation and Evaluating Models.
#'
#'
#' @param data data.frame, matrix, data.table or dgCMatrix - training data set to create model, if data_test = NULL, then data will be
#' automatically divided into training and testing dataset. NOTE: data has to contain the target column.
#' @param target character: name of the target column, should be character and has to be column name in data.
#' @param type character: defining the task. Two options are: "regression" and "classification", particularly, binary classification.
#' @param metric character, name of metric used for evaluating best model. For regression, options are: "mse", "rmse", "mad" and "r2".
#' For classification, options are: "auc", "recall", "precision", "f1" and "accuracy".
#' @param data_test optional argument, class of data.frame, matrix, data.table or dgCMatrix - test data set used for evaluating model performance.
#' @param train_ratio numeric, ranged from between 0 and 1, indicating the proportion of splitting data train over original dataset, the remained data as data test would be used for measuring model-performance.
#' @param fill_na logical, default is FALSE. If TRUE, missing values in target column are removed, missing values in categorical columns are replaced by mode and
#' missing values in numeric columns are substituted by median of corresponding columns.
#' @param num_features numeric, default is NULL. Parameter indicates number of most important features, which are chosen from the train dataset. Automatically, those important
#' features will be kept in the train and test datasets.
#' @param tune logical. If TRUE, function will perform the hyperparameter tuning steps for each model inside.
#' @param tune_iter number (default: 20) - total number of times the optimization step is to repeated. This argument is used when tune = TRUE.
#'
#'
#' @return An object of the class \code{forester_model} which is the best model with respect to the
#' chosen metric. It's also an object of the class \code{explainer} from DALEX family inherited the
#' explanation for the best chosen model.
#'
#' @export
#' @importFrom stats predict
#' @examples
#' \donttest{
#' # regression
#' library(DALEX)
#' data(apartments, package="DALEX")
#'
#' exp <- forester(apartments, "m2.price", "regression")
#' # plot(model_parts(exp))
#'
#' # binary classification
#' library(DALEX)
#' data(titanic_imputed, package="DALEX")
#'
#' exp <- forester(titanic_imputed, "survived", "classification")
#' # plot(model_parts(exp))
#'}
##
forester <- function(data, target, type, metric = NULL, data_test = NULL, train_ratio = 0.8, fill_na = TRUE, num_features = NULL, tune = FALSE, tune_iter = 20, refclass = NULL){
message("__________________________")
message("FORESTER")
data <- check_conditions(data, target, type)
### If data_test is blank, it is needed to split data into data_train and data_test by the train_ratio
if (is.null(data_test)){
splited_data <- split_data(data, target, type, ratio = train_ratio)
data_train <- splited_data[[1]]
data_test <- splited_data[[2]]
} else {
data_test <- check_conditions(data_test, target, type)
data_train <- data
# Check structure of data_test:
if (!(setequal(colnames(data_train),colnames(data_test)))){
stop("Column names in train data set and test data set are not identical.")
}
}
data_for_messages <- prepare_data(data_train = data_train, target = target, type = type,
fill_na = fill_na, num_features = num_features)
message("__________________________")
message("CREATING MODELS")
### Creating models, checking for the installed packages
is_available_ranger <- try(
suppressMessages(ranger_exp <- make_ranger(data = data_train, target = target, type = type,
tune = tune, tune_metric = metric,
tune_iter = tune_iter, fill_na = fill_na,
num_features = num_features,
refclass = refclass)),
silent = TRUE
)
if (class(is_available_ranger) == "try-error") {
ranger_exp <- NULL
message("--- Omitting `make_ranger()` because the `ranger` package is not available ---")
} else {
message("--- Ranger model has been created ---")
}
is_available_catboost <- try(
suppressMessages(catboost_exp <- make_catboost(data = data_train, target = target, type = type,
tune = tune, tune_metric = metric,
tune_iter = tune_iter, fill_na = fill_na,
num_features = num_features)),
silent = TRUE
)
if (class(is_available_catboost) == "try-error") {
catboost_exp <- NULL
message("--- Omitting `make_catboost()` because the `catboost` package is not available ---")
} else {
message("--- Catboost model has been created ---")
}
is_available_xgboost <- try(
suppressMessages(xgboost_exp <- make_xgboost(data = data_train, target = target, type = type,
tune = tune, tune_metric = metric,
tune_iter = tune_iter, fill_na = fill_na,
num_features = num_features)),
silent = TRUE
)
if (class(is_available_xgboost) == "try-error") {
xgboost_exp <- NULL
message("--- Omitting `make_xgboost()` because the `xgboost` package is not available ---")
} else {
message("--- Xgboost model has been created ---")
}
is_available_lightgbm <- try(
suppressMessages(lightgbm_exp <- make_lightgbm(data = data_train, target = target, type = type,
tune = tune, tune_metric = metric,
tune_iter = tune_iter, fill_na = fill_na,
num_features = num_features)),
silent = TRUE
)
if (class(is_available_lightgbm) == "try-error") {
lightgbm_exp <- NULL
message("--- Omitting `make_lightgbm()` because the `lightgbm` package is not available ---")
} else {
message("--- LightGBM model has been created ---")
}
message("__________________________")
message("COMPARISON")
result <- evaluate(
catboost_exp,
xgboost_exp,
ranger_exp,
lightgbm_exp,
data_test = data_test,
target = target,
metric = metric
)
return(list(best_model = result$best_model,
model1 = result$model1,
model2 = result$model2,
model3 = result$model3,
model4 = result$model4,
test_data = data_test))
}