This repository has been archived by the owner on Sep 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 19
/
trainByCrossValid.Rd
149 lines (126 loc) · 10.2 KB
/
trainByCrossValid.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trainByCrossValid.r
\name{trainByCrossValid}
\alias{trainByCrossValid}
\title{Calibrate a distribution/niche model using cross-validation}
\usage{
trainByCrossValid(
data,
resp = names(data)[1],
preds = names(data)[2:ncol(data)],
folds = dismo::kfold(data),
trainFx = enmSdm::trainGlm,
...,
metrics = c("logLoss", "cbi", "auc", "fpb", "tss", "msss", "mdss", "minTrainPres",
"trainSe95", "trainSe90"),
weightEvalTrain = TRUE,
weightEvalTest = TRUE,
na.rm = FALSE,
out = c("models", "tuning"),
verbose = 1
)
}
\arguments{
\item{data}{Data frame or matrix. Environmental predictors (and no other fields) for presences and background sites.}
\item{resp}{Character or integer. Name or column index of response variable. Default is to use the first column in \code{data}.}
\item{preds}{Character list or integer list. Names of columns or column indices of predictors. Default is to use the second and subsequent columns in \code{data}.}
\item{folds}{Either a numeric vector, or matrix or data frame:
\itemize{
\item If a vector, there must be one value per row in \code{data}. If there are \emph{K} unique values in the vector, then \emph{K} unique models will be trained. Each model will use all of the data except for rows that match a particular value in the \code{folds} vector. For example, if \code{folds = c(1, 1, 1, 2, 2, 2, 3, 3, 3)}, then three models will be trained, one with all rows that match the 2s and 3s, one with all rows matching 1s and 2s, and one will all rows matching 1s and 3s. The models will be evaluated against the withheld data and against the training data. Use \code{NA} to exclude rows from all testing/training. The default is to construct 5 folds of roughly equal size.
\item If a matrix or data frame, there must be one row per row in \code{data}. Each column corresponds to a different model to be trained. For a given column there should be only two unique values, plus possibly \code{NA}s. Of the two values, the lesser value will be used to identify the calibration data and the greater value the evaluation data. Rows with \code{NA}s will be ignored. For example, a particular column could contain 1s, 2, and \code{NA}s. Data rows corresponding to 1s will be used as training data, data rows corresponding to 2s as test data, and rows with \code{NA} are dropped. The \code{NA} flag is useful for creating spatially-structured cross-validation folds where training and test sites are separated (spatially) by censored (ignored) data.
}}
\item{trainFx}{Function, name of the "trainXYZ" function to use. Currently the functions/algorithms supported are \code{\link[enmSdm]{trainGlm}}, \code{\link[enmSdm]{trainBrt}}, \code{\link[enmSdm]{trainMaxEnt}}, \code{\link[enmSdm]{trainMaxNet}}, and \code{\link[enmSdm]{trainNs}}.}
\item{...}{Arguments to pass to the "trainXYZ" function.}
\item{metrics}{Character vector, names of evaluation metrics to calculate. If \code{weightEvalTrain} and/or \code{weightEvalTest} is \code{TRUE}, then the "train" and "test" version of each metric will be weighted versions of each. The default is to calculate all of:
\itemize{
\item \code{'logLoss'}: Log loss. Higher (less negative) values imply better fit.
\item \code{'cbi'}: Continuous Boyce Index (CBI). Calculated with \code{\link[enmSdm]{contBoyce}}.
\item \code{'auc'}: Area under the receiver-operator characteristic curve (AUC). Calculated with \code{\link[enmSdm]{aucWeighted}}.
\item \code{'tss'}: Maximum value of the True Skill Statistic. Calculated with \code{\link[enmSdm]{tssWeighted}}.
\item \code{'msss'}: Sensitivity and specificity calculated at the threshold that maximizes sensitivity (true presence prediction rate) plus specificity (true absence prediction rate).
\item \code{'mdss'}: Sensitivity (se) and specificity (sp) calculated at the threshold that minimizes the difference between sensitivity and specificity.
\item \code{'minTrainPres'}: Sensitivity and specificity at the greatest threshold at which all training presences are classified as "present".
\item \code{'trainSe95'} and/or \code{'trainSe90'}: Sensitivity at the threshold that ensures either 95 or 90 percent of all training presences are classified as "present" (training sensitivity = 0.95 or 0.9).
}}
\item{weightEvalTrain}{Logical, if \code{TRUE} (default) and an argument named \code{w} is specified in \code{...}, then evaluation statistics that support weighting will use the weights specified by \code{w} \emph{for the "train" version of evaluation statistics}. If \code{FALSE}, there will be no weighting of test sites. Note that this applies \emph{only} to the calculation of evaluation statistics. If \code{w} is supplied weights they will be used for model calibration.}
\item{weightEvalTest}{Logical, if \code{TRUE} (default) and an argument named \code{w} is specified in \code{...}, then evaluation statistics that support weighting will use the weights specified by \code{w} \emph{for the "test" version of evaluation statistics}. If \code{FALSE}, there will be no weighting of test sites. Note that this applies \emph{only} to the calculation of evaluation statistics. If \code{w} is supplied then weights will be used for model calibration.}
\item{na.rm}{Logical, if \code{TRUE} then remove \code{NA} predictions before calculating evaluation statistics. If \code{FALSE} (default), propagate \code{NA}s (meaning if predictions contain \code{NA}s, then the evaluation statistic will most likely also be \code{NA}.)}
\item{out}{Character. Indicates type of value returned. If \code{'models'} then returns a list of a list of candidate models (one sublist per fold. If \code{'tuning'} then just return the evaluation table for candidate models of each fold. If both then return a 2-item list with all candidate models and tuning tables. \emph{WARNING}: Depending on the type of model, using \code{'models'} may produce objects that are very large in memory.}
\item{verbose}{Numeric. If 0 show no progress updates. If > 0 then show minimal progress updates for this function only. If > 1 show detailed progress for this function. If > 2 show detailed progress plus detailed progress for the "trainXYZ" function.}
}
\value{
A list object with several named elements:
\itemize{
\item \code{meta}: Meta-data on the model call.
\item \code{folds}: The \code{folds} object.
\item \code{models} (if \code{'models'} is in argument \code{out}): A list of model objects, one per data fold
\item \code{tuning} (if \code{'tuning'} is in argument \code{out}): One data frame per k-fold, each containing evaluation statistics for all candidate models in the fold.
}
}
\description{
This function is an extension of any of the "trainXYZ" functions for calibrating species distribution and ecological niche models. This function uses the "trainXYZ" function to calibrate and evaluate a suite of models using cross-validation. The models are evaluated against withheld data to determine the optimal settings for a "final" model using all available data.
}
\details{
In some cases models do not converge (e.g., boosted regression trees and generalized additive models sometimes suffer from this issue). In this case the model will be skipped, but a data frame with the k-fold and model number in the fold will be returned in the $meta element in the output. If all models converged, then this data frame will be empty.
}
\examples{
\dontrun{
set.seed(123)
### contrived example
# generate training/testing data
n <- 10000
x1 <- seq(-1, 1, length.out=n) + rnorm(n)
x2 <- seq(10, 0, length.out=n) + rnorm(n)
x3 <- rnorm(n)
y <- 2 * x1 + x1^2 - 10 * x2 - x1 * x2
y <- statisfactory::invLogitAdj(y, 0.001)
presAbs <- as.integer(runif(10000) > (1 - y))
data <- data.frame(presAbs=presAbs, x1=x1, x2=x2, x3=x3)
model <- trainGlm(data, verbose=TRUE)
summary(model) # most parsimonious model
folds <- dismo::kfold(data, 3)
out <- trainByCrossValid(data, folds=folds, verbose=1)
str(out, 1)
summaryByCrossValid(out)
str(out, 1)
head(out$tuning[[1]])
head(out$tuning[[2]])
head(out$tuning[[3]])
# can do following for each fold (3 of them)
lapply(out$models[[1]], coefficients)
sapply(out$models[[1]], logLik)
sapply(out$models[[1]], AIC)
# select model for k = 1 with greatest CBI
top <- which.max(out$tuning[[1]]$cbiTest)
summary(out$models[[1]][[top]])
# in fold k = 1, which models perform well but are not overfit?
plot(out$tuning[[1]]$cbiTrain, out$tuning[[1]]$cbiTest, col='white',
main='Model Numbers for k = 1')
abline(0, 1, col='red')
numModels <- nrow(out$tuning[[1]])
text(out$tuning[[1]]$cbiTrain, out$tuning[[1]]$cbiTest, labels=1:numModels)
usr <- par('usr')
x <- usr[1] + 0.9 * (usr[4] - usr[3])
y <- usr[3] + 0.1 * (usr[4] - usr[3])
text(x, y, labels='overfit', col='red', xpd=NA)
x <- usr[1] + 0.1 * (usr[4] - usr[3])
y <- usr[3] + 0.9 * (usr[4] - usr[3])
text(x, y, labels='suspicious', col='red', xpd=NA)
# other algorithms
# boosted regression trees (with "fast" set of parameters... not recommended
# for normal use)
brt <- trainByCrossValid(data, folds=folds, verbose=2, trainFx=trainBrt,
maxTrees=2000, treeComplexity=2, learningRate=c(0.01, 0.001))
# MaxEnt with "fast" set of settings (not recommended for normal use)
mx <- trainByCrossValid(data, folds=folds, verbose=2, trainFx=trainMaxEnt,
regMult=c(1, 2), classes='lp')
}
}
\references{
Fielding, A.H. and J.F. Bell. 1997. A review of methods for the assessment of prediction errors in conservation presence/absence models. \emph{Environmental Conservation} 24:38-49.
La Rest, K., Pinaud, D., Monestiez, P., Chadoeuf, J., and Bretagnolle, V. 2014. Spatial leave-one-out cross-validation for variable selection in the presence of spatial autocorrelation. Global Ecology and Biogeography 23:811-820.
Wunderlich, R.F., Lin, P-Y., Anthony, J., and Petway, J.R. 2019. Two alternative evaluation metrics to replace the true skill statistic in the assessment of species distribution models. Nature Conservation 35:97-116.
}
\seealso{
\code{\link[enmSdm]{trainBrt}}, \code{\link[enmSdm]{trainCrf}}, \code{\link[enmSdm]{trainGam}}, \code{\link[enmSdm]{trainGlm}}, \code{\link[enmSdm]{trainMaxEnt}}, \code{\link[enmSdm]{trainMaxNet}}, \code{\link[enmSdm]{trainLars}}, \code{\link[enmSdm]{trainMaxNet}}, \code{\link[enmSdm]{trainRf}}, \code{\link[enmSdm]{trainNs}}
}