-
Notifications
You must be signed in to change notification settings - Fork 76
/
api_cluster.R
161 lines (154 loc) · 6.07 KB
/
api_cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#' @title Cluster validity indices
#' @name .cluster_validity
#' @keywords internal
#' @noRd
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Compute different cluster validity indices. This function needs
#' as input a sits tibble with `cluster` column.
#' It is a front-end to `dtwclust::cvi` function.
#' That function computes five indices:
#' 1) adjusted Rand index; 2) Rand index; 3) Jaccard index;
#' 4) Fowlkes-Mallows; and 5) Variation of Information index
#' Please refer to the documentation in that package for more details.
#'
#' @references "dtwclust" package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param samples A tibble with `cluster` column.
#'
#' @return A vector with four validity indices.
#'
.cluster_validity <- function(samples) {
# set caller to show in errors
.check_set_caller(".cluster_validity")
# verifies if dtwclust package is installed
.check_require_packages("dtwclust")
# is the input data the result of a cluster function?
.check_samples_cluster(samples)
# compute CVIs and return
result <- dtwclust::cvi(
a = factor(samples[["cluster"]]),
b = factor(samples[["label"]]),
type = "external",
log.base = 10
)
return(result)
}
#' @title Compute a dendrogram using hierarchical clustering
#' @name .cluster_dendrogram
#' @keywords internal
#' @noRd
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description Cluster time series in hierarchical mode.
#' Hierarchical clustering, as its name suggests,
#' is an algorithm that tries to create a hierarchy of groups in which,
#' as the level in the hierarchy increases, clusters are created by merging
#' the clusters from the next lower level, producing
#' an ordered sequence of groupings. The similarity measure used to
#' group time series in a cluster is the dtw metric.
#' The procedure is deterministic, so it will always give the same
#' result for a chosen set of similarity measures
#' (see \code{\link[dtwclust]{tsclust}}).
#'
#' @references `dtwclust` package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param samples Time series data and metadata
#' to be used to generate the dendrogram.
#' @param bands Vector of bands to be clustered.
#' @param dist_method One of the supported distances (single char vector)
#' "dtw": DTW with a Sakoe-Chiba constraint.
#' "dtw2": DTW with L2 norm and Sakoe-Chiba constraint.
#' "dtw_basic": A faster DTW with less functionality.
#' "lbk": Keogh's lower bound for DTW.
#' "lbi": Lemire's lower bound for DTW.
#' @param linkage Agglomeration method to be used (single char vector)
#' One of "ward.D", "ward.D2", "single", "complete",
#' "average", "mcquitty", "median" or "centroid".
#' @param ... Any additional parameters to be passed
#' to dtwclust::tsclust() function.
#' @return Full dendrogram tree for data analysis
#' (class "dendrogram")
#'
.cluster_dendrogram <- function(samples,
bands,
dist_method = "dtw_basic",
linkage = "ward.D2", ...) {
# verifies if dtwclust package is installed
.check_require_packages("dtwclust")
# get the values of the time series
values <- .values_ts(samples, bands, format = "cases_dates_bands")
# call dtwclust and get the resulting dendrogram
dendro <- dtwclust::tsclust(
values,
type = "hierarchical",
k = max(nrow(samples) - 1, 2),
distance = dist_method,
control = dtwclust::hierarchical_control(method = linkage),
...
)
# return the dendrogram
return(dendro)
}
#' @title Compute validity indexes to a range of cut height
#' @name .cluster_dendro_bestcut
#' @keywords internal
#' @noRd
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Reads a dendrogram object and its corresponding sits tibble and
#' computes the best number of clusters that maximizes the adjusted Rand index.
#'
#' @references
#' Lawrence Hubert and Phipps Arabie. Comparing partitions.
#' Journal of Classification, 2, p.193--218, 1985.
#'
#'
#' @param samples Input set of time series.
#' @param dendro Dendrogram object returned from
#' \code{\link[sits]{.cluster_dendrogram}}.
#' @return Vector with best number of clusters (k)
#' and its respective height.
#'
.cluster_dendro_bestcut <- function(samples, dendro) {
# compute range
k_range <- seq(2, max(length(dendro[["height"]]) - 1, 2))
# compute ARI for each k (vector)
ari <-
k_range |>
purrr::map(function(k) {
x <- stats::cutree(dendro, k = k)
y <- factor(samples[["label"]])
.cluster_rand_index(table(x, y))
}) |>
unlist()
# get the best ARI result
k_result <- k_range[which.max(ari)]
# compute each height corresponding to `k_result`
h_index <- length(dendro[["height"]]) - k_result + 2
h_result <- c(0, dendro[["height"]])[h_index]
# create a named vector and return
best_cut <- structure(c(k_result, h_result), .Names = c("k", "height"))
return(best_cut)
}
#' @title Compute Rand index for cluster table
#' @name .cluster_rand_index
#' @noRd
#' @param x a cluster produced by dtwclust::tsclust
#' @param correct use best calculation
#' @return Rand index for cluster
.cluster_rand_index <- function(x) {
.check_set_caller(".cluster_rand_index")
.check_that(length(dim(x)) == 2)
n <- sum(x)
ni <- rowSums(x)
nj <- colSums(x)
n2 <- choose(n, 2)
nis2 <- sum(choose(ni[ni > 1], 2))
njs2 <- sum(choose(nj[nj > 1], 2))
factor_1 <- (nis2 * njs2) / n2
factor_2 <- (nis2 + njs2) / 2
rand <- (sum(choose(x[x > 1], 2)) - factor_1) / (factor_2 - factor_1)
return(rand)
}