-
Notifications
You must be signed in to change notification settings - Fork 76
/
sits_cluster.R
146 lines (122 loc) · 6.07 KB
/
sits_cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#------------------------------------------------------------------
#' @title Cuts a cluster tree produced by sits_dendrogram
#' @name sits_cluster
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description reads a list of clusters provided by the dtwclust
#' package and produces a sits table.
#' @references "dtwclust" package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param data.tb a tibble with input data of dtwclust.
#' @param clusters a cluster structure returned from dtwclust.
#' @param k the desired number of clusters
#' @return result.tb a SITS table with the clusters or clusters' members
#' @export
sits_cluster <- function (data.tb, clusters, k) {
# create a table to store the results
result.tb <- data.tb
# cut the tree
result.tb$cluster <- stats::cutree(clusters, k)
return (result.tb)
}
#' @title Cluster validity indices
#' @name sits_cluster_validity
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Compute different cluster validity indices. This function needs
#' as input a SITS tibble with `cluster` column. It is a front-end to
#' `dtwclust::cvi` function. Please refer to the documentation in that package for more details.
#' @references "dtwclust" package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param data.tb a SITS tibble with `cluster` column.
#' @param type character vector indicating which indices are to be computed. (Default "valid")
#' @return result.vec vectors with chosen CVIs
#' @export
sits_cluster_validity <- function (data.tb, type = "valid") {
# is the input data the result of a cluster function?
ensurer::ensure_that(data.tb, "cluster" %in% names (.), err_desc = "sits_cluster_cleaner: input data does not contain cluster column")
# compute CVIs and return
result.vec <- dtwclust::cvi(a = data.tb$cluster, b = factor(data.tb$label), type = type, log.base = 10)
return (result.vec)
}
#' @title Cluster frequency
#' @name sits_cluster_frequency
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Computes the frequency of labels in each cluster.
#' This function needs as input a SITS tibble with `cluster` column.
#'
#' @param data.tb a SITS tibble with `cluster` column.
#' @param relative (boolean) return relative frequency?
#' @return result.mtx matrix containing all frequencies of labels in clusters
#' @export
sits_cluster_frequency <- function (data.tb, relative = FALSE) {
# is the input data the result of a cluster function?
ensurer::ensure_that(data.tb, "cluster" %in% names (.), err_desc = "sits_cluster_cleaner: input data does not contain cluster column")
# compute frequency table
result.mtx <- table(data.tb$label, data.tb$cluster)
# compute relative frequency
if (relative)
result.mtx <- prop.table(result.mtx, margin = 2)
# compute total row and col
result.mtx <- stats::addmargins(result.mtx, FUN = list(Total = sum), quiet = TRUE)
return (result.mtx)
}
#' @title Cluster cleaner
#' @name sits_cluster_cleaner
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Removes SITS tibble samples of labels that are minority in each cluster.
#' This function needs as input a SITS tibble with `cluster` column.
#'
#' @param data.tb a SITS tibble with `cluster` column.
#' @param min_clu_perc minimum percentage of representativeness inside a cluster to remain in cluster.
#' @return result.tb a SITS tibble with all selected samples
#' @export
sits_cluster_cleaner <- function (data.tb, min_clu_perc) {
# is the input data the result of a cluster function?
ensurer::ensure_that(data.tb, "cluster" %in% names (.), err_desc = "sits_cluster_cleaner: input data does not contain cluster column")
# compute frequency in each cluster
freq.mtx <- sits_cluster_frequency(data.tb, relative = TRUE)
# get those indexes whose labels represents more than `min_clu_perc`
index.mtx <- which(freq.mtx[1:NROW(freq.mtx) - 1,1:NCOL(freq.mtx) - 1] > min_clu_perc, arr.ind = TRUE, useNames = TRUE)
# return only those samples that satisfies the `min_clu_perc` condition
filter_condition <- paste0(purrr::map2(rownames(index.mtx), index.mtx[,2],
function(lbl, clu) paste0("label=='", lbl, "' & cluster==", clu)),
collapse = " | ")
# if no index selescted, return none
filter_condition <- ifelse(filter_condition != "", filter_condition, "FALSE")
# filter result and return
result.tb <- dplyr::filter_(data.tb, filter_condition)
return (result.tb)
}
#' @title Cluster cleaner
#' @name sits_cluster_relabel
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Renames the labels of SITS tibble samples according to its respective cluster.
#' This function needs as input a SITS tibble with `cluster` column.
#'
#' @param data.tb a SITS tibble with `cluster` column.
#' @param cluster_names character vector informing all cluster names. If unnamed vector is informed,
#' the index of each name will be treated as cluster code
#' @return result.tb SITS tibble with relabeled samples
#' @export
sits_cluster_relabel <- function (data.tb, cluster_names) {
# is the input data the result of a cluster function?
ensurer::ensure_that(data.tb, "cluster" %in% names (.), err_desc = "sits_cluster_cleaner: input data does not contain cluster column")
# compute new clusters names
# if an unnamed vector is given
if (is.null(names(cluster_names)))
data_cluster_names.vec <- cluster_names[data.tb$cluster]
# if a named vector is given
else
data_cluster_names.vec <- cluster_names[as.character(data.tb$cluster)]
# fill not renamed entries with original value
data_cluster_names.vec <- ifelse(is.na(data_cluster_names.vec), data.tb$cluster, data_cluster_names.vec)
names(data_cluster_names.vec) <- NULL
# relabel result and return
result.tb <- data.tb
result.tb$label <- data_cluster_names.vec
return (result.tb)
}