#' @title Cluster validity indices #' @name .cluster_validity #' @keywords internal #' @noRd #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} #' #' @description Compute different cluster validity indices. This function needs #' as input a sits tibble with `cluster` column. #' It is a front-end to `dtwclust::cvi` function. #' That function computes five indices: #' 1) adjusted Rand index; 2) Rand index; 3) Jaccard index; #' 4) Fowlkes-Mallows; and 5) Variation of Information index #' Please refer to the documentation in that package for more details. #' #' @references "dtwclust" package (https://CRAN.R-project.org/package=dtwclust) #' #' @param samples A tibble with `cluster` column. #' #' @return A vector with four validity indices. #' .cluster_validity <- function(samples) { # set caller to show in errors .check_set_caller(".cluster_validity") # verifies if dtwclust package is installed .check_require_packages("dtwclust") # is the input data the result of a cluster function? .check_samples_cluster(samples) # compute CVIs and return result <- dtwclust::cvi( a = factor(samples[["cluster"]]), b = factor(samples[["label"]]), type = "external", log.base = 10 ) return(result) } #' @title Compute a dendrogram using hierarchical clustering #' @name .cluster_dendrogram #' @keywords internal #' @noRd #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} #' #' @description Cluster time series in hierarchical mode. #' Hierarchical clustering, as its name suggests, #' is an algorithm that tries to create a hierarchy of groups in which, #' as the level in the hierarchy increases, clusters are created by merging #' the clusters from the next lower level, producing #' an ordered sequence of groupings. The similarity measure used to #' group time series in a cluster is the dtw metric. #' The procedure is deterministic, so it will always give the same #' result for a chosen set of similarity measures #' (see \code{\link[dtwclust]{tsclust}}). #' #' @references `dtwclust` package (https://CRAN.R-project.org/package=dtwclust) #' #' @param samples Time series data and metadata #' to be used to generate the dendrogram. #' @param bands Vector of bands to be clustered. #' @param dist_method One of the supported distances (single char vector) #' "dtw": DTW with a Sakoe-Chiba constraint. #' "dtw2": DTW with L2 norm and Sakoe-Chiba constraint. #' "dtw_basic": A faster DTW with less functionality. #' "lbk": Keogh's lower bound for DTW. #' "lbi": Lemire's lower bound for DTW. #' @param linkage Agglomeration method to be used (single char vector) #' One of "ward.D", "ward.D2", "single", "complete", #' "average", "mcquitty", "median" or "centroid". #' @param ... Any additional parameters to be passed #' to dtwclust::tsclust() function. #' @return Full dendrogram tree for data analysis #' (class "dendrogram") #' .cluster_dendrogram <- function(samples, bands, dist_method = "dtw_basic", linkage = "ward.D2", ...) { # verifies if dtwclust package is installed .check_require_packages("dtwclust") # get the values of the time series values <- .values_ts(samples, bands, format = "cases_dates_bands") # call dtwclust and get the resulting dendrogram dendro <- dtwclust::tsclust( values, type = "hierarchical", k = max(nrow(samples) - 1, 2), distance = dist_method, control = dtwclust::hierarchical_control(method = linkage), ... ) # return the dendrogram return(dendro) } #' @title Compute validity indexes to a range of cut height #' @name .cluster_dendro_bestcut #' @keywords internal #' @noRd #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} #' #' @description Reads a dendrogram object and its corresponding sits tibble and #' computes the best number of clusters that maximizes the adjusted Rand index. #' #' @references #' Lawrence Hubert and Phipps Arabie. Comparing partitions. #' Journal of Classification, 2, p.193--218, 1985. #' #' #' @param samples Input set of time series. #' @param dendro Dendrogram object returned from #' \code{\link[sits]{.cluster_dendrogram}}. #' @return Vector with best number of clusters (k) #' and its respective height. #' .cluster_dendro_bestcut <- function(samples, dendro) { # compute range k_range <- seq(2, max(length(dendro[["height"]]) - 1, 2)) # compute ARI for each k (vector) ari <- k_range |> purrr::map(function(k) { x <- stats::cutree(dendro, k = k) y <- factor(samples[["label"]]) .cluster_rand_index(table(x, y)) }) |> unlist() # get the best ARI result k_result <- k_range[which.max(ari)] # compute each height corresponding to `k_result` h_index <- length(dendro[["height"]]) - k_result + 2 h_result <- c(0, dendro[["height"]])[h_index] # create a named vector and return best_cut <- structure(c(k_result, h_result), .Names = c("k", "height")) return(best_cut) } #' @title Compute Rand index for cluster table #' @name .cluster_rand_index #' @noRd #' @param x a cluster produced by dtwclust::tsclust #' @param correct use best calculation #' @return Rand index for cluster .cluster_rand_index <- function(x) { .check_set_caller(".cluster_rand_index") .check_that(length(dim(x)) == 2) n <- sum(x) ni <- rowSums(x) nj <- colSums(x) n2 <- choose(n, 2) nis2 <- sum(choose(ni[ni > 1], 2)) njs2 <- sum(choose(nj[nj > 1], 2)) factor_1 <- (nis2 * njs2) / n2 factor_2 <- (nis2 + njs2) / 2 rand <- (sum(choose(x[x > 1], 2)) - factor_1) / (factor_2 - factor_1) return(rand) }