Skip to content

Commit

Permalink
Update automated tests & controls/inputs/outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
maximelenormand committed Apr 4, 2024
1 parent a00e363 commit 6295f8f
Show file tree
Hide file tree
Showing 17 changed files with 1,481 additions and 594 deletions.
4 changes: 2 additions & 2 deletions R/cut_tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ cut_tree <- function(tree,

arg_added <- list(...)
if(inherits(tree, "bioregion.clusters")){
if (tree$name == "hierarchical_clustering") {
if (tree$name == "hclu_hierarclust") {
cur.tree <- tree$algorithm$final.tree
# Update args
tree$args[c("n_clust", "cut_height", "find_h", "h_max", "h_min",
Expand Down Expand Up @@ -321,7 +321,7 @@ cut_tree <- function(tree,
output_cut_height <- cut_height
}

clusters <- knbclu(clusters, reorder = FALSE, method = "length")
clusters <- knbclu(clusters, reorder = TRUE, method = "length")

if(inherits(tree, "bioregion.clusters")) {
cur.tree$args$cut_height <- cut_height
Expand Down
10 changes: 5 additions & 5 deletions R/generic_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ print.bioregion.clusters <- function(x, ...)
# algorithm name -----
cat("Clustering results for algorithm : ")
cat(x$name, "\n")
if(x$name == "hierarchical_clustering") {
if(x$name == "hclu_hierarclust") {
cat("\t(hierarchical clustering based on a dissimilarity matrix)\n")
}

# dataset characteristics -----
cat(" - Number of sites: ", x$inputs$nb_sites, "\n")

# methodological details -----
if(x$name == "hierarchical_clustering") {
if(x$name == "hclu_hierarclust") {
cat(" - Name of dissimilarity metric: ",
ifelse(is.null(x$args$index),
"Undefined",
Expand All @@ -44,7 +44,7 @@ print.bioregion.clusters <- function(x, ...)
if (inherits(x$clusters, "data.frame")) {

# Further methodological details if hclust
if(x$name == "hierarchical_clustering") {
if(x$name == "hclu_hierarclust") {
if(!is.null(x$args$n_clust))
{
cat(" - Number of clusters requested by the user: ",
Expand Down Expand Up @@ -97,7 +97,7 @@ print.bioregion.clusters <- function(x, ...)
paste(nclust, collapse = " ")),
"\n")

if(x$name == "hierarchical_clustering") {
if(x$name == "hclu_hierarclust") {
if(x$args$find_h)
{
cat(" - Height of cut of the hierarchical tree:",
Expand All @@ -124,7 +124,7 @@ print.bioregion.clusters <- function(x, ...)
#' @method plot bioregion.clusters
plot.bioregion.clusters <- function(x, ...)
{
if(x$name == ("hierarchical_clustering"))
if(x$name == ("hclu_hierarclust"))
{
args <- list(...)
# Changing default arguments for hclust plot
Expand Down
126 changes: 50 additions & 76 deletions R/hclu_diana.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,25 @@
#' @param index name or number of the dissimilarity column to use. By default,
#' the third column name of `dissimilarity` is used.
#'
#' @param n_clust an integer or a vector of integers indicating the number of
#' @param n_clust an `integer` or an `integer` vector indicating the number of
#' clusters to be obtained from the hierarchical tree, or the output from
#' [partition_metrics]. Should not be used at the same time as
#' `cut_height`.
#'
#' @param cut_height a numeric vector indicating the height(s) at which the
#' @param cut_height a `numeric` vector indicating the height(s) at which the
#' tree should be cut. Should not be used at the same time as `n_clust`.
#'
#' @param find_h a boolean indicating if the height of cut should be found for
#' @param find_h a `boolean` indicating if the height of cut should be found for
#' the requested `n_clust`.
#'
#' @param h_max a numeric indicating the maximum possible tree height for
#' @param h_max a `numeric` indicating the maximum possible tree height for
#' the chosen `index`.
#'
#' @param h_min a numeric indicating the minimum possible height in the tree
#' @param h_min a `numeric` indicating the minimum possible height in the tree
#' for the chosen `index`.
#'
#' @details
#' The function is based on [diana][cluster::diana].
#' Chapter 6 of Kaufman and Rousseeuw (1990) fully details the functioning of
#' the diana algorithm.
#'
Expand All @@ -44,12 +45,12 @@
#' @return
#' A `list` of class `bioregion.clusters` with five slots:
#' \enumerate{
#' \item{**name**: `character string` containing the name of the algorithm}
#' \item{**name**: `character` containing the name of the algorithm}
#' \item{**args**: `list` of input arguments as provided by the user}
#' \item{**inputs**: `list` of characteristics of the clustering process}
#' \item{**algorithm**: `list` of all objects associated with the
#' clustering procedure, such as original cluster objects}
#' \item{**clusters**: `data.frame` containing the clustering results}}#'
#' \item{**clusters**: `data.frame` containing the clustering results}}
#'
#' @references
#' \insertRef{Kaufman2009}{bioregion}
Expand Down Expand Up @@ -88,118 +89,91 @@ hclu_diana <- function(dissimilarity,
h_min = 0){

# 1. Controls ---------------------------------------------------------------
if(inherits(dissimilarity, "bioregion.pairwise.metric")){
if(attr(dissimilarity, "type") == "similarity") {
stop("dissimilarity seems to be a similarity object.
hclu_hierarclust() should be applied on dissimilarity, not
similarities.
Use similarity_to_dissimilarity() before using hclu_hierarclust()")
}
if(is.numeric(index)){
index <- names(dissimilarity)[index]
}
if(!(index %in% colnames(dissimilarity))) {
stop("Argument index should be one of the column names of dissimilarity")
}

} else if(!any(inherits(dissimilarity, "bioregion.pairwise.metric"),
inherits(dissimilarity, "dist"))){
if(is.numeric(index)) {
index <- names(dissimilarity)[index]
}
if(is.null(index) || !(index %in% colnames(dissimilarity))) {
stop("dissimilarity is not a bioregion.pairwise.metric object, a
dissimilarity matrix (class dist) or a data.frame with at least 3
columns (site1, site2, and your dissimilarity index).")
controls(args = NULL, data = dissimilarity, type = "input_nhandhclu")
if(!inherits(dissimilarity, "dist")){
controls(args = NULL, data = dissimilarity, type = "input_dissimilarity")
controls(args = NULL, data = dissimilarity,
type = "input_data_frame_nhandhclu")
controls(args = index, data = dissimilarity, type = "input_net_index")
net <- dissimilarity
net[, 3] <- net[, index]
net <- net[, 1:3]
controls(args = NULL, data = net, type = "input_net_index_value")
dist.obj <- stats::as.dist(
net_to_mat(net,
weight = TRUE, squared = TRUE, symmetrical = TRUE))
} else {
controls(args = NULL, data = dissimilarity, type = "input_dist")
dist.obj <- dissimilarity
if(is.null(labels(dist.obj))){
attr(dist.obj, "Labels") <- paste0(1:attr(dist.obj, "Size"))
message("No labels detected, they have been assigned automatically.")
}
}

if(!is.null(n_clust)) {
if(is.numeric(n_clust)) {
if(any(!(n_clust %% 1 == 0))) {
stop("n_clust must an integer or a vector of integers determining the
number of clusters.")
}
controls(args = n_clust, data = NULL,
type = "strict_positive_integer_vector")
} else if(inherits(n_clust, "bioregion.partition.metrics")){
if(!is.null(n_clust$algorithm$optimal_nb_clusters)) {
n_clust <- n_clust$algorithm$optimal_nb_clusters
} else {
stop("n_clust does not have an optimal number of clusters. Did you
specify partition_optimisation = TRUE in partition_metrics()?")
specify partition_optimisation = TRUE in partition_metrics()?",
call. = FALSE)
}
} else{
stop("n_clust must be one of those:
* an integer determining the number of clusters
* a vector of integers determining the numbers of clusters for each cut
* the output from partition_metrics()")
* the output from partition_metrics()",
call. = FALSE)
}
if(!is.null(cut_height)){
stop("Please provide either n_clust or cut_height, but not both at the
same time.")
same time.",
call. = FALSE)
}
}

if(!is.null(cut_height)){
if(!is.numeric(cut_height) || any(cut_height < 0)){
stop("cut_height must be a positive integer.")
}
}

if(!is.logical(find_h)){
stop("find_h must be a Boolean.")
controls(args = cut_height, data = NULL, type = "positive_numeric_vector")
}

if(!is.numeric(h_max) || h_max < 0){
stop("h_max must be a positive integer.")
}

if(!is.numeric(h_min) || h_min < 0){
stop("h_min must be a positive integer.")
}

controls(args = find_h, data = NULL, type = "boolean")
controls(args = h_min, data = NULL, type = "positive_numeric")
controls(args = h_max, data = NULL, type = "positive_numeric")
if(h_min > h_max){
stop("h_min must be inferior to h_max.")
}

# 2. Function ---------------------------------------------------------------
# Convert dissimilarity into a dist object
if(!inherits(dissimilarity, "dist")){
# dist.obj <- .dfToDist(dissimilarity, metric = index)
dist.obj <- stats::as.dist(
net_to_mat(dissimilarity[, c(colnames(dissimilarity)[1:2], index)],
weight = TRUE, squared = TRUE, symmetrical = TRUE))

} else {
dist.obj <- dissimilarity
}

# Output of the function
outputs <- list(name = "divisive_hierarchical_clustering")
outputs <- list(name = "hclu_diana")

# Adding dynamic_tree_cut = FALSE for compatibility with generic functions
dynamic_tree_cut <- FALSE
outputs$args <- list(index = index,
# method = method,
# randomize = randomize,
# n_runs = n_runs,
# optimal_tree_method = optimal_tree_method,
n_clust = n_clust,
cut_height = cut_height,
find_h = find_h,
h_max = h_max,
h_min = h_min,
dynamic_tree_cut = dynamic_tree_cut)

outputs$inputs <- list(nb_sites = attr(dist.obj, "Size"))
outputs$inputs <- list(bipartite = FALSE,
weight = TRUE,
pairwise = TRUE,
pairwise_metric = ifelse(!inherits(dissimilarity,
"dist"),
ifelse(is.numeric(index),
names(net)[3], index),
NA),
dissimilarity = TRUE,
nb_sites = attr(dist.obj, "Size"))

# DIANA clustering
diana_clust <- cluster::diana(dist.obj,
diss = inherits(dist.obj, "dist"),
# metric = "euclidean",
# stand = FALSE,
# stop.at.k = FALSE,
# keep.diss = n < 100,
# keep.data = !diss,
trace.lev = 0)

outputs$algorithm$final.tree <- diana_clust
Expand Down
Loading

0 comments on commit 6295f8f

Please sign in to comment.