Skip to content

Commit

Permalink
add support for dendrogram sub-clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunuuuu committed Jul 17, 2024
1 parent 078302d commit 3b66e03
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 63 deletions.
4 changes: 4 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Collate:
'htanno-.R'
'htanno-dendrogram.R'
'htanno-group.R'
'htanno-kmeans.R'
'import-standalone-assert.R'
'import-standalone-cli.R'
'import-standalone-obj-type.R'
Expand All @@ -48,3 +49,6 @@ Collate:
ByteCompile: true
URL: https://github.com/Yunuuuu/ggheat
BugReports: https://github.com/Yunuuuu/ggheat/issues
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ S3method(htanno_add,facetted_pos_scales)
S3method(htanno_dendro_add,CoordFlip)
S3method(htanno_dendro_add,gg)
S3method(htanno_dendro_add,labels)
S3method(order2,dendrogram)
S3method(order2,hclust)
S3method(plot,anno)
S3method(print,ggheatmap)
S3method(set_context,annotations)
Expand Down
5 changes: 4 additions & 1 deletion R/anno-initialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ anno_initialize.htanno <- function(object, plot, object_name) {

# compute statistics ---------------------------------
statistics <- rlang::inject(
htanno$compute(data, position, !!!htanno$compute_params)
htanno$compute(
data, old_panels, old_index, position,
!!!htanno$compute_params
)
)
slot(object, "statistics") <- statistics

Expand Down
113 changes: 77 additions & 36 deletions R/dendrogram.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,28 +78,15 @@ hclust2 <- function(matrix,
ans
}

cutree_k_to_h <- function(tree, k) {
if (is.null(n1 <- nrow(tree$merge)) || n1 < 1) {
cli::cli_abort("invalid {.arg tree} ({.field merge} component)")
}
n <- n1 + 1
if (is.unsorted(tree$height)) {
cli::cli_abort(
"the 'height' component of 'tree' is not sorted (increasingly)"
)
}
mean(tree$height[c(n - k, n - k + 1L)])
}

#' Dengrogram x and y coordinates
#'
#' @param tree A [hclust][stats::hclust] or a [dendrogram][stats::as.dendrogram]
#' object.
#' @param priority A string of "left" or "right". if we draw from right to left,
#' the left will override the right, so we take the `"left"` as the priority. If
#' we draw from `left` to `right`, the right will override the left, so we take
#' the `"right"` as priority. This is used by the [htanno_dendro] to provide
#' support of facet operation in ggplot2.
#' the `"right"` as priority. This is used by [htanno_dendro] to provide support
#' of facet operation in ggplot2.
#' @param center A boolean value. if `TRUE`, nodes are plotted centered with
#' respect to the leaves in the branch. Otherwise (default), plot them in the
#' middle of all direct child nodes.
Expand All @@ -108,8 +95,8 @@ cutree_k_to_h <- function(tree, k) {
#' of the number of observations in `tree`.
#' @param leaf_braches Branches of the leaf node. Must be the same length of the
#' number of observations in `tree`. Usually come from [cutree][stats::cutree].
#' @param branch_gap A numeric value indicates the gap between different
#' branches. If a [unit] object is provided, it'll convert into `native` values.
#' @param branch_gap A single numeric value indicates the gap between different
#' branches.
#' @param root A length one string or numeric indicates the root branch.
#' @return A list of 2 data.frame. One for node coordinates, another for edge
#' coordinates.
Expand All @@ -124,7 +111,7 @@ cutree_k_to_h <- function(tree, k) {
#' different groups.
#' - `panel1` and `panel2`: The panel1 and panel2 variables have the same
#' functionality as panel, but they are specifically for the `edge` data and
#' correspond to both nodes of each edge.
#' correspond to both nodes of each edge.
#' - `panel`: which panel current node is, if we split the plot into panels
#' using [facet_grid][ggplot2::facet_grid], this column will show
#' which panel current node or edge is from. Note: some nodes
Expand All @@ -145,7 +132,7 @@ dendrogram_data <- function(tree,
leaf_braches = NULL,
branch_gap = NULL,
root = NULL) {
dend <- check_tree(tree)
dend <- check_dendrogram(tree)
assert_bool(center)
type <- match.arg(type, c("rectangle", "triangle"))
priority <- match.arg(priority, c("left", "right"))
Expand Down Expand Up @@ -178,23 +165,17 @@ dendrogram_data <- function(tree,
}
# branch_gap must be a numeric value
# and the length must be equal to `length(unique(leaf_braches)) - 1L`
if (is.unit(branch_gap)) {
branch_gap <- grid::convertUnit(branch_gap, "native", valueOnly = TRUE)
} else if (is.numeric(branch_gap)) {
if (!is_scalar(branch_gap) &&
!is.null(leaf_braches) &&
length(branch_gap) != (length(unique(leaf_braches)) - 1L)) {
cli::cli_abort(paste(
"{.arg branch_gap} must be of length",
"{.code length(unique(leaf_braches)) - 1}"
))
if (is.numeric(branch_gap)) {
if (!is_scalar(branch_gap)) {
cli::cli_abort("{.arg branch_gap} must be of length 1")
}
} else if (is.null(branch_gap)) {
branch_gap <- 0
} else {
cli::cli_abort("{.arg branch_gap} must be numeric value.")
}

# the root value shouldn't be the same of leaf branches.
if (!is_scalar(root)) {
cli::cli_abort("{.arg root} must be of length 1")
} else if (anyNA(root)) {
Expand All @@ -204,10 +185,12 @@ dendrogram_data <- function(tree,
"{.arg root} cannot contain value in {.arg leaf_braches}"
)
}

# initialize values
i <- 0L # leaf index
branch_levels <- root
last_branch <- root
total_gap <- 0 # will made x always be numeric
total_gap <- 0
.dendrogram_data <- function(dend, from_root = TRUE) {
if (stats::is.leaf(dend)) { # base version
index <- as.integer(dend) # the column index of the original data
Expand Down Expand Up @@ -348,9 +331,8 @@ dendrogram_data <- function(tree,
ggpanel = direct_leaves_ggpanel
)
# 2 horizontal lines
# we double the left line and the right line
# when there is a node is not in a panel, or the node
# is not
# we double the left line and the right line when a node is not
# in a panel, or the edge spaned across different panels.
if (anyNA(direct_leaves_panel) ||
.subset(direct_leaves_panel, 1L) !=
.subset(direct_leaves_panel, 2L)
Expand Down Expand Up @@ -472,8 +454,59 @@ dendrogram_data <- function(tree,
)
}

check_tree <- function(tree, arg = rlang::caller_arg(tree),
call = caller_call()) {
cutree_k_to_h <- function(tree, k) {
if (is.null(n1 <- nrow(tree$merge)) || n1 < 1) {
cli::cli_abort("invalid {.arg tree} ({.field merge} component)")
}
n <- n1 + 1
if (is.unsorted(tree$height)) {
cli::cli_abort(
"the 'height' component of 'tree' is not sorted (increasingly)"
)
}
mean(tree$height[c(n - k, n - k + 1L)])
}

# this function won't set the right `midpoint`, but out plot function won't use
# it, so, it has no hurt to use.
merge_dendrogram <- function(parent, children) {
children_heights <- vapply(children, attr, numeric(1L), "height")
parent_branch_heights <- tree_branch_heights(parent)
cutoff_height <- max(children_heights) + min(parent_branch_heights) * 0.5
.merge_dendrogram <- function(dend) {
if (stats::is.leaf(dend)) { # base version
.subset2(children, dend)
} else { # for a branch, we should update the members, height
attrs <- attributes(dend)
# we recursively run for each node of current branch
dend <- lapply(dend, .merge_dendrogram)
heights <- vapply(dend, attr, numeric(1L), "height")
n_members <- vapply(dend, attr, integer(1L), "members")
# we update height and members, in addition, midpoint
attrs$height <- .subset2(attrs, "height") + max(heights)
attrs$members <- sum(n_members)
attributes(dend) <- attrs
dend
}
}
ans <- .merge_dendrogram(parent)
attr(ans, "cutoff_height") <- cutoff_height
ans
}

tree_branch_heights <- function(dend) {
if (stats::is.leaf(dend)) {
return(NULL)
} else {
c(
attr(dend, "height"),
unlist(lapply(dend, tree_branch_heights), FALSE, FALSE)
)
}
}

check_dendrogram <- function(tree, arg = rlang::caller_arg(tree),
call = caller_call()) {
if (inherits(tree, "hclust")) {
stats::as.dendrogram(tree)
} else if (inherits(tree, "dendrogram")) {
Expand All @@ -482,6 +515,14 @@ check_tree <- function(tree, arg = rlang::caller_arg(tree),
cli::cli_abort(paste(
"{.arg {arg}} must be a {.cls hclust}",
"or a {.cls dendrogram} object."
))
), call = call)
}
}

order2 <- function(x) UseMethod("order2")

#' @export
order2.hclust <- function(x) x$order

#' @export
order2.dendrogram <- function(x) stats::order.dendrogram(x)
1 change: 1 addition & 0 deletions R/htanno-.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ methods::setClass(
#' @rdname htanno
HtannoProto <- ggplot2::ggproto("HtannoProto",
call = NULL,
# each function can modify these parameters
compute_params = NULL,
layout_params = NULL,
draw_params = NULL,
Expand Down
101 changes: 83 additions & 18 deletions R/htanno-dendrogram.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ htanno_dendro <- function(mapping = aes(), ...,
k = k, h = h, plot_cut_height = plot_cut_height,
plot = ggplot2::ggplot(mapping = mapping),
segment_params = rlang::list2(...),
center = center, type = type, root = root
center = center, type = type, root = root,
# initialize height value
height = NULL
),
labels = labels, labels_nudge = labels_nudge,
set_context = set_context, name = name, order = NULL,
Expand Down Expand Up @@ -85,30 +87,93 @@ HtannoDendro <- ggplot2::ggproto("HtannoDendro", HtannoProto,
)
self
},
compute = function(data, panels, index, position,
distance, method, use_missing) {
compute = function(self, data, panels, index, position,
distance, method, use_missing, k, h) {
# what if the old panels exist, we should do sub-clustering
if (!is.null(panels)) {
# if the heatmap has established groups,
# `k` and `h` must be NULL, and we'll do sub-clustering
if (!is.null(k) || !is.null(h)) {
cli::cli_abort(sprintf(
"Cannot cut tree since the heatmap %s %s",
to_matrix_axis(position), "has been splitted"
))
}
children <- vector("list", nlevels(panels))
names(children) <- levels(panels)
labels <- rownames(data)
# we do clustering within each group
for (g in levels(panels)) {
index <- which(panels == g)
gdata <- data[index, , drop = FALSE]
if (nrow(gdata) == 1L) {
children[[g]] <- structure(
index,
class = "dendrogram",
leaf = TRUE,
height = 0,
label = .subset(labels, index),
members = 1L
)
} else {
child <- stats::as.dendrogram(self$compute(
data = gdata,
panels = NULL,
distance = distance,
method = method,
use_missing = use_missing
))
# we restore the actual index of the original matrix
child <- stats::dendrapply(child, function(x) {
if (stats::is.leaf(x)) {
ans <- .subset(index, x)
attributes(ans) <- attributes(x)
ans
} else {
x
}
})
children[[g]] <- child
}
}
if (nlevels(panels) == 1L) { # only one parent
ans <- .subset2(children, 1L)
} else {
parent_data <- t(sapply(levels(panels), function(g) {
colMeans(data[panels == g, , drop = FALSE])
}))
rownames(parent_data) <- levels(panels)
parent <- stats::as.dendrogram(self$compute(
data = parent_data,
panels = NULL,
distance = distance,
method = method,
use_missing = use_missing
))
ans <- merge_dendrogram(parent, children)
self$draw_params$height <- attr(ans, "cutoff_height")
}
return(ans)
}
hclust2(data, distance, method, use_missing)
},
layout = function(self, data, statistics, panels, index, position, k, h) {
if (!is.null(index)) {
cli::cli_warn(sprintf(
"{.fn {snake_class(self)}} will disrupt the previously %s %s",
"established order of the heatmap",
layout = function(self, data, statistics, panels, old_index,
position, k, h) {
# we'll always reorder the dendrogram
index <- order2(statistics)
if (!is.null(old_index) && !identical(index, old_index)) {
cli::cli_abort(sprintf(
"You cannot reorder the heatmap %s twice",
to_matrix_axis(position)
))
), call = self$call)
}
if (!is.null(k)) {
panels <- stats::cutree(statistics, k = k)
height <- cutree_k_to_h(statistics, k)
self$draw_params$height <- cutree_k_to_h(statistics, k)
} else if (!is.null(h)) {
panels <- stats::cutree(statistics, h = height <- h)
} else {
panels <- NULL
height <- NULL
panels <- stats::cutree(statistics, h = h)
self$draw_params$height <- h
}
# fix error height not supplied to `$draw()` method
self$draw_params["height"] <- list(height)
index <- statistics$order
# reorder panel factor levels to following the dendrogram order
if (!is.null(panels)) panels <- factor(panels, unique(panels[index]))
list(panels, index)
Expand All @@ -117,7 +182,7 @@ HtannoDendro <- ggplot2::ggproto("HtannoDendro", HtannoProto,
# other argumentds
plot, height, plot_cut_height, center, type,
root, segment_params) {
if (!identical(statistics$order, index)) {
if (!identical(order2(statistics), index)) {
cli::cli_abort(c(
"Cannot draw the dendrogram",
i = "the node order has been changed"
Expand Down
4 changes: 2 additions & 2 deletions R/plot-build.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ ggheat_build.ggheatmap <- function(x, ...) {
.subset2(default_xscales, i),
"ggheat"
)
# we copy the expand from user input
# into the default for usage of annotation
# we copy the `expand` from user input into the default for usage of
# annotation
default_xscales[[i]]$expand <- .subset2(user_xscales, i)$expand
}

Expand Down
8 changes: 4 additions & 4 deletions man/dendrogram_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3b66e03

Please sign in to comment.