From 3cf424332d3ecea618ec92c80f684ac6de7eb2a8 Mon Sep 17 00:00:00 2001
From: Teun van den Brand <tahvdbrand@gmail.com>
Date: Wed, 28 Aug 2024 14:44:38 +0200
Subject: [PATCH] draft summarise stat

---
 DESCRIPTION            |   1 +
 NAMESPACE              |   2 +
 R/stat-summarise.R     | 129 +++++++++++++++++++++++++++++++
 man/ggplot2-ggproto.Rd |   5 +-
 man/stat_summarise.Rd  | 171 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 306 insertions(+), 2 deletions(-)
 create mode 100644 R/stat-summarise.R
 create mode 100644 man/stat_summarise.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index b4cd9ec950..ab1ffe9fd1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -262,6 +262,7 @@ Collate:
     'stat-smooth-methods.R'
     'stat-smooth.R'
     'stat-sum.R'
+    'stat-summarise.R'
     'stat-summary-2d.R'
     'stat-summary-bin.R'
     'stat-summary-hex.R'
diff --git a/NAMESPACE b/NAMESPACE
index f0ccf3bec1..31c870a5aa 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -274,6 +274,7 @@ export(StatSf)
 export(StatSfCoordinates)
 export(StatSmooth)
 export(StatSum)
+export(StatSummarise)
 export(StatSummary)
 export(StatSummary2d)
 export(StatSummaryBin)
@@ -686,6 +687,7 @@ export(stat_sf_coordinates)
 export(stat_smooth)
 export(stat_spoke)
 export(stat_sum)
+export(stat_summarise)
 export(stat_summary)
 export(stat_summary2d)
 export(stat_summary_2d)
diff --git a/R/stat-summarise.R b/R/stat-summarise.R
new file mode 100644
index 0000000000..183097b742
--- /dev/null
+++ b/R/stat-summarise.R
@@ -0,0 +1,129 @@
+#' Arbitrary summaries by group
+#'
+#' `stat_summarise()` handles arbitrary summaries by group. Unlike
+#' [`stat_summary()`], it does not automatically summarise by `x`.
+#'
+#' @inheritParams stat_identity
+#' @param fun A function that is given the complete data per group and should
+#'   return a data.frame with variables compatible with the layer's `geom`
+#'   argument. Defaults to `express_group()` which will evaluate
+#'   [data-masked][rlang::args_data_masking] expressions passed to `fun.args`.
+#' @param fun.args Optional additional arguments to pass to `fun`. When
+#'   `fun = express_group` (default), can be a list of expressions created by
+#'   `vars()`.
+#' @eval rd_aesthetics("stat", "summarise")
+#' @export
+#'
+#' @examples
+#' # Using a helper summary function
+#' my_summary_fun <- function(data) {
+#'   data.frame(
+#'     x    = min(data$x) - 0.5,
+#'     xend = max(data$x) + 0.5,
+#'     y    = mean(data$y)
+#'   )
+#' }
+#'
+#' ggplot(mpg, aes(drv, displ)) +
+#'   geom_jitter(aes(colour = drv)) +
+#'   stat_summarise(
+#'     geom = "segment",
+#'     fun = my_summary_fun
+#'   )
+#'
+#' # Making the same summary using the `exprs` argument
+#' ggplot(mpg, aes(drv, displ)) +
+#'   geom_jitter(aes(colour = drv)) +
+#'   stat_summarise(
+#'     geom = "segment",
+#'     fun.args = vars(
+#'       # Note: we use `xend` first to not mutate `x` too soon
+#'       xend = max(x) + 0.5,
+#'       x    = min(x) - 0.5,
+#'       y    = mean(y)
+#'     )
+#'   )
+#'
+#' # You can use a temporary variable that needn't conform to data.frame columns
+#' # as long as it is removed afterwards
+#' ggplot(mpg, aes(displ, colour = drv)) +
+#'   stat_summarise(
+#'     geom = "line",
+#'     fun.args = vars(
+#'       dens = density(x), # non-vector temporary variable
+#'       x    = dens$x,
+#'       y    = dens$y,
+#'       dens = NULL # deleting the temporary variable
+#'     )
+#'   )
+stat_summarise <- function(
+    mapping = NULL,
+    data = NULL,
+    geom = "point",
+    position = "identity",
+    ...,
+    fun = express_group,
+    fun.args = list(),
+    na.rm = FALSE,
+    show.legend = NA,
+    inherit.aes = TRUE
+) {
+
+  layer(
+    data = data,
+    mapping = mapping,
+    stat = StatSummarise,
+    geom = geom,
+    position = position,
+    show.legend = show.legend,
+    inherit.aes = inherit.aes,
+    params = list2(
+      na.rm = na.rm,
+      fun = fun,
+      fun.args = fun.args,
+      ...
+    )
+  )
+}
+
+#' @rdname ggplot2-ggproto
+#' @format NULL
+#' @usage NULL
+#' @export
+StatSummarise <- ggproto(
+  "StatSummarise", Stat,
+
+  setup_params = function(data, params) {
+    params$fun <- allow_lambda(params$fun)
+    check_function(params$fun, arg = "fun")
+    if (!is_named2(params$fun.args)) {
+      cli::cli_abort("Every element in {.arg fun.args} must be named.")
+    }
+    if (identical(params$fun, express_group) && length(params$fun.args) > 0) {
+      check_object(
+        params$fun.args, is_quosures, arg = "fun.args",
+        "generated by `vars()`", allow_null = TRUE
+      )
+    }
+    params
+  },
+
+  compute_group = function(data, scales, fun = express_group, fun.args = list()) {
+    inject(fun(data, !!!fun.args))
+  }
+)
+
+express_group <- function(data, ...) {
+  exprs <- list2(...)
+  if (length(exprs) < 1) {
+    return(data)
+  }
+  data <- unclass(data)
+  # We append each evaluated expression to `data`, so that the next expression
+  # can use the result of all previously evaluated expressions
+  nms <- names(exprs)
+  for (i in seq_along(exprs)) {
+    data[[nms[[i]]]] <- eval_tidy(exprs[[i]], data)
+  }
+  data_frame0(!!!data[unique0(names(exprs))])
+}
diff --git a/man/ggplot2-ggproto.Rd b/man/ggplot2-ggproto.Rd
index c3384f1e45..ae36223102 100644
--- a/man/ggplot2-ggproto.Rd
+++ b/man/ggplot2-ggproto.Rd
@@ -25,8 +25,8 @@
 %   R/stat-count.R, R/stat-density-2d.R, R/stat-density.R, R/stat-ecdf.R,
 %   R/stat-ellipse.R, R/stat-function.R, R/stat-identity.R, R/stat-qq-line.R,
 %   R/stat-qq.R, R/stat-quantilemethods.R, R/stat-smooth.R, R/stat-sum.R,
-%   R/stat-summary-2d.R, R/stat-summary-bin.R, R/stat-summary-hex.R,
-%   R/stat-summary.R, R/stat-unique.R, R/stat-ydensity.R
+%   R/stat-summarise.R, R/stat-summary-2d.R, R/stat-summary-bin.R,
+%   R/stat-summary-hex.R, R/stat-summary.R, R/stat-unique.R, R/stat-ydensity.R
 \docType{data}
 \name{ggplot2-ggproto}
 \alias{ggplot2-ggproto}
@@ -144,6 +144,7 @@
 \alias{StatQuantile}
 \alias{StatSmooth}
 \alias{StatSum}
+\alias{StatSummarise}
 \alias{StatSummary2d}
 \alias{StatSummaryBin}
 \alias{StatSummaryHex}
diff --git a/man/stat_summarise.Rd b/man/stat_summarise.Rd
new file mode 100644
index 0000000000..7002194126
--- /dev/null
+++ b/man/stat_summarise.Rd
@@ -0,0 +1,171 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/stat-summarise.R
+\name{stat_summarise}
+\alias{stat_summarise}
+\title{Arbitrary summaries}
+\usage{
+stat_summarise(
+  mapping = NULL,
+  data = NULL,
+  geom = "point",
+  position = "identity",
+  ...,
+  fun = express_group,
+  fun.args = list(),
+  na.rm = FALSE,
+  show.legend = NA,
+  inherit.aes = TRUE
+)
+}
+\arguments{
+\item{mapping}{Set of aesthetic mappings created by \code{\link[=aes]{aes()}}. If specified and
+\code{inherit.aes = TRUE} (the default), it is combined with the default mapping
+at the top level of the plot. You must supply \code{mapping} if there is no plot
+mapping.}
+
+\item{data}{The data to be displayed in this layer. There are three
+options:
+
+If \code{NULL}, the default, the data is inherited from the plot
+data as specified in the call to \code{\link[=ggplot]{ggplot()}}.
+
+A \code{data.frame}, or other object, will override the plot
+data. All objects will be fortified to produce a data frame. See
+\code{\link[=fortify]{fortify()}} for which variables will be created.
+
+A \code{function} will be called with a single argument,
+the plot data. The return value must be a \code{data.frame}, and
+will be used as the layer data. A \code{function} can be created
+from a \code{formula} (e.g. \code{~ head(.x, 10)}).}
+
+\item{geom}{The geometric object to use to display the data for this layer.
+When using a \verb{stat_*()} function to construct a layer, the \code{geom} argument
+can be used to override the default coupling between stats and geoms. The
+\code{geom} argument accepts the following:
+\itemize{
+\item A \code{Geom} ggproto subclass, for example \code{GeomPoint}.
+\item A string naming the geom. To give the geom as a string, strip the
+function name of the \code{geom_} prefix. For example, to use \code{geom_point()},
+give the geom as \code{"point"}.
+\item For more information and other ways to specify the geom, see the
+\link[=layer_geoms]{layer geom} documentation.
+}}
+
+\item{position}{A position adjustment to use on the data for this layer. This
+can be used in various ways, including to prevent overplotting and
+improving the display. The \code{position} argument accepts the following:
+\itemize{
+\item The result of calling a position function, such as \code{position_jitter()}.
+This method allows for passing extra arguments to the position.
+\item A string naming the position adjustment. To give the position as a
+string, strip the function name of the \code{position_} prefix. For example,
+to use \code{position_jitter()}, give the position as \code{"jitter"}.
+\item For more information and other ways to specify the position, see the
+\link[=layer_positions]{layer position} documentation.
+}}
+
+\item{...}{Other arguments passed on to \code{\link[=layer]{layer()}}'s \code{params} argument. These
+arguments broadly fall into one of 4 categories below. Notably, further
+arguments to the \code{position} argument, or aesthetics that are required
+can \emph{not} be passed through \code{...}. Unknown arguments that are not part
+of the 4 categories below are ignored.
+\itemize{
+\item Static aesthetics that are not mapped to a scale, but are at a fixed
+value and apply to the layer as a whole. For example, \code{colour = "red"}
+or \code{linewidth = 3}. The geom's documentation has an \strong{Aesthetics}
+section that lists the available options. The 'required' aesthetics
+cannot be passed on to the \code{params}. Please note that while passing
+unmapped aesthetics as vectors is technically possible, the order and
+required length is not guaranteed to be parallel to the input data.
+\item When constructing a layer using
+a \verb{stat_*()} function, the \code{...} argument can be used to pass on
+parameters to the \code{geom} part of the layer. An example of this is
+\code{stat_density(geom = "area", outline.type = "both")}. The geom's
+documentation lists which parameters it can accept.
+\item Inversely, when constructing a layer using a
+\verb{geom_*()} function, the \code{...} argument can be used to pass on parameters
+to the \code{stat} part of the layer. An example of this is
+\code{geom_area(stat = "density", adjust = 0.5)}. The stat's documentation
+lists which parameters it can accept.
+\item The \code{key_glyph} argument of \code{\link[=layer]{layer()}} may also be passed on through
+\code{...}. This can be one of the functions described as
+\link[=draw_key]{key glyphs}, to change the display of the layer in the legend.
+}}
+
+\item{fun}{A function that is given the complete data per group and should
+return a data.frame with variables compatible with the layer's \code{geom}
+argument. Defaults to \code{express_group()} which will evaluate
+\link[rlang:args_data_masking]{data-masked} expressions passed to \code{fun.args}.}
+
+\item{fun.args}{Optional additional arguments to pass to \code{fun}. When
+\code{fun = express_group} (default), can be a list of expressions created by
+\code{vars()}.}
+
+\item{show.legend}{logical. Should this layer be included in the legends?
+\code{NA}, the default, includes if any aesthetics are mapped.
+\code{FALSE} never includes, and \code{TRUE} always includes.
+It can also be a named logical vector to finely select the aesthetics to
+display. To include legend keys for all levels, even
+when no data exists, use \code{TRUE}.  If \code{NA}, all levels are shown in legend,
+but unobserved levels are omitted.}
+
+\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics,
+rather than combining with them. This is most useful for helper functions
+that define both data and aesthetics and shouldn't inherit behaviour from
+the default plot specification, e.g. \code{\link[=borders]{borders()}}.}
+}
+\description{
+Arbitrary summaries
+}
+\section{Aesthetics}{
+
+\code{stat_summarise()} understands the following aesthetics (required aesthetics are in bold):
+\itemize{
+\item \code{\link[=aes_group_order]{group}}
+}
+Learn more about setting these aesthetics in \code{vignette("ggplot2-specs")}.
+}
+
+\examples{
+# Using a helper summary function
+my_summary_fun <- function(data) {
+  data.frame(
+    x    = min(data$x) - 0.5,
+    xend = max(data$x) + 0.5,
+    y    = mean(data$y)
+  )
+}
+
+ggplot(mpg, aes(drv, displ)) +
+  geom_jitter(aes(colour = drv)) +
+  stat_summarise(
+    geom = "segment",
+    fun = my_summary_fun
+  )
+
+# Making the same summary using the `exprs` argument
+ggplot(mpg, aes(drv, displ)) +
+  geom_jitter(aes(colour = drv)) +
+  stat_summarise(
+    geom = "segment",
+    fun.args = vars(
+      # Note: we use `xend` first to not mutate `x` too soon
+      xend = max(x) + 0.5,
+      x    = min(x) - 0.5,
+      y    = mean(y)
+    )
+  )
+
+# You can use a temporary variable that needn't conform to data.frame columns
+# as long as it is removed afterwards
+ggplot(mpg, aes(displ, colour = drv)) +
+  stat_summarise(
+    geom = "line",
+    fun.args = vars(
+      dens = density(x), # non-vector temporary variable
+      x    = dens$x,
+      y    = dens$y,
+      dens = NULL # deleting the temporary variable
+    )
+  )
+}