33 add optional arg .by for grouping (#34)

* add basic aggregate/filter/group functionality * address tidyselect issue * bump dev version * prettify * add tests and update handling * rm group_id expectation and change group argument name to .by * rm group_id from tests where not required * aggregate_alignment_loanbook_exposure outputs results at comp and lbk level * add test for .by variables in matched data * documentation * correct documentation * do not allow grouping by crucial variables
RMI-PACTA · Apr 24, 2024 · 2a04b63 · 2a04b63
1 parent dda0a1a
commit 2a04b63
Show file tree

Hide file tree

Showing 7 changed files with 515 additions and 193 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pacta.multi.loanbook.analysis
 Title: Tools to Calculate Climate Metrics for Multiple Loanbooks
-Version: 0.0.0.9002
+Version: 0.0.0.9003
 Authors@R: 
     c(person(given = "Jacob",
              family = "Kastl",

diff --git a/R/aggregate_alignment_loanbook_exposure.R b/R/aggregate_alignment_loanbook_exposure.R
@@ -1,50 +1,136 @@
 #' Return loan book level aggregation of company alignment metrics by exposure
 #'
-#' @param data data.frame. Holds output pf company indicators
-#' @param matched data.frame. Holds matched and prioritised loan book
+#' @param data data.frame. Holds output of company indicators. Contains columns
+#'   `"name_abcd"`,	`"sector"`,	`"activity_unit"`,	`"region"`,
+#'   `"scenario_source"`,	`"scenario"`,	`"year"`,	`"direction"`,
+#'   `"total_deviation"` and	`"alignment_metric"`.
+#' @param matched data.frame. Holds matched and prioritized loan book data.
 #' @param level Character. Vector that indicates if the aggregate alignment
 #'   metric should be returned based on the net technology deviations (`net`) or
 #'   disaggregated into buildout and phaseout technologies (`bo_po`).
+#' @param .by Character. Optionally, a selection of columns to group by. All
+#'   columns indicated must be available variables in the `matched` data set.
+#'   The intended use case is to allow analyzing the loan books by additional
+#'   traits of interest, such as types of financial institutions. Default is
+#'   `NULL`, which means the aggregation is done at the meta level. It is not
+#'   possible to group by the critical columns of the `data` and `matched`
+#'   inputs.
 #'
 #' @return NULL
 #' @export
 aggregate_alignment_loanbook_exposure <- function(data,
                                                   matched,
-                                                  level = c("net", "bo_po")) {
-  group_vars <- c("group_id", "scenario", "region", "sector", "year", "direction")
+                                                  level = c("net", "bo_po"),
+                                                  .by = NULL) {
+  group_vars <- c(
+    "scenario",
+    "region",
+    "sector",
+    "year",
+    "direction"
+  )
   level <- rlang::arg_match(level)
 
   # validate input data sets
   validate_input_data_aggregate_alignment_loanbook_exposure(
     data = data,
     matched = matched,
-    group_vars = group_vars
+    group_vars = group_vars,
+    .by = .by
   )
 
+  if (!is.null(.by)) {
+    if (!inherits(.by, "character")) {
+      stop(glue::glue("`.by` must be a character vector. Your input is {class(.by)}."))
+    }
+    group_vars <- c(.by, group_vars)
+  }
+
   matched <- matched %>%
     dplyr::select(
       dplyr::all_of(
-        c("group_id", "id_loan", "loan_size_outstanding", "loan_size_outstanding_currency", "name_abcd", "sector")
+        c(
+          .by,
+          "id_loan",
+          "loan_size_outstanding",
+          "loan_size_outstanding_currency",
+          "name_abcd",
+          "sector"
+        )
       )
     ) %>%
     dplyr::summarise(
       loan_size_outstanding = sum(.data$loan_size_outstanding, na.rm = TRUE),
-      .by = c("group_id", "loan_size_outstanding_currency", "name_abcd", "sector")
+      .by = dplyr::all_of(
+        c(
+          .env$.by,
+          "loan_size_outstanding_currency",
+          "name_abcd",
+          "sector"
+        )
+      )
     ) %>%
     dplyr::mutate(
       exposure_weight = .data$loan_size_outstanding / sum(.data$loan_size_outstanding, na.rm = TRUE),
-      .by = c("group_id", "loan_size_outstanding_currency")
+      .by = dplyr::all_of(c(.env$.by, "loan_size_outstanding_currency"))
     )
 
   aggregate_exposure_company <- data %>%
     dplyr::inner_join(
       matched,
-      by = c("group_id", "name_abcd", "sector")
+      by = c("name_abcd", "sector"),
+      relationship = "many-to-many"
+    )
+
+  # if a company only has technologies going in one direction in a sector with
+  # high carbon and low carbon technologies, add an empty entry for the other
+  # direction to ensure the aggregation is correct
+  if (level == "bo_po") {
+    aggregate_exposure_company <- aggregate_exposure_company %>%
+      dplyr::mutate(
+        n_directions = dplyr::n_distinct(.data$direction, na.rm = TRUE),
+        .by = dplyr::all_of(
+          c(
+            group_vars[!group_vars == "direction"],
+            "name_abcd",
+            "sector",
+            "activity_unit",
+            "loan_size_outstanding_currency"
+          )
+        )
+      )
+
+    single_direction <- aggregate_exposure_company %>%
+      dplyr::filter(
+        .data$n_directions == 1,
+        .data$direction %in% c("buildout", "phaseout"),
+        .data$sector %in% c("automotive", "hdv", "power")
+      )
+
+    opposite_direction <- single_direction %>%
+      dplyr::mutate(
+        direction = dplyr::if_else(
+          .data$direction == "buildout",
+          "phaseout",
+          "buildout"
+        ),
+        total_deviation = 0,
+        alignment_metric = 0
+      )
+
+    aggregate_exposure_company <- aggregate_exposure_company %>%
+      dplyr::bind_rows(opposite_direction) %>%
+      dplyr::select(-"n_directions")
+  }
+
+  out_company <- aggregate_exposure_company %>%
+    dplyr::relocate(
+      dplyr::all_of(c(.by))
     )
 
   sector_aggregate_exposure_loanbook_summary <- aggregate_exposure_company %>%
     dplyr::mutate(
-      n_companies = dplyr::n(),
+      n_companies = dplyr::n_distinct(.data$name_abcd),
       .by = dplyr::all_of(group_vars)
     ) %>%
     dplyr::mutate(
@@ -85,83 +171,98 @@ aggregate_alignment_loanbook_exposure <- function(data,
       )
   }
 
-  # if a company only has technologies going in one direction in a sector with
-  # high carbon and low carbon technologies, add an empty entry for the other
-  # direction to ensure the aggregation is correct
-  if (level == "bo_po") {
-    aggregate_exposure_company <- aggregate_exposure_company %>%
-      dplyr::mutate(
-        n_directions = dplyr::n(),
-        .by = dplyr::all_of(
-          c(
-            group_vars[!group_vars == "direction"], "name_abcd", "sector",
-            "activity_unit", "loan_size_outstanding_currency"
-          )
-        )
-      )
-
-    single_direction <- aggregate_exposure_company %>%
-      dplyr::filter(
-        .data$n_directions == 1,
-        .data$direction %in% c("buildout", "phaseout"),
-        .data$sector %in% c("automotive", "hdv", "power")
-      )
-
-    opposite_direction <- single_direction %>%
-      dplyr::mutate(
-        direction = dplyr::if_else(
-          .data$direction == "buildout",
-          "phaseout",
-          "buildout"
-        ),
-        total_deviation = 0,
-        alignment_metric = 0
-      )
-
-    aggregate_exposure_company <- aggregate_exposure_company %>%
-      bind_rows(opposite_direction) %>%
-      dplyr::select(-"n_directions")
-  }
-
   sector_aggregate_exposure_loanbook_alignment <- aggregate_exposure_company %>%
     dplyr::summarise(
-      exposure_weighted_net_alignment = stats::weighted.mean(.data$alignment_metric, w = .data$exposure_weight, na.rm = TRUE),
+      exposure_weighted_net_alignment = stats::weighted.mean(
+        x = .data$alignment_metric,
+        w = .data$exposure_weight,
+        na.rm = TRUE
+      ),
       .by = dplyr::all_of(group_vars)
     )
 
-  out <- sector_aggregate_exposure_loanbook_summary %>%
+  out_aggregate <- sector_aggregate_exposure_loanbook_summary %>%
     dplyr::inner_join(
       sector_aggregate_exposure_loanbook_alignment,
       by = group_vars
     ) %>%
     dplyr::relocate(
-      c(
-        group_vars, "n_companies", "n_companies_aligned",
-        "share_companies_aligned", "exposure_weighted_net_alignment"
+      dplyr::all_of(
+        c(
+          group_vars,
+          "n_companies",
+          "n_companies_aligned",
+          "share_companies_aligned",
+          "exposure_weighted_net_alignment"
+        )
       )
     ) %>%
-    dplyr::arrange(.data$group_id, .data$scenario, .data$region, .data$sector, .data$year)
+    dplyr::arrange(
+      !!!rlang::syms(.by),
+      .data$scenario,
+      .data$region,
+      .data$sector,
+      .data$year
+    )
+
+  out <- list(
+    company = out_company,
+    aggregate = out_aggregate
+  )
 
   return(out)
 }
 
 validate_input_data_aggregate_alignment_loanbook_exposure <- function(data,
                                                                       matched,
-                                                                      group_vars) {
+                                                                      group_vars,
+                                                                      .by = NULL) {
   validate_data_has_expected_cols(
     data = data,
     expected_columns = c(
-      group_vars, "name_abcd", "activity_unit", "scenario_source", "alignment_metric"
+      group_vars,
+      "name_abcd",
+      "activity_unit",
+      "scenario_source",
+      "alignment_metric"
     )
   )
 
   validate_data_has_expected_cols(
     data = matched,
     expected_columns = c(
-      "group_id", "id_loan", "loan_size_outstanding",
-      "loan_size_outstanding_currency", "name_abcd", "sector"
+      "id_loan",
+      "loan_size_outstanding",
+      "loan_size_outstanding_currency",
+      "name_abcd",
+      "sector",
+      .by
     )
   )
 
+  if (!is.null(.by)) {
+    if (
+      any(
+        .by %in% c(
+          group_vars,
+          "name_abcd",
+          "activity_unit",
+          "scenario_source",
+          "alignment_metric",
+          "id_loan",
+          "loan_size_outstanding",
+          "loan_size_outstanding_currency",
+          "name_abcd",
+          "sector"
+        )
+      )
+    ) {
+      stop(
+        "It is not possible to group by the critical columns of the `data` and
+        `matched` inputs. Please check your .by argument!"
+      )
+    }
+  }
+
   invisible()
 }