Skip to content

Commit

Permalink
fix bugs in package data regeneration for ManyAnalysts manuscript (#148)
Browse files Browse the repository at this point in the history
* Increment version number to 2.7.5.9000

* bug: fix threshold checking for multivariate model fitting #147

only count `mixed_model` after calling `distinct()` on analysis identifier column and `mixed_model` first.

Update argument checking to fail if any id columns not present.

docs: update `@details` section about required id column and add `any_of()` to `@importFrom`

* docs: rearrange headings in NEWS.md

* bug: ensure outlier subset creation occurs on all `exclusion_set` values in Zr #144

removed filter for `exclusion_set` == "complete"

* bug: exclude analysis with non-count-based dependent variable from `yi` analysis #145

* feat: #146 add function for excluding extreme estimates based on a multiplier threshold for population parameter estimates

* fix typo #146

* docs!: #146 `devtools::document()`

* bug: #146 export function

* build!: #146 apply exclusion function to Eucalyptus dataset in targets pipeline

Note that this functionality was previously included in the  manuscript

* #146 increment dev version and news before rebuilding package and targets pipeline

* - build!: don't forget to filter the corresponding diversity data after exclusions!

* - build!: fix #146 regenerate yi data after excluding extreme values

* Increment version number to 2.7.6
  • Loading branch information
egouldo authored Sep 5, 2024
1 parent 8029b0d commit 967852b
Show file tree
Hide file tree
Showing 13 changed files with 1,041 additions and 790 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: ManyEcoEvo
Title: Meta-analyse data from 'Many-Analysts' style studies
Version: 2.7.5
Version: 2.7.6
Authors@R: c(
person("Elliot", "Gould", , "elliot.gould@unimelb.edu.au", role = c("aut", "cre"),
comment = c(ORCID = "https://orcid.org/0000-0002-6585-538X")),
Expand Down
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export(cube_back)
export(divide_back)
export(est_to_zr)
export(exclude_extreme_VZ)
export(exclude_extreme_estimates)
export(fit_MA_mv)
export(fit_boxcox_ratings_cat)
export(fit_boxcox_ratings_cont)
Expand Down Expand Up @@ -131,6 +132,7 @@ importFrom(cli,cli_alert_warning)
importFrom(cli,cli_bullets)
importFrom(cli,cli_h1)
importFrom(cli,cli_h2)
importFrom(cli,cli_h3)
importFrom(cli,cli_ol)
importFrom(cli,cli_warn)
importFrom(cli,style_italic)
Expand Down Expand Up @@ -214,10 +216,14 @@ importFrom(recipes,update_role)
importFrom(rlang,"!!")
importFrom(rlang,":=")
importFrom(rlang,as_function)
importFrom(rlang,as_quosures)
importFrom(rlang,as_string)
importFrom(rlang,caller_env)
importFrom(rlang,current_env)
importFrom(rlang,enquo)
importFrom(rlang,enquos)
importFrom(rlang,ensym)
importFrom(rlang,env)
importFrom(rlang,exec)
importFrom(rlang,expr)
importFrom(rlang,exprs)
Expand All @@ -229,6 +235,7 @@ importFrom(rlang,is_na)
importFrom(rlang,is_null)
importFrom(rlang,na_chr)
importFrom(rlang,new_formula)
importFrom(rlang,quo_set_env)
importFrom(sae,bxcx)
importFrom(see,geom_jitter2)
importFrom(see,scale_fill_material_d)
Expand All @@ -255,6 +262,7 @@ importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyr,unnest_longer)
importFrom(tidyselect,all_of)
importFrom(tidyselect,any_of)
importFrom(tidyselect,where)
importFrom(timetk,step_box_cox)
importFrom(workflows,add_model)
Expand Down
22 changes: 19 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,28 @@
# ManyEcoEvo 2.7.6

<!-- NEWS.md is maintained by https://cynkra.github.io/fledge, do not edit -->

- build!: `usethis::use_data()` update results of `make_viz()`
* - build!: force `tar_make()` for #140
* - build!: force `tar_make()` for #140, add targets meta
- build!: fix #146 regenerate yi data after excluding extreme values
- build!: don't forget to filter the corresponding diversity data after exclusions!

* Increment version number to 2.7.5
* docs: update changelog
* bug: #146 export function
* docs!: #146 `devtools::document()`
* feat: #146 add function for excluding extreme estimates based on a multiplier threshold for population parameter estimates
* bug: exclude analysis with non-count-based dependent variable from `yi` analysis #145
* bug: ensure outlier subset creation occurs on all `exclusion_set` values in Zr #144
* bug: fix threshold checking for multivariate model fitting #147
- build!: `usethis::use_data()` update results of `make_viz()`

# ManyEcoEvo 2.7.5

<!-- NEWS.md is maintained by https://cynkra.github.io/fledge, do not edit -->

- build!: `usethis::use_data()` update results of `make_viz()`
* - build!: force `tar_make()` for #140
* - build!: force `tar_make()` for #140, add targets meta

# ManyEcoEvo 2.7.4

- docs: Update function documentation #140
Expand Down
98 changes: 98 additions & 0 deletions R/exclude_extreme_estimates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#' Exclude extreme estimates above a threshold parameter sd
#'
#' @param data A dataframe of analyst estimates
#' @param outcome_variable the name of the variable in `data` containing the analyst estimates
#' @param outcome_SE variable in `data` containing analyst SE estimates
#' @param sd_threshold A numeric threshold multiplyer see details
#' @param param_table A dataframe containing population parameters `mean` and `sd` for each `variable` in a given `dataset`
#' @param .fn An optional function that will transform parameter estimates to the same scale as `outcome_variable` in `data`
#' @param ... Arguments supplied to `.fn`
#' @import dplyr
#' @importFrom rlang enquo env as_quosures enquos enquo current_env quo_set_env is_null
#' @importFrom cli cli_h3 cli_alert_success
#' @importFrom purrr map list_c
#' @importFrom tidyr pivot_wider hoist
#' @details
#' This function is used to exclude extreme estimates from a dataset. The function
#' calculates a threshold for exclusion based on the mean and standard deviation of
#' the population parameter estimates in `param_table`. The threshold is calculated
#' as the mean of the population parameter plus `sd_threshold` times the standard
#' deviation of the population parameter. Estimates in `data` that are greater than
#' this threshold are excluded from the output.
#'
#' If the user chooses to supply `.fn` and `...` arguments, the function will transform
#' the population parameter estimates in `param_table` to the same scale as the
#' `outcome_variable` in `data` using `.fn`, before calculating the threshold for exclusion.
#' @export
#' @return A dataframe of analyst estimates with extreme estimates excluded
#' @examples
#' # example code
#' data <- ManyEcoEvo_yi %>%
#' mutate(data =
#' map_if(data,
#' ~ filter(.x,
#' stringr::str_detect(response_variable_name,
#' "average.proportion.of.plots.containing",
#' negate = TRUE)),
#' .p = dataset == "eucalyptus")) %>%
#' mutate(
#' diversity_data =
#' map2(
#' .x = diversity_data,
#' .y = data,
#' .f = ~ semi_join(.x, .y, join_by(id_col)) %>%
#' distinct()
#' )
#' ) %>%
#' prepare_response_variables(
#' estimate_type = "yi",
#' param_table =
#' ManyEcoEvo:::analysis_data_param_tables,
#' dataset_standardise = "blue tit",
#' dataset_log_transform = "eucalyptus") %>%
#' generate_yi_subsets() %>% #TODO: must be run after prepare_response_variables??
#' apply_VZ_exclusions(
#' VZ_colname = list("eucalyptus" = "se_log",
#' "blue tit" = "VZ"),
#' VZ_cutoff = 3) %>%
#' filter(dataset == "eucalyptus", estimate_type == "y25") %>%
#' pluck("data", 1)
#' sd_threshold = 3
#' param_table <- ManyEcoEvo:::analysis_data_param_tables
#' exclude_extreme_estimates(data, "mean_log", "se_log", 3, param_table, log_transform, estimate = mean, std.error = sd)
exclude_extreme_estimates <- function(data, outcome_variable, outcome_SE, sd_threshold = numeric(1L), param_table, .fn = ..., ...) {
# FOR NOW: allow transformation here, but in future, we make sure that
# `prepare_response_variables()` returns both `back_transformed_data` and the
# transformed / standardised data to separate list-columns to retain this data
# Then downstream functions operate off the list-column `analysis_data` or
# some other named list-col like `transformed_data` etc.
dots <- rlang::enquos(...) %>% rlang::as_quosures(env = rlang::env())

param_table <- pivot_wider(param_table, names_from = parameter, values_from = value)

if (!is_null(.fn)){
cli::cli_h3("Transforming {.arg param_table} using {.arg .fn}:")
param_table <- param_table %>%
rowwise() %>%
mutate(transformed_values = list(.fn(!!!dots))) %>%
hoist(transformed_values,
param_mean = outcome_variable,
param_sd = outcome_SE) %>%
select(-transformed_values, -{map(dots, rlang::as_name) %>% list_c()})
} else {
param_table <- param_table %>%
rename_with(.cols = contains(c("mean", "sd")), ~ paste0("param_", .x))
}
cli::cli_h3("Excluding extreme estimates from data:")
out <- data %>%
left_join(param_table,
by = join_by(response_variable_name == variable)) %>%
mutate(exclusion_threshold = param_mean + sd_threshold * param_sd) %>%
filter(if_any(outcome_variable, ~ .x <= exclusion_threshold)) %>%
select(-starts_with("param_"))

cli::cli_alert_success("Removed {.val {nrow(data) - nrow(out)}} columns from data with {.arg sd_threshold} = {.val {sd_threshold}}")

return(out)
}

6 changes: 5 additions & 1 deletion R/filt_multivar_MA.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#' @importFrom tidyr unite
#' @importFrom cli cli_alert_info cli_bullets cli_h2 style_italic
#' @importFrom glue glue
#' @importFrom tidyselect any_of
#' @details
#' Depending on whether enough analyses in `data_tbl` have been conducted with the `mixed_model` variable, the function will fit a model with or without the predictor `mixed_model`.
#'
Expand All @@ -30,6 +31,7 @@
#' - `box_cox_abs_deviation_score_estimate`: response variable, Box-Cox transformed deviation from the meta-analytic mean effect-size for each analysis
#' - `mixed_model`: binary variable indicating whether the analysis used a mixed effects model or not
#' - `ReviewerId`: reviewer identifier
#' - one of `study_id` or `id_col` to uniquely identify each analysis for checking that the threshold `N` is met.
#' @family Model fitting and meta-analysis
fit_multivar_MA <- function(data_tbl, N = 5, ..., env = rlang::caller_env()) {

Expand All @@ -43,7 +45,8 @@ fit_multivar_MA <- function(data_tbl, N = 5, ..., env = rlang::caller_env()) {
PublishableAsIs,
mean_diversity_index,
ReviewerId,
mixed_model
mixed_model,
any_of(c("id_col", "study_id"))
))

# ----- Define Models -----
Expand All @@ -69,6 +72,7 @@ fit_multivar_MA <- function(data_tbl, N = 5, ..., env = rlang::caller_env()) {

pass_threshold <-
data_tbl %>%
distinct(pick(any_of(c("study_id", "id_col"))), mixed_model) %>%
count(mixed_model) %>%
pointblank::test_col_vals_gte(n, N)

Expand Down
42 changes: 40 additions & 2 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ list(tarchetypes::tar_file_read(name = euc_reviews,
rlang::exprs(
collinearity_subset != "collinearity_removed",
expertise_subset != "expert",
publishable_subset == "All",
exclusion_set == "complete")) |>
publishable_subset == "All")) |>
compute_MA_inputs(estimate_type = "Zr") |>
meta_analyse_datasets(
outcome_variable = "Zr",
Expand Down Expand Up @@ -264,13 +263,52 @@ list(tarchetypes::tar_file_read(name = euc_reviews,
all_prediction_data)),
targets::tar_target(name = ManyEcoEvo_yi_results,
command = ManyEcoEvo_yi %>%
mutate(
data =
map_if(data,
~ filter(.x,
stringr::str_detect(
response_variable_name,
"average.proportion.of.plots.containing",
negate = TRUE)),
.p = dataset == "eucalyptus")) %>%
mutate(
diversity_data =
map2(
.x = diversity_data,
.y = data,
.f = ~ semi_join(.x, .y, join_by(id_col)) %>%
distinct()
)
) %>%
prepare_response_variables(
estimate_type = "yi",
param_table =
ManyEcoEvo:::analysis_data_param_tables,
dataset_standardise = "blue tit",
dataset_log_transform = "eucalyptus") %>%
generate_yi_subsets() %>% #TODO: must be run after prepare_response_variables??
rowwise() %>%
mutate(data = if (dataset == "eucalyptus") {
list(
exclude_extreme_estimates(
data,
outcome_variable = "mean_log",
outcome_SE = "se_log",
param_table = ManyEcoEvo:::analysis_data_param_tables,
sd_threshold = 3,
.fn = log_transform,
estimate = mean,
std.error = sd))
} else {list(data)},
diversity_data = if (dataset == "eucalyptus") {
list(
semi_join(diversity_data,
data,
by = "id_col") %>%
distinct())
} else {list(diversity_data)}) %>%
ungroup %>%
apply_VZ_exclusions(
VZ_colname = list("eucalyptus" = "se_log",
"blue tit" = "VZ"),
Expand Down
Loading

0 comments on commit 967852b

Please sign in to comment.