diff --git a/NAMESPACE b/NAMESPACE index 239040b4..64652bfc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,7 @@ export(orsf_vi_anova) export(orsf_vi_negate) export(orsf_vi_permute) export(orsf_vs) +export(pred_spec_auto) import(R6) import(data.table) importFrom(Rcpp,sourceCpp) diff --git a/R/check.R b/R/check.R index 5ec5c42c..46055e7e 100644 --- a/R/check.R +++ b/R/check.R @@ -264,7 +264,8 @@ check_arg_lteq <- function(arg_value, arg_name, bound, append_to_msg = NULL){ #' so hopefully nothing is returned. #' #' @noRd -check_arg_is_valid <- function(arg_value, arg_name, valid_options) { +check_arg_is_valid <- function(arg_value, arg_name, valid_options, + context = NULL) { valid_arg <- arg_value %in% valid_options @@ -278,8 +279,11 @@ check_arg_is_valid <- function(arg_value, arg_name, valid_options) { sep = ', ', last = ' or ') + # context needs an extra bit of text if it isn't null. + if(!is.null(context)) context <- paste0(" for ", context) + error_msg <- paste0( - arg_name, " should be <", expected_values, ">", + arg_name, " should be <", expected_values, ">", context, " but is instead <", arg_values, ">" ) diff --git a/R/orsf_R6.R b/R/orsf_R6.R index c111e622..29d2b70c 100644 --- a/R/orsf_R6.R +++ b/R/orsf_R6.R @@ -1,7 +1,6 @@ # TODO: # - add nocov to cpp -# - compute_pd re-write # - automatic bounds for pd (better interface) # - tests for check_oobag_eval_function # - tests for survival forest w/no censored @@ -423,6 +422,7 @@ ObliqueForest <- R6::R6Class( private$data_bounds <- NULL }, + predict = function(new_data, pred_horizon, pred_type, @@ -563,648 +563,74 @@ ObliqueForest <- R6::R6Class( public_state <- list(data = self$data, na_action = self$na_action, - pred_horizon = self$pred_horizon) + pred_horizon = self$pred_horizon, + n_thread = self$n_thread) private_state <- list(data_rows_complete = private$data_rows_complete) - self$check_boundary_checks(boundary_checks) - self$check_pred_spec(pred_spec, boundary_checks) - self$check_n_thread(n_thread) - self$check_expand_grid(expand_grid) - self$check_oobag_pred_mode(oobag, label = 'oobag') - - prob_values <- prob_values %||% c(0.025, 0.50, 0.975) - prob_labels <- prob_labels %||% c('lwr', 'medn', 'upr') - - self$check_prob_values(prob_values) - self$check_prob_labels(prob_labels) - - if(length(prob_values) != length(prob_labels)){ - stop("prob_values and prob_labels must have the same length.", - call. = FALSE) - } - - # oobag=FALSE to match the format of arg in orsf_pd(). - self$check_pred_type(pred_type, oobag = FALSE) - - pred_type <- pred_type %||% self$pred_type - - self$check_pred_horizon(pred_horizon, boundary_checks, pred_type) - - pred_horizon <- pred_horizon %||% self$pred_horizon %||% 1 - - pred_horizon_order <- order(pred_horizon) - pred_horizon_ordered <- pred_horizon[pred_horizon_order] - # run checks before you assign new values to object. - # otherwise, if a check throws an error, the object will - # not be restored to its normal state. - - - if(!oobag){ - self$check_data(new = TRUE, data = pd_data) - # say new = FALSE to prevent na_action = 'pass' - self$check_na_action(new = FALSE, na_action = na_action) - self$check_var_missing(new = TRUE, data = pd_data, na_action) - self$check_units(data = pd_data) - self$data <- pd_data - } - - self$pred_horizon <- pred_horizon - self$na_action <- na_action - - # make a visible binding for CRAN - id_variable = NULL - - private$init_data_rows_complete() - private$prep_x() - # y and w do not need to be prepped for prediction, - # but they need to match orsf_cpp()'s expectations - private$prep_y(placeholder = TRUE) - private$w <- rep(1, nrow(private$x)) - - - if(oobag){ private$sort_inputs(sort_y = FALSE) } - - # the values in pred_spec need to be centered & scaled to match x, - # which is also centered and scaled - means <- private$data_means - stdev <- private$data_stdev - - for(i in intersect(names(means), names(pred_spec))){ - pred_spec[[i]] <- (pred_spec[[i]] - means[i]) / stdev[i] - } - - fi <- private$data_fctrs - - if(expand_grid){ - - if(!is.data.frame(pred_spec)) - pred_spec <- expand.grid(pred_spec, stringsAsFactors = TRUE) - - for(i in seq_along(fi$cols)){ - - ii <- fi$cols[i] - - if(is.character(pred_spec[[ii]]) && !fi$ordr[i]){ - - pred_spec[[ii]] <- factor(pred_spec[[ii]], levels = fi$lvls[[ii]]) - - } - - } - - check_new_data_fctrs(new_data = pred_spec, - names_x = private$data_names$x_original, - fi_ref = fi, - label_new = "pred_spec") - - pred_spec_new <- ref_code(x_data = pred_spec, fi = fi, - names_x_data = names(pred_spec)) - - x_cols <- list(match(names(pred_spec_new), colnames(private$x))) - - pred_spec_new <- list(as.matrix(pred_spec_new)) - - pd_bind <- list(pred_spec) - - } else { - - pred_spec_new <- pd_bind <- x_cols <- list() - - for(i in seq_along(pred_spec)){ - - pred_spec_new[[i]] <- as.data.frame(pred_spec[i]) - pd_name <- names(pred_spec)[i] - - pd_bind[[i]] <- data.frame( - variable = pd_name, - value = rep(NA_real_, length(pred_spec[[i]])), - level = rep(NA_character_, length(pred_spec[[i]])) - ) - - if(pd_name %in% fi$cols) { - - pd_bind[[i]]$level <- as.character(pred_spec[[i]]) - - pred_spec_new[[i]] <- ref_code(pred_spec_new[[i]], - fi = fi, - names_x_data = pd_name) - - } else { - - pd_bind[[i]]$value <- pred_spec[[i]] - - } - - x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(private$x)) - pred_spec_new[[i]] <- as.matrix(pred_spec_new[[i]]) - - } - - } - - - - cpp_args <- private$prep_cpp_args(x = private$x, - y = private$y, - w = private$w, - importance_type = 'none', - pred_type = pred_type, - pred_mode = TRUE, - pred_aggregate = TRUE, - pred_horizon = pred_horizon_ordered, - oobag = oobag, - oobag_eval_type = 'none', - n_thread = n_thread, - write_forest = FALSE, - run_forest = TRUE, - verbosity = 0) - - - pd_vals <- list() - - for(i in seq_along(pred_spec_new)){ - - pd_vals_i <- list() - - x_pd <- private$x - - for(j in seq(nrow(pred_spec_new[[i]]))){ - - x_pd[, x_cols[[i]]] <- pred_spec_new[[i]][j, ] - - cpp_args$x <- x_pd - - pd_vals_i[[j]] <- do.call(orsf_cpp, cpp_args)$pred_new - - } - - if(type_output == 'smry'){ - pd_vals_i <- lapply( - pd_vals_i, - function(x) { - apply(x, 2, function(x_col){ - as.numeric( - c(mean(x_col, na.rm = TRUE), - quantile(x_col, probs = prob_values, na.rm = TRUE)) - ) - }) - } - ) - } - - - pd_vals[[i]] <- pd_vals_i - - } - - for(i in seq_along(pd_vals)){ - - pd_bind[[i]]$id_variable <- seq(nrow(pd_bind[[i]])) - - for(j in seq_along(pd_vals[[i]])){ - - - pd_vals[[i]][[j]] - - if(self$tree_type == 'survival'){ - - pd_vals[[i]][[j]] <- matrix(pd_vals[[i]][[j]], - nrow=length(pred_horizon), - byrow = T) - - rownames(pd_vals[[i]][[j]]) <- pred_horizon - - } else { - - pd_vals[[i]][[j]] <- t(pd_vals[[i]][[j]]) - - if(self$tree_type == 'classification'){ - rownames(pd_vals[[i]][[j]]) <- self$class_levels - } - - } - - if(type_output=='smry') - colnames(pd_vals[[i]][[j]]) <- c('mean', prob_labels) - else - colnames(pd_vals[[i]][[j]]) <- c(paste(1:nrow(private$x))) - - # this will be null for non-survival objects - ph <- rownames(pd_vals[[i]][[j]]) - - pd_vals[[i]][[j]] <- as.data.frame(pd_vals[[i]][[j]]) - - rownames(pd_vals[[i]][[j]]) <- NULL - - pd_vals[[i]][[j]][['pred_horizon']] <- ph - - if(type_output == 'ice'){ - pd_vals[[i]][[j]] <- melt_aorsf( - data = pd_vals[[i]][[j]], - id.vars = 'pred_horizon', - variable.name = 'id_row', - value.name = 'pred', - measure.vars = setdiff(names(pd_vals[[i]][[j]]), 'pred_horizon')) + if(inherits(pred_spec, 'pspec_auto')){ - } + self$check_var_names(.names = pred_spec, + data = private$data_names$x_original, + location = "pred_spec") + pred_spec <- list_init(pred_spec) + for(i in names(pred_spec)){ + pred_spec[[i]] <- self$get_var_bounds(i) } - pd_vals[[i]] <- rbindlist(pd_vals[[i]], idcol = 'id_variable') - - # this seems awkward but the reason I convert back to data.frame - # here is to avoid a potential memory leak from forder & bmerge. - # I have no idea why this memory leak may be occurring but it does - # not if I apply merge.data.frame instead of merge.data.table - pd_vals[[i]] <- merge(as.data.frame(pd_vals[[i]]), - as.data.frame(pd_bind[[i]]), - by = 'id_variable') - - } - - out <- rbindlist(pd_vals) - - # # missings may occur when oobag=TRUE and n_tree is small - # if(type_output == 'ice') { - # out <- collapse::na_omit(out, cols = 'pred') - # } - - ids <- c('id_variable') - - if(type_output == 'ice') ids <- c(ids, 'id_row') - - mid <- setdiff(names(out), c(ids, 'mean', prob_labels, 'pred')) - - end <- setdiff(names(out), c(ids, mid)) - - setcolorder(out, neworder = c(ids, mid, end)) - - if(self$tree_type == 'classification'){ - setnames(out, old = 'pred_horizon', new = 'class') - out[, class := factor(class, levels = self$class_levels)] - setkey(out, class) - } - - if(self$tree_type == 'survival' && pred_type != 'mort') - out[, pred_horizon := as.numeric(pred_horizon)] - - if(pred_type == 'mort'){ - out[, pred_horizon := NULL] } - # not needed for summary - if(type_output == 'smry') - out[, id_variable := NULL] - - # put data back into original scale - for(j in intersect(names(means), names(pred_spec))){ - - if(j %in% names(out)){ - - var_index <- collapse::seq_row(out) - var_value <- (out[[j]] * stdev[j]) + means[j] - var_name <- j - - } else { - - var_index <- out$variable %==% j - var_value <- (out$value[var_index] * stdev[j]) + means[j] - var_name <- 'value' + self$check_boundary_checks(boundary_checks) + self$check_pred_spec(pred_spec, boundary_checks) + self$check_n_thread(n_thread) + self$check_expand_grid(expand_grid) + self$check_oobag_pred_mode(oobag, label = 'oobag') - } + prob_values <- prob_values %||% c(0.025, 0.50, 0.975) + prob_labels <- prob_labels %||% c('lwr', 'medn', 'upr') - set(out, i = var_index, j = var_name, value = var_value) + self$check_prob_values(prob_values) + self$check_prob_labels(prob_labels) - } - - # silent print after modify in place - out[] - - private$restore_state(public_state, private_state) - - # free up space - private$x <- NULL - private$y <- NULL - private$w <- NULL - - out - - - }, - - compute_dependence_cpp = function(pd_data, - pred_spec, - pred_horizon, - pred_type, - na_action, - expand_grid, - prob_values, - prob_labels, - boundary_checks, - n_thread, - oobag, - type_output){ - - public_state <- list(data = self$data, - na_action = self$na_action, - pred_horizon = self$pred_horizon) - - private_state <- list(data_rows_complete = private$data_rows_complete) - - self$check_boundary_checks(boundary_checks) - self$check_pred_spec(pred_spec, boundary_checks) - self$check_n_thread(n_thread) - self$check_expand_grid(expand_grid) - self$check_oobag_pred_mode(oobag, label = 'oobag') - - prob_values <- prob_values %||% c(0.025, 0.50, 0.975) - prob_labels <- prob_labels %||% c('lwr', 'medn', 'upr') - - self$check_prob_values(prob_values) - self$check_prob_labels(prob_labels) - - if(length(prob_values) != length(prob_labels)){ - stop("prob_values and prob_labels must have the same length.", - call. = FALSE) - } - - # oobag=FALSE to match the format of arg in orsf_pd(). - self$check_pred_type(pred_type, oobag = FALSE) - - pred_type <- pred_type %||% self$pred_type - - self$check_pred_horizon(pred_horizon, boundary_checks, pred_type) - - pred_horizon <- pred_horizon %||% self$pred_horizon %||% 1 - - pred_horizon_order <- order(pred_horizon) - pred_horizon_ordered <- pred_horizon[pred_horizon_order] - - # run checks before you assign new values to object. - # otherwise, if a check throws an error, the object will - # not be restored to its normal state. - - - if(!oobag){ - self$check_data(new = TRUE, data = pd_data) - # say new = FALSE to prevent na_action = 'pass' - self$check_na_action(new = FALSE, na_action = na_action) - self$check_var_missing(new = TRUE, data = pd_data, na_action) - self$check_units(data = pd_data) - self$data <- pd_data - } - - self$pred_horizon <- pred_horizon - self$na_action <- na_action - - # make a visible binding for CRAN - id_variable = NULL - - private$init_data_rows_complete() - private$prep_x() - # y and w do not need to be prepped for prediction, - # but they need to match orsf_cpp()'s expectations - private$prep_y(placeholder = TRUE) - private$w <- rep(1, nrow(private$x)) - - if(oobag){ private$sort_inputs(sort_y = FALSE) } - - # the values in pred_spec need to be centered & scaled to match x, - # which is also centered and scaled - means <- private$data_means - stdev <- private$data_stdev - - for(i in intersect(names(means), names(pred_spec))){ - pred_spec[[i]] <- (pred_spec[[i]] - means[i]) / stdev[i] - } - - fi <- private$data_fctrs - - if(expand_grid){ - - if(!is.data.frame(pred_spec)) - pred_spec <- expand.grid(pred_spec, stringsAsFactors = TRUE) - - for(i in seq_along(fi$cols)){ - - ii <- fi$cols[i] - - if(is.character(pred_spec[[ii]]) && !fi$ordr[i]){ - - pred_spec[[ii]] <- factor(pred_spec[[ii]], levels = fi$lvls[[ii]]) - - } - - } - - check_new_data_fctrs(new_data = pred_spec, - names_x = private$data_names$x_original, - fi_ref = fi, - label_new = "pred_spec") - - pred_spec_new <- ref_code(x_data = pred_spec, fi = fi, - names_x_data = names(pred_spec)) - - x_cols <- list(match(names(pred_spec_new), colnames(private$x))-1) - - pred_spec_new <- list(as.matrix(pred_spec_new)) - - pd_bind <- list(pred_spec) - - } else { - - pred_spec_new <- pd_bind <- x_cols <- list() - - for(i in seq_along(pred_spec)){ - - pred_spec_new[[i]] <- as.data.frame(pred_spec[i]) - pd_name <- names(pred_spec)[i] - - pd_bind[[i]] <- data.frame( - variable = pd_name, - value = rep(NA_real_, length(pred_spec[[i]])), - level = rep(NA_character_, length(pred_spec[[i]])) - ) - - if(pd_name %in% fi$cols) { - - pd_bind[[i]]$level <- as.character(pred_spec[[i]]) - - pred_spec_new[[i]] <- ref_code(pred_spec_new[[i]], - fi = fi, - names_x_data = pd_name) - - } else { - - pd_bind[[i]]$value <- pred_spec[[i]] - - } - - x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(private$x)) - 1 - pred_spec_new[[i]] <- as.matrix(pred_spec_new[[i]]) - - } - - } - - cpp_args <- private$prep_cpp_args(x = private$x, - y = private$y, - w = private$w, - importance_type = 'none', - pred_type = pred_type, - pred_mode = FALSE, - pred_aggregate = TRUE, - pred_horizon = pred_horizon_ordered, - oobag = oobag, - oobag_eval_type = 'none', - n_thread = n_thread, - pd_type_R = switch(type_output, - "smry" = 1L, - "ice" = 2L), - pd_x_vals = pred_spec_new, - pd_x_cols = x_cols, - pd_probs = prob_values, - write_forest = FALSE, - run_forest = TRUE) - - pd_vals <- do.call(orsf_cpp, cpp_args)$pd_values - - row_delim <- switch(self$tree_type, - "survival" = pred_horizon_ordered, - "regression" = 1, - "classification" = self$class_levels) - - row_delim_label <- switch(self$tree_type, - "survival" = "pred_horizon", - "regression" = "pred_row", - "classification" = "class") - - for(i in seq_along(pd_vals)){ - - pd_bind[[i]]$id_variable <- seq(nrow(pd_bind[[i]])) - - for(j in seq_along(pd_vals[[i]])){ - - pd_vals[[i]][[j]] <- matrix(pd_vals[[i]][[j]], - nrow=length(row_delim), - byrow = T) - - rownames(pd_vals[[i]][[j]]) <- row_delim - - - if(type_output=='smry'){ - - pd_vals[[i]][[j]] <- t( - apply( - X = pd_vals[[i]][[j]], - MARGIN = 1, - function(x, p){ - c(collapse::fmean(x), stats::quantile(x, p, na.rm=TRUE)) - }, - prob_values - ) - ) - - colnames(pd_vals[[i]][[j]]) <- c('mean', prob_labels) - - } else { - - colnames(pd_vals[[i]][[j]]) <- c(paste(1:nrow(private$x))) - - } - - pd_vals[[i]][[j]] <- as.data.table(pd_vals[[i]][[j]], - keep.rownames = row_delim_label) - - if(type_output == 'ice'){ - - measure.vars <- setdiff(names(pd_vals[[i]][[j]]), row_delim_label) - - pd_vals[[i]][[j]] <- melt_aorsf(data = pd_vals[[i]][[j]], - id.vars = row_delim_label, - variable.name = 'id_row', - value.name = 'pred', - measure.vars = measure.vars) - - } - - } - - pd_vals[[i]] <- rbindlist(pd_vals[[i]], idcol = 'id_variable') - - # this seems awkward but the reason I convert back to data.frame - # here is to avoid a potential memory leak from forder & bmerge. - # I have no idea why this memory leak may be occurring but it does - # not if I apply merge.data.frame instead of merge.data.table - pd_vals[[i]] <- merge(as.data.frame(pd_vals[[i]]), - as.data.frame(pd_bind[[i]]), - by = 'id_variable') - - } - - out <- rbindlist(pd_vals) - - # missings may occur when oobag=TRUE and n_tree is small - if(type_output == 'ice') { - out <- collapse::na_omit(out, cols = 'pred') - } - - ids <- c('id_variable') - - if(type_output == 'ice') ids <- c(ids, 'id_row') - - mid <- setdiff(names(out), c(ids, 'mean', prob_labels, 'pred')) - - end <- setdiff(names(out), c(ids, mid)) - - setcolorder(out, neworder = c(ids, mid, end)) - - if(self$tree_type == 'classification'){ - out[, class := factor(class, levels = self$class_levels)] - setkey(out, class) - } - - if(self$tree_type == 'survival' && pred_type != 'mort') - out[, pred_horizon := as.numeric(pred_horizon)] - - if(self$tree_type == 'regression'){ - out[, pred_row := NULL] - } - - if(pred_type == 'mort') - out[, pred_horizon := NULL] - - # not needed for summary - if(type_output == 'smry') - out[, id_variable := NULL] - - # put data back into original scale - for(j in intersect(names(means), names(pred_spec))){ - - if(j %in% names(out)){ - - var_index <- collapse::seq_row(out) - var_value <- (out[[j]] * stdev[j]) + means[j] - var_name <- j - - } else { - - var_index <- out$variable %==% j - var_value <- (out$value[var_index] * stdev[j]) + means[j] - var_name <- 'value' + if(length(prob_values) != length(prob_labels)){ + stop("prob_values and prob_labels must have the same length.", + call. = FALSE) + } - } + # oobag=FALSE to match the format of arg in orsf_pd(). + self$check_pred_type(pred_type, oobag = FALSE, + context = 'partial dependence') + pred_type <- pred_type %||% self$pred_type - set(out, i = var_index, j = var_name, value = var_value) + self$check_pred_horizon(pred_horizon, boundary_checks, pred_type) + if(!oobag){ + self$check_data(new = TRUE, data = pd_data) + # say new = FALSE to prevent na_action = 'pass' + self$check_na_action(new = FALSE, na_action = na_action) + self$check_var_missing(new = TRUE, data = pd_data, na_action) + self$check_units(data = pd_data) + self$data <- pd_data } - # silent print after modify in place - out[] + self$na_action <- na_action + self$n_thread <- n_thread + + out <- try( + private$compute_dependence_internal(pred_spec = pred_spec, + pred_type = pred_type, + pred_horizon = pred_horizon, + type_output = type_output, + expand_grid = expand_grid, + prob_labels = prob_labels, + prob_values = prob_values, + oobag = oobag), + silent = FALSE + ) private$restore_state(public_state, private_state) @@ -1306,7 +732,8 @@ ObliqueForest <- R6::R6Class( self$check_pred_horizon(pred_horizon, boundary_checks = TRUE) } - self$check_pred_type(pred_type, oobag = FALSE) + self$check_pred_type(pred_type, oobag = FALSE, + context = 'partial dependence') self$check_importance_type(importance_type) names_x <- private$data_names$x_original @@ -1426,7 +853,7 @@ ObliqueForest <- R6::R6Class( get_var_bounds = function(.name){ if(.name %in% private$data_names$x_numeric) - return(private$data_bounds[, .name]) + return(as.numeric(private$data_bounds[, .name])) else return(private$data_fctrs$lvls[[.name]]) @@ -1542,15 +969,23 @@ ObliqueForest <- R6::R6Class( }, - check_var_names = function(.names, data = NULL){ + check_var_names = function(.names, + data = NULL, + location = "formula"){ data <- data %||% self$data - names_not_found <- setdiff(c(.names), names(data)) + if(is.character(data)){ + data_names <- data + } else { + data_names <- names(data) + } + + names_not_found <- setdiff(c(.names), data_names) if(!is_empty(names_not_found)){ msg <- paste0( - "variables in formula were not found in data: ", + "variables in ", location, " were not found in data: ", paste_collapse(names_not_found, last = ' and ') ) stop(msg, call. = FALSE) @@ -1565,7 +1000,9 @@ ObliqueForest <- R6::R6Class( check_new_data_names(new_data, ref_names = private$data_names$x_original, label_new = "new_data", - label_ref = 'training data') + label_ref = 'training data', + check_new_in_ref = check_new_in_ref, + check_ref_in_new = check_ref_in_new) }, @@ -2083,7 +1520,7 @@ ObliqueForest <- R6::R6Class( }, # must specify oobag when you call this to make sure it isn't forgotten - check_pred_type = function(pred_type = NULL, oobag){ + check_pred_type = function(pred_type = NULL, oobag, context = NULL){ input <- pred_type %||% self$pred_type @@ -2100,7 +1537,9 @@ ObliqueForest <- R6::R6Class( arg_name = arg_name, expected_length = 1) - self$check_pred_type_internal(oobag, pred_type) + self$check_pred_type_internal(oobag = oobag, + pred_type = pred_type, + context = context) } @@ -2812,181 +2251,443 @@ ObliqueForest <- R6::R6Class( }, init_numeric_names = function(){ - pattern <- "^integer$|^numeric$|^units$" + pattern <- "^integer$|^numeric$|^units$" + + index <- grep(pattern = pattern, x = private$data_types$x) + + private$data_names[["x_numeric"]] = private$data_names$x_original[index] + + }, + init_ref_code_names = function(){ + + xref <- xnames <- private$data_names$x_original + fi <- private$data_fctrs + + for (i in seq_along(fi$cols)){ + + if(fi$cols[i] %in% xnames){ + + if(!fi$ordr[i]){ + + xref <- insert_vals( + vec = xref, + where = xref %==% fi$cols[i], + what = fi$keys[[i]][-1] + ) + + } + } + + } + + private$data_names$x_ref_code = xref + + }, + init_oobag_pred_mode = function(){ + + # if pred_type is null when this is run, it means + # the user did not specify pred_type, which means the + # family-specific default will be used, which means + # pred_type will not be 'none', so it is safe to assume + # oobag_pred_mode is TRUE if pred_type is currently null + + if(is.null(self$pred_type)){ + self$oobag_pred_mode <- TRUE + } else { + self$oobag_pred_mode <- self$pred_type != "none" + } + + }, + init_mtry = function(){ + + n_col_x <- length(private$data_names$x_ref_code) + + self$mtry <- ceiling(sqrt(n_col_x)) + + }, + + + + + init_lincomb_df_target = function(mtry = NULL){ + + mtry <- mtry %||% self$mtry + + self$control$lincomb_df_target <- mtry + + }, + + init_weights = function(){ + + # set weights as 1 if user did not supply them. + # length of weights depends on how missing are handled. + self$weights <- rep(1, self$n_obs) + + }, + + + init_oobag_eval_function = function(){ + + self$oobag_eval_function <- function(y_mat, w_vec, s_vec){ + return(1) + } + + }, + + init_oobag_eval_every = function(n_tree = NULL){ + self$oobag_eval_every = n_tree %||% self$n_tree + }, + + init_lincomb_R_function = function(){ + + self$control$lincomb_R_function <- function(x) x + + }, + + # use a starter seed to create n_tree seeds + plant_tree_seeds = function(start_seed){ + + self$forest_seed <- start_seed + set.seed(start_seed) + self$tree_seeds <- sample(1e5, size = self$n_tree) + + }, + + # computers + + compute_means = function(){ + + numeric_data <- select_cols(self$data, private$data_names$x_numeric) + + if(self$na_action == 'omit'){ + numeric_data <- collapse::fsubset(numeric_data, private$data_rows_complete) + } + + private$data_means <- collapse::fmean(numeric_data, w = self$weights) + + }, + + + compute_modes = function(){ + + nominal_data <- select_cols(self$data, private$data_fctrs$cols) + + if(self$na_action == 'omit'){ + nominal_data <- collapse::fsubset(nominal_data, private$data_rows_complete) + } + + private$data_modes <- vapply(nominal_data, + collapse::fmode, + FUN.VALUE = integer(1), + w = self$weights) + + }, + + compute_stdev = function(){ + + numeric_data <- select_cols(self$data, private$data_names$x_numeric) + + if(self$na_action == 'omit'){ + numeric_data <- collapse::fsubset(numeric_data, private$data_rows_complete) + } + + private$data_stdev <- collapse::fsd(numeric_data, w = self$weights) + + }, + + compute_bounds = function(){ + + numeric_data <- select_cols(self$data, private$data_names$x_numeric) + + if(self$na_action == 'omit'){ + numeric_data <- collapse::fsubset(numeric_data, private$data_rows_complete) + } + + private$data_bounds <- matrix( + data = c( + collapse::fnth(numeric_data, 0.10, w = self$weights), + collapse::fnth(numeric_data, 0.25, w = self$weights), + collapse::fnth(numeric_data, 0.50, w = self$weights), + collapse::fnth(numeric_data, 0.75, w = self$weights), + collapse::fnth(numeric_data, 0.90, w = self$weights) + ), + nrow = 5, + byrow = TRUE, + dimnames = list(c('10%', '25%', '50%', '75%', '90%'), + names(numeric_data)) + ) + + }, + + compute_mean_leaves = function(){ + + leaf_counts <- vapply(X = self$forest$leaf_summary, + FUN = function(x) sum(x != 0), + FUN.VALUE = integer(1)) + + private$mean_leaves <- collapse::fmean(leaf_counts) + + }, + + compute_dependence_internal = function(pred_spec, + pred_type, + pred_horizon = NULL, + type_output, + prob_labels, + prob_values, + expand_grid, + oobag){ + + # make a visible binding for CRAN + id_variable = NULL + + pred_horizon <- pred_horizon %||% self$pred_horizon %||% 1 + pred_horizon_order <- order(pred_horizon) + pred_horizon_ordered <- pred_horizon[pred_horizon_order] + + + private$init_data_rows_complete() + private$prep_x() + # y and w do not need to be prepped for prediction, + # but they need to match orsf_cpp()'s expectations + private$prep_y(placeholder = TRUE) + private$w <- rep(1, nrow(private$x)) + + if(oobag){ private$sort_inputs(sort_y = FALSE) } + + # the values in pred_spec need to be centered & scaled to match x, + # which is also centered and scaled + means <- private$data_means + stdev <- private$data_stdev + + for(i in intersect(names(means), names(pred_spec))){ + pred_spec[[i]] <- (pred_spec[[i]] - means[i]) / stdev[i] + } + + fi <- private$data_fctrs + + if(expand_grid){ + + if(!is.data.frame(pred_spec)) + pred_spec <- expand.grid(pred_spec, stringsAsFactors = TRUE) + + for(i in seq_along(fi$cols)){ + + ii <- fi$cols[i] + + if(is.character(pred_spec[[ii]]) && !fi$ordr[i]){ + + pred_spec[[ii]] <- factor(pred_spec[[ii]], levels = fi$lvls[[ii]]) + + } + + } + + check_new_data_fctrs(new_data = pred_spec, + names_x = private$data_names$x_original, + fi_ref = fi, + label_new = "pred_spec") + + pred_spec_new <- ref_code(x_data = pred_spec, fi = fi, + names_x_data = names(pred_spec)) + + x_cols <- list(match(names(pred_spec_new), colnames(private$x))-1) - index <- grep(pattern = pattern, x = private$data_types$x) + pred_spec_new <- list(as.matrix(pred_spec_new)) - private$data_names[["x_numeric"]] = private$data_names$x_original[index] + pd_bind <- list(pred_spec) - }, - init_ref_code_names = function(){ + } else { - xref <- xnames <- private$data_names$x_original - fi <- private$data_fctrs + pred_spec_new <- pd_bind <- x_cols <- list() - for (i in seq_along(fi$cols)){ + for(i in seq_along(pred_spec)){ - if(fi$cols[i] %in% xnames){ + pred_spec_new[[i]] <- as.data.frame(pred_spec[i]) + pd_name <- names(pred_spec)[i] - if(!fi$ordr[i]){ + pd_bind[[i]] <- data.frame( + variable = pd_name, + value = rep(NA_real_, length(pred_spec[[i]])), + level = rep(NA_character_, length(pred_spec[[i]])) + ) - xref <- insert_vals( - vec = xref, - where = xref %==% fi$cols[i], - what = fi$keys[[i]][-1] - ) + if(pd_name %in% fi$cols) { - } - } + pd_bind[[i]]$level <- as.character(pred_spec[[i]]) - } + pred_spec_new[[i]] <- ref_code(pred_spec_new[[i]], + fi = fi, + names_x_data = pd_name) - private$data_names$x_ref_code = xref + } else { - }, - init_oobag_pred_mode = function(){ + pd_bind[[i]]$value <- pred_spec[[i]] - # if pred_type is null when this is run, it means - # the user did not specify pred_type, which means the - # family-specific default will be used, which means - # pred_type will not be 'none', so it is safe to assume - # oobag_pred_mode is TRUE if pred_type is currently null + } - if(is.null(self$pred_type)){ - self$oobag_pred_mode <- TRUE - } else { - self$oobag_pred_mode <- self$pred_type != "none" - } + x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(private$x)) - 1 + pred_spec_new[[i]] <- as.matrix(pred_spec_new[[i]]) - }, - init_mtry = function(){ + } - n_col_x <- length(private$data_names$x_ref_code) + } - self$mtry <- ceiling(sqrt(n_col_x)) + cpp_args <- private$prep_cpp_args(x = private$x, + y = private$y, + w = private$w, + importance_type = 'none', + pred_type = pred_type, + pred_mode = FALSE, + pred_aggregate = TRUE, + pred_horizon = pred_horizon_ordered, + oobag = oobag, + oobag_eval_type = 'none', + pd_type_R = switch(type_output, + "smry" = 1L, + "ice" = 2L), + pd_x_vals = pred_spec_new, + pd_x_cols = x_cols, + pd_probs = prob_values, + write_forest = FALSE, + run_forest = TRUE) - }, + pd_vals <- do.call(orsf_cpp, cpp_args)$pd_values + row_delim <- switch(self$tree_type, + "survival" = pred_horizon_ordered, + "regression" = 1, + "classification" = self$class_levels) + row_delim_label <- switch(self$tree_type, + "survival" = "pred_horizon", + "regression" = "pred_row", + "classification" = "class") + for(i in seq_along(pd_vals)){ - init_lincomb_df_target = function(mtry = NULL){ + pd_bind[[i]]$id_variable <- seq(nrow(pd_bind[[i]])) - mtry <- mtry %||% self$mtry + for(j in seq_along(pd_vals[[i]])){ - self$control$lincomb_df_target <- mtry + pd_vals[[i]][[j]] <- matrix(pd_vals[[i]][[j]], + nrow=length(row_delim), + byrow = T) - }, + rownames(pd_vals[[i]][[j]]) <- row_delim - init_weights = function(){ - # set weights as 1 if user did not supply them. - # length of weights depends on how missing are handled. - self$weights <- rep(1, self$n_obs) + if(type_output=='smry'){ - }, + pd_vals[[i]][[j]] <- t( + apply( + X = pd_vals[[i]][[j]], + MARGIN = 1, + function(x, p){ + c(collapse::fmean(x), stats::quantile(x, p, na.rm=TRUE)) + }, + prob_values + ) + ) + colnames(pd_vals[[i]][[j]]) <- c('mean', prob_labels) - init_oobag_eval_function = function(){ + } else { - self$oobag_eval_function <- function(y_mat, w_vec, s_vec){ - return(1) - } + colnames(pd_vals[[i]][[j]]) <- c(paste(1:nrow(private$x))) - }, + } - init_oobag_eval_every = function(n_tree = NULL){ - self$oobag_eval_every = n_tree %||% self$n_tree - }, + pd_vals[[i]][[j]] <- as.data.table(pd_vals[[i]][[j]], + keep.rownames = row_delim_label) - init_lincomb_R_function = function(){ + if(type_output == 'ice'){ - self$control$lincomb_R_function <- function(x) x + measure.vars <- setdiff(names(pd_vals[[i]][[j]]), row_delim_label) - }, + pd_vals[[i]][[j]] <- melt_aorsf(data = pd_vals[[i]][[j]], + id.vars = row_delim_label, + variable.name = 'id_row', + value.name = 'pred', + measure.vars = measure.vars) - # use a starter seed to create n_tree seeds - plant_tree_seeds = function(start_seed){ + } - self$forest_seed <- start_seed - set.seed(start_seed) - self$tree_seeds <- sample(1e5, size = self$n_tree) + } - }, + pd_vals[[i]] <- rbindlist(pd_vals[[i]], idcol = 'id_variable') - # computers + # this seems awkward but the reason I convert back to data.frame + # here is to avoid a potential memory leak from forder & bmerge. + # I have no idea why this memory leak may be occurring but it does + # not if I apply merge.data.frame instead of merge.data.table + pd_vals[[i]] <- merge(as.data.frame(pd_vals[[i]]), + as.data.frame(pd_bind[[i]]), + by = 'id_variable') - compute_means = function(){ + } - numeric_data <- select_cols(self$data, private$data_names$x_numeric) + out <- rbindlist(pd_vals) - if(self$na_action == 'omit'){ - numeric_data <- collapse::fsubset(numeric_data, private$data_rows_complete) + # missings may occur when oobag=TRUE and n_tree is small + if(type_output == 'ice') { + out <- collapse::na_omit(out, cols = 'pred') } - private$data_means <- collapse::fmean(numeric_data, w = self$weights) + ids <- c('id_variable') - }, + if(type_output == 'ice') ids <- c(ids, 'id_row') + mid <- setdiff(names(out), c(ids, 'mean', prob_labels, 'pred')) - compute_modes = function(){ + end <- setdiff(names(out), c(ids, mid)) - nominal_data <- select_cols(self$data, private$data_fctrs$cols) + setcolorder(out, neworder = c(ids, mid, end)) - if(self$na_action == 'omit'){ - nominal_data <- collapse::fsubset(nominal_data, private$data_rows_complete) + if(self$tree_type == 'classification'){ + out[, class := factor(class, levels = self$class_levels)] + setkey(out, class) } - private$data_modes <- vapply(nominal_data, - collapse::fmode, - FUN.VALUE = integer(1), - w = self$weights) - - }, + if(self$tree_type == 'survival' && pred_type != 'mort') + out[, pred_horizon := as.numeric(pred_horizon)] - compute_stdev = function(){ + if(self$tree_type == 'regression'){ + out[, pred_row := NULL] + } - numeric_data <- select_cols(self$data, private$data_names$x_numeric) + if(pred_type == 'mort') + out[, pred_horizon := NULL] - if(self$na_action == 'omit'){ - numeric_data <- collapse::fsubset(numeric_data, private$data_rows_complete) - } + # not needed for summary + if(type_output == 'smry') + out[, id_variable := NULL] - private$data_stdev <- collapse::fsd(numeric_data, w = self$weights) + # put data back into original scale + for(j in intersect(names(means), names(pred_spec))){ - }, + if(j %in% names(out)){ - compute_bounds = function(){ + var_index <- collapse::seq_row(out) + var_value <- (out[[j]] * stdev[j]) + means[j] + var_name <- j - numeric_data <- select_cols(self$data, private$data_names$x_numeric) + } else { - if(self$na_action == 'omit'){ - numeric_data <- collapse::fsubset(numeric_data, private$data_rows_complete) - } + var_index <- out$variable %==% j + var_value <- (out$value[var_index] * stdev[j]) + means[j] + var_name <- 'value' - private$data_bounds <- matrix( - data = c( - collapse::fnth(numeric_data, 0.10, w = self$weights), - collapse::fnth(numeric_data, 0.25, w = self$weights), - collapse::fnth(numeric_data, 0.50, w = self$weights), - collapse::fnth(numeric_data, 0.75, w = self$weights), - collapse::fnth(numeric_data, 0.90, w = self$weights) - ), - nrow = 5, - byrow = TRUE, - dimnames = list(c('10%', '25%', '50%', '75%', '90%'), - names(numeric_data)) - ) + } - }, + set(out, i = var_index, j = var_name, value = var_value) - compute_mean_leaves = function(){ + } - leaf_counts <- vapply(X = self$forest$leaf_summary, - FUN = function(x) sum(x != 0), - FUN.VALUE = integer(1)) + # silent print after modify in place + out[] - private$mean_leaves <- collapse::fmean(leaf_counts) + out }, @@ -3297,16 +2998,29 @@ ObliqueForestSurvival <- R6::R6Class( valid_options = c("logrank", "cstat")) }, - check_pred_type_internal = function(oobag, pred_type = NULL){ + check_pred_type_internal = function(oobag, + pred_type = NULL, + context = NULL){ input <- pred_type %||% self$pred_type arg_name <- if(oobag) 'oobag_pred_type' else 'pred_type' + if(is.null(context)){ + valid_options <- c("none", "surv", "risk", "chf", "mort", "leaf") + } else { + valid_options <- switch( + context, + 'partial dependence' = c("surv", "risk", "chf", "mort"), + 'prediction' = c("surv", "risk", "chf", "mort", "leaf") + ) + context <- paste(context, 'with survival forests') + } + check_arg_is_valid(arg_value = input, arg_name = arg_name, - valid_options = c("none", "surv", "risk", - "chf", "mort", "leaf")) + valid_options = valid_options, + context = context) }, @@ -3845,15 +3559,29 @@ ObliqueForestClassification <- R6::R6Class( valid_options = c("gini", "cstat")) }, - check_pred_type_internal = function(oobag, pred_type = NULL){ + check_pred_type_internal = function(oobag, + pred_type = NULL, + context = NULL){ input <- pred_type %||% self$pred_type arg_name <- if(oobag) 'oobag_pred_type' else 'pred_type' + if(is.null(context)){ + valid_options <- c("none", "prob", "class", "leaf") + } else { + valid_options <- switch( + context, + 'partial dependence' = c("prob"), + 'prediction' = c("prob", "class", "leaf") + ) + context <- paste(context, 'with classification forests') + } + check_arg_is_valid(arg_value = input, arg_name = arg_name, - valid_options = c("none", "prob", "class", "leaf")) + valid_options = valid_options, + context = context) }, @@ -4089,15 +3817,29 @@ ObliqueForestRegression <- R6::R6Class( }, - check_pred_type_internal = function(oobag, pred_type = NULL){ + check_pred_type_internal = function(oobag, + pred_type = NULL, + context = NULL){ input <- pred_type %||% self$pred_type arg_name <- if(oobag) 'oobag_pred_type' else 'pred_type' + if(is.null(context)){ + valid_options <- c("none", "mean", "leaf") + } else { + valid_options <- switch( + context, + 'partial dependence' = c("mean"), + 'prediction' = c("mean", "leaf") + ) + context <- paste(context, 'with regression forests') + } + check_arg_is_valid(arg_value = input, arg_name = arg_name, - valid_options = c("none", "mean", "leaf")) + valid_options = valid_options, + context = context) }, diff --git a/R/orsf_control.R b/R/orsf_control.R index 8aa5cd4d..2bb901d8 100644 --- a/R/orsf_control.R +++ b/R/orsf_control.R @@ -51,7 +51,7 @@ orsf_control_fast <- function(method = 'efron', method <- tolower(method) - check_control_cph(method = method, do_scale = do_scale) + # check_control_cph(method = method, do_scale = do_scale) ties_method <- method @@ -126,9 +126,9 @@ orsf_control_cph <- function(method = 'efron', check_dots(list(...), orsf_control_cph) - check_control_cph(method = method, - eps = eps, - iter_max = iter_max) + # check_control_cph(method = method, + # eps = eps, + # iter_max = iter_max) ties_method <- method @@ -188,7 +188,7 @@ orsf_control_net <- function(alpha = 1/2, ) check_dots(list(...), orsf_control_net) - check_control_net(alpha, df_target) + # check_control_net(alpha, df_target) orsf_control(tree_type = 'unknown', method = 'net', diff --git a/R/orsf_pd.R b/R/orsf_pd.R index eab2eea1..1b0ec523 100644 --- a/R/orsf_pd.R +++ b/R/orsf_pd.R @@ -1,5 +1,40 @@ +#' Automatic variable values for dependence +#' +#' For partial dependence and individual conditional expectations, +#' this function allows a variable to be considered without having +#' to specify what values to set the variable at. The values used +#' are based on quantiles for continuous variables (10th, 25th, 50th, +#' 75th, and 90th) and unique categories for categorical variables. +#' +#' @param ... names of the variables to use. These can be in quotes +#' or not in quotes (see examples). +#' +#' @return a character vector with the names +#' +#' @details This function should only be used in the context of +#' `orsf_pd` or `orsf_ice` functions. +#' +#' +#' @export +#' +#' @examples +#' +#' fit <- orsf(penguins_orsf, species ~., n_tree = 5) +#' +#' orsf_pd_oob(fit, pred_spec_auto(flipper_length_mm)) +#' +pred_spec_auto <- function(...){ + + input_string <- gsub(".*\\((.*)\\).*", "\\1", match.call())[-1] + result <- trimws(unlist(strsplit(input_string, ","))) + class(result) <- c("pspec_auto", class(result)) + + result + +} + #' ORSF partial dependence #' @@ -10,7 +45,7 @@ #' @inheritParams predict.ObliqueForest #' #' -#' @param pred_spec (*named list* or _data.frame_). +#' @param pred_spec (*named list* or *data.frame*). #' #' - If `pred_spec` is a named list, #' Each item in the list should be a vector of values that will be used as @@ -85,7 +120,7 @@ orsf_pd_oob <- function(object, prob_values = c(0.025, 0.50, 0.975), prob_labels = c('lwr', 'medn', 'upr'), boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ...){ check_dots(list(...), orsf_pd_oob) @@ -115,7 +150,7 @@ orsf_pd_inb <- function(object, prob_values = c(0.025, 0.50, 0.975), prob_labels = c('lwr', 'medn', 'upr'), boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ...){ check_dots(list(...), orsf_pd_inb) @@ -152,7 +187,7 @@ orsf_pd_new <- function(object, prob_values = c(0.025, 0.50, 0.975), prob_labels = c('lwr', 'medn', 'upr'), boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ...){ check_dots(list(...), orsf_pd_new) @@ -197,7 +232,7 @@ orsf_ice_oob <- function(object, pred_type = NULL, expand_grid = TRUE, boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ...){ check_dots(list(...), orsf_ice_oob) @@ -223,7 +258,7 @@ orsf_ice_inb <- function(object, pred_type = NULL, expand_grid = TRUE, boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ...){ check_dots(list(...), orsf_ice_oob) @@ -256,7 +291,7 @@ orsf_ice_new <- function(object, na_action = 'fail', expand_grid = TRUE, boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ...){ check_dots(list(...), orsf_ice_new) @@ -313,18 +348,18 @@ orsf_pred_dependence <- function(object, "did you use attach_data = FALSE when ", "running orsf()?", call. = FALSE) - object$compute_dependence_cpp(pd_data = pd_data, - pred_spec = pred_spec, - pred_horizon = pred_horizon, - pred_type = pred_type, - na_action = na_action, - expand_grid = expand_grid, - prob_values = prob_values, - prob_labels = prob_labels, - boundary_checks = boundary_checks, - n_thread = n_thread, - oobag = oobag, - type_output = type_output) + object$compute_dependence(pd_data = pd_data, + pred_spec = pred_spec, + pred_horizon = pred_horizon, + pred_type = pred_type, + na_action = na_action, + expand_grid = expand_grid, + prob_values = prob_values, + prob_labels = prob_labels, + boundary_checks = boundary_checks, + n_thread = n_thread, + oobag = oobag, + type_output = type_output) } diff --git a/man/orsf.Rd b/man/orsf.Rd index cb837a7b..35114378 100644 --- a/man/orsf.Rd +++ b/man/orsf.Rd @@ -423,7 +423,7 @@ printing \code{fit} provides quick descriptive summaries: ## N trees: 500 ## N predictors total: 17 ## N predictors per node: 5 -## Average leaves per tree: 20.884 +## Average leaves per tree: 20.98 ## Min observations in leaf: 5 ## Min events in leaf: 1 ## OOB stat value: 0.84 @@ -599,14 +599,13 @@ The AUC values, from highest to lowest: \if{html}{\out{
}}\preformatted{sc$AUC$score[order(-AUC)] }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## model times AUC se lower upper -## -## 1: net 1788 0.9151649 0.02027237 0.8754318 0.9548980 -## 2: rlt 1788 0.9119200 0.02092197 0.8709137 0.9529263 -## 3: accel 1788 0.9095628 0.02145474 0.8675122 0.9516133 -## 4: cph 1788 0.9095628 0.02145474 0.8675122 0.9516133 -## 5: rando 1788 0.9062197 0.02150724 0.8640663 0.9483731 -## 6: pca 1788 0.8983266 0.02305627 0.8531372 0.9435161 +\if{html}{\out{
}}\preformatted{## model times AUC se lower upper +## 1: net 1788 0.9134593 0.02079935 0.8726933 0.9542253 +## 2: rlt 1788 0.9129537 0.01979016 0.8741657 0.9517417 +## 3: accel 1788 0.9112315 0.02098077 0.8701099 0.9523530 +## 4: cph 1788 0.9063871 0.02165434 0.8639453 0.9488288 +## 5: rando 1788 0.9023489 0.02218936 0.8588586 0.9458393 +## 6: pca 1788 0.8994220 0.02201713 0.8562692 0.9425748 }\if{html}{\out{
}} And the indices of prediction accuracy: @@ -615,13 +614,12 @@ And the indices of prediction accuracy: }\if{html}{\out{
}} \if{html}{\out{
}}\preformatted{## model times IPA -## -## 1: net 1788 0.4905777 -## 2: accel 1788 0.4806649 -## 3: cph 1788 0.4806649 -## 4: rlt 1788 0.4675228 -## 5: pca 1788 0.4369636 -## 6: rando 1788 0.4302814 +## 1: net 1788 0.4916038 +## 2: accel 1788 0.4879683 +## 3: cph 1788 0.4751883 +## 4: rlt 1788 0.4640282 +## 5: pca 1788 0.4370592 +## 6: rando 1788 0.4258344 ## 7: Null model 1788 0.0000000 }\if{html}{\out{
}} @@ -666,16 +664,16 @@ analyses ## # A tibble: 10 x 5 ## splits id recipe train test ## -## 1 Fold01 -## 2 Fold02 -## 3 Fold03 -## 4 Fold04 -## 5 Fold05 -## 6 Fold06 -## 7 Fold07 -## 8 Fold08 -## 9 Fold09 -## 10 Fold10 +## 1 Fold01 +## 2 Fold02 +## 3 Fold03 +## 4 Fold04 +## 5 Fold05 +## 6 Fold06 +## 7 Fold07 +## 8 Fold08 +## 9 Fold09 +## 10 Fold10 }\if{html}{\out{}} Define functions for a ‘workflow’ with \code{randomForestSRC}, \code{ranger}, and @@ -735,29 +733,30 @@ glimpse(results) }\if{html}{\out{}} \if{html}{\out{
}}\preformatted{## Rows: 276 -## Columns: 22 -## $ trt d_penicill_main, placebo, d_penicill_main, d_penicill_main~ -## $ age 54.74059, 53.05681, 53.93018, 49.56057, 59.95346, 64.18891~ -## $ sex f, f, f, f, f, m, f, f, f, f, f, f, f, f, f, f, m, f, f, m~ -## $ ascites 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ -## $ hepato 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1~ -## $ spiders 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0~ -## $ edema 0.5, 0, 1, 0.5, 0, 0, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0, 0, 1,~ -## $ bili 1.8, 0.3, 11.4, 0.7, 5.1, 0.6, 0.7, 0.7, 5.7, 0.5, 20.0, 1~ -## $ chol 244, 280, 178, 235, 374, 252, 298, 370, 482, 316, 652, 394~ -## $ albumin 2.54, 4.00, 2.80, 3.56, 3.51, 3.83, 4.10, 3.78, 2.84, 3.65~ -## $ copper 64, 52, 588, 39, 140, 41, 40, 24, 161, 68, 159, 111, 38, 1~ -## $ alk.phos 6121.8, 4651.2, 961.0, 1881.0, 1919.0, 843.0, 661.0, 5833.~ -## $ ast 60.63, 28.38, 280.55, 93.00, 122.45, 65.10, 106.95, 73.53,~ -## $ trig 92, 189, 200, 123, 135, 83, 66, 86, 165, 71, 184, 243, 102~ -## $ platelet 183, 373, 283, 209, 322, 336, 324, 390, 518, 356, 227, 165~ -## $ protime 10.3, 11.0, 12.4, 11.0, 13.0, 11.4, 11.3, 10.6, 12.7, 9.8,~ -## $ stage 4, 3, 4, 3, 4, 4, 2, 2, 3, 3, 3, 4, 3, 1, 2, 4, 3, 4, 2, 2~ -## $ time 1925, 2466, 131, 4232, 1356, 3445, 4127, 4509, 2256, 2576,~ -## $ status 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0~ -## $ pred_aorsf 0.45745610, 0.05728935, 0.92993003, 0.12073227, 0.75726804~ -## $ pred_rfsrc 0.33022046, 0.05464229, 0.83958477, 0.08137431, 0.62883225~ -## $ pred_ranger 0.43532894, 0.03749792, 0.77546182, 0.06246389, 0.63844391~ +## Columns: 23 +## $ id 1, 8, 15, 17, 23, 35, 56, 68, 92, 94, 97, 109, 116, 130, 1~ +## $ trt d_penicill_main, placebo, d_penicill_main, placebo, placeb~ +## $ age 58.76523, 53.05681, 64.64613, 52.18344, 55.96715, 48.61875~ +## $ sex f, f, f, f, f, f, f, f, f, f, m, f, f, f, f, f, f, f, f, f~ +## $ ascites 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ +## $ hepato 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0~ +## $ spiders 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0~ +## $ edema 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 0, 0, 0.5, 0~ +## $ bili 14.5, 0.3, 0.8, 2.7, 17.4, 1.2, 1.1, 0.7, 1.4, 3.2, 2.0, 0~ +## $ chol 261, 280, 231, 274, 395, 314, 498, 174, 206, 201, 420, 120~ +## $ albumin 2.60, 4.00, 3.87, 3.15, 2.94, 3.20, 3.80, 4.09, 3.13, 3.11~ +## $ copper 156, 52, 173, 159, 558, 201, 88, 58, 36, 178, 62, 53, 74, ~ +## $ alk.phos 1718.0, 4651.2, 9009.8, 1533.0, 6064.8, 12258.8, 13862.4, ~ +## $ ast 137.95, 28.38, 127.71, 117.80, 227.04, 72.24, 95.46, 71.30~ +## $ trig 172, 189, 96, 128, 191, 151, 319, 46, 70, 69, 91, 52, 382,~ +## $ platelet 190, 373, 295, 224, 214, 431, 365, 203, 145, 188, 344, 271~ +## $ protime 12.2, 11.0, 11.0, 10.5, 11.7, 10.6, 10.6, 10.6, 12.2, 11.8~ +## $ stage 4, 3, 3, 4, 4, 3, 2, 3, 4, 4, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2~ +## $ time 400, 2466, 3584, 769, 264, 2847, 1847, 4039, 388, 750, 611~ +## $ status 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0~ +## $ pred_aorsf 0.95858574, 0.05070837, 0.22924369, 0.47048899, 0.94442624~ +## $ pred_rfsrc 0.85474291, 0.05480510, 0.31751746, 0.45156052, 0.85784657~ +## $ pred_ranger 0.77897322, 0.04599438, 0.30267242, 0.43467381, 0.77914735~ }\if{html}{\out{
}} And finish by aggregating the predictions and computing performance in @@ -782,19 +781,17 @@ counts. ## ## Results by model: ## -## model times AUC lower upper -## -## 1: aorsf 1826 91.5 87.3 95.6 -## 2: rfsrc 1826 89.7 85.3 94.2 -## 3: ranger 1826 90.2 86.0 94.5 +## model times AUC lower upper +## 1: aorsf 1826 90.7 86.4 95.0 +## 2: rfsrc 1826 89.9 85.6 94.2 +## 3: ranger 1826 89.7 85.5 94.0 ## ## Results of model comparisons: ## -## times model reference delta.AUC lower upper p -## -## 1: 1826 rfsrc aorsf -1.7 -3.1 -0.4 0.01 -## 2: 1826 ranger aorsf -1.3 -2.6 0.1 0.07 -## 3: 1826 ranger rfsrc 0.5 -0.9 1.9 0.50 +## times model reference delta.AUC lower upper p +## 1: 1826 rfsrc aorsf -0.8 -2.2 0.5 0.2 +## 2: 1826 ranger aorsf -1.0 -2.5 0.5 0.2 +## 3: 1826 ranger rfsrc -0.1 -1.3 1.0 0.8 ## ## NOTE: Values are multiplied by 100 and given in \%. @@ -806,23 +803,21 @@ counts. ## ## Results by model: ## -## model times Brier lower upper IPA -## -## 1: Null model 1826.25 20.5 18.1 22.9 0.0 -## 2: aorsf 1826.25 10.2 8.1 12.4 50.0 -## 3: rfsrc 1826.25 11.6 9.4 13.8 43.4 -## 4: ranger 1826.25 11.6 9.4 13.7 43.5 +## model times Brier lower upper IPA +## 1: Null model 1826.25 20.5 18.1 22.9 0.0 +## 2: aorsf 1826.25 10.8 8.6 13.0 47.3 +## 3: rfsrc 1826.25 11.8 9.7 13.9 42.5 +## 4: ranger 1826.25 11.9 9.8 14.0 41.9 ## ## Results of model comparisons: ## -## times model reference delta.Brier lower upper p -## -## 1: 1826.25 aorsf Null model -10.2 -12.8 -7.7 2.306634e-15 -## 2: 1826.25 rfsrc Null model -8.9 -11.2 -6.6 4.786107e-14 -## 3: 1826.25 ranger Null model -8.9 -11.2 -6.6 5.814537e-14 -## 4: 1826.25 rfsrc aorsf 1.4 0.7 2.0 8.893051e-05 -## 5: 1826.25 ranger aorsf 1.3 0.7 2.0 5.262910e-05 -## 6: 1826.25 ranger rfsrc -0.0 -0.6 0.5 9.088458e-01 +## times model reference delta.Brier lower upper p +## 1: 1826.25 aorsf Null model -9.7 -12.3 -7.1 2.179169e-13 +## 2: 1826.25 rfsrc Null model -8.7 -11.0 -6.4 3.915661e-14 +## 3: 1826.25 ranger Null model -8.6 -10.9 -6.2 8.869452e-13 +## 4: 1826.25 rfsrc aorsf 1.0 0.3 1.7 5.383491e-03 +## 5: 1826.25 ranger aorsf 1.1 0.5 1.7 7.890574e-04 +## 6: 1826.25 ranger rfsrc 0.1 -0.4 0.6 6.310608e-01 ## ## NOTE: Values are multiplied by 100 and given in \%. diff --git a/man/orsf_control_custom.Rd b/man/orsf_control_custom.Rd index bb4a16f9..f79381b1 100644 --- a/man/orsf_control_custom.Rd +++ b/man/orsf_control_custom.Rd @@ -67,7 +67,7 @@ fit_rando ## N trees: 500 ## N predictors total: 17 ## N predictors per node: 5 -## Average leaves per tree: 19.882 +## Average leaves per tree: 19.758 ## Min observations in leaf: 5 ## Min events in leaf: 1 ## OOB stat value: 0.83 @@ -108,7 +108,12 @@ How well do our two customized ORSFs do? Let’s compute their indices of prediction accuracy based on out-of-bag predictions: \if{html}{\out{
}}\preformatted{library(riskRegression) -library(survival) +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## riskRegression version 2023.09.08 +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{library(survival) risk_preds <- list(rando = 1 - fit_rando$pred_oobag, pca = 1 - fit_pca$pred_oobag) @@ -129,18 +134,16 @@ The PCA ORSF does quite well! (higher IPA is better) ## Results by model: ## ## model times Brier lower upper IPA -## -## 1: Null model 1788 20.479 18.089 22.869 0.000 -## 2: rando 1788 57.487 53.803 61.172 -180.713 -## 3: pca 1788 52.444 49.190 55.698 -156.087 +## 1: Null model 1788 20.479 18.090 22.868 0.000 +## 2: rando 1788 57.628 53.951 61.305 -181.400 +## 3: pca 1788 52.770 49.473 56.067 -157.679 ## ## Results of model comparisons: ## -## times model reference delta.Brier lower upper p -## -## 1: 1788 rando Null model 37.008 31.647 42.370 1.061084e-41 -## 2: 1788 pca Null model 31.965 26.774 37.157 1.560856e-33 -## 3: 1788 pca rando -5.043 -6.405 -3.681 4.008989e-13 +## times model reference delta.Brier lower upper p +## 1: 1788 rando Null model 37.149 31.810 42.488 2.428416e-42 +## 2: 1788 pca Null model 32.291 27.056 37.526 1.205627e-33 +## 3: 1788 pca rando -4.858 -6.248 -3.468 7.365419e-12 ## ## NOTE: Values are multiplied by 100 and given in \%. diff --git a/man/orsf_ice_oob.Rd b/man/orsf_ice_oob.Rd index 971394d3..50248e78 100644 --- a/man/orsf_ice_oob.Rd +++ b/man/orsf_ice_oob.Rd @@ -13,7 +13,7 @@ orsf_ice_oob( pred_type = NULL, expand_grid = TRUE, boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ... ) @@ -24,7 +24,7 @@ orsf_ice_inb( pred_type = NULL, expand_grid = TRUE, boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ... ) @@ -37,7 +37,7 @@ orsf_ice_new( na_action = "fail", expand_grid = TRUE, boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ... ) } @@ -150,19 +150,18 @@ ice_oob <- orsf_ice_oob(fit, pred_spec, boundary_checks = FALSE) ice_oob }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## id_variable id_row pred_horizon bili pred -## -## 1: 1 1 1788 1 0.9080183 -## 2: 1 2 1788 1 0.8503112 -## 3: 1 3 1788 1 0.5990256 -## 4: 1 4 1788 1 0.7468067 -## 5: 1 5 1788 1 0.5814322 -## --- -## 6896: 25 272 1788 10 0.7262513 -## 6897: 25 273 1788 10 0.5027661 -## 6898: 25 274 1788 10 0.3203826 -## 6899: 25 275 1788 10 0.5647718 -## 6900: 25 276 1788 10 0.3255800 +\if{html}{\out{
}}\preformatted{## id_variable id_row pred_horizon bili pred +## 1: 1 1 1788 1 0.9080183 +## 2: 1 2 1788 1 0.8503112 +## 3: 1 3 1788 1 0.5990256 +## 4: 1 4 1788 1 0.7468067 +## 5: 1 5 1788 1 0.5814322 +## --- +## 6896: 25 272 1788 10 0.7262513 +## 6897: 25 273 1788 10 0.5027661 +## 6898: 25 274 1788 10 0.3203826 +## 6899: 25 275 1788 10 0.5647718 +## 6900: 25 276 1788 10 0.3255800 }\if{html}{\out{
}} Much more detailed examples are given in the diff --git a/man/orsf_pd_oob.Rd b/man/orsf_pd_oob.Rd index 627bda92..ba882b65 100644 --- a/man/orsf_pd_oob.Rd +++ b/man/orsf_pd_oob.Rd @@ -15,7 +15,7 @@ orsf_pd_oob( prob_values = c(0.025, 0.5, 0.975), prob_labels = c("lwr", "medn", "upr"), boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ... ) @@ -28,7 +28,7 @@ orsf_pd_inb( prob_values = c(0.025, 0.5, 0.975), prob_labels = c("lwr", "medn", "upr"), boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ... ) @@ -43,7 +43,7 @@ orsf_pd_new( prob_values = c(0.025, 0.5, 0.975), prob_labels = c("lwr", "medn", "upr"), boundary_checks = TRUE, - n_thread = 1, + n_thread = 0, ... ) } @@ -159,13 +159,12 @@ You can compute partial dependence and ICE three ways with \code{aorsf}: pd_train }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr -## -## 1: 1826.25 1 0.2034441 0.01958467 0.0978637 0.7771746 -## 2: 1826.25 2 0.2335524 0.03614317 0.1322952 0.7986061 -## 3: 1826.25 3 0.2679313 0.05276137 0.1745943 0.8177905 -## 4: 1826.25 4 0.3158140 0.07801520 0.2314825 0.8317336 -## 5: 1826.25 5 0.3578420 0.12202134 0.2752057 0.8408508 +\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr +## 1: 1826.25 1 0.2034441 0.01958467 0.0978637 0.7771746 +## 2: 1826.25 2 0.2335524 0.03614317 0.1322952 0.7986061 +## 3: 1826.25 3 0.2679313 0.05276137 0.1745943 0.8177905 +## 4: 1826.25 4 0.3158140 0.07801520 0.2314825 0.8317336 +## 5: 1826.25 5 0.3578420 0.12202134 0.2752057 0.8408508 }\if{html}{\out{
}} \item using out-of-bag predictions for the training data @@ -174,13 +173,12 @@ pd_train pd_train }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr -## -## 1: 1826.25 1 0.2112509 0.01973124 0.1354011 0.7171789 -## 2: 1826.25 2 0.2413456 0.03451762 0.1715079 0.7509849 -## 3: 1826.25 3 0.2750807 0.05181733 0.2127899 0.7609403 -## 4: 1826.25 4 0.3223209 0.08084008 0.2678339 0.7719659 -## 5: 1826.25 5 0.3652664 0.12709393 0.3136925 0.7760334 +\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr +## 1: 1826.25 1 0.2112509 0.01973124 0.1354011 0.7171789 +## 2: 1826.25 2 0.2413456 0.03451762 0.1715079 0.7509849 +## 3: 1826.25 3 0.2750807 0.05181733 0.2127899 0.7609403 +## 4: 1826.25 4 0.3223209 0.08084008 0.2678339 0.7719659 +## 5: 1826.25 5 0.3652664 0.12709393 0.3136925 0.7760334 }\if{html}{\out{
}} \item using predictions for a new set of data @@ -191,13 +189,12 @@ pd_train pd_test }\if{html}{\out{
}} -\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr -## -## 1: 1826.25 1 0.2417344 0.02209149 0.1895773 0.7819458 -## 2: 1826.25 2 0.2718357 0.03915587 0.2252919 0.8042602 -## 3: 1826.25 3 0.3066068 0.05829350 0.2694077 0.8171481 -## 4: 1826.25 4 0.3558594 0.08951223 0.3205839 0.8238288 -## 5: 1826.25 5 0.3982408 0.13810590 0.3683546 0.8300253 +\if{html}{\out{
}}\preformatted{## pred_horizon bili mean lwr medn upr +## 1: 1826.25 1 0.2417344 0.02209149 0.1895773 0.7819458 +## 2: 1826.25 2 0.2718357 0.03915587 0.2252919 0.8042602 +## 3: 1826.25 3 0.3066068 0.05829350 0.2694077 0.8171481 +## 4: 1826.25 4 0.3558594 0.08951223 0.3205839 0.8238288 +## 5: 1826.25 5 0.3982408 0.13810590 0.3683546 0.8300253 }\if{html}{\out{
}} \item in-bag partial dependence indicates relationships that the model has learned during training. This is helpful if your goal is to interpret diff --git a/man/orsf_update.Rd b/man/orsf_update.Rd index e3643114..411a3473 100644 --- a/man/orsf_update.Rd +++ b/man/orsf_update.Rd @@ -7,27 +7,51 @@ orsf_update(object, ..., modify_in_place = FALSE, no_fit = NULL) } \arguments{ -\item{object}{(\emph{ObliqueForest}) an oblique random forest object (see \link{orsf})} +\item{object}{(\emph{ObliqueForest}) an oblique random forest object (see \link{orsf}).} -\item{...}{arguments to plug into \link{orsf} that will be used to define the update.} +\item{...}{arguments to plug into \link{orsf} that will be used to define the +update (see examples).} -\item{modify_in_place}{(\emph{logical}) if \code{TRUE}, \code{object} will be modified. -Be cautious, as modification in place will overwrite existing data. If -\code{FALSE} (the default), \code{object} will be copied and then the modifications -will be applied to the copy, leaving \code{object} unmodified.} +\item{modify_in_place}{(\emph{logical}) if \code{TRUE}, \code{object} will be modified +by the inputs specified in \code{...}. Be cautious, as modification in place +will overwrite existing data. If \code{FALSE} (the default), \code{object} will +be copied and then the modifications will be applied to the copy, +leaving the original \code{object} unmodified.} \item{no_fit}{(\emph{logical}) if \code{TRUE}, model fitting steps are defined and saved, but training is not initiated. The object returned can be directly submitted to \code{orsf_train()} so long as \code{attach_data} is \code{TRUE}.} } +\value{ +an \code{ObliqueForest} object. +} \description{ Update orsf objects } +\details{ +There are several dynamic inputs in \code{orsf} with default values of \code{NULL}. +Specifically, these inputs are \code{control}, \code{weights}, \code{mtry}, \code{split_rule}, +\code{split_min_stat}, \code{pred_type}, \code{pred_horizon}, \code{oobag_eval_function}, +\code{tree_seeds}, and \code{oobag_eval_every}. If no explicit value is given for +these inputs in the call, they \emph{will be re-formed}. For example, if +an initial forest includes 17 predictors, the default \code{mtry} is the +smallest integer that is greater than or equal to the square root of 17, +i.e., 5. Then, if you make an updated forest with 1 less predictor and +you do not explicitly say \code{mtry = 5}, then \code{mtry} will be re-initialized +in the update based on the available 16 predictors, and the resulting +value of \code{mtry} will be 4. This is done to avoid many potential errors +that would occur if the dynamic outputs were not re-initialized. +} \examples{ +# initial fit has mtry of 5 fit <- orsf(pbc_orsf, time + status ~ . -id) +# note that mtry is now 4 (see details) fit_new <- orsf_update(fit, formula = . ~ . - edema, n_tree = 100) +# prevent dynamic updates by specifying inputs you want to freeze. +fit_newer <- orsf_update(fit_new, mtry = 2) + } diff --git a/man/pred_spec_auto.Rd b/man/pred_spec_auto.Rd new file mode 100644 index 00000000..e56883c3 --- /dev/null +++ b/man/pred_spec_auto.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/orsf_pd.R +\name{pred_spec_auto} +\alias{pred_spec_auto} +\title{Automatic variable values for dependence} +\usage{ +pred_spec_auto(...) +} +\arguments{ +\item{...}{names of the variables to use. These can be in quotes +or not in quotes (see examples).} +} +\value{ +a character vector with the names +} +\description{ +For partial dependence and individual conditional expectations, +this function allows a variable to be considered without having +to specify what values to set the variable at. The values used +are based on quantiles for continuous variables (10th, 25th, 50th, +75th, and 90th) and unique categories for categorical variables. +} +\details{ +This function should only be used in the context of +\code{orsf_pd} or \code{orsf_ice} functions. +} +\examples{ + +fit <- orsf(penguins_orsf, species ~., n_tree = 5) + +orsf_pd_oob(fit, pred_spec_auto(flipper_length_mm)) + +} diff --git a/src/Forest.cpp b/src/Forest.cpp index f9b0db4e..1a1a1ba2 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -971,11 +971,7 @@ void Forest::resize_pd_mats(std::vector>& mat_list){ for(uword j = 0; j < n_items; ++j){ mat result_k_j; - // if(pd_type == PD_SUMMARY){ - // resize_pred_mat(result_k_j, 1); - // } else { resize_pred_mat(result_k_j, data->n_rows); - // } result_k.push_back(result_k_j); } diff --git a/tests/testthat/test-orsf_pd.R b/tests/testthat/test-orsf_pd.R index 2647f5f3..2735600d 100644 --- a/tests/testthat/test-orsf_pd.R +++ b/tests/testthat/test-orsf_pd.R @@ -1,4 +1,238 @@ +funs <- list( + pd_new = orsf_pd_new, + pd_inb = orsf_pd_inb, + pd_oob = orsf_pd_oob, + ice_new = orsf_ice_new, + ice_inb = orsf_ice_inb, + ice_oob = orsf_ice_oob +) + +# survival ---- + +args_loop <- args_grid <- list( + object = fit_standard_pbc$fast, + pred_spec = pred_spec_auto(bili, sex), + new_data = pbc_test, + pred_horizon = 1000, + pred_type = 'risk', + expand_grid = TRUE +) + +args_loop$expand_grid <- FALSE + +for(i in seq_along(funs)){ + + f_name <- names(funs)[i] + + formals <- intersect(setdiff(names(formals(funs[[i]])), '...'), + names(args_grid)) + + for(pred_type in setdiff(pred_types_surv, c('leaf'))){ + + args_grid$pred_type = pred_type + args_loop$pred_type = pred_type + + pd_object_grid <- do.call(funs[[i]], args = args_grid[formals]) + pd_object_loop <- do.call(funs[[i]], args = args_loop[formals]) + + test_that( + desc = paste('pred_spec data are returned on the original scale', + ' for orsf_', f_name, sep = ''), + code = { + + expect_equal(unique(pd_object_grid$bili), + args_grid$object$get_var_bounds('bili')) + + expect_equal(unique(na.omit(pd_object_loop$value)), + args_grid$object$get_var_bounds('bili')) + + } + ) + + test_that( + desc = paste(f_name, 'returns a data.table'), + code = { + expect_s3_class(pd_object_grid, 'data.table') + expect_s3_class(pd_object_loop, 'data.table') + } + ) + + test_that( + desc = 'output is named consistently', + code = { + + if(f_name %in% c("ice_new", "ice_inb", "ice_oob")){ + expect_true('id_variable' %in% names(pd_object_grid)) + expect_true('id_variable' %in% names(pd_object_loop)) + expect_true('id_row' %in% names(pd_object_grid)) + expect_true('id_row' %in% names(pd_object_loop)) + } + + expect_true('variable' %in% names(pd_object_loop)) + expect_true('value' %in% names(pd_object_loop)) + + vars <- as.character(args_loop$pred_spec) + expect_true(all(vars %in% names(pd_object_grid))) + expect_true(all(vars %in% unique(pd_object_loop$variable))) + + if(pred_type == 'mort'){ + expect_false('pred_horizon' %in% names(pd_object_grid)) + expect_false('pred_horizon' %in% names(pd_object_loop)) + } + + } + ) + + } + +} + +# classification ---- + +args_loop <- args_grid <- list( + object = fit_standard_penguin_species$fast, + pred_spec = pred_spec_auto(bill_length_mm, + bill_depth_mm), + new_data = penguins_test, + pred_type = 'prob', + expand_grid = TRUE +) + +args_loop$expand_grid <- FALSE + +for(i in seq_along(funs)){ + + f_name <- names(funs)[i] + + formals <- intersect(setdiff(names(formals(funs[[i]])), '...'), + names(args_grid)) + + pd_object_grid <- do.call(funs[[i]], args = args_grid[formals]) + pd_object_loop <- do.call(funs[[i]], args = args_loop[formals]) + + test_that( + desc = paste('pred_spec data are returned on the original scale', + ' for orsf_', f_name, sep = ''), + code = { + + expect_equal(unique(pd_object_grid$bill_length_mm), + args_grid$object$get_var_bounds('bill_length_mm')) + + expect_equal( + unique(na.omit( + pd_object_loop[pd_object_loop$variable == 'bill_depth_mm', value] + )), + args_grid$object$get_var_bounds('bill_depth_mm') + ) + + } + ) + + test_that( + desc = paste(f_name, 'returns a data.table'), + code = { + expect_s3_class(pd_object_grid, 'data.table') + expect_s3_class(pd_object_loop, 'data.table') + } + ) + + test_that( + desc = 'output is named consistently', + code = { + + if(f_name %in% c("ice_new", "ice_inb", "ice_oob")){ + expect_true('id_variable' %in% names(pd_object_grid)) + expect_true('id_variable' %in% names(pd_object_loop)) + expect_true('id_row' %in% names(pd_object_grid)) + expect_true('id_row' %in% names(pd_object_loop)) + } + + expect_true('variable' %in% names(pd_object_loop)) + expect_true('value' %in% names(pd_object_loop)) + + vars <- as.character(args_loop$pred_spec) + expect_true(all(vars %in% names(pd_object_grid))) + expect_true(all(vars %in% unique(pd_object_loop$variable))) + + } + ) + +} + + +# regression ---- + +args_loop <- args_grid <- list( + object = fit_standard_penguin_bills$fast, + pred_spec = pred_spec_auto(species, island), + new_data = penguins_test, + pred_type = 'mean', + expand_grid = TRUE +) + +args_loop$expand_grid <- FALSE + +for(i in seq_along(funs)){ + + f_name <- names(funs)[i] + + formals <- intersect(setdiff(names(formals(funs[[i]])), '...'), + names(args_grid)) + + pd_object_grid <- do.call(funs[[i]], args = args_grid[formals]) + pd_object_loop <- do.call(funs[[i]], args = args_loop[formals]) + + test_that( + desc = paste('pred_spec data are returned on the original scale', + ' for orsf_', f_name, sep = ''), + code = { + + expect_equal(levels(pd_object_grid$species), + levels(penguins_orsf$species)) + + loop_vals <- unique(pd_object_loop$level) + data_vals <- c(unique(penguins_orsf$species), + unique(penguins_orsf$island)) + + expect_true(all(loop_vals %in% data_vals)) + expect_true(all(data_vals %in% loop_vals)) + + } + ) + + test_that( + desc = paste(f_name, 'returns a data.table'), + code = { + expect_s3_class(pd_object_grid, 'data.table') + expect_s3_class(pd_object_loop, 'data.table') + } + ) + + test_that( + desc = 'output is named consistently', + code = { + + if(f_name %in% c("ice_new", "ice_inb", "ice_oob")){ + expect_true('id_variable' %in% names(pd_object_grid)) + expect_true('id_variable' %in% names(pd_object_loop)) + expect_true('id_row' %in% names(pd_object_grid)) + expect_true('id_row' %in% names(pd_object_loop)) + } + + expect_true('variable' %in% names(pd_object_loop)) + expect_true('value' %in% names(pd_object_loop)) + + vars <- as.character(args_loop$pred_spec) + expect_true(all(vars %in% names(pd_object_grid))) + expect_true(all(vars %in% unique(pd_object_loop$variable))) + + } + ) + +} + +# general ---- test_that( desc = "oob stops if there are no data", @@ -39,6 +273,13 @@ test_that( pred_horizon = 1000), regexp = 'nope and no_sir' ) + + expect_error( + orsf_ice_oob(fit, + pred_spec = pred_spec_auto(bili, nope, no_sir), + pred_horizon = 1000), + regexp = 'nope and no_sir' + ) } ) diff --git a/vignettes/pd.Rmd b/vignettes/pd.Rmd index f977339a..cf1f7909 100644 --- a/vignettes/pd.Rmd +++ b/vignettes/pd.Rmd @@ -89,10 +89,22 @@ You can compute PD three ways with `aorsf`: - in-bag PD indicates relationships that the model has learned during training. This is helpful if your goal is to interpret the model. -- out-of-bag PD indicates relationships that the model has learned during training but using the out-of-bag data simulates application of the model to new data. if you want to test your model's reliability or fairness in new data but you don't have access to a large testing set. +- out-of-bag PD indicates relationships that the model has learned during training but using the out-of-bag data simulates application of the model to new data. This is helpful if you want to test your model's reliability or fairness in new data but you don't have access to a large testing set. - new data PD shows how the model predicts outcomes for observations it has not seen. This is helpful if you want to test your model's reliability or fairness. +## Automatic variable values + +Use `pred_spec_auto()` if you know the variables you want to check out but you don't have a specific set of values in mind: + +```{r} + +orsf_pd_inb(fit, pred_spec_auto(bili)) + +``` + +`pred_spec_auto()` lets you specify a variable in your model with or without quotes, and then assign values for that variable on your behalf. For continuous variables, it uses quantiles (10, 25, 50, 75, and 90). For nominal variables, it uses unique categories. + Let's re-fit our ORSF model to all available data before proceeding to the next sections. ```{r} @@ -112,7 +124,7 @@ Computing PD for a single variable is straightforward: ```{r} -pd_sex <- orsf_pd_oob(fit, pred_spec = list(sex = c("m", "f"))) +pd_sex <- orsf_pd_oob(fit, pred_spec = pred_spec_auto(sex)) pd_sex @@ -126,7 +138,7 @@ What if the effect of a predictor varies over time? PD can show this. ```{r} -pd_sex_tv <- orsf_pd_oob(fit, pred_spec = list(sex = c("m", "f")), +pd_sex_tv <- orsf_pd_oob(fit, pred_spec = pred_spec_auto(sex), pred_horizon = seq(365, 365*5)) ggplot(pd_sex_tv, aes(x = pred_horizon, y = mean, color = sex)) + @@ -162,14 +174,14 @@ If you want to compute PD marginally for multiple variables, just list the varia pd_two_vars <- orsf_pd_oob(fit, - pred_spec = list(sex = c("m", "f"), bili = 1:5), + pred_spec = pred_spec_auto(sex, bili), expand_grid = FALSE) pd_two_vars ``` -Now would it be tedious if you wanted to do this for all the variables? You bet. That's why we made a function for that. As a bonus, the printed output is sorted from most to least important variables. +To get a view of partial dependence for any number of variables in the training data, use `orsf_summarize_uni()`. This function computes out-of-bag PD for the most important `n_variables` and returns a nicely formatted view of the output: ```{r} @@ -179,7 +191,7 @@ pd_smry ``` -It's easy enough to turn this 'summary' object into a `data.table` for downstream plotting and tables. +This 'summary' object can be converted into a `data.table` for downstream plotting and tables. ```{r} @@ -194,9 +206,7 @@ PD can show the expected value of a model's predictions as a function of a speci ```{r} -pred_spec = list(bili = seq(1, 5, length.out = 20), - edema = levels(pbc_orsf_train$edema), - trt = levels(pbc_orsf$trt)) +pred_spec = pred_spec_auto(bili, edema, trt) pd_bili_edema <- orsf_pd_oob(fit, pred_spec)