Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partial dependence fixes to plug memory leaks #28

Merged
merged 4 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions R/melt_aorsf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

# need to make this to avoid possible memory leak in data.table melt

melt_aorsf <-
function(data,
id.vars,
measure.vars,
variable.name = "variable",
value.name = "value") {
if (!is.data.frame(data)) {
stop("Input 'data' must be a data frame.")
}

if (!is.character(id.vars)) {
stop("Input 'id.vars' must be a character vector.")
}

if (!is.character(measure.vars)) {
stop("Input 'measure.vars' must be a character vector.")
}

# Select id variables and measure variables
id_data <- data[id.vars]
measure_data <- data[measure.vars]

# Create a sequence variable to represent the variable names
variable_data <- rep(names(measure_data), each = nrow(data))

# Reshape the data
long_data <- data.frame(id_data,
variable = variable_data,
value = unlist(measure_data, use.names = FALSE))

names(long_data)[names(long_data) == 'variable'] <- variable.name
names(long_data)[names(long_data) == 'value'] <- value.name

return(long_data)
}

150 changes: 142 additions & 8 deletions R/orsf_pd.R
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ orsf_pred_dependence <- function(object,

}

x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(x_new))-1
x_cols[[i]] <- match(names(pred_spec_new[[i]]), colnames(x_new)) - 1
pred_spec_new[[i]] <- as.matrix(pred_spec_new[[i]])

}
Expand All @@ -439,6 +439,99 @@ orsf_pred_dependence <- function(object,

control <- get_control(object)

pred_horizon_order <- order(pred_horizon)
pred_horizon_ordered <- pred_horizon[pred_horizon_order]

# results <- list()
#
# for(i in seq_along(pred_spec_new)){
#
# results_i <- list()
#
# x_pd <- x_new
#
# for(j in seq(nrow(pred_spec_new[[i]]))){
#
# x_pd[, x_cols[[i]]] <- pred_spec_new[[i]][j, ]
#
# results_i[[j]] <- orsf_cpp(
# x = x_pd,
# y = matrix(1, ncol=2),
# w = rep(1, nrow(x_new)),
# tree_type_R = get_tree_type(object),
# tree_seeds = get_tree_seeds(object),
# loaded_forest = object$forest,
# n_tree = get_n_tree(object),
# mtry = get_mtry(object),
# sample_with_replacement = get_sample_with_replacement(object),
# sample_fraction = get_sample_fraction(object),
# vi_type_R = 0,
# vi_max_pvalue = get_vi_max_pvalue(object),
# oobag_R_function = get_f_oobag_eval(object),
# leaf_min_events = get_leaf_min_events(object),
# leaf_min_obs = get_leaf_min_obs(object),
# split_rule_R = switch(get_split_rule(object),
# "logrank" = 1,
# "cstat" = 2),
# split_min_events = get_split_min_events(object),
# split_min_obs = get_split_min_obs(object),
# split_min_stat = get_split_min_stat(object),
# split_max_cuts = get_n_split(object),
# split_max_retry = get_n_retry(object),
# lincomb_R_function = control$lincomb_R_function,
# lincomb_type_R = switch(control$lincomb_type,
# 'glm' = 1,
# 'random' = 2,
# 'net' = 3,
# 'custom' = 4),
# lincomb_eps = control$lincomb_eps,
# lincomb_iter_max = control$lincomb_iter_max,
# lincomb_scale = control$lincomb_scale,
# lincomb_alpha = control$lincomb_alpha,
# lincomb_df_target = control$lincomb_df_target,
# lincomb_ties_method = switch(tolower(control$lincomb_ties_method),
# 'breslow' = 0,
# 'efron' = 1),
# pred_type_R = pred_type_R,
# pred_mode = TRUE,
# pred_aggregate = TRUE,
# pred_horizon = pred_horizon_ordered,
# oobag = oobag,
# oobag_eval_type_R = 0,
# oobag_eval_every = get_n_tree(object),
# pd_type_R = 0,
# pd_x_vals = list(matrix(0, ncol=1, nrow=1)),
# pd_x_cols = list(matrix(1L, ncol=1, nrow=1)),
# pd_probs = c(0),
# n_thread = n_thread,
# write_forest = FALSE,
# run_forest = TRUE,
# verbosity = 0)$pred_new
#
# }
#
# if(type_output == 'smry'){
# results_i <- lapply(
# results_i,
# function(x) {
# apply(x, 2, function(x_col){
# as.numeric(
# c(mean(x_col, na.rm = TRUE),
# quantile(x_col, probs = prob_values, na.rm = TRUE))
# )
# })
# }
# )
# }
#
#
# results[[i]] <- results_i
#
# }
#
# pd_vals <- results
# browser()

orsf_out <- orsf_cpp(x = x_new,
y = matrix(1, ncol=2),
w = rep(1, nrow(x_new)),
Expand Down Expand Up @@ -513,14 +606,25 @@ orsf_pred_dependence <- function(object,
else
colnames(pd_vals[[i]][[j]]) <- c(paste(1:nrow(x_new)))

pd_vals[[i]][[j]] <- as.data.table(pd_vals[[i]][[j]],
keep.rownames = 'pred_horizon')
ph <- rownames(pd_vals[[i]][[j]])

pd_vals[[i]][[j]] <- as.data.frame(pd_vals[[i]][[j]])

rownames(pd_vals[[i]][[j]]) <- NULL

pd_vals[[i]][[j]][['pred_horizon']] <- ph

if(type_output == 'ice'){

pd_vals[[i]][[j]] <- melt_aorsf(
data = pd_vals[[i]][[j]],
id.vars = 'pred_horizon',
variable.name = 'id_row',
value.name = 'pred',
measure.vars = setdiff(names(pd_vals[[i]][[j]]), 'pred_horizon'))

}

if(type_output == 'ice')
pd_vals[[i]][[j]] <- melt(data = pd_vals[[i]][[j]],
id.vars = 'pred_horizon',
variable.name = 'id_row',
value.name = 'pred')

}

Expand Down Expand Up @@ -592,3 +696,33 @@ orsf_pred_dependence <- function(object,
}


pd_list_split <- function(x_vals, x_cols){

x_vals_out <- x_cols_out <- vector(mode = 'list')
counter <- 1

for(i in seq_along(x_vals)){

x_vals_split <- split(x_vals[[i]], row(x_vals[[i]]))

for(j in seq_along(x_vals_split)){

x_vals_out[[counter]] <- matrix(x_vals_split[[j]],
ncol = ncol(x_vals[[i]]),
nrow = 1)
colnames(x_vals_out[[counter]]) <- colnames(x_vals[[i]])

x_cols_out[[counter]] <- x_cols[[i]]

counter <- counter + 1

}

}

list(
x_vals = x_vals_out,
x_cols = x_cols_out
)

}
2 changes: 2 additions & 0 deletions cran-comments.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## Version 0.1.1

Update, October 25: Thank you for helping me with this. I have tidied up threads and avoided usage of the `data.table` functions that were creating possible memory leaks. I have checked this submission locally with valgrind and on rhub, with both indicating 0 memory leaks. However, if this submission does not pass on your end, I would like to request an extension on the October 28th deadline.

Update, October 21: I have updated the submission to fix memory leaks. Many of the leaks were caused by my omission of a virtual de-constructor for derived classes or by omission of a delete statement for dynamically allocated memory. I apologize for these oversights. After reviewing, you may still see a possible memory leak from `orsf_ice` functions. From what I can tell, this possible leak could either be measurement error or could be attributed to `data.table`. I do not think it's from `aorsf`.

Initial submission: This version is being submitted to CRAN early due to a memory error that was identified in version 1.0.0. I apologize for the oversight. As `aorsf` would be removed from CRAN if the issue is not fixed before October 28, I would like to request an expedited submission. I have run the current submission's tests and examples with valgrind to ensure the memory error has been fixed.
Expand Down
66 changes: 30 additions & 36 deletions tests/testthat/test-orsf_pd.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ test_that(
)

funs <- list(
# ice_new = orsf_ice_new,
# ice_inb = orsf_ice_inb,
# ice_oob = orsf_ice_oob,
pd_new = orsf_pd_new,
pd_inb = orsf_pd_inb,
pd_oob = orsf_pd_oob
Expand Down Expand Up @@ -87,8 +84,7 @@ for(i in seq_along(funs)){

formals <- setdiff(names(formals(funs[[i]])), '...')

for(pred_type in c('mort')){
# for(pred_type in setdiff(pred_types_surv, c('leaf', 'mort'))){
for(pred_type in setdiff(pred_types_surv, c('leaf'))){

args_grid$pred_type = pred_type
args_loop$pred_type = pred_type
Expand Down Expand Up @@ -147,43 +143,40 @@ for(i in seq_along(funs)){
}


# pd_vals_ice <- orsf_ice_new(
# fit,
# new_data = pbc_orsf,
# pred_spec = list(bili = 1:4),
# pred_horizon = 1000
# )
#
pd_vals_smry <- orsf_pd_new(
pd_vals_ice <- orsf_ice_new(
fit,
new_data = pbc_orsf,
new_data = pbc_test,
pred_spec = list(bili = 1:4),
pred_horizon = 1000
)
#
# test_that(
# 'ice values summarized are the same as pd values',
# code = {
#
# pd_vals_check <- pd_vals_ice[, .(medn = median(pred)), by = id_variable]
#
# expect_equal(
# pd_vals_check$medn,
# pd_vals_smry$medn
# )
#
# }
# )


pd_vals_smry <- orsf_pd_new(
fit,
new_data = pbc_test,
pred_spec = list(bili = 1:4),
pred_horizon = 1000
)

test_that(
'No missing values in output',
'ice values summarized are the same as pd values',
code = {

# expect_false(any(is.na(pd_vals_ice)))
# expect_false(any(is.nan(as.matrix(pd_vals_ice))))
# expect_false(any(is.infinite(as.matrix(pd_vals_ice))))
grps <- split(pd_vals_ice, pd_vals_ice$id_variable)
pd_vals_check <- sapply(grps, function(x) median(x$pred))

expect_equal(
as.numeric(pd_vals_check),
pd_vals_smry$medn
)

}
)


test_that(
'No missing values in summary output',
code = {
expect_false(any(is.na(pd_vals_smry)))
expect_false(any(is.nan(as.matrix(pd_vals_smry))))
expect_false(any(is.infinite(as.matrix(pd_vals_smry))))
Expand All @@ -200,10 +193,9 @@ test_that(
pred_horizon = c(1000, 2000, 3000)
)

# risk must increase or remain steady over time
# risk monotonically increases
expect_lte(pd_smry_multi_horiz$mean[1], pd_smry_multi_horiz$mean[2])
expect_lte(pd_smry_multi_horiz$mean[2], pd_smry_multi_horiz$mean[3])

expect_lte(pd_smry_multi_horiz$medn[1], pd_smry_multi_horiz$medn[2])
expect_lte(pd_smry_multi_horiz$medn[2], pd_smry_multi_horiz$medn[3])

Expand All @@ -213,9 +205,11 @@ test_that(
pred_horizon = c(1000, 2000, 3000)
)

ice_check <- pd_ice_multi_horiz[, .(m = mean(pred, na.rm=TRUE)), by = pred_horizon]
grps <- split(pd_ice_multi_horiz, pd_ice_multi_horiz$pred_horizon)

ice_check <- sapply(grps, function(x) mean(x$pred, na.rm=TRUE))

expect_equal(ice_check$m, pd_smry_multi_horiz$mean)
expect_equal(as.numeric(ice_check), pd_smry_multi_horiz$mean)

}

Expand Down
Loading