From 6d349145a3663dfad3c48d82590171017063c481 Mon Sep 17 00:00:00 2001 From: byron jaeger Date: Mon, 16 Oct 2023 16:05:26 -0400 Subject: [PATCH 1/3] merging lincomb regression methods make these return column for coefs and pvalues since the pvalues are computed in different ways for different methods. --- R/RcppExports.R | 12 ++- src/Coxph.cpp | 8 +- src/RcppExports.cpp | 44 ++++++++- src/Tree.cpp | 4 +- src/orsf_oop.cpp | 34 ++++++- src/utility.cpp | 93 +++++++++++++++++++ src/utility.h | 14 +++ tests/testthat/test-cp_find_bounds.R | 0 .../{test-coxph.R => test-lincomb_coxph.R} | 31 +++---- tests/testthat/test-lincomb_logreg.R | 46 +++++++++ 10 files changed, 254 insertions(+), 32 deletions(-) delete mode 100644 tests/testthat/test-cp_find_bounds.R rename tests/testthat/{test-coxph.R => test-lincomb_coxph.R} (64%) create mode 100644 tests/testthat/test-lincomb_logreg.R diff --git a/R/RcppExports.R b/R/RcppExports.R index f99dbe0e..9484de66 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,16 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -coxph_fit_exported <- function(x_node, y_node, w_node, method, cph_eps, cph_iter_max) { - .Call(`_aorsf_coxph_fit_exported`, x_node, y_node, w_node, method, cph_eps, cph_iter_max) +coxph_fit_exported <- function(x_node, y_node, w_node, method, epsilon, iter_max) { + .Call(`_aorsf_coxph_fit_exported`, x_node, y_node, w_node, method, epsilon, iter_max) +} + +linreg_fit_exported <- function(x_node, y_node, w_node, do_scale, epsilon, iter_max) { + .Call(`_aorsf_linreg_fit_exported`, x_node, y_node, w_node, do_scale, epsilon, iter_max) +} + +logreg_fit_exported <- function(x_node, y_node, w_node, do_scale, epsilon, iter_max) { + .Call(`_aorsf_logreg_fit_exported`, x_node, y_node, w_node, do_scale, epsilon, iter_max) } compute_cstat_exported_vec <- function(y, w, p, pred_is_risklike) { diff --git a/src/Coxph.cpp b/src/Coxph.cpp index 3058f3af..3ce2be37 100644 --- a/src/Coxph.cpp +++ b/src/Coxph.cpp @@ -650,6 +650,8 @@ // invert vmat cholesky_invert(vmat); + vec pvalues(beta_current.size()); + for (i=0; i < n_vars; i++) { beta_current[i] = beta_new[i]; @@ -662,6 +664,10 @@ vmat.at(i, i) = 1.0; } + pvalues[i] = R::pchisq( + pow(beta_current[i], 2) / vmat.at(i, i), 1, false, false + ); + if(do_scale){ // return beta and variance to original scales beta_current.at(i) *= scales[i]; @@ -673,7 +679,7 @@ } - return(join_horiz(beta_current, vmat.diag())); + return(join_horiz(beta_current, pvalues)); } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index cf64be34..357e234f 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -12,8 +12,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // coxph_fit_exported -List coxph_fit_exported(arma::mat& x_node, arma::mat& y_node, arma::vec& w_node, int method, double cph_eps, arma::uword cph_iter_max); -RcppExport SEXP _aorsf_coxph_fit_exported(SEXP x_nodeSEXP, SEXP y_nodeSEXP, SEXP w_nodeSEXP, SEXP methodSEXP, SEXP cph_epsSEXP, SEXP cph_iter_maxSEXP) { +List coxph_fit_exported(arma::mat& x_node, arma::mat& y_node, arma::vec& w_node, int method, double epsilon, arma::uword iter_max); +RcppExport SEXP _aorsf_coxph_fit_exported(SEXP x_nodeSEXP, SEXP y_nodeSEXP, SEXP w_nodeSEXP, SEXP methodSEXP, SEXP epsilonSEXP, SEXP iter_maxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -21,9 +21,41 @@ BEGIN_RCPP Rcpp::traits::input_parameter< arma::mat& >::type y_node(y_nodeSEXP); Rcpp::traits::input_parameter< arma::vec& >::type w_node(w_nodeSEXP); Rcpp::traits::input_parameter< int >::type method(methodSEXP); - Rcpp::traits::input_parameter< double >::type cph_eps(cph_epsSEXP); - Rcpp::traits::input_parameter< arma::uword >::type cph_iter_max(cph_iter_maxSEXP); - rcpp_result_gen = Rcpp::wrap(coxph_fit_exported(x_node, y_node, w_node, method, cph_eps, cph_iter_max)); + Rcpp::traits::input_parameter< double >::type epsilon(epsilonSEXP); + Rcpp::traits::input_parameter< arma::uword >::type iter_max(iter_maxSEXP); + rcpp_result_gen = Rcpp::wrap(coxph_fit_exported(x_node, y_node, w_node, method, epsilon, iter_max)); + return rcpp_result_gen; +END_RCPP +} +// linreg_fit_exported +arma::mat linreg_fit_exported(arma::mat& x_node, arma::mat& y_node, arma::vec& w_node, bool do_scale, double epsilon, arma::uword iter_max); +RcppExport SEXP _aorsf_linreg_fit_exported(SEXP x_nodeSEXP, SEXP y_nodeSEXP, SEXP w_nodeSEXP, SEXP do_scaleSEXP, SEXP epsilonSEXP, SEXP iter_maxSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat& >::type x_node(x_nodeSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type y_node(y_nodeSEXP); + Rcpp::traits::input_parameter< arma::vec& >::type w_node(w_nodeSEXP); + Rcpp::traits::input_parameter< bool >::type do_scale(do_scaleSEXP); + Rcpp::traits::input_parameter< double >::type epsilon(epsilonSEXP); + Rcpp::traits::input_parameter< arma::uword >::type iter_max(iter_maxSEXP); + rcpp_result_gen = Rcpp::wrap(linreg_fit_exported(x_node, y_node, w_node, do_scale, epsilon, iter_max)); + return rcpp_result_gen; +END_RCPP +} +// logreg_fit_exported +arma::mat logreg_fit_exported(arma::mat& x_node, arma::mat& y_node, arma::vec& w_node, bool do_scale, double epsilon, arma::uword iter_max); +RcppExport SEXP _aorsf_logreg_fit_exported(SEXP x_nodeSEXP, SEXP y_nodeSEXP, SEXP w_nodeSEXP, SEXP do_scaleSEXP, SEXP epsilonSEXP, SEXP iter_maxSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat& >::type x_node(x_nodeSEXP); + Rcpp::traits::input_parameter< arma::mat& >::type y_node(y_nodeSEXP); + Rcpp::traits::input_parameter< arma::vec& >::type w_node(w_nodeSEXP); + Rcpp::traits::input_parameter< bool >::type do_scale(do_scaleSEXP); + Rcpp::traits::input_parameter< double >::type epsilon(epsilonSEXP); + Rcpp::traits::input_parameter< arma::uword >::type iter_max(iter_maxSEXP); + rcpp_result_gen = Rcpp::wrap(logreg_fit_exported(x_node, y_node, w_node, do_scale, epsilon, iter_max)); return rcpp_result_gen; END_RCPP } @@ -207,6 +239,8 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_aorsf_coxph_fit_exported", (DL_FUNC) &_aorsf_coxph_fit_exported, 6}, + {"_aorsf_linreg_fit_exported", (DL_FUNC) &_aorsf_linreg_fit_exported, 6}, + {"_aorsf_logreg_fit_exported", (DL_FUNC) &_aorsf_logreg_fit_exported, 6}, {"_aorsf_compute_cstat_exported_vec", (DL_FUNC) &_aorsf_compute_cstat_exported_vec, 4}, {"_aorsf_compute_cstat_exported_uvec", (DL_FUNC) &_aorsf_compute_cstat_exported_uvec, 4}, {"_aorsf_compute_logrank_exported", (DL_FUNC) &_aorsf_compute_logrank_exported, 3}, diff --git a/src/Tree.cpp b/src/Tree.cpp index a111db3b..f5e6b642 100644 --- a/src/Tree.cpp +++ b/src/Tree.cpp @@ -992,7 +992,7 @@ // # nocov end } - vec beta_var = beta.unsafe_col(1); + vec pvalues = beta.unsafe_col(1); double pvalue; @@ -1002,7 +1002,7 @@ if(beta_est[i] != 0){ - pvalue = R::pchisq(pow(beta_est[i],2)/beta_var[i], 1, false, false); + pvalue = pvalues[i]; if(verbosity > 3){ // # nocov start diff --git a/src/orsf_oop.cpp b/src/orsf_oop.cpp index cf90e38a..33b7edff 100644 --- a/src/orsf_oop.cpp +++ b/src/orsf_oop.cpp @@ -32,8 +32,8 @@ arma::mat& y_node, arma::vec& w_node, int method, - double cph_eps, - arma::uword cph_iter_max){ + double epsilon, + arma::uword iter_max){ arma::uvec cols_node=regspace(0, x_node.n_cols-1); @@ -42,17 +42,41 @@ w_node, true, method, - cph_eps, - cph_iter_max); + epsilon, + iter_max); List result; result.push_back(out.col(0), "beta"); - result.push_back(out.col(1), "var"); + result.push_back(out.col(1), "pvalues"); return(result); } + // [[Rcpp::export]] + arma::mat linreg_fit_exported(arma::mat& x_node, + arma::mat& y_node, + arma::vec& w_node, + bool do_scale, + double epsilon, + arma::uword iter_max){ + return( + linreg_fit(x_node, y_node, w_node, do_scale, epsilon, iter_max) + ); + } + + // [[Rcpp::export]] + arma::mat logreg_fit_exported(arma::mat& x_node, + arma::mat& y_node, + arma::vec& w_node, + bool do_scale, + double epsilon, + arma::uword iter_max){ + return( + logreg_fit(x_node, y_node, w_node, do_scale, epsilon, iter_max) + ); + } + // [[Rcpp::export]] double compute_cstat_exported_vec( arma::mat& y, diff --git a/src/utility.cpp b/src/utility.cpp index 5db7d0f9..5aa8361d 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -353,4 +353,97 @@ } + arma::mat linreg_fit(arma::mat& x_node, + arma::mat& y_node, + arma::vec& w_node, + bool do_scale, + double epsilon, + arma::uword iter_max){ + + // Add an intercept column to the design matrix + vec intercept(x_node.n_rows, fill::ones); + mat X = join_horiz(intercept, x_node); + + // for later steps we don't care about the intercept term b/c we don't + // need it, but in this step including the intercept is important + // for computing p-values of other regression coefficients. + + uword resid_df = X.n_rows - X.n_cols; + + vec beta = solve(X.t() * diagmat(w_node) * X, X.t() * (w_node % y_node)); + + vec resid = y_node - X * beta; + + double s2 = as_scalar(trans(resid) * (w_node % resid) / (resid_df)); + + mat beta_cov = s2 * inv(X.t() * diagmat(w_node) * X); + + vec se = sqrt(diagvec(beta_cov)); + + vec tscores = beta / se; + + // Calculate two-tailed p-values + vec pvalues(X.n_cols); + + for (uword i = 0; i < X.n_cols; ++i) { + + double tstat = std::abs(tscores[i]); + + pvalues[i] = 2 * (1 - R::pt(tstat, resid_df, 1, 0)); + + } + + + mat result = join_horiz(beta, pvalues); + + return(result.rows(1, result.n_rows-1)); + + } + + arma::mat logreg_fit(arma::mat& x_node, + arma::mat& y_node, + arma::vec& w_node, + bool do_scale, + double epsilon, + arma::uword iter_max){ + + // Add an intercept column to the design matrix + vec intercept(x_node.n_rows, fill::ones); + mat X = join_horiz(intercept, x_node); + + // for later steps we don't care about the intercept term b/c we don't + // need it, but in this step including the intercept is important + // for computing p-values of other regression coefficients. + + vec beta(X.n_cols, fill::zeros); + + mat hessian; + + for (uword iter = 0; iter < iter_max; ++iter) { + + vec eta = X * beta; + vec pi = 1 / (1 + exp(-eta)); + vec w = w_node % pi % (1 - pi); + + vec gradient = X.t() * ((y_node - pi) % w_node); + hessian = -X.t() * diagmat(w) * X; + + beta -= solve(hessian, gradient); + + if (norm(gradient) < epsilon) { + break; + } + } + + // Compute standard errors, z-scores, and p-values + + vec se = sqrt(diagvec(inv(-hessian))); + vec zscores = beta / se; + vec pvalues = 2 * (1 - normcdf(abs(zscores))); + mat result = join_horiz(beta, pvalues); + + return(result.rows(1, result.n_rows-1)); + + } + } diff --git a/src/utility.h b/src/utility.h index dde257a8..270ed5a2 100644 --- a/src/utility.h +++ b/src/utility.h @@ -77,6 +77,20 @@ aorsf may be modified and distributed under the terms of the MIT license. arma::uvec& g, bool pred_is_risklike); + arma::mat linreg_fit(arma::mat& x_node, + arma::mat& y_node, + arma::vec& w_node, + bool do_scale, + double epsilon, + arma::uword iter_max); + + arma::mat logreg_fit(arma::mat& x_node, + arma::mat& y_node, + arma::vec& w_node, + bool do_scale, + double epsilon, + arma::uword iter_max); + } #endif /* UTILITY_H */ diff --git a/tests/testthat/test-cp_find_bounds.R b/tests/testthat/test-cp_find_bounds.R deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/testthat/test-coxph.R b/tests/testthat/test-lincomb_coxph.R similarity index 64% rename from tests/testthat/test-coxph.R rename to tests/testthat/test-lincomb_coxph.R index 8e8f4c04..d8dd7718 100644 --- a/tests/testthat/test-coxph.R +++ b/tests/testthat/test-lincomb_coxph.R @@ -3,8 +3,6 @@ run_cph_test <- function(x, y, w, method){ control <- coxph.control(iter.max = 20, eps = 1e-8) - start <- Sys.time() - tt = survival::coxph.fit(x = x, y = y, strata = NULL, @@ -17,30 +15,29 @@ run_cph_test <- function(x, y, w, method){ resid = FALSE, nocenter = c(0)) - stop <- Sys.time() + fit <- coxph(y ~ x, + weights = w, + control = control, + method = if(method == 0) 'breslow' else 'efron') - tt_time <- stop-start + fit_stats <- as.data.frame( + summary(fit)$coefficients[,c("coef", "Pr(>|z|)")] + ) - xx <- x[, , drop = FALSE] + fit_coefs <- fit_stats$coef + fit_pvalues <- fit_stats$`Pr(>|z|)` - start <- Sys.time() + xx <- x[, , drop = FALSE] bcj = coxph_fit_exported(xx, y, w, method = method, - cph_eps = control$eps, - cph_iter_max = control$iter.max) - - stop <- Sys.time() - - bcj_time <- stop-start - - expect_equal(as.numeric(tt$coefficients), bcj$beta, tolerance = control$eps) - - expect_equal(diag(tt$var), bcj$var, tolerance = control$eps) + epsilon = control$eps, + iter_max = control$iter.max) - # list(bcj_time = bcj_time, tt_time = tt_time) + expect_equal(as.numeric(fit_coefs), bcj$beta, tolerance = 1e-5) + expect_equal(as.numeric(fit_pvalues), bcj$pvalues, tolerance = 1e-5) } diff --git a/tests/testthat/test-lincomb_logreg.R b/tests/testthat/test-lincomb_logreg.R new file mode 100644 index 00000000..abd89c9a --- /dev/null +++ b/tests/testthat/test-lincomb_logreg.R @@ -0,0 +1,46 @@ + +set.seed(123) + +test_that( + desc = "logreg_fit with weights approximately equal to glm()", + code = { + + nrows <- 1000 + ncols <- 20 + + X <- matrix(data = rnorm(nrows*ncols), nrow = nrows, ncol = ncols) + + # X <- cbind(1, X) + + colnames(X) <- c( + # "intercept", + paste0("x", seq(ncols)) + ) + + Y <- matrix(rbinom(nrows, size = 1, prob = 0.7), ncol = 1) + + glm_data <- as.data.frame(cbind(y=as.numeric(Y), X)) + + # Fit logistic regression using the custom function + + W <- sample(1:3, nrow(X), replace=TRUE) + + cpp = logreg_fit_exported(X, Y, W, do_scale = T, epsilon = 1e-10, iter_max = 20) + + R = glm(y ~ ., + weights = as.integer(W), + data = glm_data, + family = 'binomial') + + R_summary <- summary(R) + + R_beta_est <- as.numeric(R_summary$coefficients[-1, 'Estimate']) + R_beta_pvalues <- as.numeric(R_summary$coefficients[-1, 'Pr(>|z|)']) + + expect_equal(cpp[,1], R_beta_est, tolerance = 1e-3) + expect_equal(cpp[,2], R_beta_pvalues, tolerance = 1e-3) + + + } +) + From 4e4c9f83b19b430fde318a7870512898bd5b493a Mon Sep 17 00:00:00 2001 From: byron jaeger Date: Tue, 17 Oct 2023 00:09:58 -0400 Subject: [PATCH 2/3] scaling funs added --- R/RcppExports.R | 4 +++ R/orsf.R | 37 -------------------- src/RcppExports.cpp | 13 +++++++ src/orsf_oop.cpp | 14 ++++++++ src/utility.cpp | 52 ++++++++++++++++++++++++++++ src/utility.h | 6 ++++ tests/testthat/test-lincomb_linreg.R | 43 +++++++++++++++++++++++ tests/testthat/test-lincomb_logreg.R | 7 +++- tests/testthat/test-scale_x.R | 22 ++++++++++++ 9 files changed, 160 insertions(+), 38 deletions(-) create mode 100644 tests/testthat/test-lincomb_linreg.R create mode 100644 tests/testthat/test-scale_x.R diff --git a/R/RcppExports.R b/R/RcppExports.R index 9484de66..64d0177f 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -45,6 +45,10 @@ x_submat_mult_beta_exported <- function(x, y, w, x_rows, x_cols, beta) { .Call(`_aorsf_x_submat_mult_beta_exported`, x, y, w, x_rows, x_cols, beta) } +scale_x_exported <- function(x, w) { + .Call(`_aorsf_scale_x_exported`, x, w) +} + cph_scale <- function(x, w) { .Call(`_aorsf_cph_scale`, x, w) } diff --git a/R/orsf.R b/R/orsf.R index c4d3a05a..1786ae6d 100644 --- a/R/orsf.R +++ b/R/orsf.R @@ -3,43 +3,6 @@ #' #' Fit an oblique random survival forest #' -#' @srrstats {G1.4} *documented with Roxygen* -#' @srrstats {G1.1} *aorsf is an improvement of the ORSF algorithm implemented in obliqueRSF, which was an extension of Hemant Ishwaran's random survival forest.* -#' @srrstats {G1.3} *linear combinations of inputs defined.* -#' @srrstats {G1.5} *orsf() will be used in publications to benchmark performance of the aorsf package in computation speed and prediction accuracy.* -#' @srrstats {G1.6} *orsf() should be used to compare performance claims with other packages.* -#' @srrstats {G2.1} *Inputs have indication of type in parentheticals. This format is used in all exported functions.* -#' @srrstats {G5.2a} *messages produced here (e.g., with `stop()`, `warning()`, `message()`) are unique and make effort to highlight the specific data elements that cause the error* -#' @srrstats {G2.0a} *secondary documentation of arg lengths. When an input has length 1, a parenthetical gives the specific type of value it should be and uses a singular description (e.g., an integer). When inputs have length > 1, a vector description is used (e.g., integer vector)* -#' @srrstats {ML1.0} *Documentation includes a subsection that makes clear conceptual distinction between train and test data* -#' @srrstats {ML3.3} *Properties and behaviours of aorsf models are explicitly compared with objects produced by other ML software in the "Introduction to aorsf" vignette.* -#' @srrstats {ML4.0} *orsf() is a unified single-function interface to model training. orsf_train() is able to receive as input an untrained model specified by orsf() when no_fit = TRUE. Models with categorically different specifications are able to be submitted to the same model training function.* -#' @srrstats {ML5.2, ML5.2a} *The structure and functionality of trained aorsf objects is documented through vignettes. In particular, basic functionality extending from the aorsf class is explicitly described in the "Introduction to aorsf" vignette, and additional functionality is documented in the "Out-of-bag predictions and evaluation" and "Compute partial dependence with ORSF" vignettes. Each vignettes demonstrates functionality clearly with example code.* -#' @srrstats {ML5.3} *Assessment of model performance is implemented through out-of-bag error, which is finalized after a model is trained* -#' @srrstats {ML5.4} *The "Out-of-bag predictions and evaluation" vignette shows how to implement built-in or user-specified functions for this functionality.* -#' @srrstats {ML1.1} *Training data are labelled as "train".* -#' @srrstats {G2.5} *factors used as predictors can be ordered and un-ordered.* -#' @srrstats {ML4.1b} *The value of out-of-bag error can be returned for every oobag_eval_every step.* -#' @srrstats {ML4.2} *The extraction of out-of-bag error is explicitly documented with example code in the "Out-of-bag predictions and evaluation" vignette.* -#' @srrstats {ML3.5b} *Users can specify the kind of loss function to assess distance between model estimates and desired output. This is discussed in detail in the "Out-of-bag predictions and evaluation" vignette.* -#' @srrstats {ML5.4a} *Harrell's C-statistic, an internally utilized metric for model performance, is clearly and distinctly documented and cited.* -#' @srrstats {ML5.4b} *It is possible to submit custom metrics to a model assessment function, and the ability to do so is clearly documented. The "Out-of-bag predictions and evaluation" vignette provides example code.* -#' @srrstats {ML2.0, ML2.0b} *orsf() enables pre-processing steps to be defined and parametrized without fitting a model when no_fit is TRUE, returning an object with a defined class minimally intended to implement a default `print` method which summarizes the model specifications.* -#' @srrstats {ML3.0} *Model specification can be implemented prior to actual model fitting or training* -#' @srrstats {ML3.0a} *As pre-processing, model specification, and training are controlled by the orsf() function, an input parameter (no_fit) enables models to be specified yet not fitted.* -#' @srrstats {ML3.0c} *when no_fit=TRUE, orsf() will return an object that can be directly trained using orsf_train().* -#' @srrstats {ML1.6a} *Explain why missing values are not admitted.* -#' @srrstats {G1.0} *Jaeger et al describes the ORSF algorithm that aorsf is based on. Note: aorsf uses a different approach to create linear combinations of inputs for speed reasons, but orsf_control_net() allows users to make ensembles that are very similar to obliqueRSF::ORSF().* -#' @srrstats {ML1.6b} *Explicit example showing how missing values may be imputed rather than discarded.* -#' @srrstats {ML6.0} *Reference section explicitly links to aorsf-bench, which includes training and testing stages, and which clearly indicates a need for distinct training and test data sets.* -#' @srrstats {ML6.1} *clearly document how aorsf can be embedded within a typical full ML workflow.* -#' @srrstats {ML6.1a} *Embed aorsf within a full workflow using tidymodels and tidyverse* -#' @srrstats {ML5.2b} *Documentation includes examples of how to save and re-load trained model objects for their re-use.* -#' @srrstats {ML2.3} *Values associated with transformations are recorded in the object returned by orsf()* -#' @srrstats {ML1.3} *Input data are partitioned as training (in-bag) and test (out-of-bag) data within orsf_fit().* -#' @srrstats {ML4.1} *orsf_fit() retains information on model-internal parameters.* -#' @srrstats {ML4.1a} *orsf_fit() output includes all model-internal parameters, specifically the linear combination coefficients.* -#' #' @param data a `r roxy_data_allowed()` that contains the #' relevant variables. #' diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 357e234f..c4cd334c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -170,6 +170,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// scale_x_exported +List scale_x_exported(arma::mat& x, arma::vec& w); +RcppExport SEXP _aorsf_scale_x_exported(SEXP xSEXP, SEXP wSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat& >::type x(xSEXP); + Rcpp::traits::input_parameter< arma::vec& >::type w(wSEXP); + rcpp_result_gen = Rcpp::wrap(scale_x_exported(x, w)); + return rcpp_result_gen; +END_RCPP +} // cph_scale List cph_scale(arma::mat& x, arma::vec& w); RcppExport SEXP _aorsf_cph_scale(SEXP xSEXP, SEXP wSEXP) { @@ -249,6 +261,7 @@ static const R_CallMethodDef CallEntries[] = { {"_aorsf_sprout_node_survival_exported", (DL_FUNC) &_aorsf_sprout_node_survival_exported, 2}, {"_aorsf_find_rows_inbag_exported", (DL_FUNC) &_aorsf_find_rows_inbag_exported, 2}, {"_aorsf_x_submat_mult_beta_exported", (DL_FUNC) &_aorsf_x_submat_mult_beta_exported, 6}, + {"_aorsf_scale_x_exported", (DL_FUNC) &_aorsf_scale_x_exported, 2}, {"_aorsf_cph_scale", (DL_FUNC) &_aorsf_cph_scale, 2}, {"_aorsf_orsf_cpp", (DL_FUNC) &_aorsf_orsf_cpp, 44}, {NULL, NULL, 0} diff --git a/src/orsf_oop.cpp b/src/orsf_oop.cpp index 33b7edff..4118a635 100644 --- a/src/orsf_oop.cpp +++ b/src/orsf_oop.cpp @@ -210,6 +210,20 @@ } + // [[Rcpp::export]] + List scale_x_exported(arma::mat& x, + arma::vec& w){ + + mat transforms = scale_x(x, w); + + List result; + result.push_back(x, "scaled_x"); + result.push_back(transforms, "transforms"); + + return(result); + + } + // [[Rcpp::export]] List cph_scale(arma::mat& x, arma::vec& w){ diff --git a/src/utility.cpp b/src/utility.cpp index 5aa8361d..4dfe72fa 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -446,4 +446,56 @@ } + arma::mat scale_x(arma::mat& x, + arma::vec& w){ + + uword n_vars = x.n_cols; + + mat x_transforms(n_vars, 2, fill::zeros); + + vec means = x_transforms.unsafe_col(0); // Reference to column 1 + vec scales = x_transforms.unsafe_col(1); // Reference to column 2 + + double w_sum = sum(w); + double m = w.size(); + + for(uword i = 0; i < n_vars; i++) { + + means.at(i) = sum(w % x.col(i) ) / w_sum; + + // subtract the mean now so you don't have to do the subtraction + // when computing standard deviation. + x.col(i) -= means.at(i); + + scales.at(i) = sqrt( + sum(w % pow(x.col(i), 2)) / ( (m-1) * w_sum / m ) + ); + + if(scales(i) <= 0) scales.at(i) = 1.0; // constant covariate; + + x.col(i) /= scales.at(i); + + } + + return(x_transforms); + + } + + void unscale_x(arma::mat& x, + arma::mat& x_transforms){ + + uword n_vars = x.n_cols; + + vec means = x_transforms.unsafe_col(0); // Reference to column 1 + vec scales = x_transforms.unsafe_col(1); // Reference to column 2 + + for(uword i = 0; i < n_vars; i++){ + + x.col(i) /= scales.at(i); + x.col(i) += means.at(i); + + } + + } + } diff --git a/src/utility.h b/src/utility.h index 270ed5a2..3ed08a1d 100644 --- a/src/utility.h +++ b/src/utility.h @@ -91,6 +91,12 @@ aorsf may be modified and distributed under the terms of the MIT license. double epsilon, arma::uword iter_max); + arma::mat scale_x(arma::mat& x, + arma::vec& w); + + void unscale_x(arma::mat& x, + arma::mat& x_transforms); + } #endif /* UTILITY_H */ diff --git a/tests/testthat/test-lincomb_linreg.R b/tests/testthat/test-lincomb_linreg.R new file mode 100644 index 00000000..3a25a971 --- /dev/null +++ b/tests/testthat/test-lincomb_linreg.R @@ -0,0 +1,43 @@ + +set.seed(123) + +test_that( + desc = "linreg_fit with weights approximately equal to glm()", + code = { + + nrows <- 1000 + ncols <- 20 + + X <- matrix(data = rnorm(nrows*ncols), nrow = nrows, ncol = ncols) + + # X <- cbind(1, X) + + colnames(X) <- c( + # "intercept", + paste0("x", seq(ncols)) + ) + + Y <- matrix(rnorm(nrows), ncol = 1) + + glm_data <- as.data.frame(cbind(y=as.numeric(Y), X)) + + # Fit logistic regression using the custom function + + W <- sample(1:3, nrow(X), replace=TRUE) + + cpp = linreg_fit_exported(X, Y, W, do_scale = TRUE, + epsilon = 1e-9, iter_max = 20) + + R = lm(y ~ ., weights = as.integer(W), data = glm_data) + + R_summary <- summary(R) + + R_beta_est <- as.numeric(R_summary$coefficients[-1, 'Estimate']) + R_beta_pvalues <- as.numeric(R_summary$coefficients[-1, 'Pr(>|t|)']) + + expect_equal(cpp[,1], R_beta_est, tolerance = 1e-9) + expect_equal(cpp[,2], R_beta_pvalues, tolerance = 1e-9) + + + } +) diff --git a/tests/testthat/test-lincomb_logreg.R b/tests/testthat/test-lincomb_logreg.R index abd89c9a..3b0d4f53 100644 --- a/tests/testthat/test-lincomb_logreg.R +++ b/tests/testthat/test-lincomb_logreg.R @@ -25,10 +25,15 @@ test_that( W <- sample(1:3, nrow(X), replace=TRUE) - cpp = logreg_fit_exported(X, Y, W, do_scale = T, epsilon = 1e-10, iter_max = 20) + control <- glm.control() + + cpp = logreg_fit_exported(X, Y, W, do_scale = T, + epsilon = control$epsilon, + iter_max = control$maxit) R = glm(y ~ ., weights = as.integer(W), + control = control, data = glm_data, family = 'binomial') diff --git a/tests/testthat/test-scale_x.R b/tests/testthat/test-scale_x.R new file mode 100644 index 00000000..b8cfbfa2 --- /dev/null +++ b/tests/testthat/test-scale_x.R @@ -0,0 +1,22 @@ +weighted_sd <- function(x, w){ + mu <- weighted.mean(x, w) + w_sum <- sum(w) + m <- length(w) + sqrt( sum(w * (x-mu)^2) / ( (m-1) * w_sum / m ) ) +} + +nrows <- 1000 +ncols <- 20 + +X <- matrix(data = rnorm(nrows*ncols), nrow = nrows, ncol = ncols) + +colnames(X) <- paste0("x", seq(ncols)) + +weights <- sample(1:3, nrow(X), replace = TRUE) +x_means <- apply(X, 2, weighted.mean, w = weights) +x_sds <- apply(X, 2, weighted_sd, w = weights) + +res <- scale_x_exported(X, w = weights) + +expect_equal(res$transforms[, 1], as.numeric(x_means), tolerance = 1e-9) +expect_equal(res$transforms[, 2], as.numeric(x_sds), tolerance = 1e-9) From 30048696a8c6b9e17f5a28fc8542382994b570f0 Mon Sep 17 00:00:00 2001 From: byron jaeger Date: Wed, 18 Oct 2023 21:23:31 -0400 Subject: [PATCH 3/3] drop rcpp namespace --- src/Data.h | 1 - src/Forest.cpp | 69 +++++++++++++++++++++++++------------------------- src/Forest.h | 6 ++--- 3 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/Data.h b/src/Data.h index e3804c60..1e17d260 100644 --- a/src/Data.h +++ b/src/Data.h @@ -11,7 +11,6 @@ #include "globals.h" using namespace arma; - using namespace Rcpp; namespace aorsf { diff --git a/src/Forest.cpp b/src/Forest.cpp index 9a3e08b8..24f58308 100644 --- a/src/Forest.cpp +++ b/src/Forest.cpp @@ -5,7 +5,6 @@ #include "Tree.h" using namespace arma; -using namespace Rcpp; namespace aorsf { @@ -36,7 +35,7 @@ void Forest::init(std::unique_ptr input_data, double lincomb_alpha, arma::uword lincomb_df_target, arma::uword lincomb_ties_method, - RObject lincomb_R_function, + Rcpp::RObject lincomb_R_function, // predictions PredType pred_type, bool pred_mode, @@ -101,12 +100,12 @@ void Forest::init(std::unique_ptr input_data, // # nocov start if(verbosity > 1){ - Rcout << "------------ input data dimensions ------------" << std::endl; - Rcout << "N observations total: " << data->get_n_rows() << std::endl; - Rcout << "N columns total: " << data->get_n_cols() << std::endl; - Rcout << "-----------------------------------------------"; - Rcout << std::endl; - Rcout << std::endl; + Rcpp::Rcout << "------------ input data dimensions ------------" << std::endl; + Rcpp::Rcout << "N observations total: " << data->get_n_rows() << std::endl; + Rcpp::Rcout << "N columns total: " << data->get_n_cols() << std::endl; + Rcpp::Rcout << "-----------------------------------------------"; + Rcpp::Rcout << std::endl; + Rcpp::Rcout << std::endl; } // # nocov end @@ -262,9 +261,9 @@ void Forest::grow_single_thread(vec* vi_numer_ptr, for (uint i = 0; i < n_tree; ++i) { if(verbosity > 1){ - Rcout << "------------ Growing tree " << i << " --------------"; - Rcout << std::endl; - Rcout << std::endl; + Rcpp::Rcout << "------------ Growing tree " << i << " --------------"; + Rcpp::Rcout << std::endl; + Rcpp::Rcout << std::endl; } trees[i]->grow(vi_numer_ptr, vi_denom_ptr); @@ -282,15 +281,15 @@ void Forest::grow_single_thread(vec* vi_numer_ptr, seconds time_from_start = duration_cast(steady_clock::now() - start_time); uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); - Rcout << "Growing trees: "; - Rcout << round(100 * relative_progress) << "%. "; + Rcpp::Rcout << "Growing trees: "; + Rcpp::Rcout << round(100 * relative_progress) << "%. "; if(progress < max_progress){ - Rcout << "~ time remaining: "; - Rcout << beautifyTime(remaining_time) << "."; + Rcpp::Rcout << "~ time remaining: "; + Rcpp::Rcout << beautifyTime(remaining_time) << "."; } - Rcout << std::endl; + Rcpp::Rcout << std::endl; last_time = steady_clock::now(); @@ -408,15 +407,15 @@ void Forest::compute_oobag_vi_single_thread(vec* vi_numer_ptr) { seconds time_from_start = duration_cast(steady_clock::now() - start_time); uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); - Rcout << "Computing importance: "; - Rcout << round(100 * relative_progress) << "%. "; + Rcpp::Rcout << "Computing importance: "; + Rcpp::Rcout << round(100 * relative_progress) << "%. "; if(progress < max_progress){ - Rcout << "~ time remaining: "; - Rcout << beautifyTime(remaining_time) << "."; + Rcpp::Rcout << "~ time remaining: "; + Rcpp::Rcout << beautifyTime(remaining_time) << "."; } - Rcout << std::endl; + Rcpp::Rcout << std::endl; last_time = steady_clock::now(); @@ -660,12 +659,12 @@ void Forest::predict_single_thread(Data* prediction_data, if(verbosity > 1){ if(oobag){ - Rcout << "--- Computing oobag predictions: tree " << i << " ---"; + Rcpp::Rcout << "--- Computing oobag predictions: tree " << i << " ---"; } else { - Rcout << "------ Computing predictions: tree " << i << " -----"; + Rcpp::Rcout << "------ Computing predictions: tree " << i << " -----"; } - Rcout << std::endl; - Rcout << std::endl; + Rcpp::Rcout << std::endl; + Rcpp::Rcout << std::endl; } trees[i]->predict_leaf(prediction_data, oobag); @@ -698,15 +697,15 @@ void Forest::predict_single_thread(Data* prediction_data, seconds time_from_start = duration_cast(steady_clock::now() - start_time); uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); - Rcout << "Computing predictions: "; - Rcout << round(100 * relative_progress) << "%. "; + Rcpp::Rcout << "Computing predictions: "; + Rcpp::Rcout << round(100 * relative_progress) << "%. "; if(progress < max_progress){ - Rcout << "~ time remaining: "; - Rcout << beautifyTime(remaining_time) << "."; + Rcpp::Rcout << "~ time remaining: "; + Rcpp::Rcout << beautifyTime(remaining_time) << "."; } - Rcout << std::endl; + Rcpp::Rcout << std::endl; last_time = steady_clock::now(); @@ -842,15 +841,15 @@ void Forest::show_progress(std::string operation, size_t max_progress) { seconds time_from_start = duration_cast(steady_clock::now() - start_time); uint remaining_time = (1 / relative_progress - 1) * time_from_start.count(); - Rcout << operation << ": "; - Rcout << round(100 * relative_progress) << "%. "; + Rcpp::Rcout << operation << ": "; + Rcpp::Rcout << round(100 * relative_progress) << "%. "; if(progress < max_progress){ - Rcout << "~ time remaining: "; - Rcout << beautifyTime(remaining_time) << "."; + Rcpp::Rcout << "~ time remaining: "; + Rcpp::Rcout << beautifyTime(remaining_time) << "."; } - Rcout << std::endl; + Rcpp::Rcout << std::endl; last_time = steady_clock::now(); diff --git a/src/Forest.h b/src/Forest.h index 94734ea5..31d83c6d 100644 --- a/src/Forest.h +++ b/src/Forest.h @@ -58,7 +58,7 @@ class Forest { double lincomb_alpha, arma::uword lincomb_df_target, arma::uword lincomb_ties_method, - RObject lincomb_R_function, + Rcpp::RObject lincomb_R_function, // predictions PredType pred_type, bool pred_mode, @@ -288,7 +288,7 @@ class Forest { arma::uword lincomb_iter_max; arma::uword lincomb_df_target; arma::uword lincomb_ties_method; - RObject lincomb_R_function; + Rcpp::RObject lincomb_R_function; bool grow_mode; @@ -311,7 +311,7 @@ class Forest { arma::mat oobag_eval; EvalType oobag_eval_type; arma::uword oobag_eval_every; - RObject oobag_R_function; + Rcpp::RObject oobag_R_function; // multi-threading