diff --git a/DESCRIPTION b/DESCRIPTION index 997b15f..4c368dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,13 @@ LinkingTo: cpp11 Suggests: knitr, - rmarkdown + rmarkdown, + Matrix, + tgp, + MASS, + mvtnorm, + ggplot2, + latex2exp VignetteBuilder: knitr Imports: R6 diff --git a/NAMESPACE b/NAMESPACE index 4ded939..6ecb479 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,8 +6,11 @@ S3method(predict,bartmodel) S3method(predict,bcf) export(bart) export(bcf) +export(computeForestKernels) +export(computeForestLeafIndices) export(createForestContainer) export(createForestDataset) +export(createForestKernel) export(createForestModel) export(createOutcome) export(createRNG) diff --git a/R/cpp11.R b/R/cpp11.R index e2e5aa2..ea70526 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -76,6 +76,34 @@ rfx_dataset_add_weights_cpp <- function(dataset_ptr, weights) { invisible(.Call(`_stochtree_rfx_dataset_add_weights_cpp`, dataset_ptr, weights)) } +forest_kernel_cpp <- function() { + .Call(`_stochtree_forest_kernel_cpp`) +} + +forest_kernel_compute_leaf_indices_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) { + invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num)) +} + +forest_kernel_compute_leaf_indices_train_test_cpp <- function(forest_kernel, covariates_train, covariates_test, forest_container, forest_num) { + invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num)) +} + +forest_kernel_get_train_leaf_indices_cpp <- function(forest_kernel) { + .Call(`_stochtree_forest_kernel_get_train_leaf_indices_cpp`, forest_kernel) +} + +forest_kernel_get_test_leaf_indices_cpp <- function(forest_kernel) { + .Call(`_stochtree_forest_kernel_get_test_leaf_indices_cpp`, forest_kernel) +} + +forest_kernel_compute_kernel_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) { + .Call(`_stochtree_forest_kernel_compute_kernel_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num) +} + +forest_kernel_compute_kernel_train_test_cpp <- function(forest_kernel, covariates_train, covariates_test, forest_container, forest_num) { + .Call(`_stochtree_forest_kernel_compute_kernel_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num) +} + predict_forest_cpp <- function(forest_samples, dataset) { .Call(`_stochtree_predict_forest_cpp`, forest_samples, dataset) } @@ -200,6 +228,10 @@ num_samples_forest_container_cpp <- function(forest_samples) { .Call(`_stochtree_num_samples_forest_container_cpp`, forest_samples) } +num_trees_forest_container_cpp <- function(forest_samples) { + .Call(`_stochtree_num_trees_forest_container_cpp`, forest_samples) +} + json_save_forest_container_cpp <- function(forest_samples, json_filename) { invisible(.Call(`_stochtree_json_save_forest_container_cpp`, forest_samples, json_filename)) } diff --git a/R/forest.R b/R/forest.R index 55c9698..e7eb385 100644 --- a/R/forest.R +++ b/R/forest.R @@ -141,6 +141,13 @@ ForestSamples <- R6::R6Class( return(num_samples_forest_container_cpp(self$forest_container_ptr)) }, + #' @description + #' Return number of trees in each ensemble of a `ForestContainer` object + #' @return Tree count + num_trees = function() { + return(num_trees_forest_container_cpp(self$forest_container_ptr)) + }, + #' @description #' Return output dimension of trees in a `ForestContainer` object #' @return Leaf node parameter size diff --git a/R/kernel.R b/R/kernel.R new file mode 100644 index 0000000..5dff0d5 --- /dev/null +++ b/R/kernel.R @@ -0,0 +1,195 @@ +#' Class that provides functionality for statistical kernel definition and +#' computation based on shared leaf membership of observations in a tree ensemble. +#' +#' @description +#' Computes leaf membership internally as a sparse matrix and also calculates a +#' (dense) kernel based on the sparse matrix all in C++. + +ForestKernel <- R6::R6Class( + classname = "ForestKernel", + cloneable = FALSE, + public = list( + + #' @field forest_kernel_ptr External pointer to a C++ StochTree::ForestKernel class + forest_kernel_ptr = NULL, + + #' @description + #' Create a new ForestKernel object. + #' @return A new `ForestKernel` object. + initialize = function() { + # Initialize forest kernel and return external pointer to the class + self$forest_kernel_ptr <- forest_kernel_cpp() + }, + + #' @description + #' Compute the leaf indices of each tree in the ensemble for every observation in a dataset. + #' Stores the result internally, which can be extracted from the class via a call to `get_leaf_indices`. + #' @param covariates_train Matrix of training set covariates at which to compute leaf indices + #' @param covariates_test (Optional) Matrix of test set covariates at which to compute leaf indices + #' @param forest_container Object of type `ForestSamples` + #' @param forest_num Index of the forest in forest_container to be assessed + #' @return List of vectors. If `covariates_test = NULL` the list has one element (train set leaf indices), and + #' otherwise the list has two elements (train and test set leaf indices). + compute_leaf_indices = function(covariates_train, covariates_test = NULL, forest_container, forest_num) { + # Convert to matrix format if not provided as such + if ((is.null(dim(covariates_train))) && (!is.null(covariates_train))) { + covariates_train <- as.matrix(covariates_train) + } + if ((is.null(dim(covariates_test))) && (!is.null(covariates_test))) { + covariates_test <- as.matrix(covariates_test) + } + + # Compute the leaf indices + result = list() + if (is.null(covariates_test)) { + forest_kernel_compute_leaf_indices_train_cpp( + self$forest_kernel_ptr, covariates_train, + forest_container$forest_container_ptr, forest_num + ) + result[["leaf_indices_train"]] = forest_kernel_get_train_leaf_indices_cpp(self$forest_kernel_ptr) + } else { + forest_kernel_compute_leaf_indices_train_test_cpp( + self$forest_kernel_ptr, covariates_train, covariates_test, + forest_container$forest_container_ptr, forest_num + ) + result[["leaf_indices_train"]] = forest_kernel_get_train_leaf_indices_cpp(self$forest_kernel_ptr) + result[["leaf_indices_test"]] = forest_kernel_get_test_leaf_indices_cpp(self$forest_kernel_ptr) + } + return(result) + }, + + #' @description + #' Compute the kernel implied by a tree ensemble. This function calls `compute_leaf_indices`, + #' so it is not necessary to call both. `compute_leaf_indices` is exposed at the class level + #' to allow for extracting the vector of leaf indices for an ensemble directly in R. + #' @param covariates_train Matrix of training set covariates at which to assess ensemble kernel + #' @param covariates_test (Optional) Matrix of test set covariates at which to assess ensemble kernel + #' @param forest_container Object of type `ForestSamples` + #' @param forest_num Index of the forest in forest_container to be assessed + #' @return List of matrices. If `covariates_test = NULL`, the list contains + #' one `n_train` x `n_train` matrix, where `n_train = nrow(covariates_train)`. + #' This matrix is the kernel defined by `W_train %*% t(W_train)` where `W_train` + #' is a matrix with `n_train` rows and as many columns as there are total leaves in an ensemble. + #' If `covariates_test` is not `NULL`, the list contains two more matrices defined by + #' `W_test %*% t(W_train)` and `W_test %*% t(W_test)`. + compute_kernel = function(covariates_train, covariates_test = NULL, forest_container, forest_num) { + # Convert to matrix format if not provided as such + if ((is.null(dim(covariates_train))) && (!is.null(covariates_train))) { + covariates_train <- as.matrix(covariates_train) + } + if ((is.null(dim(covariates_test))) && (!is.null(covariates_test))) { + covariates_test <- as.matrix(covariates_test) + } + + # Compute the kernels + num_trees <- forest_container$num_trees() + n_train = nrow(covariates_train) + kernel_train = matrix(0., nrow = n_train, ncol = n_train) + inverse_kernel_train = matrix(0., nrow = n_train, ncol = n_train) + if (is.null(covariates_test)) { + result = forest_kernel_compute_kernel_train_cpp( + self$forest_kernel_ptr, covariates_train, + forest_container$forest_container_ptr, forest_num + ) + names(result) <- c("kernel_train") + } else { + n_test = nrow(covariates_test) + kernel_test_train = matrix(0., nrow = n_test, ncol = n_train) + kernel_test = matrix(0., nrow = n_test, ncol = n_test) + result = forest_kernel_compute_kernel_train_test_cpp( + self$forest_kernel_ptr, covariates_train, covariates_test, + forest_container$forest_container_ptr, forest_num + ) + names(result) <- c("kernel_train", "kernel_test_train", "kernel_test") + } + + # Divide each matrix by num_trees + for (i in 1:length(result)) {result[[i]] <- result[[i]] / num_trees} + + return(result) + } + ) +) + +#' Create a `ForestKernel` object +#' +#' @return `ForestKernel` object +#' @export +createForestKernel <- function() { + return(invisible(( + ForestKernel$new() + ))) +} + +#' Compute a kernel from a tree ensemble, defined by the fraction +#' of trees of an ensemble in which two observations fall into the +#' same leaf. +#' +#' @param bart_model Object of type `bartmodel` corresponding to a BART model with at least one sample +#' @param X_train Matrix of "training" data. In a traditional Gaussian process kriging context, this +#' corresponds to the observations for which outcomes are observed. +#' @param X_test (Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this +#' corresponds to the observations for which outcomes are unobserved and must be estimated +#' based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, +#' this function will only compute k(X_train, X_train). +#' @param forest_num (Option) Index of the forest sample to use for kernel computation. If not provided, +#' this function will use the last forest. +#' @return List of kernel matrices. If `X_test = NULL`, the list contains +#' one `n_train` x `n_train` matrix, where `n_train = nrow(X_train)`. +#' This matrix is the kernel defined by `W_train %*% t(W_train)` where `W_train` +#' is a matrix with `n_train` rows and as many columns as there are total leaves in an ensemble. +#' If `X_test` is not `NULL`, the list contains two more matrices defined by +#' `W_test %*% t(W_train)` and `W_test %*% t(W_test)`. +#' @export +computeForestKernels <- function(bart_model, X_train, X_test=NULL, forest_num=NULL) { + stopifnot(class(bart_model)=="bartmodel") + forest_kernel <- createForestKernel() + num_samples <- bart_model$model_params$num_samples + stopifnot(forest_num <= num_samples) + sample_index <- ifelse(is.null(forest_num), num_samples-1, forest_num-1) + return(forest_kernel$compute_kernel( + covariates_train = X_train, covariates_test = X_test, + forest_container = bart_model$forests, forest_num = sample_index + )) +} + +#' Compute and return a vector representation of a forest's leaf predictions for +#' every observation in a dataset. +#' The vector has a "column-major" format that can be easily re-represented as +#' as a CSC sparse matrix: elements are organized so that the first `n` elements +#' correspond to leaf predictions for all `n` observations in a dataset for the +#' first tree in an ensemble, the next `n` elements correspond to predictions for +#' the second tree and so on. The "data" for each element corresponds to a uniquely +#' mapped column index that corresponds to a single leaf of a single tree (i.e. +#' if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's +#' leaf indices begin at 3, etc...). +#' Users may pass a single dataset (which we refer to here as a "training set") +#' or two datasets (which we refer to as "training and test sets"). This verbiage +#' hints that one potential use-case for a matrix of leaf indices is to define a +#' ensemble-based kernel for kriging. +#' +#' @param bart_model Object of type `bartmodel` corresponding to a BART model with at least one sample +#' @param X_train Matrix of "training" data. In a traditional Gaussian process kriging context, this +#' corresponds to the observations for which outcomes are observed. +#' @param X_test (Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this +#' corresponds to the observations for which outcomes are unobserved and must be estimated +#' based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, +#' this function will only compute k(X_train, X_train). +#' @param forest_num (Option) Index of the forest sample to use for kernel computation. If not provided, +#' this function will use the last forest. +#' @return List of vectors. If `X_test = NULL`, the list contains +#' one vector of length `n_train * num_trees`, where `n_train = nrow(X_train)` +#' and `num_trees` is the number of trees in `bart_model`. If `X_test` is not `NULL`, +#' the list contains another vector of length `n_test * num_trees`. +#' @export +computeForestLeafIndices <- function(bart_model, X_train, X_test=NULL, forest_num=NULL) { + stopifnot(class(bart_model)=="bartmodel") + forest_kernel <- createForestKernel() + num_samples <- bart_model$model_params$num_samples + stopifnot(forest_num <= num_samples) + sample_index <- ifelse(is.null(forest_num), num_samples-1, forest_num-1) + return(forest_kernel$compute_leaf_indices( + covariates_train = X_train, covariates_test = X_test, + forest_container = bart_model$forests, forest_num = sample_index + )) +} diff --git a/man/ForestKernel.Rd b/man/ForestKernel.Rd new file mode 100644 index 0000000..a721b36 --- /dev/null +++ b/man/ForestKernel.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kernel.R +\name{ForestKernel} +\alias{ForestKernel} +\title{Class that provides functionality for statistical kernel definition and +computation based on shared leaf membership of observations in a tree ensemble.} +\description{ +Computes leaf membership internally as a sparse matrix and also calculates a +(dense) kernel based on the sparse matrix all in C++. +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{forest_kernel_ptr}}{External pointer to a C++ StochTree::ForestKernel class} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-ForestKernel-new}{\code{ForestKernel$new()}} +\item \href{#method-ForestKernel-compute_leaf_indices}{\code{ForestKernel$compute_leaf_indices()}} +\item \href{#method-ForestKernel-compute_kernel}{\code{ForestKernel$compute_kernel()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestKernel-new}{}}} +\subsection{Method \code{new()}}{ +Create a new ForestKernel object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestKernel$new()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +A new \code{ForestKernel} object. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestKernel-compute_leaf_indices}{}}} +\subsection{Method \code{compute_leaf_indices()}}{ +Compute the leaf indices of each tree in the ensemble for every observation in a dataset. +Stores the result internally, which can be extracted from the class via a call to \code{get_leaf_indices}. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestKernel$compute_leaf_indices( + covariates_train, + covariates_test = NULL, + forest_container, + forest_num +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{covariates_train}}{Matrix of training set covariates at which to compute leaf indices} + +\item{\code{covariates_test}}{(Optional) Matrix of test set covariates at which to compute leaf indices} + +\item{\code{forest_container}}{Object of type \code{ForestSamples}} + +\item{\code{forest_num}}{Index of the forest in forest_container to be assessed} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +List of vectors. If \code{covariates_test = NULL} the list has one element (train set leaf indices), and +otherwise the list has two elements (train and test set leaf indices). +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestKernel-compute_kernel}{}}} +\subsection{Method \code{compute_kernel()}}{ +Compute the kernel implied by a tree ensemble. This function calls \code{compute_leaf_indices}, +so it is not necessary to call both. \code{compute_leaf_indices} is exposed at the class level +to allow for extracting the vector of leaf indices for an ensemble directly in R. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestKernel$compute_kernel( + covariates_train, + covariates_test = NULL, + forest_container, + forest_num +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{covariates_train}}{Matrix of training set covariates at which to assess ensemble kernel} + +\item{\code{covariates_test}}{(Optional) Matrix of test set covariates at which to assess ensemble kernel} + +\item{\code{forest_container}}{Object of type \code{ForestSamples}} + +\item{\code{forest_num}}{Index of the forest in forest_container to be assessed} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +List of matrices. If \code{covariates_test = NULL}, the list contains +one \code{n_train} x \code{n_train} matrix, where \code{n_train = nrow(covariates_train)}. +This matrix is the kernel defined by \code{W_train \%*\% t(W_train)} where \code{W_train} +is a matrix with \code{n_train} rows and as many columns as there are total leaves in an ensemble. +If \code{covariates_test} is not \code{NULL}, the list contains two more matrices defined by +\code{W_test \%*\% t(W_train)} and \code{W_test \%*\% t(W_test)}. +} +} +} diff --git a/man/ForestSamples.Rd b/man/ForestSamples.Rd index c351821..a3cb0e2 100644 --- a/man/ForestSamples.Rd +++ b/man/ForestSamples.Rd @@ -25,6 +25,7 @@ Wrapper around a C++ container of tree ensembles \item \href{#method-ForestSamples-save_json}{\code{ForestSamples$save_json()}} \item \href{#method-ForestSamples-load_json}{\code{ForestSamples$load_json()}} \item \href{#method-ForestSamples-num_samples}{\code{ForestSamples$num_samples()}} +\item \href{#method-ForestSamples-num_trees}{\code{ForestSamples$num_trees()}} \item \href{#method-ForestSamples-output_dimension}{\code{ForestSamples$output_dimension()}} } } @@ -226,6 +227,19 @@ Sample count } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestSamples-num_trees}{}}} +\subsection{Method \code{num_trees()}}{ +Return number of trees in each ensemble of a \code{ForestContainer} object +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestSamples$num_trees()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Tree count +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-ForestSamples-output_dimension}{}}} \subsection{Method \code{output_dimension()}}{ diff --git a/man/computeForestKernels.Rd b/man/computeForestKernels.Rd new file mode 100644 index 0000000..7421019 --- /dev/null +++ b/man/computeForestKernels.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kernel.R +\name{computeForestKernels} +\alias{computeForestKernels} +\title{Compute a kernel from a tree ensemble, defined by the fraction +of trees of an ensemble in which two observations fall into the +same leaf.} +\usage{ +computeForestKernels(bart_model, X_train, X_test = NULL, forest_num = NULL) +} +\arguments{ +\item{bart_model}{Object of type \code{bartmodel} corresponding to a BART model with at least one sample} + +\item{X_train}{Matrix of "training" data. In a traditional Gaussian process kriging context, this +corresponds to the observations for which outcomes are observed.} + +\item{X_test}{(Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this +corresponds to the observations for which outcomes are unobserved and must be estimated +based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, +this function will only compute k(X_train, X_train).} + +\item{forest_num}{(Option) Index of the forest sample to use for kernel computation. If not provided, +this function will use the last forest.} +} +\value{ +List of kernel matrices. If \code{X_test = NULL}, the list contains +one \code{n_train} x \code{n_train} matrix, where \code{n_train = nrow(X_train)}. +This matrix is the kernel defined by \code{W_train \%*\% t(W_train)} where \code{W_train} +is a matrix with \code{n_train} rows and as many columns as there are total leaves in an ensemble. +If \code{X_test} is not \code{NULL}, the list contains two more matrices defined by +\code{W_test \%*\% t(W_train)} and \code{W_test \%*\% t(W_test)}. +} +\description{ +Compute a kernel from a tree ensemble, defined by the fraction +of trees of an ensemble in which two observations fall into the +same leaf. +} diff --git a/man/computeForestLeafIndices.Rd b/man/computeForestLeafIndices.Rd new file mode 100644 index 0000000..0f5dd89 --- /dev/null +++ b/man/computeForestLeafIndices.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kernel.R +\name{computeForestLeafIndices} +\alias{computeForestLeafIndices} +\title{Compute and return a vector representation of a forest's leaf predictions for +every observation in a dataset. +The vector has a "column-major" format that can be easily re-represented as +as a CSC sparse matrix: elements are organized so that the first \code{n} elements +correspond to leaf predictions for all \code{n} observations in a dataset for the +first tree in an ensemble, the next \code{n} elements correspond to predictions for +the second tree and so on. The "data" for each element corresponds to a uniquely +mapped column index that corresponds to a single leaf of a single tree (i.e. +if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's +leaf indices begin at 3, etc...). +Users may pass a single dataset (which we refer to here as a "training set") +or two datasets (which we refer to as "training and test sets"). This verbiage +hints that one potential use-case for a matrix of leaf indices is to define a +ensemble-based kernel for kriging.} +\usage{ +computeForestLeafIndices(bart_model, X_train, X_test = NULL, forest_num = NULL) +} +\arguments{ +\item{bart_model}{Object of type \code{bartmodel} corresponding to a BART model with at least one sample} + +\item{X_train}{Matrix of "training" data. In a traditional Gaussian process kriging context, this +corresponds to the observations for which outcomes are observed.} + +\item{X_test}{(Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this +corresponds to the observations for which outcomes are unobserved and must be estimated +based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, +this function will only compute k(X_train, X_train).} + +\item{forest_num}{(Option) Index of the forest sample to use for kernel computation. If not provided, +this function will use the last forest.} +} +\value{ +List of vectors. If \code{X_test = NULL}, the list contains +one vector of length \code{n_train * num_trees}, where \code{n_train = nrow(X_train)} +and \code{num_trees} is the number of trees in \code{bart_model}. If \code{X_test} is not \code{NULL}, +the list contains another vector of length \code{n_test * num_trees}. +} +\description{ +Compute and return a vector representation of a forest's leaf predictions for +every observation in a dataset. +The vector has a "column-major" format that can be easily re-represented as +as a CSC sparse matrix: elements are organized so that the first \code{n} elements +correspond to leaf predictions for all \code{n} observations in a dataset for the +first tree in an ensemble, the next \code{n} elements correspond to predictions for +the second tree and so on. The "data" for each element corresponds to a uniquely +mapped column index that corresponds to a single leaf of a single tree (i.e. +if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's +leaf indices begin at 3, etc...). +Users may pass a single dataset (which we refer to here as a "training set") +or two datasets (which we refer to as "training and test sets"). This verbiage +hints that one potential use-case for a matrix of leaf indices is to define a +ensemble-based kernel for kriging. +} diff --git a/man/createForestKernel.Rd b/man/createForestKernel.Rd new file mode 100644 index 0000000..36cf73c --- /dev/null +++ b/man/createForestKernel.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kernel.R +\name{createForestKernel} +\alias{createForestKernel} +\title{Create a \code{ForestKernel} object} +\usage{ +createForestKernel() +} +\value{ +\code{ForestKernel} object +} +\description{ +Create a \code{ForestKernel} object +} diff --git a/src/Makevars b/src/Makevars index fd1ae1a..b8b3131 100644 --- a/src/Makevars +++ b/src/Makevars @@ -5,6 +5,7 @@ PKG_CPPFLAGS= -I$(CPP_PKGROOT)/include -I$(CPP_PKGROOT)/dependencies/boost_math/ OBJECTS = \ data.o \ + kernel.o \ predictor.o \ random_effects.o \ sampler.o \ diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 2608a6f..fa9dcab 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -145,6 +145,57 @@ extern "C" SEXP _stochtree_rfx_dataset_add_weights_cpp(SEXP dataset_ptr, SEXP we return R_NilValue; END_CPP11 } +// kernel.cpp +cpp11::external_pointer forest_kernel_cpp(); +extern "C" SEXP _stochtree_forest_kernel_cpp() { + BEGIN_CPP11 + return cpp11::as_sexp(forest_kernel_cpp()); + END_CPP11 +} +// kernel.cpp +void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num); +extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) { + BEGIN_CPP11 + forest_kernel_compute_leaf_indices_train_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num)); + return R_NilValue; + END_CPP11 +} +// kernel.cpp +void forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num); +extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_test_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP covariates_test, SEXP forest_container, SEXP forest_num) { + BEGIN_CPP11 + forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num)); + return R_NilValue; + END_CPP11 +} +// kernel.cpp +cpp11::writable::integers forest_kernel_get_train_leaf_indices_cpp(cpp11::external_pointer forest_kernel); +extern "C" SEXP _stochtree_forest_kernel_get_train_leaf_indices_cpp(SEXP forest_kernel) { + BEGIN_CPP11 + return cpp11::as_sexp(forest_kernel_get_train_leaf_indices_cpp(cpp11::as_cpp>>(forest_kernel))); + END_CPP11 +} +// kernel.cpp +cpp11::writable::integers forest_kernel_get_test_leaf_indices_cpp(cpp11::external_pointer forest_kernel); +extern "C" SEXP _stochtree_forest_kernel_get_test_leaf_indices_cpp(SEXP forest_kernel) { + BEGIN_CPP11 + return cpp11::as_sexp(forest_kernel_get_test_leaf_indices_cpp(cpp11::as_cpp>>(forest_kernel))); + END_CPP11 +} +// kernel.cpp +cpp11::list forest_kernel_compute_kernel_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num); +extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) { + BEGIN_CPP11 + return cpp11::as_sexp(forest_kernel_compute_kernel_train_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num))); + END_CPP11 +} +// kernel.cpp +cpp11::list forest_kernel_compute_kernel_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num); +extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_test_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP covariates_test, SEXP forest_container, SEXP forest_num) { + BEGIN_CPP11 + return cpp11::as_sexp(forest_kernel_compute_kernel_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num))); + END_CPP11 +} // predictor.cpp cpp11::writable::doubles_matrix<> predict_forest_cpp(cpp11::external_pointer forest_samples, cpp11::external_pointer dataset); extern "C" SEXP _stochtree_predict_forest_cpp(SEXP forest_samples, SEXP dataset) { @@ -372,6 +423,13 @@ extern "C" SEXP _stochtree_num_samples_forest_container_cpp(SEXP forest_samples) END_CPP11 } // sampler.cpp +int num_trees_forest_container_cpp(cpp11::external_pointer forest_samples); +extern "C" SEXP _stochtree_num_trees_forest_container_cpp(SEXP forest_samples) { + BEGIN_CPP11 + return cpp11::as_sexp(num_trees_forest_container_cpp(cpp11::as_cpp>>(forest_samples))); + END_CPP11 +} +// sampler.cpp void json_save_forest_container_cpp(cpp11::external_pointer forest_samples, std::string json_filename); extern "C" SEXP _stochtree_json_save_forest_container_cpp(SEXP forest_samples, SEXP json_filename) { BEGIN_CPP11 @@ -457,67 +515,75 @@ extern "C" SEXP _stochtree_forest_tracker_cpp(SEXP data, SEXP feature_types, SEX extern "C" { static const R_CallMethodDef CallEntries[] = { - {"_stochtree_add_sample_forest_container_cpp", (DL_FUNC) &_stochtree_add_sample_forest_container_cpp, 1}, - {"_stochtree_all_roots_forest_container_cpp", (DL_FUNC) &_stochtree_all_roots_forest_container_cpp, 2}, - {"_stochtree_create_column_vector_cpp", (DL_FUNC) &_stochtree_create_column_vector_cpp, 1}, - {"_stochtree_create_forest_dataset_cpp", (DL_FUNC) &_stochtree_create_forest_dataset_cpp, 0}, - {"_stochtree_create_rfx_dataset_cpp", (DL_FUNC) &_stochtree_create_rfx_dataset_cpp, 0}, - {"_stochtree_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_dataset_has_basis_cpp, 1}, - {"_stochtree_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_dataset_has_variance_weights_cpp, 1}, - {"_stochtree_dataset_num_basis_cpp", (DL_FUNC) &_stochtree_dataset_num_basis_cpp, 1}, - {"_stochtree_dataset_num_covariates_cpp", (DL_FUNC) &_stochtree_dataset_num_covariates_cpp, 1}, - {"_stochtree_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_dataset_num_rows_cpp, 1}, - {"_stochtree_forest_container_cpp", (DL_FUNC) &_stochtree_forest_container_cpp, 3}, - {"_stochtree_forest_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_basis_cpp, 2}, - {"_stochtree_forest_dataset_add_covariates_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_covariates_cpp, 2}, - {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2}, - {"_stochtree_forest_dataset_update_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_update_basis_cpp, 2}, - {"_stochtree_forest_tracker_cpp", (DL_FUNC) &_stochtree_forest_tracker_cpp, 4}, - {"_stochtree_is_leaf_constant_forest_container_cpp", (DL_FUNC) &_stochtree_is_leaf_constant_forest_container_cpp, 1}, - {"_stochtree_json_load_forest_container_cpp", (DL_FUNC) &_stochtree_json_load_forest_container_cpp, 2}, - {"_stochtree_json_save_forest_container_cpp", (DL_FUNC) &_stochtree_json_save_forest_container_cpp, 2}, - {"_stochtree_num_samples_forest_container_cpp", (DL_FUNC) &_stochtree_num_samples_forest_container_cpp, 1}, - {"_stochtree_output_dimension_forest_container_cpp", (DL_FUNC) &_stochtree_output_dimension_forest_container_cpp, 1}, - {"_stochtree_predict_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_cpp, 2}, - {"_stochtree_predict_forest_raw_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_cpp, 2}, - {"_stochtree_predict_forest_raw_single_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_single_forest_cpp, 3}, - {"_stochtree_rfx_container_cpp", (DL_FUNC) &_stochtree_rfx_container_cpp, 2}, - {"_stochtree_rfx_container_get_alpha_cpp", (DL_FUNC) &_stochtree_rfx_container_get_alpha_cpp, 1}, - {"_stochtree_rfx_container_get_beta_cpp", (DL_FUNC) &_stochtree_rfx_container_get_beta_cpp, 1}, - {"_stochtree_rfx_container_get_sigma_cpp", (DL_FUNC) &_stochtree_rfx_container_get_sigma_cpp, 1}, - {"_stochtree_rfx_container_get_xi_cpp", (DL_FUNC) &_stochtree_rfx_container_get_xi_cpp, 1}, - {"_stochtree_rfx_container_num_components_cpp", (DL_FUNC) &_stochtree_rfx_container_num_components_cpp, 1}, - {"_stochtree_rfx_container_num_groups_cpp", (DL_FUNC) &_stochtree_rfx_container_num_groups_cpp, 1}, - {"_stochtree_rfx_container_num_samples_cpp", (DL_FUNC) &_stochtree_rfx_container_num_samples_cpp, 1}, - {"_stochtree_rfx_container_predict_cpp", (DL_FUNC) &_stochtree_rfx_container_predict_cpp, 3}, - {"_stochtree_rfx_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_basis_cpp, 2}, - {"_stochtree_rfx_dataset_add_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_group_labels_cpp, 2}, - {"_stochtree_rfx_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_weights_cpp, 2}, - {"_stochtree_rfx_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_basis_cpp, 1}, - {"_stochtree_rfx_dataset_has_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_group_labels_cpp, 1}, - {"_stochtree_rfx_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_variance_weights_cpp, 1}, - {"_stochtree_rfx_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_rfx_dataset_num_rows_cpp, 1}, - {"_stochtree_rfx_label_mapper_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_cpp, 1}, - {"_stochtree_rfx_label_mapper_to_list_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_to_list_cpp, 1}, - {"_stochtree_rfx_model_cpp", (DL_FUNC) &_stochtree_rfx_model_cpp, 2}, - {"_stochtree_rfx_model_sample_random_effects_cpp", (DL_FUNC) &_stochtree_rfx_model_sample_random_effects_cpp, 7}, - {"_stochtree_rfx_model_set_group_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameter_covariance_cpp, 2}, - {"_stochtree_rfx_model_set_group_parameters_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameters_cpp, 2}, - {"_stochtree_rfx_model_set_variance_prior_scale_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_scale_cpp, 2}, - {"_stochtree_rfx_model_set_variance_prior_shape_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_shape_cpp, 2}, - {"_stochtree_rfx_model_set_working_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_covariance_cpp, 2}, - {"_stochtree_rfx_model_set_working_parameter_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_cpp, 2}, - {"_stochtree_rfx_tracker_cpp", (DL_FUNC) &_stochtree_rfx_tracker_cpp, 1}, - {"_stochtree_rfx_tracker_get_unique_group_ids_cpp", (DL_FUNC) &_stochtree_rfx_tracker_get_unique_group_ids_cpp, 1}, - {"_stochtree_rng_cpp", (DL_FUNC) &_stochtree_rng_cpp, 1}, - {"_stochtree_sample_gfr_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_gfr_one_iteration_cpp, 13}, - {"_stochtree_sample_mcmc_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_mcmc_one_iteration_cpp, 13}, - {"_stochtree_sample_sigma2_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_sigma2_one_iteration_cpp, 4}, - {"_stochtree_sample_tau_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_tau_one_iteration_cpp, 5}, - {"_stochtree_set_leaf_value_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_value_forest_container_cpp, 2}, - {"_stochtree_set_leaf_vector_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_vector_forest_container_cpp, 2}, - {"_stochtree_tree_prior_cpp", (DL_FUNC) &_stochtree_tree_prior_cpp, 3}, - {"_stochtree_update_residual_forest_container_cpp", (DL_FUNC) &_stochtree_update_residual_forest_container_cpp, 7}, + {"_stochtree_add_sample_forest_container_cpp", (DL_FUNC) &_stochtree_add_sample_forest_container_cpp, 1}, + {"_stochtree_all_roots_forest_container_cpp", (DL_FUNC) &_stochtree_all_roots_forest_container_cpp, 2}, + {"_stochtree_create_column_vector_cpp", (DL_FUNC) &_stochtree_create_column_vector_cpp, 1}, + {"_stochtree_create_forest_dataset_cpp", (DL_FUNC) &_stochtree_create_forest_dataset_cpp, 0}, + {"_stochtree_create_rfx_dataset_cpp", (DL_FUNC) &_stochtree_create_rfx_dataset_cpp, 0}, + {"_stochtree_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_dataset_has_basis_cpp, 1}, + {"_stochtree_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_dataset_has_variance_weights_cpp, 1}, + {"_stochtree_dataset_num_basis_cpp", (DL_FUNC) &_stochtree_dataset_num_basis_cpp, 1}, + {"_stochtree_dataset_num_covariates_cpp", (DL_FUNC) &_stochtree_dataset_num_covariates_cpp, 1}, + {"_stochtree_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_dataset_num_rows_cpp, 1}, + {"_stochtree_forest_container_cpp", (DL_FUNC) &_stochtree_forest_container_cpp, 3}, + {"_stochtree_forest_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_basis_cpp, 2}, + {"_stochtree_forest_dataset_add_covariates_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_covariates_cpp, 2}, + {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2}, + {"_stochtree_forest_dataset_update_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_update_basis_cpp, 2}, + {"_stochtree_forest_kernel_compute_kernel_train_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_kernel_train_cpp, 4}, + {"_stochtree_forest_kernel_compute_kernel_train_test_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_kernel_train_test_cpp, 5}, + {"_stochtree_forest_kernel_compute_leaf_indices_train_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_leaf_indices_train_cpp, 4}, + {"_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp, 5}, + {"_stochtree_forest_kernel_cpp", (DL_FUNC) &_stochtree_forest_kernel_cpp, 0}, + {"_stochtree_forest_kernel_get_test_leaf_indices_cpp", (DL_FUNC) &_stochtree_forest_kernel_get_test_leaf_indices_cpp, 1}, + {"_stochtree_forest_kernel_get_train_leaf_indices_cpp", (DL_FUNC) &_stochtree_forest_kernel_get_train_leaf_indices_cpp, 1}, + {"_stochtree_forest_tracker_cpp", (DL_FUNC) &_stochtree_forest_tracker_cpp, 4}, + {"_stochtree_is_leaf_constant_forest_container_cpp", (DL_FUNC) &_stochtree_is_leaf_constant_forest_container_cpp, 1}, + {"_stochtree_json_load_forest_container_cpp", (DL_FUNC) &_stochtree_json_load_forest_container_cpp, 2}, + {"_stochtree_json_save_forest_container_cpp", (DL_FUNC) &_stochtree_json_save_forest_container_cpp, 2}, + {"_stochtree_num_samples_forest_container_cpp", (DL_FUNC) &_stochtree_num_samples_forest_container_cpp, 1}, + {"_stochtree_num_trees_forest_container_cpp", (DL_FUNC) &_stochtree_num_trees_forest_container_cpp, 1}, + {"_stochtree_output_dimension_forest_container_cpp", (DL_FUNC) &_stochtree_output_dimension_forest_container_cpp, 1}, + {"_stochtree_predict_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_cpp, 2}, + {"_stochtree_predict_forest_raw_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_cpp, 2}, + {"_stochtree_predict_forest_raw_single_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_single_forest_cpp, 3}, + {"_stochtree_rfx_container_cpp", (DL_FUNC) &_stochtree_rfx_container_cpp, 2}, + {"_stochtree_rfx_container_get_alpha_cpp", (DL_FUNC) &_stochtree_rfx_container_get_alpha_cpp, 1}, + {"_stochtree_rfx_container_get_beta_cpp", (DL_FUNC) &_stochtree_rfx_container_get_beta_cpp, 1}, + {"_stochtree_rfx_container_get_sigma_cpp", (DL_FUNC) &_stochtree_rfx_container_get_sigma_cpp, 1}, + {"_stochtree_rfx_container_get_xi_cpp", (DL_FUNC) &_stochtree_rfx_container_get_xi_cpp, 1}, + {"_stochtree_rfx_container_num_components_cpp", (DL_FUNC) &_stochtree_rfx_container_num_components_cpp, 1}, + {"_stochtree_rfx_container_num_groups_cpp", (DL_FUNC) &_stochtree_rfx_container_num_groups_cpp, 1}, + {"_stochtree_rfx_container_num_samples_cpp", (DL_FUNC) &_stochtree_rfx_container_num_samples_cpp, 1}, + {"_stochtree_rfx_container_predict_cpp", (DL_FUNC) &_stochtree_rfx_container_predict_cpp, 3}, + {"_stochtree_rfx_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_basis_cpp, 2}, + {"_stochtree_rfx_dataset_add_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_group_labels_cpp, 2}, + {"_stochtree_rfx_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_weights_cpp, 2}, + {"_stochtree_rfx_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_basis_cpp, 1}, + {"_stochtree_rfx_dataset_has_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_group_labels_cpp, 1}, + {"_stochtree_rfx_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_variance_weights_cpp, 1}, + {"_stochtree_rfx_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_rfx_dataset_num_rows_cpp, 1}, + {"_stochtree_rfx_label_mapper_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_cpp, 1}, + {"_stochtree_rfx_label_mapper_to_list_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_to_list_cpp, 1}, + {"_stochtree_rfx_model_cpp", (DL_FUNC) &_stochtree_rfx_model_cpp, 2}, + {"_stochtree_rfx_model_sample_random_effects_cpp", (DL_FUNC) &_stochtree_rfx_model_sample_random_effects_cpp, 7}, + {"_stochtree_rfx_model_set_group_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameter_covariance_cpp, 2}, + {"_stochtree_rfx_model_set_group_parameters_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameters_cpp, 2}, + {"_stochtree_rfx_model_set_variance_prior_scale_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_scale_cpp, 2}, + {"_stochtree_rfx_model_set_variance_prior_shape_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_shape_cpp, 2}, + {"_stochtree_rfx_model_set_working_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_covariance_cpp, 2}, + {"_stochtree_rfx_model_set_working_parameter_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_cpp, 2}, + {"_stochtree_rfx_tracker_cpp", (DL_FUNC) &_stochtree_rfx_tracker_cpp, 1}, + {"_stochtree_rfx_tracker_get_unique_group_ids_cpp", (DL_FUNC) &_stochtree_rfx_tracker_get_unique_group_ids_cpp, 1}, + {"_stochtree_rng_cpp", (DL_FUNC) &_stochtree_rng_cpp, 1}, + {"_stochtree_sample_gfr_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_gfr_one_iteration_cpp, 13}, + {"_stochtree_sample_mcmc_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_mcmc_one_iteration_cpp, 13}, + {"_stochtree_sample_sigma2_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_sigma2_one_iteration_cpp, 4}, + {"_stochtree_sample_tau_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_tau_one_iteration_cpp, 5}, + {"_stochtree_set_leaf_value_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_value_forest_container_cpp, 2}, + {"_stochtree_set_leaf_vector_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_vector_forest_container_cpp, 2}, + {"_stochtree_tree_prior_cpp", (DL_FUNC) &_stochtree_tree_prior_cpp, 3}, + {"_stochtree_update_residual_forest_container_cpp", (DL_FUNC) &_stochtree_update_residual_forest_container_cpp, 7}, {NULL, NULL, 0} }; } diff --git a/src/kernel.cpp b/src/kernel.cpp new file mode 100644 index 0000000..a301cbd --- /dev/null +++ b/src/kernel.cpp @@ -0,0 +1,153 @@ +#include +#include "stochtree_types.h" +#include +#include +#include +#include +#include +#include + +typedef Eigen::Map> KernelMatrixType; + +[[cpp11::register]] +cpp11::external_pointer forest_kernel_cpp() { + // Create smart pointer to newly allocated object + std::unique_ptr forest_kernel_ptr_ = std::make_unique(); + + // Release management of the pointer to R session + return cpp11::external_pointer(forest_kernel_ptr_.release()); +} + +[[cpp11::register]] +void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, + cpp11::external_pointer forest_container, int forest_num) { + // Wrap an Eigen Map around the raw data of the covariate_train matrix + StochTree::data_size_t n_train = covariates_train.nrow(); + int num_covariates_train = covariates_train.ncol(); + double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); + KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); + + // Compute leaf indices + forest_kernel->ComputeLeafIndices(covariates_train_eigen, *(forest_container->GetEnsemble(forest_num))); + + // Unprotect pointers to R data + UNPROTECT(1); +} + +[[cpp11::register]] +void forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, + cpp11::external_pointer forest_container, int forest_num) { + // Wrap an Eigen Map around the raw data of the covariate_train matrix + StochTree::data_size_t n_train = covariates_train.nrow(); + int num_covariates_train = covariates_train.ncol(); + double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); + KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); + + // Wrap an Eigen Map around the raw data of the covariate_test matrix + StochTree::data_size_t n_test = covariates_test.nrow(); + int num_covariates_test = covariates_test.ncol(); + double* covariate_test_data_ptr = REAL(PROTECT(covariates_test)); + KernelMatrixType covariates_test_eigen(covariate_test_data_ptr, n_test, num_covariates_test); + + // Compute leaf indices + forest_kernel->ComputeLeafIndices(covariates_train_eigen, covariates_test_eigen, *(forest_container->GetEnsemble(forest_num))); + + // Unprotect pointers to R data + UNPROTECT(2); +} + +[[cpp11::register]] +cpp11::writable::integers forest_kernel_get_train_leaf_indices_cpp(cpp11::external_pointer forest_kernel) { + if (!forest_kernel->HasTrainLeafIndices()) { + cpp11::writable::integers output; + return output; + } + std::vector train_indices = forest_kernel->GetTrainLeafIndices(); + cpp11::writable::integers output(train_indices.begin(), train_indices.end()); + return output; +} + +[[cpp11::register]] +cpp11::writable::integers forest_kernel_get_test_leaf_indices_cpp(cpp11::external_pointer forest_kernel) { + if (!forest_kernel->HasTestLeafIndices()) { + cpp11::writable::integers output; + return output; + } + std::vector test_indices = forest_kernel->GetTestLeafIndices(); + cpp11::writable::integers output(test_indices.begin(), test_indices.end()); + return output; +} + +[[cpp11::register]] +cpp11::list forest_kernel_compute_kernel_train_cpp( + cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, + cpp11::external_pointer forest_container, int forest_num +) { + // Wrap an Eigen Map around the raw data of the covariate_train matrix + StochTree::data_size_t n_train = covariates_train.nrow(); + int num_covariates_train = covariates_train.ncol(); + double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); + KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); + + // Declare outputs + cpp11::writable::doubles_matrix<> kernel_train(n_train, n_train); + + // Wrap Eigen Maps around kernel and kernel inverse matrices + double* kernel_data_ptr = REAL(PROTECT(kernel_train)); + KernelMatrixType kernel_eigen(kernel_data_ptr, n_train, n_train); + + // Compute kernel terms + forest_kernel->ComputeKernelExternal(covariates_train_eigen, *(forest_container->GetEnsemble(forest_num)), kernel_eigen); + + // Unprotect pointers to R data + UNPROTECT(2); + + // Return list of vectors + cpp11::writable::list result; + result.push_back(kernel_train); + return result; +} + +[[cpp11::register]] +cpp11::list forest_kernel_compute_kernel_train_test_cpp( + cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, + cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num +) { + // Wrap an Eigen Map around the raw data of the covariate_train matrix + StochTree::data_size_t n_train = covariates_train.nrow(); + int num_covariates_train = covariates_train.ncol(); + double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); + KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); + + // Wrap an Eigen Map around the raw data of the covariate_test matrix + StochTree::data_size_t n_test = covariates_test.nrow(); + int num_covariates_test = covariates_test.ncol(); + double* covariate_test_data_ptr = REAL(PROTECT(covariates_test)); + KernelMatrixType covariates_test_eigen(covariate_test_data_ptr, n_test, num_covariates_test); + + // Declare outputs + cpp11::writable::doubles_matrix<> kernel_train(n_train, n_train); + cpp11::writable::doubles_matrix<> kernel_test_train(n_test, n_train); + cpp11::writable::doubles_matrix<> kernel_test(n_test, n_test); + + // Wrap Eigen Maps around kernel and kernel inverse matrices + double* kernel_data_ptr = REAL(PROTECT(kernel_train)); + double* kernel_test_train_data_ptr = REAL(PROTECT(kernel_test_train)); + double* kernel_test_data_ptr = REAL(PROTECT(kernel_test)); + KernelMatrixType kernel_train_eigen(kernel_data_ptr, n_train, n_train); + KernelMatrixType kernel_test_train_eigen(kernel_test_train_data_ptr, n_test, n_train); + KernelMatrixType kernel_test_eigen(kernel_test_data_ptr, n_test, n_test); + + // Compute kernel terms + forest_kernel->ComputeKernelExternal(covariates_train_eigen, covariates_test_eigen, *(forest_container->GetEnsemble(forest_num)), kernel_train_eigen, kernel_test_train_eigen, kernel_test_eigen); + + // Unprotect pointers to R data + UNPROTECT(5); + + // Return list of vectors + cpp11::writable::list result; + result.push_back(kernel_train); + result.push_back(kernel_test_train); + result.push_back(kernel_test); + return result; +} diff --git a/src/sampler.cpp b/src/sampler.cpp index cf34fa6..57ac15b 100644 --- a/src/sampler.cpp +++ b/src/sampler.cpp @@ -186,6 +186,11 @@ int num_samples_forest_container_cpp(cpp11::external_pointerNumSamples(); } +[[cpp11::register]] +int num_trees_forest_container_cpp(cpp11::external_pointer forest_samples) { + return forest_samples->NumTrees(); +} + [[cpp11::register]] void json_save_forest_container_cpp(cpp11::external_pointer forest_samples, std::string json_filename) { forest_samples->SaveToJsonFile(json_filename); diff --git a/src/stochtree-cpp b/src/stochtree-cpp index 44e3be1..40608da 160000 --- a/src/stochtree-cpp +++ b/src/stochtree-cpp @@ -1 +1 @@ -Subproject commit 44e3be19ffdef00db3ca5180099bfba5ebbda5c9 +Subproject commit 40608da37812ae92ee45f85c2e2314198690041f diff --git a/src/stochtree_types.h b/src/stochtree_types.h index 33a8fb6..096dc58 100644 --- a/src/stochtree_types.h +++ b/src/stochtree_types.h @@ -1,5 +1,6 @@ #include #include +#include #include #include #include diff --git a/tools/debug/kernel_debug.R b/tools/debug/kernel_debug.R new file mode 100644 index 0000000..756a239 --- /dev/null +++ b/tools/debug/kernel_debug.R @@ -0,0 +1,46 @@ +library(stochtree) + +# Generate the data +X_train <- seq(0,20,length=100) +X_test <- seq(0,20,length=99) +y_train <- (sin(pi*X_train/5) + 0.2*cos(4*pi*X_train/5)) * (X_train <= 9.6) +lin_train <- X_train>9.6; +y_train[lin_train] <- -1 + X_train[lin_train]/10 +y_train <- y_train + rnorm(length(y_train), sd=0.1) +y_test <- (sin(pi*X_test/5) + 0.2*cos(4*pi*X_test/5)) * (X_test <= 9.6) +lin_test <- X_test>9.6; +y_test[lin_test] <- -1 + X_test[lin_test]/10 + +bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test) +forest_kernel <- createForestKernel() +result_inds <- forest_kernel$compute_leaf_indices( + covariates_train = X_train, covariates_test = X_test, + forest_container = bart_model$forests, + forest_num = bart_model$model_params$num_samples - 1 +) +result_kernels <- forest_kernel$compute_kernel( + covariates_train = X_train, covariates_test = X_test, + forest_container = bart_model$forests, + forest_num = bart_model$model_params$num_samples - 1 +) + +# GP +mu_tilde <- result_kernels$kernel_test_train %*% MASS::ginv(result_kernels$kernel_train) %*% y_train +Sigma_tilde <- result_kernels$kernel_test - result_kernels$kernel_test_train %*% MASS::ginv(result_kernels$kernel_train) %*% t(result_kernels$kernel_test_train) + +B <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde) +yhat_mean_test <- colMeans(B) +plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process") +abline(0,1,lwd=2.5,lty=3,col="red") + +# m <- 200 +# n <- length(X_train) +# dummies <- result[["leaf_indices_train"]] +# max_ix <- max(dummies) +# leaf_train <- Matrix::sparseMatrix(i=rep(1:n, m), +# j = dummies+1, +# x = rep(1, n*m), +# dims = c(n, max_ix+1), +# ) +# leaf_train <- as.matrix(leaf_train) +# check <- leaf_train %*% t(leaf_train) diff --git a/vignettes/Ensemble-Kernel.Rmd b/vignettes/Ensemble-Kernel.Rmd new file mode 100644 index 0000000..f1f0550 --- /dev/null +++ b/vignettes/Ensemble-Kernel.Rmd @@ -0,0 +1,198 @@ +--- +title: "Kernel Methods from Tree Ensembles in StochTree" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Prototype-Interface} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +bibliography: vignettes.bib +editor_options: + markdown: + wrap: 72 +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Motivation + +A trained tree ensemble with strong out-of-sample performance admits a natural +motivation for the "distance" between two samples: shared leaf membership. +We number the leaves in an ensemble from 1 to $s$ (that is, if tree 1 has 3 leaves, +it reserves the numbers 1 - 3, and in turn if tree 2 has 5 leaves, it reserves the numbers 4 - 8 to label its leaves, and so on). For a dataset with $n$ observations, +we construct the matrix $W$ as follows: + +| Initialize $W$ as a matrix of all zeroes with $n$ rows and as many columns as leaves in the ensemble +| Let `s` = 0 +| **FOR** $j$ **IN** $\left\{1,\dots,m\right\}$: +| Let `num_leaves` be the number of leaves in tree $j$ +| **FOR** $i$ **IN** $\left\{1,\dots,n\right\}$: +| Let `k` be the leaf to which tree $j$ maps observation $i$ +| Set element $W_{i,k+s} = 1$ +| Let `s` = `s + num_leaves` + +This sparse matrix $W$ is a matrix representation of the basis predictions of an ensemble +(i.e. integrating out the leaf parameters and just analyzing the leaf indices). +For an ensemble with $m$ trees, we can determine the proportion of trees that +map each observation to the same leaf by computing $W W^T / m$. +This can form the basis for a kernel function used in a Gaussian process +regression, as we demonstrate below. + +To begin, load the `stochtree` package and the `tgp` package which will serve as a point of reference. + +```{r setup} +library(stochtree) +library(tgp) +library(MASS) +library(mvtnorm) +``` + +# Demo 1: Univariate Supervised Learning + +We begin with a simulated example from the `tgp` package (@gramacy2010categorical). +This data generating process (DGP) is non-stationary with a single numeric +covariate. We define a training set and test set and evaluate various approaches +to modeling the out of sample outcome data. + +## Traditional Gaussian Process + +We can use the `tgp` package to model this data with a classical Gaussian Process. + +```{r, results='hide'} +# Generate the data +X_train <- seq(0,20,length=100) +X_test <- seq(0,20,length=99) +y_train <- (sin(pi*X_train/5) + 0.2*cos(4*pi*X_train/5)) * (X_train <= 9.6) +lin_train <- X_train>9.6; +y_train[lin_train] <- -1 + X_train[lin_train]/10 +y_train <- y_train + rnorm(length(y_train), sd=0.1) +y_test <- (sin(pi*X_test/5) + 0.2*cos(4*pi*X_test/5)) * (X_test <= 9.6) +lin_test <- X_test>9.6; +y_test[lin_test] <- -1 + X_test[lin_test]/10 + +# Fit the GP +model_gp <- bgp(X=X_train, Z=y_train, XX=X_test) +plot(model_gp$ZZ.mean, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process") +abline(0,1,lwd=2.5,lty=3,col="red") +sqrt(mean((model_gp$ZZ.mean - y_test)^2)) +``` + +Assess the RMSE + +```{r} +sqrt(mean((model_gp$ZZ.mean - y_test)^2)) +``` + +## BART-based Gaussian process + +```{r} +# Run BART on the data +num_trees <- 200 +sigma_leaf <- 1/num_trees +bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, num_trees=num_trees) + +# Extract kernels needed for kriging +result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test) +Sigma_11 <- result_kernels$kernel_test +Sigma_12 <- result_kernels$kernel_test_train +Sigma_22 <- result_kernels$kernel_train +Sigma_22_inv <- ginv(Sigma_22) +Sigma_21 <- t(Sigma_12) + +# Compute mean and covariance for the test set posterior +mu_tilde <- Sigma_12 %*% Sigma_22_inv %*% y_train +Sigma_tilde <- (sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21) + +# Sample from f(X_test) | X_test, X_train, f(X_train) +gp_samples <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde) + +# Compute posterior mean predictions for f(X_test) +yhat_mean_test <- colMeans(gp_samples) +plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "BART Gaussian process") +abline(0,1,lwd=2.5,lty=3,col="red") +``` + +Assess the RMSE + +```{r} +sqrt(mean((yhat_mean_test - y_test)^2)) +``` + +# Demo 2: Multivariate Supervised Learning + +We proceed to the simulated "Friedman" dataset, as implemented in `tgp`. + +## Traditional Gaussian Process + +We can use the `tgp` package to model this data with a classical Gaussian Process. + +```{r, results='hide'} +# Generate the data, add many "noise variables" +n <- 100 +friedman.df <- friedman.1.data(n=n) +train_inds <- sort(sample(1:n, floor(0.8*n), replace = F)) +test_inds <- (1:n)[!((1:n) %in% train_inds)] +X <- as.matrix(friedman.df)[,1:10] +X <- cbind(X, matrix(runif(n*10), ncol = 10)) +y <- as.matrix(friedman.df)[,12] + rnorm(n,0,1)*(sd(as.matrix(friedman.df)[,11])/2) +X_train <- X[train_inds,] +X_test <- X[test_inds,] +y_train <- y[train_inds] +y_test <- y[test_inds] + +# Fit the GP +model_gp <- bgp(X=X_train, Z=y_train, XX=X_test) +plot(model_gp$ZZ.mean, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process") +abline(0,1,lwd=2.5,lty=3,col="red") +``` + +Assess the RMSE + +```{r} +sqrt(mean((model_gp$ZZ.mean - y_test)^2)) +``` + +## BART-based Gaussian process + +```{r} +# Run BART on the data +num_trees <- 200 +sigma_leaf <- 1/num_trees +bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, num_trees=num_trees) + +# Extract kernels needed for kriging +result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test) +Sigma_11 <- result_kernels$kernel_test +Sigma_12 <- result_kernels$kernel_test_train +Sigma_22 <- result_kernels$kernel_train +Sigma_22_inv <- ginv(Sigma_22) +Sigma_21 <- t(Sigma_12) + +# Compute mean and covariance for the test set posterior +mu_tilde <- Sigma_12 %*% Sigma_22_inv %*% y_train +Sigma_tilde <- (sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21) + +# Sample from f(X_test) | X_test, X_train, f(X_train) +gp_samples <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde) + +# Compute posterior mean predictions for f(X_test) +yhat_mean_test <- colMeans(gp_samples) +plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "BART Gaussian process") +abline(0,1,lwd=2.5,lty=3,col="red") +``` + +Assess the RMSE + +```{r} +sqrt(mean((yhat_mean_test - y_test)^2)) +``` + +While the use case of a BART kernel for classical kriging is perhaps unclear without +more empirical investigation, we will see in a later vignette that the kernel +approach can be very beneficial for causal inference applications. + +# References diff --git a/vignettes/vignettes.bib b/vignettes/vignettes.bib index 31bbb60..2f4abbb 100644 --- a/vignettes/vignettes.bib +++ b/vignettes/vignettes.bib @@ -63,4 +63,33 @@ @inproceedings{krantsevich2023stochastic pages={6120--6131}, year={2023}, organization={PMLR} +} + +@Article{gramacy2010categorical, + title = {Categorical Inputs, Sensitivity Analysis, Optimization and Importance Tempering with {tgp} Version 2, an {R} Package for Treed Gaussian Process Models}, + author = {Robert B. Gramacy and Matthew Taddy}, + journal = {Journal of Statistical Software}, + year = {2010}, + volume = {33}, + number = {6}, + pages = {1--48}, + url = {https://www.jstatsoft.org/v33/i06/}, + doi = {10.18637/jss.v033.i06}, +} + +@book{gramacy2020surrogates, + title = {Surrogates: {G}aussian Process Modeling, Design and \ + Optimization for the Applied Sciences}, + author = {Robert B. Gramacy}, + publisher = {Chapman Hall/CRC}, + address = {Boca Raton, Florida}, + note = {\url{http://bobby.gramacy.com/surrogates/}}, + year = {2020} +} + +@book{scholkopf2002learning, + title={Learning with kernels: support vector machines, regularization, optimization, and beyond}, + author={Sch{\"o}lkopf, Bernhard and Smola, Alexander J}, + year={2002}, + publisher={MIT press} } \ No newline at end of file