diff --git a/DESCRIPTION b/DESCRIPTION
index 997b15f..4c368dc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -18,7 +18,13 @@ LinkingTo:
cpp11
Suggests:
knitr,
- rmarkdown
+ rmarkdown,
+ Matrix,
+ tgp,
+ MASS,
+ mvtnorm,
+ ggplot2,
+ latex2exp
VignetteBuilder: knitr
Imports:
R6
diff --git a/NAMESPACE b/NAMESPACE
index 4ded939..6ecb479 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,8 +6,11 @@ S3method(predict,bartmodel)
S3method(predict,bcf)
export(bart)
export(bcf)
+export(computeForestKernels)
+export(computeForestLeafIndices)
export(createForestContainer)
export(createForestDataset)
+export(createForestKernel)
export(createForestModel)
export(createOutcome)
export(createRNG)
diff --git a/R/cpp11.R b/R/cpp11.R
index e2e5aa2..ea70526 100644
--- a/R/cpp11.R
+++ b/R/cpp11.R
@@ -76,6 +76,34 @@ rfx_dataset_add_weights_cpp <- function(dataset_ptr, weights) {
invisible(.Call(`_stochtree_rfx_dataset_add_weights_cpp`, dataset_ptr, weights))
}
+forest_kernel_cpp <- function() {
+ .Call(`_stochtree_forest_kernel_cpp`)
+}
+
+forest_kernel_compute_leaf_indices_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) {
+ invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num))
+}
+
+forest_kernel_compute_leaf_indices_train_test_cpp <- function(forest_kernel, covariates_train, covariates_test, forest_container, forest_num) {
+ invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num))
+}
+
+forest_kernel_get_train_leaf_indices_cpp <- function(forest_kernel) {
+ .Call(`_stochtree_forest_kernel_get_train_leaf_indices_cpp`, forest_kernel)
+}
+
+forest_kernel_get_test_leaf_indices_cpp <- function(forest_kernel) {
+ .Call(`_stochtree_forest_kernel_get_test_leaf_indices_cpp`, forest_kernel)
+}
+
+forest_kernel_compute_kernel_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) {
+ .Call(`_stochtree_forest_kernel_compute_kernel_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num)
+}
+
+forest_kernel_compute_kernel_train_test_cpp <- function(forest_kernel, covariates_train, covariates_test, forest_container, forest_num) {
+ .Call(`_stochtree_forest_kernel_compute_kernel_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num)
+}
+
predict_forest_cpp <- function(forest_samples, dataset) {
.Call(`_stochtree_predict_forest_cpp`, forest_samples, dataset)
}
@@ -200,6 +228,10 @@ num_samples_forest_container_cpp <- function(forest_samples) {
.Call(`_stochtree_num_samples_forest_container_cpp`, forest_samples)
}
+num_trees_forest_container_cpp <- function(forest_samples) {
+ .Call(`_stochtree_num_trees_forest_container_cpp`, forest_samples)
+}
+
json_save_forest_container_cpp <- function(forest_samples, json_filename) {
invisible(.Call(`_stochtree_json_save_forest_container_cpp`, forest_samples, json_filename))
}
diff --git a/R/forest.R b/R/forest.R
index 55c9698..e7eb385 100644
--- a/R/forest.R
+++ b/R/forest.R
@@ -141,6 +141,13 @@ ForestSamples <- R6::R6Class(
return(num_samples_forest_container_cpp(self$forest_container_ptr))
},
+ #' @description
+ #' Return number of trees in each ensemble of a `ForestContainer` object
+ #' @return Tree count
+ num_trees = function() {
+ return(num_trees_forest_container_cpp(self$forest_container_ptr))
+ },
+
#' @description
#' Return output dimension of trees in a `ForestContainer` object
#' @return Leaf node parameter size
diff --git a/R/kernel.R b/R/kernel.R
new file mode 100644
index 0000000..5dff0d5
--- /dev/null
+++ b/R/kernel.R
@@ -0,0 +1,195 @@
+#' Class that provides functionality for statistical kernel definition and
+#' computation based on shared leaf membership of observations in a tree ensemble.
+#'
+#' @description
+#' Computes leaf membership internally as a sparse matrix and also calculates a
+#' (dense) kernel based on the sparse matrix all in C++.
+
+ForestKernel <- R6::R6Class(
+ classname = "ForestKernel",
+ cloneable = FALSE,
+ public = list(
+
+ #' @field forest_kernel_ptr External pointer to a C++ StochTree::ForestKernel class
+ forest_kernel_ptr = NULL,
+
+ #' @description
+ #' Create a new ForestKernel object.
+ #' @return A new `ForestKernel` object.
+ initialize = function() {
+ # Initialize forest kernel and return external pointer to the class
+ self$forest_kernel_ptr <- forest_kernel_cpp()
+ },
+
+ #' @description
+ #' Compute the leaf indices of each tree in the ensemble for every observation in a dataset.
+ #' Stores the result internally, which can be extracted from the class via a call to `get_leaf_indices`.
+ #' @param covariates_train Matrix of training set covariates at which to compute leaf indices
+ #' @param covariates_test (Optional) Matrix of test set covariates at which to compute leaf indices
+ #' @param forest_container Object of type `ForestSamples`
+ #' @param forest_num Index of the forest in forest_container to be assessed
+ #' @return List of vectors. If `covariates_test = NULL` the list has one element (train set leaf indices), and
+ #' otherwise the list has two elements (train and test set leaf indices).
+ compute_leaf_indices = function(covariates_train, covariates_test = NULL, forest_container, forest_num) {
+ # Convert to matrix format if not provided as such
+ if ((is.null(dim(covariates_train))) && (!is.null(covariates_train))) {
+ covariates_train <- as.matrix(covariates_train)
+ }
+ if ((is.null(dim(covariates_test))) && (!is.null(covariates_test))) {
+ covariates_test <- as.matrix(covariates_test)
+ }
+
+ # Compute the leaf indices
+ result = list()
+ if (is.null(covariates_test)) {
+ forest_kernel_compute_leaf_indices_train_cpp(
+ self$forest_kernel_ptr, covariates_train,
+ forest_container$forest_container_ptr, forest_num
+ )
+ result[["leaf_indices_train"]] = forest_kernel_get_train_leaf_indices_cpp(self$forest_kernel_ptr)
+ } else {
+ forest_kernel_compute_leaf_indices_train_test_cpp(
+ self$forest_kernel_ptr, covariates_train, covariates_test,
+ forest_container$forest_container_ptr, forest_num
+ )
+ result[["leaf_indices_train"]] = forest_kernel_get_train_leaf_indices_cpp(self$forest_kernel_ptr)
+ result[["leaf_indices_test"]] = forest_kernel_get_test_leaf_indices_cpp(self$forest_kernel_ptr)
+ }
+ return(result)
+ },
+
+ #' @description
+ #' Compute the kernel implied by a tree ensemble. This function calls `compute_leaf_indices`,
+ #' so it is not necessary to call both. `compute_leaf_indices` is exposed at the class level
+ #' to allow for extracting the vector of leaf indices for an ensemble directly in R.
+ #' @param covariates_train Matrix of training set covariates at which to assess ensemble kernel
+ #' @param covariates_test (Optional) Matrix of test set covariates at which to assess ensemble kernel
+ #' @param forest_container Object of type `ForestSamples`
+ #' @param forest_num Index of the forest in forest_container to be assessed
+ #' @return List of matrices. If `covariates_test = NULL`, the list contains
+ #' one `n_train` x `n_train` matrix, where `n_train = nrow(covariates_train)`.
+ #' This matrix is the kernel defined by `W_train %*% t(W_train)` where `W_train`
+ #' is a matrix with `n_train` rows and as many columns as there are total leaves in an ensemble.
+ #' If `covariates_test` is not `NULL`, the list contains two more matrices defined by
+ #' `W_test %*% t(W_train)` and `W_test %*% t(W_test)`.
+ compute_kernel = function(covariates_train, covariates_test = NULL, forest_container, forest_num) {
+ # Convert to matrix format if not provided as such
+ if ((is.null(dim(covariates_train))) && (!is.null(covariates_train))) {
+ covariates_train <- as.matrix(covariates_train)
+ }
+ if ((is.null(dim(covariates_test))) && (!is.null(covariates_test))) {
+ covariates_test <- as.matrix(covariates_test)
+ }
+
+ # Compute the kernels
+ num_trees <- forest_container$num_trees()
+ n_train = nrow(covariates_train)
+ kernel_train = matrix(0., nrow = n_train, ncol = n_train)
+ inverse_kernel_train = matrix(0., nrow = n_train, ncol = n_train)
+ if (is.null(covariates_test)) {
+ result = forest_kernel_compute_kernel_train_cpp(
+ self$forest_kernel_ptr, covariates_train,
+ forest_container$forest_container_ptr, forest_num
+ )
+ names(result) <- c("kernel_train")
+ } else {
+ n_test = nrow(covariates_test)
+ kernel_test_train = matrix(0., nrow = n_test, ncol = n_train)
+ kernel_test = matrix(0., nrow = n_test, ncol = n_test)
+ result = forest_kernel_compute_kernel_train_test_cpp(
+ self$forest_kernel_ptr, covariates_train, covariates_test,
+ forest_container$forest_container_ptr, forest_num
+ )
+ names(result) <- c("kernel_train", "kernel_test_train", "kernel_test")
+ }
+
+ # Divide each matrix by num_trees
+ for (i in 1:length(result)) {result[[i]] <- result[[i]] / num_trees}
+
+ return(result)
+ }
+ )
+)
+
+#' Create a `ForestKernel` object
+#'
+#' @return `ForestKernel` object
+#' @export
+createForestKernel <- function() {
+ return(invisible((
+ ForestKernel$new()
+ )))
+}
+
+#' Compute a kernel from a tree ensemble, defined by the fraction
+#' of trees of an ensemble in which two observations fall into the
+#' same leaf.
+#'
+#' @param bart_model Object of type `bartmodel` corresponding to a BART model with at least one sample
+#' @param X_train Matrix of "training" data. In a traditional Gaussian process kriging context, this
+#' corresponds to the observations for which outcomes are observed.
+#' @param X_test (Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this
+#' corresponds to the observations for which outcomes are unobserved and must be estimated
+#' based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided,
+#' this function will only compute k(X_train, X_train).
+#' @param forest_num (Option) Index of the forest sample to use for kernel computation. If not provided,
+#' this function will use the last forest.
+#' @return List of kernel matrices. If `X_test = NULL`, the list contains
+#' one `n_train` x `n_train` matrix, where `n_train = nrow(X_train)`.
+#' This matrix is the kernel defined by `W_train %*% t(W_train)` where `W_train`
+#' is a matrix with `n_train` rows and as many columns as there are total leaves in an ensemble.
+#' If `X_test` is not `NULL`, the list contains two more matrices defined by
+#' `W_test %*% t(W_train)` and `W_test %*% t(W_test)`.
+#' @export
+computeForestKernels <- function(bart_model, X_train, X_test=NULL, forest_num=NULL) {
+ stopifnot(class(bart_model)=="bartmodel")
+ forest_kernel <- createForestKernel()
+ num_samples <- bart_model$model_params$num_samples
+ stopifnot(forest_num <= num_samples)
+ sample_index <- ifelse(is.null(forest_num), num_samples-1, forest_num-1)
+ return(forest_kernel$compute_kernel(
+ covariates_train = X_train, covariates_test = X_test,
+ forest_container = bart_model$forests, forest_num = sample_index
+ ))
+}
+
+#' Compute and return a vector representation of a forest's leaf predictions for
+#' every observation in a dataset.
+#' The vector has a "column-major" format that can be easily re-represented as
+#' as a CSC sparse matrix: elements are organized so that the first `n` elements
+#' correspond to leaf predictions for all `n` observations in a dataset for the
+#' first tree in an ensemble, the next `n` elements correspond to predictions for
+#' the second tree and so on. The "data" for each element corresponds to a uniquely
+#' mapped column index that corresponds to a single leaf of a single tree (i.e.
+#' if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's
+#' leaf indices begin at 3, etc...).
+#' Users may pass a single dataset (which we refer to here as a "training set")
+#' or two datasets (which we refer to as "training and test sets"). This verbiage
+#' hints that one potential use-case for a matrix of leaf indices is to define a
+#' ensemble-based kernel for kriging.
+#'
+#' @param bart_model Object of type `bartmodel` corresponding to a BART model with at least one sample
+#' @param X_train Matrix of "training" data. In a traditional Gaussian process kriging context, this
+#' corresponds to the observations for which outcomes are observed.
+#' @param X_test (Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this
+#' corresponds to the observations for which outcomes are unobserved and must be estimated
+#' based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided,
+#' this function will only compute k(X_train, X_train).
+#' @param forest_num (Option) Index of the forest sample to use for kernel computation. If not provided,
+#' this function will use the last forest.
+#' @return List of vectors. If `X_test = NULL`, the list contains
+#' one vector of length `n_train * num_trees`, where `n_train = nrow(X_train)`
+#' and `num_trees` is the number of trees in `bart_model`. If `X_test` is not `NULL`,
+#' the list contains another vector of length `n_test * num_trees`.
+#' @export
+computeForestLeafIndices <- function(bart_model, X_train, X_test=NULL, forest_num=NULL) {
+ stopifnot(class(bart_model)=="bartmodel")
+ forest_kernel <- createForestKernel()
+ num_samples <- bart_model$model_params$num_samples
+ stopifnot(forest_num <= num_samples)
+ sample_index <- ifelse(is.null(forest_num), num_samples-1, forest_num-1)
+ return(forest_kernel$compute_leaf_indices(
+ covariates_train = X_train, covariates_test = X_test,
+ forest_container = bart_model$forests, forest_num = sample_index
+ ))
+}
diff --git a/man/ForestKernel.Rd b/man/ForestKernel.Rd
new file mode 100644
index 0000000..a721b36
--- /dev/null
+++ b/man/ForestKernel.Rd
@@ -0,0 +1,110 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/kernel.R
+\name{ForestKernel}
+\alias{ForestKernel}
+\title{Class that provides functionality for statistical kernel definition and
+computation based on shared leaf membership of observations in a tree ensemble.}
+\description{
+Computes leaf membership internally as a sparse matrix and also calculates a
+(dense) kernel based on the sparse matrix all in C++.
+}
+\section{Public fields}{
+\if{html}{\out{
}}
+\describe{
+\item{\code{forest_kernel_ptr}}{External pointer to a C++ StochTree::ForestKernel class}
+}
+\if{html}{\out{
}}
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-ForestKernel-new}{\code{ForestKernel$new()}}
+\item \href{#method-ForestKernel-compute_leaf_indices}{\code{ForestKernel$compute_leaf_indices()}}
+\item \href{#method-ForestKernel-compute_kernel}{\code{ForestKernel$compute_kernel()}}
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-ForestKernel-new}{}}}
+\subsection{Method \code{new()}}{
+Create a new ForestKernel object.
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{ForestKernel$new()}\if{html}{\out{
}}
+}
+
+\subsection{Returns}{
+A new \code{ForestKernel} object.
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-ForestKernel-compute_leaf_indices}{}}}
+\subsection{Method \code{compute_leaf_indices()}}{
+Compute the leaf indices of each tree in the ensemble for every observation in a dataset.
+Stores the result internally, which can be extracted from the class via a call to \code{get_leaf_indices}.
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{ForestKernel$compute_leaf_indices(
+ covariates_train,
+ covariates_test = NULL,
+ forest_container,
+ forest_num
+)}\if{html}{\out{
}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{}}
+\describe{
+\item{\code{covariates_train}}{Matrix of training set covariates at which to compute leaf indices}
+
+\item{\code{covariates_test}}{(Optional) Matrix of test set covariates at which to compute leaf indices}
+
+\item{\code{forest_container}}{Object of type \code{ForestSamples}}
+
+\item{\code{forest_num}}{Index of the forest in forest_container to be assessed}
+}
+\if{html}{\out{
}}
+}
+\subsection{Returns}{
+List of vectors. If \code{covariates_test = NULL} the list has one element (train set leaf indices), and
+otherwise the list has two elements (train and test set leaf indices).
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-ForestKernel-compute_kernel}{}}}
+\subsection{Method \code{compute_kernel()}}{
+Compute the kernel implied by a tree ensemble. This function calls \code{compute_leaf_indices},
+so it is not necessary to call both. \code{compute_leaf_indices} is exposed at the class level
+to allow for extracting the vector of leaf indices for an ensemble directly in R.
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{ForestKernel$compute_kernel(
+ covariates_train,
+ covariates_test = NULL,
+ forest_container,
+ forest_num
+)}\if{html}{\out{
}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{}}
+\describe{
+\item{\code{covariates_train}}{Matrix of training set covariates at which to assess ensemble kernel}
+
+\item{\code{covariates_test}}{(Optional) Matrix of test set covariates at which to assess ensemble kernel}
+
+\item{\code{forest_container}}{Object of type \code{ForestSamples}}
+
+\item{\code{forest_num}}{Index of the forest in forest_container to be assessed}
+}
+\if{html}{\out{
}}
+}
+\subsection{Returns}{
+List of matrices. If \code{covariates_test = NULL}, the list contains
+one \code{n_train} x \code{n_train} matrix, where \code{n_train = nrow(covariates_train)}.
+This matrix is the kernel defined by \code{W_train \%*\% t(W_train)} where \code{W_train}
+is a matrix with \code{n_train} rows and as many columns as there are total leaves in an ensemble.
+If \code{covariates_test} is not \code{NULL}, the list contains two more matrices defined by
+\code{W_test \%*\% t(W_train)} and \code{W_test \%*\% t(W_test)}.
+}
+}
+}
diff --git a/man/ForestSamples.Rd b/man/ForestSamples.Rd
index c351821..a3cb0e2 100644
--- a/man/ForestSamples.Rd
+++ b/man/ForestSamples.Rd
@@ -25,6 +25,7 @@ Wrapper around a C++ container of tree ensembles
\item \href{#method-ForestSamples-save_json}{\code{ForestSamples$save_json()}}
\item \href{#method-ForestSamples-load_json}{\code{ForestSamples$load_json()}}
\item \href{#method-ForestSamples-num_samples}{\code{ForestSamples$num_samples()}}
+\item \href{#method-ForestSamples-num_trees}{\code{ForestSamples$num_trees()}}
\item \href{#method-ForestSamples-output_dimension}{\code{ForestSamples$output_dimension()}}
}
}
@@ -226,6 +227,19 @@ Sample count
}
}
\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-ForestSamples-num_trees}{}}}
+\subsection{Method \code{num_trees()}}{
+Return number of trees in each ensemble of a \code{ForestContainer} object
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{ForestSamples$num_trees()}\if{html}{\out{
}}
+}
+
+\subsection{Returns}{
+Tree count
+}
+}
+\if{html}{\out{
}}
\if{html}{\out{}}
\if{latex}{\out{\hypertarget{method-ForestSamples-output_dimension}{}}}
\subsection{Method \code{output_dimension()}}{
diff --git a/man/computeForestKernels.Rd b/man/computeForestKernels.Rd
new file mode 100644
index 0000000..7421019
--- /dev/null
+++ b/man/computeForestKernels.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/kernel.R
+\name{computeForestKernels}
+\alias{computeForestKernels}
+\title{Compute a kernel from a tree ensemble, defined by the fraction
+of trees of an ensemble in which two observations fall into the
+same leaf.}
+\usage{
+computeForestKernels(bart_model, X_train, X_test = NULL, forest_num = NULL)
+}
+\arguments{
+\item{bart_model}{Object of type \code{bartmodel} corresponding to a BART model with at least one sample}
+
+\item{X_train}{Matrix of "training" data. In a traditional Gaussian process kriging context, this
+corresponds to the observations for which outcomes are observed.}
+
+\item{X_test}{(Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this
+corresponds to the observations for which outcomes are unobserved and must be estimated
+based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided,
+this function will only compute k(X_train, X_train).}
+
+\item{forest_num}{(Option) Index of the forest sample to use for kernel computation. If not provided,
+this function will use the last forest.}
+}
+\value{
+List of kernel matrices. If \code{X_test = NULL}, the list contains
+one \code{n_train} x \code{n_train} matrix, where \code{n_train = nrow(X_train)}.
+This matrix is the kernel defined by \code{W_train \%*\% t(W_train)} where \code{W_train}
+is a matrix with \code{n_train} rows and as many columns as there are total leaves in an ensemble.
+If \code{X_test} is not \code{NULL}, the list contains two more matrices defined by
+\code{W_test \%*\% t(W_train)} and \code{W_test \%*\% t(W_test)}.
+}
+\description{
+Compute a kernel from a tree ensemble, defined by the fraction
+of trees of an ensemble in which two observations fall into the
+same leaf.
+}
diff --git a/man/computeForestLeafIndices.Rd b/man/computeForestLeafIndices.Rd
new file mode 100644
index 0000000..0f5dd89
--- /dev/null
+++ b/man/computeForestLeafIndices.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/kernel.R
+\name{computeForestLeafIndices}
+\alias{computeForestLeafIndices}
+\title{Compute and return a vector representation of a forest's leaf predictions for
+every observation in a dataset.
+The vector has a "column-major" format that can be easily re-represented as
+as a CSC sparse matrix: elements are organized so that the first \code{n} elements
+correspond to leaf predictions for all \code{n} observations in a dataset for the
+first tree in an ensemble, the next \code{n} elements correspond to predictions for
+the second tree and so on. The "data" for each element corresponds to a uniquely
+mapped column index that corresponds to a single leaf of a single tree (i.e.
+if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's
+leaf indices begin at 3, etc...).
+Users may pass a single dataset (which we refer to here as a "training set")
+or two datasets (which we refer to as "training and test sets"). This verbiage
+hints that one potential use-case for a matrix of leaf indices is to define a
+ensemble-based kernel for kriging.}
+\usage{
+computeForestLeafIndices(bart_model, X_train, X_test = NULL, forest_num = NULL)
+}
+\arguments{
+\item{bart_model}{Object of type \code{bartmodel} corresponding to a BART model with at least one sample}
+
+\item{X_train}{Matrix of "training" data. In a traditional Gaussian process kriging context, this
+corresponds to the observations for which outcomes are observed.}
+
+\item{X_test}{(Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this
+corresponds to the observations for which outcomes are unobserved and must be estimated
+based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided,
+this function will only compute k(X_train, X_train).}
+
+\item{forest_num}{(Option) Index of the forest sample to use for kernel computation. If not provided,
+this function will use the last forest.}
+}
+\value{
+List of vectors. If \code{X_test = NULL}, the list contains
+one vector of length \code{n_train * num_trees}, where \code{n_train = nrow(X_train)}
+and \code{num_trees} is the number of trees in \code{bart_model}. If \code{X_test} is not \code{NULL},
+the list contains another vector of length \code{n_test * num_trees}.
+}
+\description{
+Compute and return a vector representation of a forest's leaf predictions for
+every observation in a dataset.
+The vector has a "column-major" format that can be easily re-represented as
+as a CSC sparse matrix: elements are organized so that the first \code{n} elements
+correspond to leaf predictions for all \code{n} observations in a dataset for the
+first tree in an ensemble, the next \code{n} elements correspond to predictions for
+the second tree and so on. The "data" for each element corresponds to a uniquely
+mapped column index that corresponds to a single leaf of a single tree (i.e.
+if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's
+leaf indices begin at 3, etc...).
+Users may pass a single dataset (which we refer to here as a "training set")
+or two datasets (which we refer to as "training and test sets"). This verbiage
+hints that one potential use-case for a matrix of leaf indices is to define a
+ensemble-based kernel for kriging.
+}
diff --git a/man/createForestKernel.Rd b/man/createForestKernel.Rd
new file mode 100644
index 0000000..36cf73c
--- /dev/null
+++ b/man/createForestKernel.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/kernel.R
+\name{createForestKernel}
+\alias{createForestKernel}
+\title{Create a \code{ForestKernel} object}
+\usage{
+createForestKernel()
+}
+\value{
+\code{ForestKernel} object
+}
+\description{
+Create a \code{ForestKernel} object
+}
diff --git a/src/Makevars b/src/Makevars
index fd1ae1a..b8b3131 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -5,6 +5,7 @@ PKG_CPPFLAGS= -I$(CPP_PKGROOT)/include -I$(CPP_PKGROOT)/dependencies/boost_math/
OBJECTS = \
data.o \
+ kernel.o \
predictor.o \
random_effects.o \
sampler.o \
diff --git a/src/cpp11.cpp b/src/cpp11.cpp
index 2608a6f..fa9dcab 100644
--- a/src/cpp11.cpp
+++ b/src/cpp11.cpp
@@ -145,6 +145,57 @@ extern "C" SEXP _stochtree_rfx_dataset_add_weights_cpp(SEXP dataset_ptr, SEXP we
return R_NilValue;
END_CPP11
}
+// kernel.cpp
+cpp11::external_pointer forest_kernel_cpp();
+extern "C" SEXP _stochtree_forest_kernel_cpp() {
+ BEGIN_CPP11
+ return cpp11::as_sexp(forest_kernel_cpp());
+ END_CPP11
+}
+// kernel.cpp
+void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num);
+extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) {
+ BEGIN_CPP11
+ forest_kernel_compute_leaf_indices_train_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num));
+ return R_NilValue;
+ END_CPP11
+}
+// kernel.cpp
+void forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num);
+extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_test_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP covariates_test, SEXP forest_container, SEXP forest_num) {
+ BEGIN_CPP11
+ forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num));
+ return R_NilValue;
+ END_CPP11
+}
+// kernel.cpp
+cpp11::writable::integers forest_kernel_get_train_leaf_indices_cpp(cpp11::external_pointer forest_kernel);
+extern "C" SEXP _stochtree_forest_kernel_get_train_leaf_indices_cpp(SEXP forest_kernel) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(forest_kernel_get_train_leaf_indices_cpp(cpp11::as_cpp>>(forest_kernel)));
+ END_CPP11
+}
+// kernel.cpp
+cpp11::writable::integers forest_kernel_get_test_leaf_indices_cpp(cpp11::external_pointer forest_kernel);
+extern "C" SEXP _stochtree_forest_kernel_get_test_leaf_indices_cpp(SEXP forest_kernel) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(forest_kernel_get_test_leaf_indices_cpp(cpp11::as_cpp>>(forest_kernel)));
+ END_CPP11
+}
+// kernel.cpp
+cpp11::list forest_kernel_compute_kernel_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num);
+extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(forest_kernel_compute_kernel_train_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num)));
+ END_CPP11
+}
+// kernel.cpp
+cpp11::list forest_kernel_compute_kernel_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num);
+extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_test_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP covariates_test, SEXP forest_container, SEXP forest_num) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(forest_kernel_compute_kernel_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num)));
+ END_CPP11
+}
// predictor.cpp
cpp11::writable::doubles_matrix<> predict_forest_cpp(cpp11::external_pointer forest_samples, cpp11::external_pointer dataset);
extern "C" SEXP _stochtree_predict_forest_cpp(SEXP forest_samples, SEXP dataset) {
@@ -372,6 +423,13 @@ extern "C" SEXP _stochtree_num_samples_forest_container_cpp(SEXP forest_samples)
END_CPP11
}
// sampler.cpp
+int num_trees_forest_container_cpp(cpp11::external_pointer forest_samples);
+extern "C" SEXP _stochtree_num_trees_forest_container_cpp(SEXP forest_samples) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(num_trees_forest_container_cpp(cpp11::as_cpp>>(forest_samples)));
+ END_CPP11
+}
+// sampler.cpp
void json_save_forest_container_cpp(cpp11::external_pointer forest_samples, std::string json_filename);
extern "C" SEXP _stochtree_json_save_forest_container_cpp(SEXP forest_samples, SEXP json_filename) {
BEGIN_CPP11
@@ -457,67 +515,75 @@ extern "C" SEXP _stochtree_forest_tracker_cpp(SEXP data, SEXP feature_types, SEX
extern "C" {
static const R_CallMethodDef CallEntries[] = {
- {"_stochtree_add_sample_forest_container_cpp", (DL_FUNC) &_stochtree_add_sample_forest_container_cpp, 1},
- {"_stochtree_all_roots_forest_container_cpp", (DL_FUNC) &_stochtree_all_roots_forest_container_cpp, 2},
- {"_stochtree_create_column_vector_cpp", (DL_FUNC) &_stochtree_create_column_vector_cpp, 1},
- {"_stochtree_create_forest_dataset_cpp", (DL_FUNC) &_stochtree_create_forest_dataset_cpp, 0},
- {"_stochtree_create_rfx_dataset_cpp", (DL_FUNC) &_stochtree_create_rfx_dataset_cpp, 0},
- {"_stochtree_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_dataset_has_basis_cpp, 1},
- {"_stochtree_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_dataset_has_variance_weights_cpp, 1},
- {"_stochtree_dataset_num_basis_cpp", (DL_FUNC) &_stochtree_dataset_num_basis_cpp, 1},
- {"_stochtree_dataset_num_covariates_cpp", (DL_FUNC) &_stochtree_dataset_num_covariates_cpp, 1},
- {"_stochtree_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_dataset_num_rows_cpp, 1},
- {"_stochtree_forest_container_cpp", (DL_FUNC) &_stochtree_forest_container_cpp, 3},
- {"_stochtree_forest_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_basis_cpp, 2},
- {"_stochtree_forest_dataset_add_covariates_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_covariates_cpp, 2},
- {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2},
- {"_stochtree_forest_dataset_update_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_update_basis_cpp, 2},
- {"_stochtree_forest_tracker_cpp", (DL_FUNC) &_stochtree_forest_tracker_cpp, 4},
- {"_stochtree_is_leaf_constant_forest_container_cpp", (DL_FUNC) &_stochtree_is_leaf_constant_forest_container_cpp, 1},
- {"_stochtree_json_load_forest_container_cpp", (DL_FUNC) &_stochtree_json_load_forest_container_cpp, 2},
- {"_stochtree_json_save_forest_container_cpp", (DL_FUNC) &_stochtree_json_save_forest_container_cpp, 2},
- {"_stochtree_num_samples_forest_container_cpp", (DL_FUNC) &_stochtree_num_samples_forest_container_cpp, 1},
- {"_stochtree_output_dimension_forest_container_cpp", (DL_FUNC) &_stochtree_output_dimension_forest_container_cpp, 1},
- {"_stochtree_predict_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_cpp, 2},
- {"_stochtree_predict_forest_raw_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_cpp, 2},
- {"_stochtree_predict_forest_raw_single_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_single_forest_cpp, 3},
- {"_stochtree_rfx_container_cpp", (DL_FUNC) &_stochtree_rfx_container_cpp, 2},
- {"_stochtree_rfx_container_get_alpha_cpp", (DL_FUNC) &_stochtree_rfx_container_get_alpha_cpp, 1},
- {"_stochtree_rfx_container_get_beta_cpp", (DL_FUNC) &_stochtree_rfx_container_get_beta_cpp, 1},
- {"_stochtree_rfx_container_get_sigma_cpp", (DL_FUNC) &_stochtree_rfx_container_get_sigma_cpp, 1},
- {"_stochtree_rfx_container_get_xi_cpp", (DL_FUNC) &_stochtree_rfx_container_get_xi_cpp, 1},
- {"_stochtree_rfx_container_num_components_cpp", (DL_FUNC) &_stochtree_rfx_container_num_components_cpp, 1},
- {"_stochtree_rfx_container_num_groups_cpp", (DL_FUNC) &_stochtree_rfx_container_num_groups_cpp, 1},
- {"_stochtree_rfx_container_num_samples_cpp", (DL_FUNC) &_stochtree_rfx_container_num_samples_cpp, 1},
- {"_stochtree_rfx_container_predict_cpp", (DL_FUNC) &_stochtree_rfx_container_predict_cpp, 3},
- {"_stochtree_rfx_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_basis_cpp, 2},
- {"_stochtree_rfx_dataset_add_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_group_labels_cpp, 2},
- {"_stochtree_rfx_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_weights_cpp, 2},
- {"_stochtree_rfx_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_basis_cpp, 1},
- {"_stochtree_rfx_dataset_has_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_group_labels_cpp, 1},
- {"_stochtree_rfx_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_variance_weights_cpp, 1},
- {"_stochtree_rfx_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_rfx_dataset_num_rows_cpp, 1},
- {"_stochtree_rfx_label_mapper_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_cpp, 1},
- {"_stochtree_rfx_label_mapper_to_list_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_to_list_cpp, 1},
- {"_stochtree_rfx_model_cpp", (DL_FUNC) &_stochtree_rfx_model_cpp, 2},
- {"_stochtree_rfx_model_sample_random_effects_cpp", (DL_FUNC) &_stochtree_rfx_model_sample_random_effects_cpp, 7},
- {"_stochtree_rfx_model_set_group_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameter_covariance_cpp, 2},
- {"_stochtree_rfx_model_set_group_parameters_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameters_cpp, 2},
- {"_stochtree_rfx_model_set_variance_prior_scale_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_scale_cpp, 2},
- {"_stochtree_rfx_model_set_variance_prior_shape_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_shape_cpp, 2},
- {"_stochtree_rfx_model_set_working_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_covariance_cpp, 2},
- {"_stochtree_rfx_model_set_working_parameter_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_cpp, 2},
- {"_stochtree_rfx_tracker_cpp", (DL_FUNC) &_stochtree_rfx_tracker_cpp, 1},
- {"_stochtree_rfx_tracker_get_unique_group_ids_cpp", (DL_FUNC) &_stochtree_rfx_tracker_get_unique_group_ids_cpp, 1},
- {"_stochtree_rng_cpp", (DL_FUNC) &_stochtree_rng_cpp, 1},
- {"_stochtree_sample_gfr_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_gfr_one_iteration_cpp, 13},
- {"_stochtree_sample_mcmc_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_mcmc_one_iteration_cpp, 13},
- {"_stochtree_sample_sigma2_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_sigma2_one_iteration_cpp, 4},
- {"_stochtree_sample_tau_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_tau_one_iteration_cpp, 5},
- {"_stochtree_set_leaf_value_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_value_forest_container_cpp, 2},
- {"_stochtree_set_leaf_vector_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_vector_forest_container_cpp, 2},
- {"_stochtree_tree_prior_cpp", (DL_FUNC) &_stochtree_tree_prior_cpp, 3},
- {"_stochtree_update_residual_forest_container_cpp", (DL_FUNC) &_stochtree_update_residual_forest_container_cpp, 7},
+ {"_stochtree_add_sample_forest_container_cpp", (DL_FUNC) &_stochtree_add_sample_forest_container_cpp, 1},
+ {"_stochtree_all_roots_forest_container_cpp", (DL_FUNC) &_stochtree_all_roots_forest_container_cpp, 2},
+ {"_stochtree_create_column_vector_cpp", (DL_FUNC) &_stochtree_create_column_vector_cpp, 1},
+ {"_stochtree_create_forest_dataset_cpp", (DL_FUNC) &_stochtree_create_forest_dataset_cpp, 0},
+ {"_stochtree_create_rfx_dataset_cpp", (DL_FUNC) &_stochtree_create_rfx_dataset_cpp, 0},
+ {"_stochtree_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_dataset_has_basis_cpp, 1},
+ {"_stochtree_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_dataset_has_variance_weights_cpp, 1},
+ {"_stochtree_dataset_num_basis_cpp", (DL_FUNC) &_stochtree_dataset_num_basis_cpp, 1},
+ {"_stochtree_dataset_num_covariates_cpp", (DL_FUNC) &_stochtree_dataset_num_covariates_cpp, 1},
+ {"_stochtree_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_dataset_num_rows_cpp, 1},
+ {"_stochtree_forest_container_cpp", (DL_FUNC) &_stochtree_forest_container_cpp, 3},
+ {"_stochtree_forest_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_basis_cpp, 2},
+ {"_stochtree_forest_dataset_add_covariates_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_covariates_cpp, 2},
+ {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2},
+ {"_stochtree_forest_dataset_update_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_update_basis_cpp, 2},
+ {"_stochtree_forest_kernel_compute_kernel_train_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_kernel_train_cpp, 4},
+ {"_stochtree_forest_kernel_compute_kernel_train_test_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_kernel_train_test_cpp, 5},
+ {"_stochtree_forest_kernel_compute_leaf_indices_train_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_leaf_indices_train_cpp, 4},
+ {"_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp, 5},
+ {"_stochtree_forest_kernel_cpp", (DL_FUNC) &_stochtree_forest_kernel_cpp, 0},
+ {"_stochtree_forest_kernel_get_test_leaf_indices_cpp", (DL_FUNC) &_stochtree_forest_kernel_get_test_leaf_indices_cpp, 1},
+ {"_stochtree_forest_kernel_get_train_leaf_indices_cpp", (DL_FUNC) &_stochtree_forest_kernel_get_train_leaf_indices_cpp, 1},
+ {"_stochtree_forest_tracker_cpp", (DL_FUNC) &_stochtree_forest_tracker_cpp, 4},
+ {"_stochtree_is_leaf_constant_forest_container_cpp", (DL_FUNC) &_stochtree_is_leaf_constant_forest_container_cpp, 1},
+ {"_stochtree_json_load_forest_container_cpp", (DL_FUNC) &_stochtree_json_load_forest_container_cpp, 2},
+ {"_stochtree_json_save_forest_container_cpp", (DL_FUNC) &_stochtree_json_save_forest_container_cpp, 2},
+ {"_stochtree_num_samples_forest_container_cpp", (DL_FUNC) &_stochtree_num_samples_forest_container_cpp, 1},
+ {"_stochtree_num_trees_forest_container_cpp", (DL_FUNC) &_stochtree_num_trees_forest_container_cpp, 1},
+ {"_stochtree_output_dimension_forest_container_cpp", (DL_FUNC) &_stochtree_output_dimension_forest_container_cpp, 1},
+ {"_stochtree_predict_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_cpp, 2},
+ {"_stochtree_predict_forest_raw_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_cpp, 2},
+ {"_stochtree_predict_forest_raw_single_forest_cpp", (DL_FUNC) &_stochtree_predict_forest_raw_single_forest_cpp, 3},
+ {"_stochtree_rfx_container_cpp", (DL_FUNC) &_stochtree_rfx_container_cpp, 2},
+ {"_stochtree_rfx_container_get_alpha_cpp", (DL_FUNC) &_stochtree_rfx_container_get_alpha_cpp, 1},
+ {"_stochtree_rfx_container_get_beta_cpp", (DL_FUNC) &_stochtree_rfx_container_get_beta_cpp, 1},
+ {"_stochtree_rfx_container_get_sigma_cpp", (DL_FUNC) &_stochtree_rfx_container_get_sigma_cpp, 1},
+ {"_stochtree_rfx_container_get_xi_cpp", (DL_FUNC) &_stochtree_rfx_container_get_xi_cpp, 1},
+ {"_stochtree_rfx_container_num_components_cpp", (DL_FUNC) &_stochtree_rfx_container_num_components_cpp, 1},
+ {"_stochtree_rfx_container_num_groups_cpp", (DL_FUNC) &_stochtree_rfx_container_num_groups_cpp, 1},
+ {"_stochtree_rfx_container_num_samples_cpp", (DL_FUNC) &_stochtree_rfx_container_num_samples_cpp, 1},
+ {"_stochtree_rfx_container_predict_cpp", (DL_FUNC) &_stochtree_rfx_container_predict_cpp, 3},
+ {"_stochtree_rfx_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_basis_cpp, 2},
+ {"_stochtree_rfx_dataset_add_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_group_labels_cpp, 2},
+ {"_stochtree_rfx_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_add_weights_cpp, 2},
+ {"_stochtree_rfx_dataset_has_basis_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_basis_cpp, 1},
+ {"_stochtree_rfx_dataset_has_group_labels_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_group_labels_cpp, 1},
+ {"_stochtree_rfx_dataset_has_variance_weights_cpp", (DL_FUNC) &_stochtree_rfx_dataset_has_variance_weights_cpp, 1},
+ {"_stochtree_rfx_dataset_num_rows_cpp", (DL_FUNC) &_stochtree_rfx_dataset_num_rows_cpp, 1},
+ {"_stochtree_rfx_label_mapper_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_cpp, 1},
+ {"_stochtree_rfx_label_mapper_to_list_cpp", (DL_FUNC) &_stochtree_rfx_label_mapper_to_list_cpp, 1},
+ {"_stochtree_rfx_model_cpp", (DL_FUNC) &_stochtree_rfx_model_cpp, 2},
+ {"_stochtree_rfx_model_sample_random_effects_cpp", (DL_FUNC) &_stochtree_rfx_model_sample_random_effects_cpp, 7},
+ {"_stochtree_rfx_model_set_group_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameter_covariance_cpp, 2},
+ {"_stochtree_rfx_model_set_group_parameters_cpp", (DL_FUNC) &_stochtree_rfx_model_set_group_parameters_cpp, 2},
+ {"_stochtree_rfx_model_set_variance_prior_scale_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_scale_cpp, 2},
+ {"_stochtree_rfx_model_set_variance_prior_shape_cpp", (DL_FUNC) &_stochtree_rfx_model_set_variance_prior_shape_cpp, 2},
+ {"_stochtree_rfx_model_set_working_parameter_covariance_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_covariance_cpp, 2},
+ {"_stochtree_rfx_model_set_working_parameter_cpp", (DL_FUNC) &_stochtree_rfx_model_set_working_parameter_cpp, 2},
+ {"_stochtree_rfx_tracker_cpp", (DL_FUNC) &_stochtree_rfx_tracker_cpp, 1},
+ {"_stochtree_rfx_tracker_get_unique_group_ids_cpp", (DL_FUNC) &_stochtree_rfx_tracker_get_unique_group_ids_cpp, 1},
+ {"_stochtree_rng_cpp", (DL_FUNC) &_stochtree_rng_cpp, 1},
+ {"_stochtree_sample_gfr_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_gfr_one_iteration_cpp, 13},
+ {"_stochtree_sample_mcmc_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_mcmc_one_iteration_cpp, 13},
+ {"_stochtree_sample_sigma2_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_sigma2_one_iteration_cpp, 4},
+ {"_stochtree_sample_tau_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_tau_one_iteration_cpp, 5},
+ {"_stochtree_set_leaf_value_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_value_forest_container_cpp, 2},
+ {"_stochtree_set_leaf_vector_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_vector_forest_container_cpp, 2},
+ {"_stochtree_tree_prior_cpp", (DL_FUNC) &_stochtree_tree_prior_cpp, 3},
+ {"_stochtree_update_residual_forest_container_cpp", (DL_FUNC) &_stochtree_update_residual_forest_container_cpp, 7},
{NULL, NULL, 0}
};
}
diff --git a/src/kernel.cpp b/src/kernel.cpp
new file mode 100644
index 0000000..a301cbd
--- /dev/null
+++ b/src/kernel.cpp
@@ -0,0 +1,153 @@
+#include
+#include "stochtree_types.h"
+#include
+#include
+#include
+#include
+#include
+#include
+
+typedef Eigen::Map> KernelMatrixType;
+
+[[cpp11::register]]
+cpp11::external_pointer forest_kernel_cpp() {
+ // Create smart pointer to newly allocated object
+ std::unique_ptr forest_kernel_ptr_ = std::make_unique();
+
+ // Release management of the pointer to R session
+ return cpp11::external_pointer(forest_kernel_ptr_.release());
+}
+
+[[cpp11::register]]
+void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train,
+ cpp11::external_pointer forest_container, int forest_num) {
+ // Wrap an Eigen Map around the raw data of the covariate_train matrix
+ StochTree::data_size_t n_train = covariates_train.nrow();
+ int num_covariates_train = covariates_train.ncol();
+ double* covariate_train_data_ptr = REAL(PROTECT(covariates_train));
+ KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train);
+
+ // Compute leaf indices
+ forest_kernel->ComputeLeafIndices(covariates_train_eigen, *(forest_container->GetEnsemble(forest_num)));
+
+ // Unprotect pointers to R data
+ UNPROTECT(1);
+}
+
+[[cpp11::register]]
+void forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test,
+ cpp11::external_pointer forest_container, int forest_num) {
+ // Wrap an Eigen Map around the raw data of the covariate_train matrix
+ StochTree::data_size_t n_train = covariates_train.nrow();
+ int num_covariates_train = covariates_train.ncol();
+ double* covariate_train_data_ptr = REAL(PROTECT(covariates_train));
+ KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train);
+
+ // Wrap an Eigen Map around the raw data of the covariate_test matrix
+ StochTree::data_size_t n_test = covariates_test.nrow();
+ int num_covariates_test = covariates_test.ncol();
+ double* covariate_test_data_ptr = REAL(PROTECT(covariates_test));
+ KernelMatrixType covariates_test_eigen(covariate_test_data_ptr, n_test, num_covariates_test);
+
+ // Compute leaf indices
+ forest_kernel->ComputeLeafIndices(covariates_train_eigen, covariates_test_eigen, *(forest_container->GetEnsemble(forest_num)));
+
+ // Unprotect pointers to R data
+ UNPROTECT(2);
+}
+
+[[cpp11::register]]
+cpp11::writable::integers forest_kernel_get_train_leaf_indices_cpp(cpp11::external_pointer forest_kernel) {
+ if (!forest_kernel->HasTrainLeafIndices()) {
+ cpp11::writable::integers output;
+ return output;
+ }
+ std::vector train_indices = forest_kernel->GetTrainLeafIndices();
+ cpp11::writable::integers output(train_indices.begin(), train_indices.end());
+ return output;
+}
+
+[[cpp11::register]]
+cpp11::writable::integers forest_kernel_get_test_leaf_indices_cpp(cpp11::external_pointer forest_kernel) {
+ if (!forest_kernel->HasTestLeafIndices()) {
+ cpp11::writable::integers output;
+ return output;
+ }
+ std::vector test_indices = forest_kernel->GetTestLeafIndices();
+ cpp11::writable::integers output(test_indices.begin(), test_indices.end());
+ return output;
+}
+
+[[cpp11::register]]
+cpp11::list forest_kernel_compute_kernel_train_cpp(
+ cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train,
+ cpp11::external_pointer forest_container, int forest_num
+) {
+ // Wrap an Eigen Map around the raw data of the covariate_train matrix
+ StochTree::data_size_t n_train = covariates_train.nrow();
+ int num_covariates_train = covariates_train.ncol();
+ double* covariate_train_data_ptr = REAL(PROTECT(covariates_train));
+ KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train);
+
+ // Declare outputs
+ cpp11::writable::doubles_matrix<> kernel_train(n_train, n_train);
+
+ // Wrap Eigen Maps around kernel and kernel inverse matrices
+ double* kernel_data_ptr = REAL(PROTECT(kernel_train));
+ KernelMatrixType kernel_eigen(kernel_data_ptr, n_train, n_train);
+
+ // Compute kernel terms
+ forest_kernel->ComputeKernelExternal(covariates_train_eigen, *(forest_container->GetEnsemble(forest_num)), kernel_eigen);
+
+ // Unprotect pointers to R data
+ UNPROTECT(2);
+
+ // Return list of vectors
+ cpp11::writable::list result;
+ result.push_back(kernel_train);
+ return result;
+}
+
+[[cpp11::register]]
+cpp11::list forest_kernel_compute_kernel_train_test_cpp(
+ cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train,
+ cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num
+) {
+ // Wrap an Eigen Map around the raw data of the covariate_train matrix
+ StochTree::data_size_t n_train = covariates_train.nrow();
+ int num_covariates_train = covariates_train.ncol();
+ double* covariate_train_data_ptr = REAL(PROTECT(covariates_train));
+ KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train);
+
+ // Wrap an Eigen Map around the raw data of the covariate_test matrix
+ StochTree::data_size_t n_test = covariates_test.nrow();
+ int num_covariates_test = covariates_test.ncol();
+ double* covariate_test_data_ptr = REAL(PROTECT(covariates_test));
+ KernelMatrixType covariates_test_eigen(covariate_test_data_ptr, n_test, num_covariates_test);
+
+ // Declare outputs
+ cpp11::writable::doubles_matrix<> kernel_train(n_train, n_train);
+ cpp11::writable::doubles_matrix<> kernel_test_train(n_test, n_train);
+ cpp11::writable::doubles_matrix<> kernel_test(n_test, n_test);
+
+ // Wrap Eigen Maps around kernel and kernel inverse matrices
+ double* kernel_data_ptr = REAL(PROTECT(kernel_train));
+ double* kernel_test_train_data_ptr = REAL(PROTECT(kernel_test_train));
+ double* kernel_test_data_ptr = REAL(PROTECT(kernel_test));
+ KernelMatrixType kernel_train_eigen(kernel_data_ptr, n_train, n_train);
+ KernelMatrixType kernel_test_train_eigen(kernel_test_train_data_ptr, n_test, n_train);
+ KernelMatrixType kernel_test_eigen(kernel_test_data_ptr, n_test, n_test);
+
+ // Compute kernel terms
+ forest_kernel->ComputeKernelExternal(covariates_train_eigen, covariates_test_eigen, *(forest_container->GetEnsemble(forest_num)), kernel_train_eigen, kernel_test_train_eigen, kernel_test_eigen);
+
+ // Unprotect pointers to R data
+ UNPROTECT(5);
+
+ // Return list of vectors
+ cpp11::writable::list result;
+ result.push_back(kernel_train);
+ result.push_back(kernel_test_train);
+ result.push_back(kernel_test);
+ return result;
+}
diff --git a/src/sampler.cpp b/src/sampler.cpp
index cf34fa6..57ac15b 100644
--- a/src/sampler.cpp
+++ b/src/sampler.cpp
@@ -186,6 +186,11 @@ int num_samples_forest_container_cpp(cpp11::external_pointerNumSamples();
}
+[[cpp11::register]]
+int num_trees_forest_container_cpp(cpp11::external_pointer forest_samples) {
+ return forest_samples->NumTrees();
+}
+
[[cpp11::register]]
void json_save_forest_container_cpp(cpp11::external_pointer forest_samples, std::string json_filename) {
forest_samples->SaveToJsonFile(json_filename);
diff --git a/src/stochtree-cpp b/src/stochtree-cpp
index 44e3be1..40608da 160000
--- a/src/stochtree-cpp
+++ b/src/stochtree-cpp
@@ -1 +1 @@
-Subproject commit 44e3be19ffdef00db3ca5180099bfba5ebbda5c9
+Subproject commit 40608da37812ae92ee45f85c2e2314198690041f
diff --git a/src/stochtree_types.h b/src/stochtree_types.h
index 33a8fb6..096dc58 100644
--- a/src/stochtree_types.h
+++ b/src/stochtree_types.h
@@ -1,5 +1,6 @@
#include
#include
+#include
#include
#include
#include
diff --git a/tools/debug/kernel_debug.R b/tools/debug/kernel_debug.R
new file mode 100644
index 0000000..756a239
--- /dev/null
+++ b/tools/debug/kernel_debug.R
@@ -0,0 +1,46 @@
+library(stochtree)
+
+# Generate the data
+X_train <- seq(0,20,length=100)
+X_test <- seq(0,20,length=99)
+y_train <- (sin(pi*X_train/5) + 0.2*cos(4*pi*X_train/5)) * (X_train <= 9.6)
+lin_train <- X_train>9.6;
+y_train[lin_train] <- -1 + X_train[lin_train]/10
+y_train <- y_train + rnorm(length(y_train), sd=0.1)
+y_test <- (sin(pi*X_test/5) + 0.2*cos(4*pi*X_test/5)) * (X_test <= 9.6)
+lin_test <- X_test>9.6;
+y_test[lin_test] <- -1 + X_test[lin_test]/10
+
+bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test)
+forest_kernel <- createForestKernel()
+result_inds <- forest_kernel$compute_leaf_indices(
+ covariates_train = X_train, covariates_test = X_test,
+ forest_container = bart_model$forests,
+ forest_num = bart_model$model_params$num_samples - 1
+)
+result_kernels <- forest_kernel$compute_kernel(
+ covariates_train = X_train, covariates_test = X_test,
+ forest_container = bart_model$forests,
+ forest_num = bart_model$model_params$num_samples - 1
+)
+
+# GP
+mu_tilde <- result_kernels$kernel_test_train %*% MASS::ginv(result_kernels$kernel_train) %*% y_train
+Sigma_tilde <- result_kernels$kernel_test - result_kernels$kernel_test_train %*% MASS::ginv(result_kernels$kernel_train) %*% t(result_kernels$kernel_test_train)
+
+B <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde)
+yhat_mean_test <- colMeans(B)
+plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process")
+abline(0,1,lwd=2.5,lty=3,col="red")
+
+# m <- 200
+# n <- length(X_train)
+# dummies <- result[["leaf_indices_train"]]
+# max_ix <- max(dummies)
+# leaf_train <- Matrix::sparseMatrix(i=rep(1:n, m),
+# j = dummies+1,
+# x = rep(1, n*m),
+# dims = c(n, max_ix+1),
+# )
+# leaf_train <- as.matrix(leaf_train)
+# check <- leaf_train %*% t(leaf_train)
diff --git a/vignettes/Ensemble-Kernel.Rmd b/vignettes/Ensemble-Kernel.Rmd
new file mode 100644
index 0000000..f1f0550
--- /dev/null
+++ b/vignettes/Ensemble-Kernel.Rmd
@@ -0,0 +1,198 @@
+---
+title: "Kernel Methods from Tree Ensembles in StochTree"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Prototype-Interface}
+ %\VignetteEncoding{UTF-8}
+ %\VignetteEngine{knitr::rmarkdown}
+bibliography: vignettes.bib
+editor_options:
+ markdown:
+ wrap: 72
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>"
+)
+```
+
+# Motivation
+
+A trained tree ensemble with strong out-of-sample performance admits a natural
+motivation for the "distance" between two samples: shared leaf membership.
+We number the leaves in an ensemble from 1 to $s$ (that is, if tree 1 has 3 leaves,
+it reserves the numbers 1 - 3, and in turn if tree 2 has 5 leaves, it reserves the numbers 4 - 8 to label its leaves, and so on). For a dataset with $n$ observations,
+we construct the matrix $W$ as follows:
+
+| Initialize $W$ as a matrix of all zeroes with $n$ rows and as many columns as leaves in the ensemble
+| Let `s` = 0
+| **FOR** $j$ **IN** $\left\{1,\dots,m\right\}$:
+| Let `num_leaves` be the number of leaves in tree $j$
+| **FOR** $i$ **IN** $\left\{1,\dots,n\right\}$:
+| Let `k` be the leaf to which tree $j$ maps observation $i$
+| Set element $W_{i,k+s} = 1$
+| Let `s` = `s + num_leaves`
+
+This sparse matrix $W$ is a matrix representation of the basis predictions of an ensemble
+(i.e. integrating out the leaf parameters and just analyzing the leaf indices).
+For an ensemble with $m$ trees, we can determine the proportion of trees that
+map each observation to the same leaf by computing $W W^T / m$.
+This can form the basis for a kernel function used in a Gaussian process
+regression, as we demonstrate below.
+
+To begin, load the `stochtree` package and the `tgp` package which will serve as a point of reference.
+
+```{r setup}
+library(stochtree)
+library(tgp)
+library(MASS)
+library(mvtnorm)
+```
+
+# Demo 1: Univariate Supervised Learning
+
+We begin with a simulated example from the `tgp` package (@gramacy2010categorical).
+This data generating process (DGP) is non-stationary with a single numeric
+covariate. We define a training set and test set and evaluate various approaches
+to modeling the out of sample outcome data.
+
+## Traditional Gaussian Process
+
+We can use the `tgp` package to model this data with a classical Gaussian Process.
+
+```{r, results='hide'}
+# Generate the data
+X_train <- seq(0,20,length=100)
+X_test <- seq(0,20,length=99)
+y_train <- (sin(pi*X_train/5) + 0.2*cos(4*pi*X_train/5)) * (X_train <= 9.6)
+lin_train <- X_train>9.6;
+y_train[lin_train] <- -1 + X_train[lin_train]/10
+y_train <- y_train + rnorm(length(y_train), sd=0.1)
+y_test <- (sin(pi*X_test/5) + 0.2*cos(4*pi*X_test/5)) * (X_test <= 9.6)
+lin_test <- X_test>9.6;
+y_test[lin_test] <- -1 + X_test[lin_test]/10
+
+# Fit the GP
+model_gp <- bgp(X=X_train, Z=y_train, XX=X_test)
+plot(model_gp$ZZ.mean, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process")
+abline(0,1,lwd=2.5,lty=3,col="red")
+sqrt(mean((model_gp$ZZ.mean - y_test)^2))
+```
+
+Assess the RMSE
+
+```{r}
+sqrt(mean((model_gp$ZZ.mean - y_test)^2))
+```
+
+## BART-based Gaussian process
+
+```{r}
+# Run BART on the data
+num_trees <- 200
+sigma_leaf <- 1/num_trees
+bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, num_trees=num_trees)
+
+# Extract kernels needed for kriging
+result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test)
+Sigma_11 <- result_kernels$kernel_test
+Sigma_12 <- result_kernels$kernel_test_train
+Sigma_22 <- result_kernels$kernel_train
+Sigma_22_inv <- ginv(Sigma_22)
+Sigma_21 <- t(Sigma_12)
+
+# Compute mean and covariance for the test set posterior
+mu_tilde <- Sigma_12 %*% Sigma_22_inv %*% y_train
+Sigma_tilde <- (sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21)
+
+# Sample from f(X_test) | X_test, X_train, f(X_train)
+gp_samples <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde)
+
+# Compute posterior mean predictions for f(X_test)
+yhat_mean_test <- colMeans(gp_samples)
+plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "BART Gaussian process")
+abline(0,1,lwd=2.5,lty=3,col="red")
+```
+
+Assess the RMSE
+
+```{r}
+sqrt(mean((yhat_mean_test - y_test)^2))
+```
+
+# Demo 2: Multivariate Supervised Learning
+
+We proceed to the simulated "Friedman" dataset, as implemented in `tgp`.
+
+## Traditional Gaussian Process
+
+We can use the `tgp` package to model this data with a classical Gaussian Process.
+
+```{r, results='hide'}
+# Generate the data, add many "noise variables"
+n <- 100
+friedman.df <- friedman.1.data(n=n)
+train_inds <- sort(sample(1:n, floor(0.8*n), replace = F))
+test_inds <- (1:n)[!((1:n) %in% train_inds)]
+X <- as.matrix(friedman.df)[,1:10]
+X <- cbind(X, matrix(runif(n*10), ncol = 10))
+y <- as.matrix(friedman.df)[,12] + rnorm(n,0,1)*(sd(as.matrix(friedman.df)[,11])/2)
+X_train <- X[train_inds,]
+X_test <- X[test_inds,]
+y_train <- y[train_inds]
+y_test <- y[test_inds]
+
+# Fit the GP
+model_gp <- bgp(X=X_train, Z=y_train, XX=X_test)
+plot(model_gp$ZZ.mean, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process")
+abline(0,1,lwd=2.5,lty=3,col="red")
+```
+
+Assess the RMSE
+
+```{r}
+sqrt(mean((model_gp$ZZ.mean - y_test)^2))
+```
+
+## BART-based Gaussian process
+
+```{r}
+# Run BART on the data
+num_trees <- 200
+sigma_leaf <- 1/num_trees
+bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, num_trees=num_trees)
+
+# Extract kernels needed for kriging
+result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test)
+Sigma_11 <- result_kernels$kernel_test
+Sigma_12 <- result_kernels$kernel_test_train
+Sigma_22 <- result_kernels$kernel_train
+Sigma_22_inv <- ginv(Sigma_22)
+Sigma_21 <- t(Sigma_12)
+
+# Compute mean and covariance for the test set posterior
+mu_tilde <- Sigma_12 %*% Sigma_22_inv %*% y_train
+Sigma_tilde <- (sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21)
+
+# Sample from f(X_test) | X_test, X_train, f(X_train)
+gp_samples <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde)
+
+# Compute posterior mean predictions for f(X_test)
+yhat_mean_test <- colMeans(gp_samples)
+plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "BART Gaussian process")
+abline(0,1,lwd=2.5,lty=3,col="red")
+```
+
+Assess the RMSE
+
+```{r}
+sqrt(mean((yhat_mean_test - y_test)^2))
+```
+
+While the use case of a BART kernel for classical kriging is perhaps unclear without
+more empirical investigation, we will see in a later vignette that the kernel
+approach can be very beneficial for causal inference applications.
+
+# References
diff --git a/vignettes/vignettes.bib b/vignettes/vignettes.bib
index 31bbb60..2f4abbb 100644
--- a/vignettes/vignettes.bib
+++ b/vignettes/vignettes.bib
@@ -63,4 +63,33 @@ @inproceedings{krantsevich2023stochastic
pages={6120--6131},
year={2023},
organization={PMLR}
+}
+
+@Article{gramacy2010categorical,
+ title = {Categorical Inputs, Sensitivity Analysis, Optimization and Importance Tempering with {tgp} Version 2, an {R} Package for Treed Gaussian Process Models},
+ author = {Robert B. Gramacy and Matthew Taddy},
+ journal = {Journal of Statistical Software},
+ year = {2010},
+ volume = {33},
+ number = {6},
+ pages = {1--48},
+ url = {https://www.jstatsoft.org/v33/i06/},
+ doi = {10.18637/jss.v033.i06},
+}
+
+@book{gramacy2020surrogates,
+ title = {Surrogates: {G}aussian Process Modeling, Design and \
+ Optimization for the Applied Sciences},
+ author = {Robert B. Gramacy},
+ publisher = {Chapman Hall/CRC},
+ address = {Boca Raton, Florida},
+ note = {\url{http://bobby.gramacy.com/surrogates/}},
+ year = {2020}
+}
+
+@book{scholkopf2002learning,
+ title={Learning with kernels: support vector machines, regularization, optimization, and beyond},
+ author={Sch{\"o}lkopf, Bernhard and Smola, Alexander J},
+ year={2002},
+ publisher={MIT press}
}
\ No newline at end of file