From 7a0565d6b8f74c63c1d4a44866e55fb20c4d92ab Mon Sep 17 00:00:00 2001 From: Tony Greenberg Date: Thu, 16 Jan 2020 21:14:26 -0500 Subject: [PATCH] first pass at the R interface --- R/interfaceFunctions.R | 50 ++++++++++++++++++++++++++++++++++++++++++ README.md | 9 ++++++++ src/functions4R.cpp | 9 ++++++-- src/mumimo.cpp | 2 -- 4 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 R/interfaceFunctions.R diff --git a/R/interfaceFunctions.R b/R/interfaceFunctions.R new file mode 100644 index 0000000..fa5c5b7 --- /dev/null +++ b/R/interfaceFunctions.R @@ -0,0 +1,50 @@ +# +# Copyright (c) 2019 Anthony J. Greenberg +# +# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# + +#' Fit a Gaussian mixture model +#' +#' Fits a mixture model to the provided data, taking into account replication structure. Takes data on multiple traits and generates samples from posterior distributions of parameters, as well as probabilities that a line belongs to a given population. The fit is performed using a No-U-Turn Sampler (NUTS). The recommended burn-in, sampling, number of chains, and thinning are set as defaults. +#' +#' @note Currently exactly one level of replication is supported, with no missing data. +#' +#' @param data data frame with the data +#' @param trait.columns list of columns in \code{data} that contain trait values +#' @param factor.column name of the column that contains the factor connecting replicates to lines +#' @param n.pop number of populations to fit, must be two ro greater +#' @param n.burnin number of iterations of butnin (adaptation) +#' @param n.sampling number of sampling steps +#' @param n.thin thinning number (if, e.g., set to five then every fifth chain sample is saved) +#' @param n.chains number of chains +#' @return list that contains matrix of parameter chains (named \code{parChains}, each chain a column) and a matrix population assignments (named \code{popChains}) +#' +#' @export +fitModel(data, trait.colums, factor.column, n.pop, n.burnin = 5000, n.sampling = 10000, n.thin = 5, n.chains = 5){ + yVec <- as.double(unlist(data[, trait.colums])) + if (sum(is.na(yVec))) { + stop("No missing trait data allowed at present") + } + lnFac <- as.integer(as.factor(data[, factor.column])) + if (n.pop < 2) { + stop("Must specify more than one population") + } + res <- runSampler(yVec, lnFac, n.pop, n.burnin, n.sampling, n.thin, n.chains) + res$thetaChain <- matrix(res$thetaChain, ncol=n.chains) + res$piChain <- matrix(res$piChain, ncol=n.chains) +} + diff --git a/README.md b/README.md index efcac80..dc2e1b0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ This is an R package to assign individuals to populations based on phenotypic data from multiple traits. It uses Bayesian inference and Hamiltonian Monte Carlo to fit the model. Goodness of fit is assessed using Bayes factors which allow to choose the most likely number of populations. +The package is still in development. Basic model fitting with one level of replication, no missing phenotype data, and no covariates, is functional. The following functionality is in development: + + - Bayes factors to assess goodness of fit and reasonable population number + - Functions to summarize results and provide convergence diagnostics + - Disambiguation of population labels + - Support for missing phenotype data + - Support for covariates + - Arbitrary replication levels (including no replication) + To install, make sure you have the `devtools` package on your system, and then run `install_github("tonymugen/MuGaMix")`. It should work under any Unix-like system, but has not been tested under Windows. diff --git a/src/functions4R.cpp b/src/functions4R.cpp index 531b145..9157ffd 100644 --- a/src/functions4R.cpp +++ b/src/functions4R.cpp @@ -48,7 +48,7 @@ //' @param Nthin thinning number //' //[[Rcpp::export(name="runSampler")]] -Rcpp::List runSampler(const std::vector &yVec, const std::vector &lnFac, const int32_t &Npop, const int32_t &Nadapt, const int32_t &Nsamp, const int32_t &Nthin){ +Rcpp::List runSampler(const std::vector &yVec, const std::vector &lnFac, const int32_t &Npop, const int32_t &Nadapt, const int32_t &Nsamp, const int32_t &Nthin, const int32_t &Nchains){ if (yVec.size()%lnFac.size()) { Rcpp::stop("ERROR: line factor length implies a non-integer number of traits in the data vector"); } @@ -61,6 +61,9 @@ Rcpp::List runSampler(const std::vector &yVec, const std::vector l1; for (auto &lf : lnFac) { @@ -78,7 +81,9 @@ Rcpp::List runSampler(const std::vector &yVec, const std::vectorupdate();