Skip to content

Commit

Permalink
started on missing data k-means
Browse files Browse the repository at this point in the history
  • Loading branch information
tonymugen committed Sep 29, 2020
1 parent b7aa36d commit cab5a88
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 30 deletions.
97 changes: 75 additions & 22 deletions src/functions4R.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,32 +35,10 @@

#include <Rcpp.h>

#include "matrixView.hpp"
#include "mumimo.hpp"
#include "gmmvb.hpp"
#include "index.hpp"

//[[Rcpp::export(name="tstMiss")]]
Rcpp::List tstMiss(std::vector<double> &inVec, const int32_t &nRow, const int32_t &nCol, const std::vector<int32_t> &fac, const std::vector<int32_t> &missInd){
std::vector<size_t> stFac;
for (auto &f : fac){
stFac.push_back( static_cast<size_t>(f) );
}
for (auto &m : missInd){
inVec[static_cast<size_t>(m)] = std::nan("");
}
BayesicSpace::Index tstInd(stFac);
BayesicSpace::MatrixViewConst test(&inVec, 0, nRow, nCol);
std::vector<double> vOut(nCol * tstInd.groupNumber(), 0.0);
BayesicSpace::MatrixView Out( &vOut, 0, tstInd.groupNumber(), nCol );
try {
test.colMeansMiss(stFac, Out);
} catch (std::string problem) {
Rcpp::stop(problem);
}
return Rcpp::List::create(Rcpp::Named("out", vOut));
}

//[[Rcpp::export(name="testLpostNR")]]
Rcpp::List testLpostNR(const std::vector<double> &yVec, const int32_t &d, const int32_t &Npop, std::vector<double> &theta, const std::vector<double> &P, const int32_t &ind, const double &limit, const double &incr){
BayesicSpace::MumiNR test(&yVec, &P, d, Npop, 1e-8, 2.5, 1e-8);
Expand Down Expand Up @@ -408,6 +386,81 @@ Rcpp::List vbFit(const std::vector<double> &yVec, const int32_t &d, const int32_
return Rcpp::List::create(Rcpp::Named("popMeans", vPopMn), Rcpp::Named("covariances", vSm), Rcpp::Named("effNm", Nm), Rcpp::Named("p", r), Rcpp::Named("DIC", dic));
}

//' Variational Bayes model fit with missing data
//'
//' Fits a Gaussian mixture model using variational Bayes. Allows missing data. Missing values are marked with an integer vector that stores base-0 indexes of the messing values in the vectorized data matrix.
//'
//' @param yVec vectorized data matrix
//' @param missInd missing values indexes (base-0)
//' @param d number of traits
//' @param nPop number of populations
//' @param alphaPr prior population size
//' @param sigSqPr prior variance
//' @param ppRatio population to error covariance ratio
//' @param nReps number of model fit attempts before picking the best fit
//' @return list containing population means (\code{popMeans}), covariances (\code{covariances}), effective population sizes (\code{effNm}), population assignment probabilities (\code{p}), and the deviance information criterion (DIC, \code{DIC}).
//'
//' @keywords internal
//'
//[[Rcpp::export(name="vbFit")]]
Rcpp::List vbFitMiss(std::vector<double> &yVec, const std::vector<int32_t> &missInd, const int32_t &d, const int32_t &nPop, const double &alphaPr, const double &sigSqPr, const double &ppRatio, const int32_t nReps){
if (nPop <= 1) {
Rcpp::stop("Number of populations must be greater than 1");
}
if (d <= 0) {
Rcpp::stop("Number of traits must be non-negative");
}
if (nReps <= 0) {
Rcpp::stop("Number of replicate runs must be positive");
}
if (alphaPr <= 0.0) {
Rcpp::stop("Prior population size must be positive");
}
if (sigSqPr <= 0.0) {
Rcpp::stop("Prior variance must be positive");
}
if (ppRatio <= 0.0) {
Rcpp::stop("Variance ratio must be positive");
}
for (auto &i : missInd){
if (i < 0) {
Rcpp::stop("Negative value in the missing data index");
}
yVec[i] = std::nan("");
}
std::vector<double> vPopMn;
std::vector<double> vSm;
std::vector<double> Nm;
std::vector<double> r;
std::vector<double> lPost;
double dic = 0.0;
try {
BayesicSpace::GmmVB vbModel(&yVec, ppRatio, sigSqPr, alphaPr, static_cast<size_t>(nPop), static_cast<size_t>(d), &vPopMn, &vSm, &r, &Nm);
vbModel.fitModel(lPost, dic);
for (int iRep = 1; iRep < nReps; iRep++) {
std::vector<double> vPopMnLoc;
std::vector<double> vSmLoc;
std::vector<double> NmLoc;
std::vector<double> rLoc;
std::vector<double> lPostLoc;
double dicLoc = 0.0;

BayesicSpace::GmmVB vbModel(&yVec, ppRatio, sigSqPr, alphaPr, static_cast<size_t>(nPop), static_cast<size_t>(d), &vPopMnLoc, &vSmLoc, &rLoc, &NmLoc);
vbModel.fitModel(lPostLoc, dicLoc);
if (dicLoc < dic) { // if we found a better DIC
vPopMn = vPopMnLoc;
vSm = vSmLoc;
Nm = NmLoc;
r = rLoc;
dic = dicLoc;
}
}
} catch (std::string problem) {
Rcpp::stop(problem);
}
return Rcpp::List::create(Rcpp::Named("popMeans", vPopMn), Rcpp::Named("covariances", vSm), Rcpp::Named("effNm", Nm), Rcpp::Named("p", r), Rcpp::Named("DIC", dic));
}

//' Run the sampler with no replication
//'
//' Runs the sampler on the data assuming no fixed effects, missing trait data, or replication.
Expand Down
24 changes: 17 additions & 7 deletions src/gmmvb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ GmmVB::GmmVB(GmmVB &&in) : yVec_{in.yVec_}, N_{in.N_}, lambda0_{in.lambda0_}, nu

void GmmVB::fitModel(vector<double> &logPost, double &dic) {
// Initialize values with k-means
vector<size_t> ind;
for (size_t m = 0; m < M_.getNrows(); m++) {
for (size_t iLn = 0; iLn < Y_.getNrows() / M_.getNrows(); iLn++) {
ind.push_back(m);
}
}
Index popInd( M_.getNrows() );
const double smallVal = 0.01 / static_cast<double>(M_.getNrows() - 1);
kMeans_(Y_, M_.getNrows(), 50, popInd, M_);
Expand Down Expand Up @@ -429,6 +423,22 @@ void GmmVB::kMeans_(const MatrixViewConst &X, const size_t &Kclust, const uint32
}

// GmmVBmiss methods
void GmmVBmiss::fitModel(vector<double> &logPost, double &dic){
// Initialize values with k-means
Index popInd( M_.getNrows() );
const double smallVal = 0.01 / static_cast<double>(M_.getNrows() - 1);
kMeans_(Y_, M_.getNrows(), 50, popInd, M_);
for (size_t m = 0; m < M_.getNrows(); m++) {
for (size_t iRow = 0; iRow < Y_.getNrows(); iRow++) {
if (popInd.groupID(iRow) == m){
R_.setElem(iRow, m, 0.99);
} else {
R_.setElem(iRow, m, smallVal);
}
}
}
}

double GmmVBmiss::rowDistance_(const MatrixViewConst &m1, const size_t &row1, const MatrixView &m2, const size_t &row2){
#ifndef PKG_DEBUG_OFF
if ( m1.getNcols() != m2.getNcols() ) {
Expand Down Expand Up @@ -499,7 +509,7 @@ void GmmVBmiss::kMeans_(const MatrixViewConst &X, const size_t &Kclust, const ui
}
x2m.update(sNew);
// recalculate cluster means
X.colMeans(x2m, M);
X.colMeansMiss(x2m, M);
// calculate the magnitude of cluster assignment change
double nDiff = 0.0;
for (size_t i = 0; i < sNew.size(); i++) {
Expand Down
10 changes: 9 additions & 1 deletion src/gmmvb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ namespace BayesicSpace {
* \param[out] logPost vector of lower bounds
* \param[out] dic the DIC value
*/
void fitModel(vector<double> &logPost, double &dic);
virtual void fitModel(vector<double> &logPost, double &dic);
protected:
/** \brief Pointer to vectorized data matrix */
const vector<double> *yVec_;
Expand Down Expand Up @@ -218,6 +218,14 @@ namespace BayesicSpace {
* \return target object
*/
GmmVBmiss& operator=(GmmVBmiss &&in) = delete;
/** \brief Fit model
*
* Fits the model, returning the log-posterior for each step and the end-result deviance information criterion (DIC).
*
* \param[out] logPost vector of lower bounds
* \param[out] dic the DIC value
*/
void fitModel(vector<double> &logPost, double &dic) override;
protected:
// Private functions
/** \brief The E-step */
Expand Down

0 comments on commit cab5a88

Please sign in to comment.