diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index a65e9af..b0542b2 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -33,11 +33,11 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | diff --git a/DESCRIPTION b/DESCRIPTION index 9013bcb..2af3c12 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mdatools Title: Multivariate Data Analysis for Chemometrics -Version: 0.12.0 -Date: 2021-09-12 +Version: 0.13.0 +Date: 2022-07-09 Author: Sergey Kucheryavskiy () Maintainer: Sergey Kucheryavskiy Description: Projection based methods for preprocessing, @@ -10,7 +10,7 @@ Description: Projection based methods for preprocessing, Encoding: UTF-8 License: MIT + file LICENSE Imports: methods, graphics, grDevices, stats, Matrix -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.0 Suggests: testthat NeedsCompilation: no Packaged: 2019-05-24 11:03:33 UTC; svkucheryavski diff --git a/LICENSE.md b/LICENSE.md index 7cf9f2d..1dfac4a 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ -Copyright (c) 2016-2021 Sergey Kucheryavskiy +Copyright (c) 2016-2022 Sergey Kucheryavskiy Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/NAMESPACE b/NAMESPACE index 0215c11..0905b0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,6 +62,7 @@ S3method(plotPuritySpectra,mcrpure) S3method(plotRMSE,ipls) S3method(plotRMSE,regmodel) S3method(plotRMSE,regres) +S3method(plotRMSERatio,regmodel) S3method(plotRegcoeffs,regmodel) S3method(plotResiduals,ldecomp) S3method(plotResiduals,pca) @@ -264,6 +265,7 @@ export(plotPurity) export(plotPuritySpectra) export(plotQDoF) export(plotRMSE) +export(plotRMSERatio) export(plotRegcoeffs) export(plotResiduals) export(plotScatter) @@ -351,6 +353,7 @@ importFrom(graphics,rasterImage) importFrom(graphics,rect) importFrom(graphics,segments) importFrom(graphics,text) +importFrom(methods,as) importFrom(methods,show) importFrom(stats,convolve) importFrom(stats,cor) diff --git a/NEWS.md b/NEWS.md index bc35708..2b651e0 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,25 @@ -v.0.12.0 -======== +v. 0.13.0 +========== +This release brings an updated implementation of PLS algorithm (SIMPLS) which is more numerically stable and gives sufficiently less warnings about using too many components in case when you work with small y-values. The speed of `pls()` method in general has been also improved. + +Another important thing is that cross-validation of regression and classification models has been re-written towards more simple solution and now you can also use your own custom splits by providing a vector with segment indices associated with each measurement. For example if you run PLS with parameter `cv = c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2)` it is assumed that you want to use venetian blinds split with four segments and your dataset has 10 measurements. See more details in the tutorial, where description of cross-validation procedure has been moved to a separate section. + +Other changes and improvements: + +* Refactoring and improvements of `prep.savgol()` code made the method much faster (up to 50-60 times faster for datasets with many measurements). + +* Refactoring and improvements of `prep.alsbasecorr()` code made the method 2-3 times faster especially for large datasets. + +* added new plotting method `plotRMSERatio()` for regression models (inspired by [this post](https://eigenvector.com/%EF%BF%BCevaluating-models-hating-on-r-squared/) by Barry M. Wise) +* added [PQN](https://doi.org/10.1021/ac051632c) normalization method to `prep.norm()` function. + +* fixed a bug in `vipscores()` which could lead to a bit higher values for PLS2 models. + +* fixes to several small bugs and general improvements. + + +v. 0.12.0 +========== This release is mostly about preprocessing - added some new methods, improved the existent once and implemented a possibility to combine preprocessing methods together (including parameter values) and apply them all together in a correct sequence. See [preprocessing section](https://mda.tools/docs/preprocessing.html) in the tutorials for details ## New features and improvements @@ -30,13 +50,13 @@ This release is mostly about preprocessing - added some new methods, improved th * the user guides have been revised and improved. -v.0.11.5 -======== +v. 0.11.5 +========== * fix for an issue in PLS SIMPLS implementation (incorrect use of `Machine$longdouble.eps`), which lead to an error when the package is tested on Apple M1. -v.0.11.4 -======== +v. 0.11.4 +========== * added possibility for providing partially known contributions (parameter `cont.forced`) or spectral values (parameter `spec.forced`) to `mcrals()`. See more in help text and user guide for the package. @@ -49,8 +69,8 @@ v.0.11.4 * fixed bug [#99](https://github.com/svkucheryavski/mdatools/issues/99), which did not allow to use user defined indices of pure variables in `mcrpure()`. -v.0.11.3 -======== +v. 0.11.3 +========== * added [Procrustes Cross-Validation method](https://doi.org/10.1021/acs.analchem.0c02175), `pcv()` (it is also available as a [separate project](https://github.com/svkucheryavski/pcv)). @@ -65,21 +85,21 @@ v.0.11.3 * added link to [YouTube channel](https://www.youtube.com/channel/UCox0H4utfMq4FIu2kymuyTA) with Chemometric course based on *mdatools* package. -v.0.11.2 -======== +v. 0.11.2 +========== * fixed an issue, which lead to a bug in `simcam.getPerformanceStats`, returning implausible and asymmetrical results (thanks to @svonallmen). * fixed a small issue sometimes giving warning when running tests on CRAN (did not influence the user experience though). -v.0.11.1 -======== +v. 0.11.1 +========== * the algorithm for `mcrpure()` method has been modified to avoid potential issues with original patented version. -v.0.11.0 -======== +v. 0.11.0 +========== ## New features @@ -98,14 +118,14 @@ v.0.11.0 * main model methods (`pls()`, `pca()`, etc.), now do additional check for the consistency of provided datasets. -v.0.10.4 -======== +v. 0.10.4 +========== * fixed a bug, which led to ignoring `opacity` option in plots. -v.0.10.3 -======== +v. 0.10.3 +========== * Fixed bug `#85` when using y-values as data frame gave an error in PLS regression @@ -113,13 +133,13 @@ v.0.10.3 * Code refactoring and tests for preprocessing methods -v.0.10.2 -======== +v. 0.10.2 +========== * Fixed a bug in `categorize.pls()` method, which could give wrong results for test set measurements (see issue #82). -v.0.10.1 -======== +v. 0.10.1 +========== * Small improvements to `plotExtreme.pca()` so user can specify additional parameters, such as, for example `cex`. If plot is made for several components, you can now specify just one value for all points (e.g. color of points or marker symbol). @@ -129,8 +149,8 @@ v.0.10.1 * Fixed a bug in `summary()` method for PLS, which worked incorrectly in case of several response variables (PLS2). -v.0.10.0 -======== +v. 0.10.0 +========== Many changes have been made in this version, but most of them are under the hood. Code has been refactored significantly in order to improve its efficiency and make future support easier. Some functionality has been re-written from the scratch. **Most** of the code is backward compatible, which means your old scripts should have no problem to run with this version. However, some changes are incompatible and this can lead to occasional errors and warning messages. All details are shown below, pay a special attention to **breaking changes** part. @@ -232,29 +252,29 @@ Other changes are listed below: * New method `categorize()` allowing to categorize data rows based on PLS results and critical limits computed for X- and Y-distance. -v.0.9.6 -======= +v. 0.9.6 +========= * fixed a bug related to wrong calculation of R2 in case of multiple response variables (PLS2) * refactoring of `regres` methods * added tests for some of the `regres` methods -v.0.9.5 -======= +v. 0.9.5 +========= * better description of cross-validation settings in help text (parameter `cv`) * added column R2 (coefficient of determination) to PLS summary as it is not always identical to `Y cumexpvar` * better use of `cex` parameter for group plots (can be specified differently for each group) * if `cex` is specified it will be also applied for legend items -v.0.9.4 -======= +v. 0.9.4 +========= * fixed a bug leading to wrong results when using parameter `max.cov` in `prep.autoscale()` (#59) -v.0.9.3 -======= +v. 0.9.3 +========= * fixed a bug leading to wrong results in multiclass PLS-DA if class labels in reference class variable (factor) were not in alphabetical order -v.0.9.2 -======= +v. 0.9.2 +========= * improvements to `ipls()` method plus fixed a bug preventing breaking the selection loop (#56) * fixed a bug in `selectCompNum()` related to use of Wold criterion (#57) * fixed a bug with using of `max.cov` parameter in `prep.autoscale()` (#58) @@ -262,15 +282,15 @@ v.0.9.2 * code refactoring and small improvements * added tests for `prep.autoscale()` -v.0.9.1 -======= +v. 0.9.1 +========= * all plot functions have new `opacity` parameter for semi-transparent colors * several improvements to PLS-DA method for one-class discrimination * fixed a bug with wrong estimation of maximum number of components for PCA/SIMCA with cross-validation * added chapter on PLS-DA to the tutorial (including last improvements) -v.0.9.0 -======= +v. 0.9.0 +========= * added randomized PCA algorithm (efficient for datasets with large number of rows) * added option to inherit and show critical limits on residuals plot for PCA/SIMCA results * added support for data driven approach to PCA/SIMCA (DD-SIMCA) @@ -283,8 +303,8 @@ v.0.9.0 * added option to use equal axes limits in `plotPrediction()` for PLS results * the tutorial has been amended and extended correspondingly -v.0.8.4 -======= +v. 0.8.4 +========= * small improvements to calculation of statistics for regression coefficients * `pls.getRegCoeffs()` now also returns standard error and confidence intervals calculated for unstandardized variables * new method `summary()` for object with regression coefficients (`regcoeffs`) @@ -292,24 +312,24 @@ v.0.8.4 * fixed a bug in some PLS plots where labels for cross-validated results forced to be numbers * when using `mdaplot` for data frame with one or more factor columns, the factors are now transofrmed to dummy variables (before it led to an error) -v.0.8.3 -======= +v. 0.8.3 +========= * fixed a bug in `mdaplots` when using factor with more than 8 levels for color grouping led to an error * fixed a bug in `pca` with wrong calculation of eigenvalues in NIPALS algorithm * bars on a bar plot now can be color grouped -v.0.8.2 -======= +v. 0.8.2 +========= * parameters `lab.cex` and `lab.col` now are also applied to colorbar labels -v.0.8.1 -======= +v. 0.8.1 +========= * fixed a bug in PCA when explained variance was calculated incorrectly for data with excluded rows * fixed several issues with SIMCA (cross-validation) and SIMCAM (Cooman's plot) * added a chapter about SIMCA to the tutorial -v.0.8.0 -======= +v. 0.8.0 +========= * tutorial has been moved from GitBook to Bookdown and fully rewritten * GitHub repo for the package has the tutorial as a static html site in `docs` folder * the `mdaplot()` and `mdaplotg()` were rewritten completely and now are more easy to use (check tutorial) @@ -326,21 +346,21 @@ v.0.8.0 * added a posibility to exclude selected rows and columns from calculations * added support for images (check tutorial) -v.0.7.2 -======= +v. 0.7.2 +========= * corrected a typo in title of selectivity ratio plot * `prep.autoscale()` now do not scale columns with coefficient of variation below given threshold -v.0.7.1 -======= +v. 0.7.1 +========= * fixed an issue lead to plot.new() error in some cases * documentation was regenerated with new version of Roxygen * file People.RData was renamed to people.RData * NIPALS method for PCA has been added * code optimization to speed calculations up -v.0.7.0 -======= +v. 0.7.0 +========= * interval PLS variable selection (iPLS) is implemented * normalization was added to preprocessing methods (`prep.norm`) * method `getRegcoeffs` was added to PLS model @@ -350,41 +370,41 @@ v.0.7.0 * NAMESPACE file is generated by roxygen2 * fixed several small bugs and typos -v.0.6.2 -======== +v. 0.6.2 +========== * Q2 residuals renamed to Q (Squared residual distance) * All plots have parameters `lab.col` and `lab.cex` for changing color and font size for data point labels -v.0.6.1 -======== +v. 0.6.1 +========== * fixed a bug led to incorrect calculation of specificity * jack-knife confidence intervals now also calculated for PLS-DA models * confidence intervals for regression coefficients are shown by default if calculated -v.0.6.0 -======== +v. 0.6.0 +========== * randomization test for PLS has been added, see `?randtest` * systematic and repeated random cross-validation are available, see `?crossval` * fixed bug with labels on bar plot with confidence intervals * fixed bug in PLS when using maximum number of components lead to NA values in weights v. 0.5.3 -======== +========== * fixed several small bugs * improvemed documentation for basic methods v. 0.5.2 -======== +========== * fixed bug for computing classification performance for numeric class names * improvements to SIMCA implementation v. 0.5.1 -======== +========== * added more details to documentation * bug fixes for variable selection methods v. 0.5.0 -======== +========== * all documentation has been rewritten using `roxygen2` package * added extra preprocessing methods * added VIP scores calculation and plot for PLS and PLS-DA models @@ -394,7 +414,7 @@ v. 0.5.0 * the first release available in CRAN v. 0.4.0 -======== +========== * New `classres` class for representation and visualisation of classification results * in PCA model, limits for T2 and Q2 now are calculated for all available components * in PCA results, limits for T2 and Q2 calculated for a model are kept and shown on residuals plot @@ -405,16 +425,16 @@ v. 0.4.0 * bug fixes and improvements v. 0.3.2 -======== +========== * Enhancements in group bar plot * Fixed bugs with wrong labels of bar plot with negative values v. 0.3.1 -======== +========== * Corrected errors and typos in README.md and small bg fixes v. 0.3.0 -======== +========== * PLS and all related methods were rewritten from the scratch to make them faster, more efficient and also to follow the same code conventions as previously rewritten PCA. Here are main changes you need to do in your code if you used mdatools PLS before: `selectNumComp(model, ncomp)` instead diff --git a/R/classres.R b/R/classres.R index 064382a..8581837 100755 --- a/R/classres.R +++ b/R/classres.R @@ -324,7 +324,7 @@ classres.getPerformance <- function(c.ref, c.pred) { dim(c.ref) <- NULL attrs <- mda.getattr(c.pred) if (length(attrs$exclrows) > 0) { - c.pred <- c.pred[-attrs$exclrows, , , drop = F] + c.pred <- c.pred[-attrs$exclrows, , , drop = FALSE] c.ref <- c.ref[-attrs$exclrows] } @@ -339,10 +339,10 @@ classres.getPerformance <- function(c.ref, c.pred) { # compute main performance indicators classnames <- dimnames(c.pred)[[3]] for (i in seq_len(nclasses)) { - fn[i, ] <- colSums((c.ref == classnames[i]) & (c.pred[, , i, drop = F] == -1)) - fp[i, ] <- colSums((c.ref != classnames[i]) & (c.pred[, , i, drop = F] == 1)) - tp[i, ] <- colSums((c.ref == classnames[i]) & (c.pred[, , i, drop = F] == 1)) - tn[i, ] <- colSums((c.ref != classnames[i]) & (c.pred[, , i, drop = F] == -1)) + fn[i, ] <- colSums((c.ref == classnames[i]) & (c.pred[, , i, drop = FALSE] == -1)) + fp[i, ] <- colSums((c.ref != classnames[i]) & (c.pred[, , i, drop = FALSE] == 1)) + tp[i, ] <- colSums((c.ref == classnames[i]) & (c.pred[, , i, drop = FALSE] == 1)) + tn[i, ] <- colSums((c.ref != classnames[i]) & (c.pred[, , i, drop = FALSE] == -1)) } # compute main statistics diff --git a/R/crossval.R b/R/crossval.R index 976828d..6debe23 100755 --- a/R/crossval.R +++ b/R/crossval.R @@ -25,8 +25,8 @@ crossval.getParams <- function(cv, nobj) { if (type == "loo") { return( list( - type = "rand", - nrep = nrep, + type = "loo", + nrep = 1, nseg = nobj ) ) @@ -80,6 +80,9 @@ crossval <- function(cv = 1, nobj = NULL, resp = NULL) { # get cross-validation parameters if (is.null(nobj)) nobj <- length(resp) + # if user already provided matrix with values - return it + if (is.numeric(cv) && length(cv) == nobj) return(as.matrix(cv)) + p <- crossval.getParams(cv = cv, nobj = nobj) if (!(p$type %in% c("rand", "ven", "loo"))) { stop("Wrong name for cross-validation method.") @@ -95,22 +98,17 @@ crossval <- function(cv = 1, nobj = NULL, resp = NULL) { stop("Wrong value for number of segments (should be between 2 and number of objects).") } - seglen <- ceiling(nobj / p$nseg) - fulllen <- seglen * p$nseg - ind <- array(0, dim = c(p$nseg, seglen, p$nrep)) + if (p$type == "loo") { + return(matrix(seq_len(nobj), ncol = 1)) + } if (p$type == "rand") { - for (i in seq_len(p$nrep)) { - v <- c(sample(nobj), rep(NA, fulllen - nobj)) - ind[, , i] <- matrix(v, nrow = p$nseg, byrow = TRUE) - } - return(ind) + return(sapply(seq_len(p$nrep), function(i) rep(seq_len(p$nseg), length.out = nobj)[sample(nobj)])) } if (p$type == "ven") { - v <- c(order(resp), rep(NA, fulllen - nobj)) - ind[, , 1] <- matrix(v, nrow = p$nseg, byrow = FALSE) - return(ind) + ind <- if (is.null(resp)) seq_len(nobj) else order(resp) + return(matrix(rep(seq_len(p$nseg), length.out = nobj)[ind], ncol = 1)) } stop("Something went wrong.") diff --git a/R/defaults.R b/R/defaults.R index 594fae8..b6fddbd 100755 --- a/R/defaults.R +++ b/R/defaults.R @@ -1,4 +1,15 @@ +#' Plot for ratio RMSEC/RMSECV vs RMSECV +#' @param obj +#' object with any regression model +#' @param ... +#' other parameters +#' +#' @export +plotRMSERatio <- function(obj, ...) { + UseMethod("plotRMSERatio") +} + #' Plot purity spectra #' @param obj #' object with mcr pure case diff --git a/R/ipls.R b/R/ipls.R index 2c8b22a..5e32b80 100644 --- a/R/ipls.R +++ b/R/ipls.R @@ -165,7 +165,7 @@ ipls <- function(x, y, glob.ncomp = 10, center = TRUE, scale = FALSE, cv = list( if (ncol(y) > 1) { warning("iPLS can work with one y-variable at time, selecting first column.") - y <- y[, 1, drop = F] + y <- y[, 1, drop = FALSE] } # add name to the single column @@ -244,7 +244,7 @@ ipls <- function(x, y, glob.ncomp = 10, center = TRUE, scale = FALSE, cv = list( "n" = 0, "start" = 1, "end" = ncol(x), - "selected" = F, + "selected" = FALSE, "nComp" = obj$gm$ncomp.selected, "RMSE" = gmres$rmse[1, obj$gm$ncomp.selected], "R2" = gmres$r2[1, obj$gm$ncomp.selected] @@ -349,7 +349,7 @@ ipls.forward <- function(x, y, obj, int.stat, glob.stat) { # combine already selected intervals with the current ind <- obj$int.limits[l, 1]:obj$int.limits[l, 2] xc <- x[, c(selind, ind), drop = FALSE] - xt <- if(!is.null(obj$x.test)) obj$x.test[, c(selind, ind), drop = FALSE] else NULL + xt <- if (!is.null(obj$x.test)) obj$x.test[, c(selind, ind), drop = FALSE] else NULL # build a model m <- pls(xc, y, ncomp = obj$int.ncomp, center = obj$center, scale = obj$scale, cv = obj$cv, @@ -453,7 +453,7 @@ ipls.backward <- function(x, y, obj, int.stat, glob.stat) { # combine already selected intervals with the current xc <- x[, -c(unselind, ind), drop = FALSE] - xt <- if(!is.null(obj$x.test)) obj$x.test[, -c(unselind, ind), drop = FALSE] else NULL + xt <- if (!is.null(obj$x.test)) obj$x.test[, -c(unselind, ind), drop = FALSE] else NULL # build a model m <- pls(xc, y, ncomp = obj$int.ncomp, center = obj$center, scale = obj$scale, cv = obj$cv, @@ -479,7 +479,7 @@ ipls.backward <- function(x, y, obj, int.stat, glob.stat) { "n" = l, "start" = obj$int.limits[l, 1], "end" = obj$int.limits[l, 2], - "selected" = F, + "selected" = FALSE, "nComp" = m$ncomp.selected, "RMSE" = lres$rmse[1, m$ncomp.selected], "R2" = lres$r2[1, m$ncomp.selected] diff --git a/R/ldecomp.R b/R/ldecomp.R index 3492ebe..8b9777b 100755 --- a/R/ldecomp.R +++ b/R/ldecomp.R @@ -142,9 +142,13 @@ plotVariance.ldecomp <- function(obj, type = "b", variance = "expvar", labels = #' most of graphical parameters from \code{\link{mdaplot}} function can be used. #' #' @export -plotScores.ldecomp <- function(obj, comp = c(1, 2), type = "p", show.axes = TRUE, +plotScores.ldecomp <- function(obj, comp = if (obj$ncomp > 1) c(1, 2) else 1, type = "p", show.axes = TRUE, show.plot = TRUE, ...) { + if (min(comp) < 1 || max(comp) > ncol(obj$scores)) { + stop("Wrong values for 'comp' parameter.") + } + # get scores for given components and generate column names with explained variance plot_data <- mda.subset(obj$scores, select = comp) colnames(plot_data) <- paste0("Comp ", comp, " (", round(obj$expvar[comp], 2), "%)") @@ -418,7 +422,6 @@ ldecomp.getVariances <- function(scores, loadings, residuals, Q) { ldecomp.getDistances <- function(scores, loadings, residuals, eigenvals) { # get names and attributes - rows_excluded <- attr(scores, "exclrows") cols_excluded <- attr(loadings, "exclrows") # get sizes @@ -431,36 +434,24 @@ ldecomp.getDistances <- function(scores, loadings, residuals, eigenvals) { residuals <- residuals[, -cols_excluded, drop = FALSE] } - # get rid of hidden scores and residuals (needed for some calculations) - scores_visible <- scores - residuals_visible <- residuals - if (length(rows_excluded) > 0) { - scores_visible <- scores_visible[-rows_excluded, , drop = FALSE] - residuals_visible <- residuals_visible[-rows_excluded, , drop = FALSE] - } + # helper function to compute orthogonal distances for given number of components in a model + getResiduals <- function(scores, loadings, residuals, a) { + ncomp <- ncol(scores) + if (a == ncomp) return(residuals) - # normalize the scores - scoresn <- scale(scores, center = FALSE, scale = sqrt(eigenvals)) - - # prepare zero matrices for the and model power - T2 <- matrix(0, nrow = nobj, ncol = ncomp) - Q <- matrix(0, nrow = nobj, ncol = ncomp) - - # calculate distances and model power for each possible number of components in model - for (i in seq_len(ncomp)) { - res <- residuals - if (i < ncomp) { - res <- res + - tcrossprod( - scores[, (i + 1):ncomp, drop = F], - loadings[, (i + 1):ncomp, drop = F] - ) - } - - Q[, i] <- rowSums(res^2) - T2[, i] <- rowSums(scoresn[, seq_len(i), drop = F]^2) + residuals + tcrossprod( + scores[, (a + 1):ncomp, drop = FALSE], + loadings[, (a + 1):ncomp, drop = FALSE] + ) } + # compute matrices with orthogonal and score distances + Q <- sapply(seq_len(ncomp), function(a) rowSums(getResiduals(scores, loadings, residuals, a)^2)) + dim(Q) <- c(nobj, ncomp) + + T2 <- t(apply(scale(scores^2, center = FALSE, scale = eigenvals), 1, cumsum)) + dim(T2) <- c(nobj, ncomp) + # set attributes for Q Q <- mda.setattr(Q, mda.getattr(scores), type = "row") attr(Q, "name") <- "Squared residual distance (q)" @@ -512,7 +503,7 @@ jm.crit <- function(residuals, eigenvals, alpha = 0.05, gamma = 0.01) { t2 <- rev(cumsum(rev(eigenvals)^2))[seq_len(ncomp)] t3 <- rev(cumsum(rev(eigenvals)^3))[seq_len(ncomp)] - h0 <- 1 - 2 * t1 * t3 / 3 / (t2^2); + h0 <- 1 - 2 * t1 * t3 / 3 / (t2^2) ifelse(h0 < 0.001, h0 <- 0.001, h0) # inverse error function @@ -556,7 +547,7 @@ jm.prob <- function(u, eigenvals, ncomp) { t2 <- rev(cumsum(rev(eigenvals)^2))[ncomp] t3 <- rev(cumsum(rev(eigenvals)^3))[ncomp] - h0 <- 1 - 2 * t1 * t3 / 3 / (t2^2); + h0 <- 1 - 2 * t1 * t3 / 3 / (t2^2) ifelse(h0 < 0.001, h0 <- 0.001, h0) h1 <- (u / t1)^h0 diff --git a/R/mcrpure.R b/R/mcrpure.R index f7103ad..19a44cd 100644 --- a/R/mcrpure.R +++ b/R/mcrpure.R @@ -180,14 +180,16 @@ unmix.mcrpure <- function(obj, x) { # resolve spectra and concentrations St <- tryCatch( t(x) %*% Dr %*% solve(crossprod(Dr)), - error = function(e) + error = function(e) { stop("Unable to resolve the components, perhaps 'ncomp' is too large.", call. = FALSE) + } ) Ct <- tryCatch( x %*% St %*% solve(crossprod(St)), - error = function(e) + error = function(e) { stop("Unable to resolve the components, perhaps 'ncomp' is too large.", call. = FALSE) + } ) # scale @@ -293,8 +295,6 @@ summary.mcrpure <- function(object, ...) { fprintf("\nInfo:\n%s\n", object$info) } - drvstr <- c("no", "only for estimation of pure variables", "for pure variables and unmixing") - if (length(object$exclrows) > 0) { fprintf("Excluded rows: %d\n", length(object$exclrows)) } @@ -357,7 +357,6 @@ getPureVariables <- function(D, ncomp, purevars, offset) { nspec <- nrow(D) nvar <- ncol(D) nspecvis <- nspec - length(exclrows) - nvarvis <- nvar - length(exclcols) # get indices for excluded columns if provided colind <- seq_len(nvar) diff --git a/R/mdaplot.R b/R/mdaplot.R index b11cb17..2c2c221 100755 --- a/R/mdaplot.R +++ b/R/mdaplot.R @@ -28,7 +28,7 @@ mdaplot.areColors <- function(palette) { #' @return #' matrix with formatted values #' -mdaplot.formatValues <- function(data, round.only = F, digits = 3) { +mdaplot.formatValues <- function(data, round.only = FALSE, digits = 3) { # if values are not numeric - return as is if (!is.numeric(data[1])) return(data) @@ -152,7 +152,7 @@ mdaplot.showColorbar <- function(cgroup, colmap = "default", lab.col = "darkgray shift <- shift * w * 0.02 # 2 percent of segment width x <- lim[1] + dx * 0.1 - y <- lim[4] - (h + 0.1 * h); + y <- lim[4] - (h + 0.1 * h) # show colorbar and define coordinates for labels for (i in seq_len(ncol)) { @@ -168,8 +168,7 @@ mdaplot.showColorbar <- function(cgroup, colmap = "default", lab.col = "darkgray } # show labels for colorbar regions - text(labels[, 1], labels[, 2], labels = rownames(labels), pos = 1, col = lab.col, - cex = lab.cex) + text(labels[, 1], labels[, 2], labels = rownames(labels), pos = 1, col = lab.col, cex = lab.cex) } #' Plot lines diff --git a/R/mdaplotg.R b/R/mdaplotg.R index f19cb85..750c907 100755 --- a/R/mdaplotg.R +++ b/R/mdaplotg.R @@ -476,7 +476,7 @@ mdaplotg <- function( # use mdaplot with show.axes = FALSE to create the plot mdaplot(ps = ps[[i]], type = type[i], col = col[i], pch = pch[i], lty = lty[i], lwd = lwd[i], cex = cex[i], force.x.values = force.x.values, bwd = bwd, - show.grid = F, show.labels = show.labels, opacity = opacity[i], + show.grid = FALSE, show.labels = show.labels, opacity = opacity[i], lab.col = lab.col[i], lab.cex = lab.cex, show.axes = FALSE, show.excluded = show.excluded, ... ) diff --git a/R/misc.R b/R/misc.R index a976aa6..d50ee40 100755 --- a/R/misc.R +++ b/R/misc.R @@ -201,12 +201,12 @@ mda.subset <- function(x, subset = NULL, select = NULL) { attrs <- mda.getattr(x) if (!is.null(subset)) { - if (is.logical(subset) & !is.null(attrs$exclrows)) + if (is.logical(subset) && !is.null(attrs$exclrows)) subset <- subset[-attrs$exclrows] # remove excluded rows first if (!is.null(attrs$exclrows)) - x <- x[-attrs$exclrows, , drop = F] + x <- x[-attrs$exclrows, , drop = FALSE] # get numeric indices for the rows and subset them subset <- mda.getexclind(subset, rownames(x), nrow(x)) @@ -227,11 +227,11 @@ mda.subset <- function(x, subset = NULL, select = NULL) { # remove excluded rows first if (!is.null(attrs$exclcols)) - x <- x[, -attrs$exclcols, drop = F] + x <- x[, -attrs$exclcols, drop = FALSE] # get numeric indices for the rows and subset them select <- mda.getexclind(select, colnames(x), ncol(x)) - x <- x[, select, drop = F] + x <- x[, select, drop = FALSE] # correct attributes if (!is.null(attrs$xaxis.values)) { diff --git a/R/pca.R b/R/pca.R index 555335d..6deb79e 100755 --- a/R/pca.R +++ b/R/pca.R @@ -666,7 +666,7 @@ pca.mvreplace <- function(x, center = TRUE, scale = FALSE, maxncomp = 10, expvar } if (center) { - gmean <- attr(xo, "scaled:center"); + gmean <- attr(xo, "scaled:center") } n <- 1 @@ -697,7 +697,7 @@ pca.mvreplace <- function(x, center = TRUE, scale = FALSE, maxncomp = 10, expvar x_new <- tcrossprod(scores, loadings) # remove centering - x_new <- sweep(x_new, 2L, lmean, "+", check.margin = F) + x_new <- sweep(x_new, 2L, lmean, "+", check.margin = FALSE) # replace missing values by the calculated x <- xo @@ -831,7 +831,7 @@ pca.nipals <- function(x, ncomp = min(ncol(x), nrow(x) - 1), tol = 10^-10) { E <- x for (i in seq_len(ncomp)) { ind <- which.max(apply(E, 2, sd)) - t <- E[, ind, drop = F] + t <- E[, ind, drop = FALSE] tau <- th <- 99999999 while (th > tol * tau) { p <- crossprod(E, t) / as.vector(crossprod(t)) @@ -1091,9 +1091,13 @@ plotCumVariance.pca <- function(obj, legend.position = "bottomright", ...) { #' See examples in help for \code{\link{pca}} function. #' #' @export -plotScores.pca <- function(obj, comp = c(1, 2), type = "p", show.axes = TRUE, +plotScores.pca <- function(obj, comp = if (obj$ncomp > 1) c(1, 2) else 1, type = "p", show.axes = TRUE, show.legend = TRUE, res = obj$res, ...) { + if (min(comp) < 1 || max(comp) > ncol(obj$loadings)) { + stop("Wrong values for 'comp' parameter.") + } + res <- getRes(res, "ldecomp") if (length(res) == 1) { return(plotScores(res[[1]], comp = comp, type = type, ...)) @@ -1207,9 +1211,14 @@ plotResiduals.pca <- function(obj, ncomp = obj$ncomp.selected, log = FALSE, #' See examples in help for \code{\link{pca}} function. #' #' @export -plotLoadings.pca <- function(obj, comp = c(1, 2), type = (if (length(comp == 2)) "p" else "l"), +plotLoadings.pca <- function(obj, comp = if (obj$ncomp > 1) c(1, 2) else 1, + type = (if (length(comp == 2)) "p" else "l"), show.legend = TRUE, show.axes = TRUE, ...) { + if (min(comp) < 1 || max(comp) > ncol(obj$loadings)) { + stop("Wrong values for 'comp' parameter.") + } + plot_data <- mda.subset(obj$loadings, select = comp) colnames(plot_data) <- paste0("Comp ", comp, " (", round(obj$res[["cal"]]$expvar[comp], 2), "%)") attr(plot_data, "name") <- "Loadings" @@ -1284,7 +1293,7 @@ plotBiplot.pca <- function(obj, comp = c(1, 2), pch = c(16, NA), col = mdaplot.g main = main, colmap = col, show.lines = show.lines, show.excluded = show.excluded, ...) if (show.excluded && length(attr(loadings, "exclrows")) > 0) { - loadings <- loadings[-attr(loadings, "exclrows"), , drop = F] + loadings <- loadings[-attr(loadings, "exclrows"), , drop = FALSE] } segments(0, 0, loadings[, 1], loadings[, 2], col = col[2], lty = lty, lwd = lwd) diff --git a/R/pcv.R b/R/pcv.R index 96c33ab..b064be1 100644 --- a/R/pcv.R +++ b/R/pcv.R @@ -25,7 +25,7 @@ #' Pseudo-validation matrix (IxJ) #' #' @export -pcv <- function(x, ncomp = min(round(nrow(x)/nseg) - 1, col(x), 20), nseg = 4, scale = FALSE) { +pcv <- function(x, ncomp = min(round(nrow(x) / nseg) - 1, col(x), 20), nseg = 4, scale = FALSE) { # keep names if any attrs <- attributes(x) @@ -56,7 +56,7 @@ pcv <- function(x, ncomp = min(round(nrow(x)/nseg) - 1, col(x), 20), nseg = 4, s # split data to calibration and validation x.c <- x[-ind[, k], , drop = FALSE] - x.k <- x[ ind[, k], , drop = FALSE] + x.k <- x[ind[, k], , drop = FALSE] # get loadings for local model and rotation matrix between global and local models P.k <- svd(x.c, nv = ncomp)$v[, seq_len(ncomp), drop = FALSE] @@ -93,8 +93,8 @@ pcv <- function(x, ncomp = min(round(nrow(x)/nseg) - 1, col(x), 20), nseg = 4, s #' @return #' Rotation matrix (JxJ) getR <- function(base1, base2) { - base1 <- as.matrix(base1); - base2 <- as.matrix(base2); + base1 <- as.matrix(base1) + base2 <- as.matrix(base2) R1 <- rotationMatrixToX1(base1[, 1]) R2 <- rotationMatrixToX1(base2[, 1]) @@ -122,7 +122,7 @@ getR <- function(base1, base2) { R <- crossprod(R2, (M %*% R1)) } - return(R); + return(R) } #' Creates a rotation matrix to map a vector x to [1 0 0 ... 0] @@ -174,4 +174,3 @@ eye <- function(n) { diag(X) <- 1 return(X) } - diff --git a/R/plotseries.R b/R/plotseries.R index c77b112..4fba984 100644 --- a/R/plotseries.R +++ b/R/plotseries.R @@ -210,7 +210,7 @@ splitPlotData <- function(data, type) { # 0.12.0: yaxis.name must not be used as axis label in line and bar plots if (type %in% c("b", "l", "e", "h")) { - attrs$yaxis.name = NULL + attrs$yaxis.name <- NULL } # prepare x-axis values for other types of plots @@ -315,7 +315,7 @@ getPlotColors <- function(ps, col, opacity, cgroup, colmap) { getConfidenceEllipse <- function(points, conf.level = 0.95, n = 100) { # compute igen vectors and values - e <- eigen(cov(points)); + e <- eigen(cov(points)) # get angle between the x-axis and the largest eigenvector phi <- atan2(e$vectors[[2]], e$vectors[[1]]) @@ -647,11 +647,6 @@ plotBars <- function(ps, col = ps$col, bwd = 0.8, border = NA, force.x.values = y <- ps$y_values[1, ] if (length(x) > 1) { - # this gives variable width for bars and does not work well - #bwd_left <- c(x[seq(2, length(x))] - x[seq(1, length(x) - 1)]) - #bwd_right <- -c(x[seq(1, length(x) - 1)] - x[seq(2, length(x))]) - #bwd_left <- c(bwd_left[1], bwd_left) * bwd / 2 - #bwd_right <- c(bwd_right, bwd_right[length(bwd_right)]) * bwd / 2 dx <- min(diff(x)) bwd_right <- bwd_left <- dx * bwd / 2 } else { diff --git a/R/pls.R b/R/pls.R index e750caf..9cb9cc7 100755 --- a/R/pls.R +++ b/R/pls.R @@ -125,6 +125,7 @@ #' \tabular{ll}{ #' \code{\link{plotPredictions.regmodel}} \tab shows predicted vs. measured plot.\cr #' \code{\link{plotRMSE.regmodel}} \tab shows RMSE plot.\cr +#' \code{\link{plotRMSERatio.regmodel}} \tab shows plot for ratio RMSECV/RMSEC values.\cr #' \code{\link{plotYResiduals.regmodel}} \tab shows residuals plot for y values.\cr #' \code{\link{getRegcoeffs.regmodel}} \tab returns matrix with regression coefficients.\cr #' } @@ -505,6 +506,185 @@ setDistanceLimits.pls <- function(obj, lim.type = obj$lim.type, alpha = obj$alph return(obj) } +#' Compute predictions for response values +#' +#' @param x +#' matrix with predictors, already preprocessed (e.g. mean centered) and cleaned +#' @param coeffs +#' array with regression coefficients +#' @param ycenter +#' `ycenter` property of PLS model +#' @param yscale +#' `yscale` property of PLS model +#' @param ynames +#' vector with names of the responses +#' @param y.attrs +#' list with response attributes (e.g. from reference values if any) +#' @param objnames +#' vector with names of objects (rows of x) +#' @param compnames +#' vector with names used for components +#' +#' @return array with predicted y-values +pls.getpredictions <- function(x, coeffs, ycenter, yscale, ynames = NULL, y.attrs = NULL, objnames = NULL, + compnames = NULL) { + + yp <- apply(coeffs, 3, function(x, y) (y %*% x), x) + dim(yp) <- c(nrow(x), dim(coeffs)[2], dim(coeffs)[3]) + + # unscale predicted y values + yp <- if (is.numeric(yscale)) sweep(yp, 3, yscale, "*") else yp + + # uncenter predicted y values + yp <- if (is.numeric(ycenter)) sweep(yp, 3, ycenter, "+") else yp + + # set up all attributes and names + yp <- mda.setattr(yp, y.attrs, "row") + attr(yp, "name") <- "Response values, predicted" + dimnames(yp) <- list(objnames, compnames, ynames) + + return(yp) +} + +#' Compute object with decomposition of y-values +#' +#' @param y +#' matrix with responses, already preprocessed (e.g. mean centered) and cleaned +#' @param yscores +#' matrix with Y-scores +#' @param xscores +#' matrix with X-scores +#' @param yloadings +#' matrix with Y-loadings +#' @param yeigenvals +#' matrix with eigenvalues for Y +#' @param ynames +#' vector with names of the responses +#' @param y.attrs +#' list with response attributes (e.g. from reference values if any) +#' @param x.attrs +#' list with preditors attributes +#' @param objnames +#' vector with names of objects (rows of x) +#' @param compnames +#' vector with names used for components +#' +#' @return array `ldecomp` object for y-values (or NULL if y is not provided) +pls.getydecomp <- function(y, yscores, xscores, yloadings, yeigenvals, ynames = NULL, y.attrs = NULL, + x.attrs = NULL, objnames = NULL, compnames = NULL) { + + # if reference y-values are not provided, no ydecomp can be computed + if (is.null(y)) return(NULL) + + # compute resuduals + yresiduals <- y - tcrossprod(xscores, yloadings) + + # set names + rownames(yscores) <- rownames(yresiduals) <- objnames + colnames(yscores) <- compnames + colnames(yresiduals) <- ynames + + # set attributes + yscores <- mda.setattr(yscores, y.attrs, "row") + yresiduals <- mda.setattr(yresiduals, y.attrs) + + # set attributes + yscores <- mda.setattr(yscores, x.attrs, "row") + yresiduals <- mda.setattr(yresiduals, y.attrs) + + attr(yscores, "name") <- "Y-scores" + attr(yscores, "xaxis.name") <- "Components" + attr(yresiduals, "name") <- "Residuals" + + # create ydecomp object (we use xscores as residuals for different components are computed + # as xscores %*% t(yloadings)), but then we assign correct residuals + ydecomp <- ldecomp(scores = xscores, loadings = yloadings, residuals = yresiduals, eigenvals = yeigenvals) + ydecomp$scores <- yscores + + return(ydecomp) +} + +#' Compute object with decomposition of x-values +#' +#' @param x +#' matrix with predictors, already preprocessed (e.g. mean centered) and cleaned +#' @param xscores +#' matrix with X-scores +#' @param xloadings +#' matrix with X-loadings +#' @param xeigenvals +#' matrix with eigenvalues for X +#' @param xnames +#' vector with names of the predictors +#' @param x.attrs +#' list with preditors attributes +#' @param objnames +#' vector with names of objects (rows of x) +#' @param compnames +#' vector with names used for components +#' +#' @return array `ldecomp` object for x-values +pls.getxdecomp <- function(x, xscores, xloadings, xeigenvals, xnames = NULL, x.attrs = NULL, objnames = NULL, + compnames = NULL) { + + # compute x-residuals + xresiduals <- x - tcrossprod(xscores, xloadings) + + # set attributes + xscores <- mda.setattr(xscores, x.attrs, "row") + xresiduals <- mda.setattr(xresiduals, x.attrs) + attr(xscores, "name") <- "X-scores" + attr(xscores, "xaxis.name") <- "Components" + attr(xresiduals, "name") <- "Residuals" + + # set names + rownames(xscores) <- rownames(xresiduals) <- objnames + colnames(xscores) <- compnames + colnames(xresiduals) <- xnames + + # create and return xdecomp object + xdecomp <- ldecomp(scores = xscores, residuals = xresiduals, loadings = xloadings, eigenvals = xeigenvals) + return(xdecomp) +} + +#' Compute matrix with X-scores +#' +#' @param x +#' matrix with predictors, already preprocessed and cleaned +#' @param weights +#' matrix with PLS weights +#' @param xloadings +#' matrix with X-loadings +#' +#' @return matrix with X-scores +pls.getxscores <- function(x, weights, xloadings) { + return(x %*% (weights %*% solve(crossprod(xloadings, weights)))) +} + +#' Compute and orthogonalize matrix with Y-scores +#' +#' @param y +#' matrix with response values, already preprocessed and cleaned +#' @param yloadings +#' matrix with Y-loadings +#' @param xscores +#' matrix with X-scores (needed for orthogonalization) +#' +#' @return matrix with Y-scores +pls.getyscores <- function(y, yloadings, xscores) { + + ncomp <- ncol(yloadings) + yscores <- as.matrix(y) %*% yloadings + if (ncomp < 2) return(yscores) + + # orthogonalize + for (a in 2:ncomp) { + yscores[, a] <- yscores[, a] - xscores[, 1:(a - 1), drop = FALSE] %*% + crossprod(xscores[, 1:(a - 1), drop = FALSE], yscores[, a]) + } + + return(yscores) +} #' PLS predictions #' #' @description @@ -533,11 +713,11 @@ predict.pls <- function(object, x, y = NULL, cv = FALSE, ...) { # get names prednames <- rownames(object$xloadings) respnames <- rownames(object$yloadings) + compnames <- colnames(object$xloadings) objnames <- rownames(x) # preprocess x and calculate scores, total and full variance x.attrs <- mda.getattr(x) - y.attrs <- mda.getattr(y) # set names for y-axis (rows if it is empty) if (is.null(x.attrs$yaxis.name)) { @@ -549,7 +729,6 @@ predict.pls <- function(object, x, y = NULL, cv = FALSE, ...) { # get dimensions nresp <- dim(object$coeffs$values)[3] - ncomp <- dim(object$coeffs$values)[2] # check dimensions of predictors if (ncol(x) != dim(object$coeffs$values)[1]) { @@ -559,29 +738,31 @@ predict.pls <- function(object, x, y = NULL, cv = FALSE, ...) { # autoscale x x <- prep.autoscale(x, center = object$xcenter, scale = object$xscale) - # compute x scores and residuals - xscores <- x %*% (object$weights %*% solve(crossprod(object$xloadings, object$weights))) - xresiduals <- x - tcrossprod(xscores, object$xloadings) + # get predicted y-values + yp <- pls.getpredictions(x, object$coeffs$values, object$ycenter, object$yscale, respnames, x.attrs, + objnames, compnames) - # set attributes - xscores <- mda.setattr(xscores, x.attrs, "row") - xresiduals <- mda.setattr(xresiduals, x.attrs) - attr(xscores, "name") <- "X-scores" - attr(xscores, "xaxis.name") <- "Components" - attr(xresiduals, "name") <- "Residuals" + # if predictions for cross-validation - return + if (cv) { + return(list(y.pred = yp)) + } - # set names - rownames(xscores) <- rownames(xresiduals) <- objnames - colnames(xscores) <- colnames(object$xloadings) - colnames(xresiduals) <- prednames + # compute xdecomp + xscores <- pls.getxscores(x, object$weights, object$xloadings) + xdecomp <- pls.getxdecomp(x, xscores, object$xloadings, object$xeigenvals, prednames, x.attrs, objnames, compnames) + xdecomp$ncomp.selected <- object$ncomp.selected + + # add u0 and Nu parameters as arguments, so the orthogonal distance values can be normalized + attr(xdecomp$Q, "u0") <- object$Qlim[3, ] + attr(xdecomp$Q, "Nu") <- object$Qlim[4, ] - # make predictions - yp <- apply(object$coeffs$values, 3, function(x, y)(y %*% x), x) - dim(yp) <- c(nrow(x), ncomp, dim(object$coeffs$values)[3]) + # add u0 and Nu parameters as arguments, so the score distance values can be normalized + attr(xdecomp$T2, "u0") <- object$T2lim[3, ] + attr(xdecomp$T2, "Nu") <- object$T2lim[4, ] - # if reference values are provided calculate and set up names for ydecomp - y.ref <- NULL + # compute ydecomp if y-values are provided ydecomp <- NULL + y.ref <- y if (!is.null(y)) { if (is.null(dim(y))) dim(y) <- c(length(y), 1) @@ -594,80 +775,24 @@ predict.pls <- function(object, x, y = NULL, cv = FALSE, ...) { stop("Wrong number of columns in matrix with response values (y).") } - # keep the original y values as reference - y.ref <- y + y.attrs <- mda.getattr(y) + y.attrs$exclrows <- x.attrs$exclrows # autoscale y-values y <- prep.autoscale(y, center = object$ycenter, scale = object$yscale) + yscores <- pls.getyscores(as.matrix(y), object$yloadings, xscores) - # compute and orthogonalize y-scores - yscores <- as.matrix(y) %*% object$yloadings - for (a in seq_len(ncomp)) { - for (n in 1:2) { - for (j in seq_len(a - 1)) { - yscores[, a] <- yscores[, a] - - tcrossprod(xscores[, j], yscores[, a]) %*% xscores[, j] - } - } - } + # below we use xdecomp$scores instead of xscores to provide all names and attributes + ydecomp <- pls.getydecomp(y, yscores, xdecomp$scores, object$yloadings, object$yeigenvals, + respnames, y.attrs, x.attrs, objnames, compnames) + ydecomp$ncomp.selected <- object$ncomp.selected - # compute y-residuals - yresiduals <- y - yp[, ncomp, ] - - # set names - rownames(yscores) <- rownames(yresiduals) <- objnames - colnames(yscores) <- colnames(object$yloadings) - colnames(yresiduals) <- respnames - - # set attributes - yscores <- mda.setattr(yscores, x.attrs, "row") - yresiduals <- mda.setattr(yresiduals, y.attrs) - attr(yscores, "exclrows") <- attr(yresiduals, "exclrows") <- x.attrs$exclrows - attr(yscores, "name") <- "Y-scores" - attr(yscores, "xaxis.name") <- "Components" - attr(yresiduals, "name") <- "Residuals" - - # create ydecomp object (we use xscores as residuals for different components are computed - # as xscores %*% t(yloadings)), but then we assign correct residuals - ydecomp <- ldecomp(scores = xscores, loadings = object$yloadings, residuals = yresiduals, - eigenvals = object$yeigenvals, ncomp.selected = object$ncomp.selected) - ydecomp$scores <- yscores + # add u0 and Nu parameters as arguments, so the z-distance values can be normalized attr(ydecomp$Q, "u0") <- object$Zlim[3, ] attr(ydecomp$Q, "Nu") <- object$Zlim[4, ] } - # unscale predicted y values - yp <- if (is.numeric(object$yscale)) sweep(yp, 3, object$yscale, "*") else yp - - # uncenter predicted y values - yp <- if (is.numeric(object$ycenter)) sweep(yp, 3, object$ycenter, "+") else yp - - # if predictions for cross-validation - return - if (cv) { - return(list(y.pred = yp)) - } - - # set up all attributes and names - yp <- mda.setattr(yp, x.attrs, "row") - attr(yp, "exclrows") <- x.attrs$exclrows - attr(yp, "name") <- "Response values, predicted" - dimnames(yp) <- c(list(rownames(x)), dimnames(object$coeffs$values)[2:3]) - - # create xdecomp object - xdecomp <- ldecomp(scores = xscores, residuals = xresiduals, loadings = object$xloadings, - eigenvals = object$xeigenvals, ncomp.selected = object$ncomp.selected) - - # add u0 and Nu parameters as arguments, so the residuals can be normalized - attr(xdecomp$Q, "u0") <- object$Qlim[3, ] - attr(xdecomp$Q, "Nu") <- object$Qlim[4, ] - - attr(xdecomp$T2, "u0") <- object$T2lim[3, ] - attr(xdecomp$T2, "Nu") <- object$T2lim[4, ] - - return( - plsres(yp, y.ref = y.ref, ncomp.selected = object$ncomp.selected, - xdecomp = xdecomp, ydecomp = ydecomp) - ) + return(plsres(yp, y.ref = y.ref, ncomp.selected = object$ncomp.selected, xdecomp = xdecomp, ydecomp = ydecomp)) } #' Categorize data rows based on PLS results and critical limits for total distance. @@ -797,7 +922,7 @@ summary.pls <- function(object, ncomp = object$ncomp.selected, if (!any(is.na(out[, 1:4]))) out[, 1:4] <- round(out[, 1:4], 3) out[, 5] <- round(out[, 5], 3) - out[, 6] <- mdaplot.formatValues(out[, 6], round.only = T) + out[, 6] <- mdaplot.formatValues(out[, 6], round.only = TRUE) out[, 7] <- round(out[, 7], 3) out[, 8] <- round(out[, 8], 4) out[, 9] <- round(out[, 9], 2) @@ -984,9 +1109,12 @@ plotVariance.pls <- function(obj, decomp = "xdecomp", variance = "expvar", type #' See examples in help for \code{\link{pls}} function. #' #' @export -plotXScores.pls <- function(obj, comp = c(1, 2), show.axes = T, main = "Scores (X)", +plotXScores.pls <- function(obj, comp = if (obj$ncomp > 1) c(1, 2) else 1, show.axes = TRUE, main = "Scores (X)", res = obj$res, ...) { + if (min(comp) < 1 || max(comp) > ncol(obj$weights)) { + stop("Wrong values for 'comp' parameter.") + } # set up values for showing axes lines show.lines <- FALSE @@ -1018,7 +1146,7 @@ plotXScores.pls <- function(obj, comp = c(1, 2), show.axes = T, main = "Scores #' See examples in help for \code{\link{pls}} function. #' #' @export -plotXYScores.pls <- function(obj, ncomp = 1, show.axes = T, res = obj$res, ...) { +plotXYScores.pls <- function(obj, ncomp = 1, show.axes = TRUE, res = obj$res, ...) { show.lines <- if (show.axes) c(0, 0) else FALSE plot_data <- lapply(res, plotXYScores, ncomp = ncomp, type = "p", show.plot = FALSE) @@ -1220,9 +1348,13 @@ plotXYResiduals.pls <- function(obj, ncomp = obj$ncomp.selected, norm = TRUE, lo #' See examples in help for \code{\link{pls}} function. #' #' @export -plotXLoadings.pls <- function(obj, comp = c(1, 2), type = "p", show.axes = TRUE, +plotXLoadings.pls <- function(obj, comp = if (obj$ncomp > 1) c(1, 2) else 1, type = "p", show.axes = TRUE, show.legend = TRUE, ...) { + if (min(comp) < 1 || max(comp) > ncol(obj$weights)) { + stop("Wrong values for 'comp' parameter.") + } + plot_data <- mda.subset(obj$xloadings, select = comp) colnames(plot_data) <- sprintf("Comp %d (%.2f%%)", comp, obj$res[["cal"]]$xdecomp$expvar[comp]) attr(plot_data, "name") <- "Loadings (X)" @@ -1474,6 +1606,87 @@ pls.simpls <- function(x, y, ncomp, cv = FALSE) { x <- as.matrix(x) y <- as.matrix(y) + nobj <- nrow(x) + npred <- ncol(x) + nresp <- ncol(y) + + # initial estimation + S <- crossprod(x, y) + M <- crossprod(x) + + # prepare space for results + B <- array(0, dim = c(npred, ncomp, nresp)) + Q <- matrix(0, nrow = nresp, ncol = ncomp) + R <- V <- P <- matrix(0, nrow = npred, ncol = ncomp) + TT <- U <- matrix(0, nrow = nobj, ncol = ncomp) + + + # loop for each components + for (a in seq_len(ncomp)) { + + r <- svd(S, nu = 1, nv = 0)$u + t <- x %*% r + + tnorm <- sqrt(sum(t * t)) + t <- t / tnorm + r <- r / tnorm + + p <- crossprod(x, t) + q <- crossprod(y, t) + u <- y %*% q + v <- p + + if (a > 1) { + v <- v - V %*% crossprod(V, p) + u <- u - TT %*% crossprod(TT, u) + } + + v <- v / sqrt(sum(v * v)) + + R[, a] <- r + V[, a] <- v + P[, a] <- p + TT[, a] <- t + U[, a] <- u + Q[, a] <- q + + # coefficients are computed for each a from 1 to A + B[, a, ] <- tcrossprod(R[, seq_len(a), drop = FALSE], Q[, seq_len(a), drop = FALSE]) + + M <- M - tcrossprod(p) + S <- S - v %*% crossprod(v, S) + } + + return(list(coeffs = B, weights = R, xloadings = P, xscores = TT, yloadings = Q, yscores = U, ncomp = a)) +} + +#' SIMPLS algorithm (old implementation) +#' +#' @description +#' SIMPLS algorithm for calibration of PLS model (old version) +#' +#' @param x +#' a matrix with x values (predictors) +#' @param y +#' a matrix with y values (responses) +#' @param ncomp +#' number of components to calculate +#' @param cv +#' logical, is model calibrated during cross-validation or not +#' +#' @return +#' a list with computed regression coefficients, loadings and scores for x and y matrices, +#' and weights. +#' +#' @references +#' [1]. S. de Jong. SIMPLS: An Alternative approach to partial least squares regression. +#' Chemometrics and Intelligent Laboratory Systems, 18, 1993 (251-263). +#' +pls.simplsold <- function(x, y, ncomp, cv = FALSE) { + + x <- as.matrix(x) + y <- as.matrix(y) + npred <- ncol(x) nresp <- ncol(y) @@ -1653,7 +1866,7 @@ pls.cal <- function(x, y, ncomp, center, scale, method = "simpls", cv = FALSE) { # find maximum number of objects in a segment nobj.cv <- 0 if (!is.logical(cv) && !is.null(cv)) { - nseg <- if (is.numeric(cv)) cv else cv[[2]] + nseg <- max(crossval(cv, xc.nrows)) nobj.cv <- if (nseg == 1) 1 else ceiling(xc.nrows / nseg) # we set cv to FALSE so fitting knows that it is not a part of cross-validation @@ -1666,7 +1879,7 @@ pls.cal <- function(x, y, ncomp, center, scale, method = "simpls", cv = FALSE) { # correct maximum number of components ncomp <- min(xc.ncols, xc.nrows - 1 - nobj.cv, ncomp) - # fit model + # fit the model fit <- pls.run(x, y, method = method, ncomp = ncomp, cv = cv) model$ncomp <- ncomp <- fit$ncomp @@ -1678,14 +1891,11 @@ pls.cal <- function(x, y, ncomp, center, scale, method = "simpls", cv = FALSE) { return(model) } - # compute x-scores and residuals - xscores <- x %*% (fit$weights %*% solve(crossprod(fit$xloadings, fit$weights))) - yscores <- as.matrix(y) %*% fit$yloadings # compute eigenvalues - xeigenvals <- colSums(xscores^2) / (xc.nrows - 1) + xeigenvals <- colSums(fit$xscores^2) / (xc.nrows - 1) attr(xeigenvals, "DoF") <- (xc.nrows - 1) - yeigenvals <- colSums(yscores^2) / (xc.nrows - 1) + yeigenvals <- colSums(fit$yscores^2) / (xc.nrows - 1) attr(yeigenvals, "DoF") <- (xc.nrows - 1) # correct results related to predictors for missing columns in x @@ -1790,12 +2000,12 @@ vipscores <- function(obj, ncomp = obj$ncomp.selected) { # subset needed model parameters comp <- seq_len(ncomp) weights <- obj$weights[, comp, drop = FALSE] - yloads <- obj$yloadings[, comp, drop = FALSE]; + yloads <- obj$yloadings[, comp, drop = FALSE] - # get eigenvalues and multiply them to degrees of freedom + # get eigenvalues xeigenvals <- obj$xeigenvals[comp] - # get number and indices of variables and adjust dimension for regcoeffs + # get number and indices of variables nvar <- nrow(weights) var_ind <- seq_len(nvar) @@ -1808,12 +2018,12 @@ vipscores <- function(obj, ncomp = obj$ncomp.selected) { # prepare matrix for vipscores vipscores <- matrix(0, nrow = nvar, ncol = nrow(yloads)) - # normalize scores - wnorm <- weights %*% diag(1 / sqrt(colSums(weights^2)), nrow = ncomp, ncol = ncomp) + # normalize weights + wnorm <- sweep(weights, 2, sqrt(colSums(weights^2)), "/") # compute sum of squares for explained y variance and normalize it ssq <- yloads^2 %*% diag(xeigenvals, nrow = ncomp, ncol = ncomp) - ssq <- ssq %*% diag(1 / rowSums(ssq), nrow = ncomp, ncol = ncomp) + ssq <- sweep(ssq, 1, rowSums(ssq), "/") # compute VIP scores vipscores[var_ind, ] <- sqrt(nvar * wnorm^2 %*% t(ssq)) @@ -1908,17 +2118,18 @@ selratio <- function(obj, ncomp = obj$ncomp.selected) { # prepare matrix for vipscores selratio <- matrix(0, nrow = nvar, ncol = nresp) - # get norm value for regression coefficients - bnorm <- sqrt(colSums(coeffs^2)) - - # compute target projections - ttp <- x %*% (coeffs %*% diag(1 / bnorm, nrow = nresp, ncol = nresp)) - ptp <- t(crossprod(x, ttp) %*% diag(1 / colSums(ttp^2), nrow = nresp, ncol = nresp)) - # compute selectivity ratio for (y in seq_len(nresp)) { - expvar <- ttp[, y, drop = FALSE] %*% ptp[y, , drop = FALSE] - selratio[var_ind, y] <- colSums(expvar^2) / colSums((x - expvar)^2) + b <- coeffs[, y, drop = FALSE] / sqrt(sum(coeffs[, y]^2)) + t <- x %*% b + p <- crossprod(t, x) / sum(t * t) + + exp <- t %*% p + res <- x - exp + expvar <- colSums(exp^2) + resvar <- colSums(res^2) + resvar[resvar < .Machine$double.eps] <- 1 + selratio[var_ind, y] <- expvar / resvar } rownames(selratio) <- rownames(obj$xloadings) diff --git a/R/plsdares.R b/R/plsdares.R index 1548f06..e999aeb 100755 --- a/R/plsdares.R +++ b/R/plsdares.R @@ -213,7 +213,7 @@ as.matrix.plsdares <- function(x, ncomp = NULL, nc = 1, ...) { #' #' @export summary.plsdares <- function(object, nc = seq_len(object$nclasses), ...) { - cat("\nPLS-DA results (class plsdares) summary:\n"); + cat("\nPLS-DA results (class plsdares) summary:\n") fprintf("Number of selected components: %.0f\n", object$ncomp.selected) if (is.null(object$c.ref)) { diff --git a/R/plsres.R b/R/plsres.R index 12a4993..2063871 100755 --- a/R/plsres.R +++ b/R/plsres.R @@ -226,7 +226,7 @@ summary.plsres <- function(object, ny = seq_len(object$nresp), ncomp = NULL, ... out <- as.matrix.plsres(object, ny = y, ncomp = ncomp) if (!any(is.na(out[, 1:4]))) out[, 1:4] <- round(out[, 1:4], 3) out[, 5] <- round(out[, 5], 3) - out[, 6] <- mdaplot.formatValues(out[, 6], round.only = T) + out[, 6] <- mdaplot.formatValues(out[, 6], round.only = TRUE) out[, 7] <- round(out[, 7], 3) out[, 8] <- round(out[, 8], 4) out[, 9] <- round(out[, 9], 2) diff --git a/R/prep.R b/R/prep.R index ef43e85..2d31fff 100755 --- a/R/prep.R +++ b/R/prep.R @@ -108,10 +108,12 @@ prep.snv <- function(data) { #' @param data #' a matrix with data values #' @param type -#' type of normalization \code{"area"}, \code{"length"}, \code{"sum"}, \code{"snv"}, or \code{"is"}. +#' type of normalization \code{"area"}, \code{"length"}, \code{"sum"}, \code{"snv"}, \code{"is"}, or \code{"pqn"}. #' @param col.ind #' indices of columns (can be either integer or logical valuws) for normalization to internal #' standard peak. +#' @param ref.spectrum +#' reference spectrum for PQN normalization, if not provided a mean spectrum for data is used #' #' @details #' The \code{"area"}, \code{"length"}, \code{"sum"} types do preprocessing to unit area (sum of @@ -122,15 +124,25 @@ prep.snv <- function(data) { #' of this peak. If `col.ind` points on several adjucent vales, the rows are normalized to the area #' under the peak - sum of the intensities. #' +#' The \code{"pqn"} is Probabilistic Quotient Normalization as described in [1]. In this case you also +#' need to provide a reference spectrum (e.g. mean or median of spectra for some reference samples). If +#' reference spectrum is not provided it will be computed as mean of the spectra to be +#' preprocessed (parameter \code{data}). +#' +#' @references +#' 1. F. Dieterle, A. Ross, H. Senn. Probabilistic Quotient Normalization as Robust Method to +#' Account for Dilution of Complex Biological Mixtures. Application in 1 H NMR Metabonomics. +#' Anal. Chem. 2006, 78, 4281–4290. +#' #' @return #' data matrix with normalized values #' #' @export -prep.norm <- function(data, type = "area", col.ind = NULL) { +prep.norm <- function(data, type = "area", col.ind = NULL, ref.spectrum = NULL) { if (type == "snv") return(prep.snv(data)) - if (type == "is" && is.null(col.ind) ) { + if (type == "is" && is.null(col.ind)) { stop("For 'is' normalization you need to provide indices for IS peak.") } @@ -142,21 +154,45 @@ prep.norm <- function(data, type = "area", col.ind = NULL) { stop("Values for 'col.ind' seem to be wrong.") } - f <- function(data, type, col.ind) { + if (type == "pqn" && is.null(ref.spectrum)) { + ref.spectrum <- apply(data, 2, mean) + } + + pqn <- function(data, ref.spectrum) { + + if (length(ref.spectrum) != ncol(data)) { + stop("prep.norm: 'ref.spectrum' should have the same number of values as the number of columns in 'data'.") + } + + # 1. unit area normalization + ref.spectrum <- as.numeric(ref.spectrum) + ref.spectrum <- ref.spectrum / sum(abs(ref.spectrum)) + + # 2. compute and return median quotients for each spectrum + return(apply(sweep(data, 2, ref.spectrum, "/"), 1, median)) + } + + f <- function(data, type, col.ind, ref.spectrum) { + + # preliminary normalize the dataset to unit sum + if (type == "pqn") { + data <- prep.norm(data, type = "area") + } w <- switch( type, "area" = apply(abs(data), 1, sum), "length" = sqrt(apply(data^2, 1, sum)), "sum" = apply(data, 1, sum), - "is" = apply(data[, col.ind, drop = FALSE], 1, sum) + "is" = apply(data[, col.ind, drop = FALSE], 1, sum), + "pqn" = pqn(data, ref.spectrum) ) if (is.null(w)) stop("Wrong value for argument 'type'.") return(sweep(data, 1, w, "/")) } - return(prep.generic(data, f, type = type, col.ind = col.ind)) + return(prep.generic(data, f, type = type, col.ind = col.ind, ref.spectrum = ref.spectrum)) } #' Savytzky-Golay filter @@ -193,8 +229,8 @@ prep.savgol <- function(data, width = 3, porder = 1, dorder = 0) { # compute grams polynomials gram <- function(i, m, k, s) { if (k > 0) { - return( (4 * k - 2) / (k * (2 * m - k + 1)) * (i * gram(i, m, k - 1, s) + s * gram(i, m, k - 1, s - 1)) - - ((k - 1) * (2 * m + k)) / (k * (2 * m - k + 1)) * gram(i, m, k - 2, s) ) + return((4 * k - 2) / (k * (2 * m - k + 1)) * (i * gram(i, m, k - 1, s) + s * gram(i, m, k - 1, s - 1)) - + ((k - 1) * (2 * m + k)) / (k * (2 * m - k + 1)) * gram(i, m, k - 2, s)) } if (k == 0 && s == 0) return(1) return(0) @@ -202,11 +238,11 @@ prep.savgol <- function(data, width = 3, porder = 1, dorder = 0) { # compute generalized factorial genfact <- function(a, b) { - f <- 1; + f <- 1 if ((a - b + 1) > a) return(f) for (i in (a - b + 1):a) { - f <- f * i; + f <- f * i } return(f) @@ -216,33 +252,29 @@ prep.savgol <- function(data, width = 3, porder = 1, dorder = 0) { weight <- function(i, t, m, n, s) { sum <- 0 for (k in 0:n) { - sum <- sum + (2 * k + 1) * (genfact(2 * m, k) / genfact(2 * m + k + 1, k + 1)) * gram(i, m, k, 0) * gram(t, m, k, s) + sum <- sum + (2 * k + 1) * (genfact(2 * m, k) / genfact(2 * m + k + 1, k + 1)) * + gram(i, m, k, 0) * gram(t, m, k, s) } return(sum) } - f <- function(data, width, porder, dorder) { + f <- function(x, width, porder, dorder) { - nobj <- nrow(data) - nvar <- ncol(data) - pdata <- matrix(0, ncol = nvar, nrow = nobj) - - m <- (width - 1) / 2 + nvar <- ncol(x) + px <- matrix(0.0, nrow(x), ncol(x)) + m <- round((width - 1) / 2) w <- outer(-m:m, -m:m, function(x, y) weight(x, y, m, porder, dorder)) - n <- ncol(data) - - for (j in seq_len(nobj)) { - for (i in 1:m) { - pdata[j, i] <- convolve(data[j, 1:(2 * m + 1)], w[, i], type = "filter") - pdata[j, n - i + 1] <- convolve(data[j, (n - 2 * m):n], w[, width - i + 1], type = "filter") - } - for (i in (m+1):(n-m)) { - pdata[j, i] <- convolve(data[j, (i - m):(i + m)], w[, m + 1], type = "filter") - } + for (i in seq_len(m)) { + px[, i] <- apply(x[, seq_len(2 * m + 1), drop = FALSE], 1, + function(xx) convolve(xx, w[, i], type = "filter")[1]) + px[, nvar - i + 1] <- apply(x[, (nvar - 2 * m):nvar, drop = FALSE], 1, + function(xx) convolve(xx, w[, width - i + 1], type = "filter")[1]) } - return(pdata) + px[, (m + 1):(nvar - m)] <- t(apply(x, 1, function(xx) convolve(xx, w[, m + 1], type = "filter"))) + + return(px) } return(prep.generic(data, f, width = width, porder = porder, dorder = dorder)) @@ -333,7 +365,7 @@ prep.ref2km <- function(data) { return(prep.generic(data, f)) } -#' Baseline correction using assymetric least squares +#' Baseline correction using asymetric least squares #' #' @param data #' matrix with spectra (rows correspond to individual spectra) @@ -372,34 +404,34 @@ prep.ref2km <- function(data) { #' } #' #' @importFrom Matrix Matrix Diagonal +#' @importFrom methods as #' #' @export prep.alsbasecorr <- function(data, plambda = 5, p = 0.1, max.niter = 10) { attrs <- mda.getattr(data) dimnames <- dimnames(data) - baseline <- function(y) { + m <- ncol(data) + baseline <- matrix(0, nrow(data), ncol(data)) + LDD <- Matrix::Matrix((10^plambda) * crossprod(diff(diag(m), difference = 2)), sparse = TRUE) + w.ini <- matrix(rep(1, m)) - m <- length(y) - D <- Matrix::Matrix(diff(diag(m), difference = 2), sparse = TRUE) - LDD <- (10^plambda) * Matrix::crossprod(D) - w <- Matrix::Matrix(1, nrow = m, ncol = 1, sparse = TRUE) - - for (i in seq_len(max.niter)) { + for (i in seq_len(nrow(data))) { + y <- data[i, ] + w <- w.ini + for (j in seq_len(max.niter)) { W <- Matrix::Diagonal(x = as.numeric(w)) - z <- Matrix::solve(W + LDD, w * y) + z <- Matrix::solve(as(W + LDD, "dgCMatrix"), w * y, sparse = TRUE) w.old <- w w <- p * (y > z) + (1 - p) * (y < z) - if (sum(abs(w - w.old)) < 10^-12) { - break - } + if (sum(abs(w - w.old)) < 10^-10) break } - return(as.numeric(z)) + baseline[i, ] <- as.numeric(z) } - pspectra <- t(apply(data, 1, function(x) x - baseline(x))) + pspectra <- data - baseline pspectra <- mda.setattr(pspectra, attrs) dimnames(pspectra) <- dimnames @@ -558,7 +590,7 @@ getImplementedPrepMethods <- function() { method = prep.norm, params = list(type = "area", col.ind = NULL), params.info = list( - type = "type of normalization ('area', 'sum', 'length', 'is', 'snv').", + type = "type of normalization ('area', 'sum', 'length', 'is', 'snv', 'pqn').", col.ind = "indices of columns (variables) for normalization to internal standard peak." ), info = "Normalization." @@ -709,7 +741,8 @@ prep <- function(name, params = NULL, method = NULL) { #' @export employ.prep <- function(obj, x, ...) { - stopifnot("employ.prep: the first argument must be a list with preprocessing methods" = is.list(obj) && class(obj[[1]]) == "prep") + stopifnot("employ.prep: the first argument must be a list with preprocessing methods" = + is.list(obj) && class(obj[[1]]) == "prep") for (p in obj) { x <- do.call(p$method, c(list(data = x), p$params)) } diff --git a/R/randtest.R b/R/randtest.R index c15a651..7c16f7a 100755 --- a/R/randtest.R +++ b/R/randtest.R @@ -92,7 +92,7 @@ #' par( mfrow = c(1, 1)) #' #' @export -randtest <- function(x, y, ncomp = 15, center = T, scale = F, nperm = 1000, sig.level = 0.05, +randtest <- function(x, y, ncomp = 15, center = TRUE, scale = FALSE, nperm = 1000, sig.level = 0.05, silent = TRUE, exclcols = NULL, exclrows = NULL) { x <- as.matrix(x) y <- as.matrix(y) @@ -100,14 +100,14 @@ randtest <- function(x, y, ncomp = 15, center = T, scale = F, nperm = 1000, sig. # remove excluded columns and rows if (length(exclcols) > 0) { x <- mda.exclcols(x, exclcols) - x <- x[, -attr(x, "exclcols"), drop = F] + x <- x[, -attr(x, "exclcols"), drop = FALSE] } if (length(exclrows) > 0) { x <- mda.exclrows(x, exclrows) exclrows <- attr(x, "exclrows") - x <- x[-exclrows, , drop = F] - y <- y[-exclrows, , drop = F] + x <- x[-exclrows, , drop = FALSE] + y <- y[-exclrows, , drop = FALSE] } nobj <- nrow(x) diff --git a/R/regcoeffs.R b/R/regcoeffs.R index 2fbad97..6dfd49b 100755 --- a/R/regcoeffs.R +++ b/R/regcoeffs.R @@ -165,7 +165,7 @@ summary.regcoeffs <- function(object, ncomp = 1, ny = 1, alpha = 0.05, ...) { } attrs <- mda.getattr(object$values) - coeffs <- object$values[, ncomp, ny, drop = F] + coeffs <- object$values[, ncomp, ny, drop = FALSE] dim(coeffs) <- c(dim(object$values)[1], 1) colnames(coeffs)[1] <- "Coeffs" if (!is.null(object$se)) { @@ -382,5 +382,5 @@ plot.regcoeffs <- function(x, ncomp = 1, ny = 1, type = (if (x$nvar > 30) "l" el err <- (ci[, 2] - ci[, 1]) / 2 mdaplotg(list(plot_data, mda.rbind(plot_data, err)), type = c(type, "e"), show.legend = FALSE, - col = col[c(2, 1)], show.grid = T, show.lines = show.lines, ylab = ylab, ...) + col = col[c(2, 1)], show.grid = TRUE, show.lines = show.lines, ylab = ylab, ...) } diff --git a/R/regmodel.R b/R/regmodel.R index cc33657..32dc541 100755 --- a/R/regmodel.R +++ b/R/regmodel.R @@ -56,8 +56,8 @@ crossval.regmodel <- function(obj, x, y, cv, cal.fun) { # get matrix with indices for cv segments cv_ind <- crossval(cv, nobj = nobj, resp = y[, 1]) - nseg <- nrow(cv_ind); - nrep <- dim(cv_ind)[3] + nseg <- max(cv_ind) + nrep <- ncol(cv_ind) # prepare arrays for results yp.cv <- array(0, dim = c(nobj, ncomp, nresp)) @@ -66,7 +66,7 @@ crossval.regmodel <- function(obj, x, y, cv, cal.fun) { # loop over segments and repetitions for (ir in seq_len(nrep)) { for (is in seq_len(nseg)) { - ind <- na.exclude(cv_ind[is, , ir]) + ind <- which(cv_ind[, ir] == is) if (length(ind) == 0) next xc <- x[-ind, , drop = FALSE] @@ -231,7 +231,7 @@ summary.regmodel <- function(object, ncomp = object$ncomp.selected, rownames(sum_data) <- capitalize(names(res)) sum_data[, "R2"] <- round(sum_data[, "R2"], 3) - sum_data[, "RMSE"] <- mdaplot.formatValues(sum_data[, "RMSE"], round.only = T) + sum_data[, "RMSE"] <- mdaplot.formatValues(sum_data[, "RMSE"], round.only = TRUE) sum_data[, "Slope"] <- round(sum_data[, "Slope"], 3) sum_data[, "Bias"] <- round(sum_data[, "Bias"], 4) sum_data[, "RPD"] <- round(sum_data[, "RPD"], 1) @@ -301,6 +301,49 @@ plotRMSE.regmodel <- function(obj, ny = 1, type = "b", labels = "values", mdaplotg(plot_data, type = type, xticks = xticks, labels = labels, ylab = ylab, ...) } + +#' RMSECV/RMSEC ratio plot for regression model +#' +#' @description +#' Shows plot with RMSECV/RMSEC values vs. RMSECV for each component. +#' +#' @param obj +#' a regression model (object of class \code{regmodel}) +#' @param ny +#' number of response variable to make the plot for (if y is multivariate) +#' @param type +#' type of the plot (use only "b" or "l") +#' @param show.labels +#' logical, show or not labels for plot points +#' @param labels +#' vector with point labels (by default number of components) +#' @param main +#' main plot title +#' @param xlab +#' label for x-axis +#' @param ylab +#' label for y-axis +#' @param ... +#' other plot parameters (see \code{mdaplot} for details) +#' +#' @export +plotRMSERatio.regmodel <- function(obj, ny = 1, type = "b", show.labels = TRUE, labels = seq_len(obj$ncomp), + main = paste0("RMSECV/RMSEC ratio (", obj$res$cal$respnames[ny], ")"), + ylab = "RMSECV/RMSEC ratio", + xlab = "RMSECV", ...) { + + stopifnot("Cross-validation results are not found." = !is.null(obj$res$cv)) + stopifnot("Parameter 'ny' has a wrong value." = (length(ny) == 1 && ny >= 1 && ny <= nrow(obj$res$cal$rmse))) + + plot_data <- matrix(obj$res$cv$rmse[ny, ] / obj$res$cal$rmse[ny, ], nrow = 1) + attr(plot_data, "xaxis.values") <- obj$res$cv$rmse[ny, ] + attr(plot_data, "xaxis.name") <- xlab + + mdaplot(plot_data, type = type, xlab = xlab, ylab = ylab, main = main, show.labels = show.labels, + labels = labels, ...) +} + + #' Predictions plot for regression model #' #' @description diff --git a/R/regres.R b/R/regres.R index f2b882f..5071a28 100755 --- a/R/regres.R +++ b/R/regres.R @@ -161,8 +161,8 @@ regres.getPerformanceStats <- function(y.pred, y.ref) { # remove excluded rows so they are not counted # when calculating statistics if (length(attrs$exclrows) > 0) { - y.pred <- y.pred[-attrs$exclrows, , , drop = F] - y.ref <- y.ref[-attrs$exclrows, , drop = F] + y.pred <- y.pred[-attrs$exclrows, , , drop = FALSE] + y.ref <- y.ref[-attrs$exclrows, , drop = FALSE] } # residuals (errors) based statistics @@ -234,7 +234,7 @@ regres.err <- function(y.pred, y.ref) { #' #' @export regres.r2 <- function(err, ytot) { - r2 <- t(1 - scale(colSums(err^2), center = F, scale = ytot)) + r2 <- t(1 - scale(colSums(err^2), center = FALSE, scale = ytot)) return(regress.addattrs(r2, attributes(err), "Coefficient of determination")) } diff --git a/R/simca.R b/R/simca.R index 44812d5..91b241c 100755 --- a/R/simca.R +++ b/R/simca.R @@ -299,44 +299,40 @@ crossval.simca <- function(obj, x, cv) { # remove excluded rows if (length(attrs$exclrows) > 0) { - x <- x[-attrs$exclrows, , drop = F] + x <- x[-attrs$exclrows, , drop = FALSE] } # remove excluded columns if (length(attrs$exclcols) > 0) { - x <- x[, -attrs$exclcols, drop = F] + x <- x[, -attrs$exclcols, drop = FALSE] } # get matrix with indices for cv segments nobj <- nrow(x) idx <- crossval(cv, nobj) - nseg <- nrow(idx); - nrep <- dim(idx)[3] + nseg <- max(idx) + nrep <- ncol(idx) p.pred <- array(0, dim = c(nobj, ncomp, 1)) - # loop over segments for (iRep in seq_len(nrep)) { for (iSeg in seq_len(nseg)) { - ind <- na.exclude(idx[iSeg, , iRep]) - - if (length(ind) > 0) { - x.cal <- x[-ind, , drop = F] - x.val <- x[ind, , drop = F] - - # calibrate PCA model and set distance limits - m.loc <- pca(x.cal, obj$ncomp, center = obj$center, scale = obj$scale, - method = obj$method, rand = obj$rand, lim.type = obj$lim.type, alpha = obj$alpha, - gamma = obj$gamma) - - # make prediction for validation subset - res.loc <- predict.pca(m.loc, x.val) - - # compute and save probabilities - for (i in seq_len(obj$ncomp)) { - p.pred[ind, i, 1] <- p.pred[ind, i, 1] + - getProbabilities.simca(m.loc, i, res.loc$Q[, i], res.loc$T2[, i]) - } + ind <- which(idx[, iRep] == iSeg) + x.cal <- x[-ind, , drop = FALSE] + x.val <- x[ind, , drop = FALSE] + + # calibrate PCA model and set distance limits + m.loc <- pca(x.cal, obj$ncomp, center = obj$center, scale = obj$scale, + method = obj$method, rand = obj$rand, lim.type = obj$lim.type, alpha = obj$alpha, + gamma = obj$gamma) + + # make prediction for validation subset + res.loc <- predict.pca(m.loc, x.val) + + # compute and save probabilities + for (i in seq_len(obj$ncomp)) { + p.pred[ind, i, 1] <- p.pred[ind, i, 1] + + getProbabilities.simca(m.loc, i, res.loc$Q[, i], res.loc$T2[, i]) } } } diff --git a/R/simcam.R b/R/simcam.R index af7153a..5365d55 100755 --- a/R/simcam.R +++ b/R/simcam.R @@ -432,7 +432,7 @@ plotModelDistance.simcam <- function(obj, nc = 1, type = "h", xticks = seq_len(o stop("Wrong values for 'nc' parameter.") } - mdaplot(mda.t(obj$moddist[, nc, drop = F]), type = type, xticks = xticks, + mdaplot(mda.t(obj$moddist[, nc, drop = FALSE]), type = type, xticks = xticks, xticklabels = xticklabels, main = main, xlab = xlab, ylab = ylab, ...) } diff --git a/README.md b/README.md index 4b0f0f2..9221516 100755 --- a/README.md +++ b/README.md @@ -3,7 +3,6 @@ Multivariate Data Analysis Tools ![GitHub build status](https://github.com/svkucheryavski/mdatools/workflows/R-CMD-check/badge.svg) ![GitHub All Releases](https://img.shields.io/github/downloads/svkucheryavski/mdatools/total?color=blue&logo=Github "Downloads from GitHub") ![Downloads (CRAN)](https://cranlogs.r-pkg.org/badges/grand-total/mdatools?color=blue&logo=R&style=flat-square "Downloads from CRAN") -[![codecov](https://codecov.io/gh/svkucheryavski/mdatools/branch/0.10.0/graph/badge.svg?style=flat-square)](https://codecov.io/gh/svkucheryavski/mdatools) @@ -19,16 +18,16 @@ If you want to cite the package, please use the following: Sergey Kucheryavskiy, What is new ----------- -Latest release (0.12.0) is available both from GitHub and CRAN. You can see the full list of changes [here](NEWS.md). The Bookdown tutorial has been also updated and contains the description of new methods added in the last release. +Latest release (0.13.0) is available both from GitHub and CRAN. You can see the full list of changes [here](NEWS.md). The Bookdown tutorial has been also updated and contains the description of new methods added in the last release. How to install -------------- -The package is available from CRAN by usual installing procedure. However, due to restrictions in CRAN politics regarding number of submissions (one in 3-4 month), mostly major releases will be published there (with 2-3 weeks delay after GitHub release as more thorough testing is needed). You can [download](https://github.com/svkucheryavski/mdatools/releases) a zip-file with source package and install it using the `install.packages` command, e.g. if the downloaded file is `mdatools_0.12.0.tar.gz` and it is located in a current working directory, just run the following: +The package is available from CRAN by usual installing procedure. However, due to restrictions in CRAN politics regarding number of submissions (one in 3-4 month), mostly major releases will be published there (with 2-3 weeks delay after GitHub release as more thorough testing is needed). You can [download](https://github.com/svkucheryavski/mdatools/releases) a zip-file with source package and install it using the `install.packages` command, e.g. if the downloaded file is `mdatools_0.13.0.tar.gz` and it is located in a current working directory, just run the following: ``` -install.packages("mdatools_0.12.0.tar.gz") +install.packages("mdatools_0.13.0.tar.gz") ``` If you have `devtools` package installed, the following command will install the current developer version from the master branch of GitHub repository (do not forget to load the `devtools` package first): diff --git a/man/mdaplot.formatValues.Rd b/man/mdaplot.formatValues.Rd index eeb6a23..d481191 100644 --- a/man/mdaplot.formatValues.Rd +++ b/man/mdaplot.formatValues.Rd @@ -4,7 +4,7 @@ \alias{mdaplot.formatValues} \title{Format vector with numeric values} \usage{ -mdaplot.formatValues(data, round.only = F, digits = 3) +mdaplot.formatValues(data, round.only = FALSE, digits = 3) } \arguments{ \item{data}{vector or matrix with values} diff --git a/man/plotLoadings.pca.Rd b/man/plotLoadings.pca.Rd index 923056b..c4a306f 100644 --- a/man/plotLoadings.pca.Rd +++ b/man/plotLoadings.pca.Rd @@ -6,7 +6,7 @@ \usage{ \method{plotLoadings}{pca}( obj, - comp = c(1, 2), + comp = if (obj$ncomp > 1) c(1, 2) else 1, type = (if (length(comp == 2)) "p" else "l"), show.legend = TRUE, show.axes = TRUE, diff --git a/man/plotRMSERatio.Rd b/man/plotRMSERatio.Rd new file mode 100644 index 0000000..45a1d2b --- /dev/null +++ b/man/plotRMSERatio.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/defaults.R +\name{plotRMSERatio} +\alias{plotRMSERatio} +\title{Plot for ratio RMSEC/RMSECV vs RMSECV} +\usage{ +plotRMSERatio(obj, ...) +} +\arguments{ +\item{obj}{object with any regression model} + +\item{...}{other parameters} +} +\description{ +Plot for ratio RMSEC/RMSECV vs RMSECV +} diff --git a/man/plotRMSERatio.regmodel.Rd b/man/plotRMSERatio.regmodel.Rd new file mode 100644 index 0000000..73d0293 --- /dev/null +++ b/man/plotRMSERatio.regmodel.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regmodel.R +\name{plotRMSERatio.regmodel} +\alias{plotRMSERatio.regmodel} +\title{RMSECV/RMSEC ratio plot for regression model} +\usage{ +\method{plotRMSERatio}{regmodel}( + obj, + ny = 1, + type = "b", + show.labels = TRUE, + labels = seq_len(obj$ncomp), + main = paste0("RMSECV/RMSEC ratio (", obj$res$cal$respnames[ny], ")"), + ylab = "RMSECV/RMSEC ratio", + xlab = "RMSECV", + ... +) +} +\arguments{ +\item{obj}{a regression model (object of class \code{regmodel})} + +\item{ny}{number of response variable to make the plot for (if y is multivariate)} + +\item{type}{type of the plot (use only "b" or "l")} + +\item{show.labels}{logical, show or not labels for plot points} + +\item{labels}{vector with point labels (by default number of components)} + +\item{main}{main plot title} + +\item{ylab}{label for y-axis} + +\item{xlab}{label for x-axis} + +\item{...}{other plot parameters (see \code{mdaplot} for details)} +} +\description{ +Shows plot with RMSECV/RMSEC values vs. RMSECV for each component. +} diff --git a/man/plotScores.ldecomp.Rd b/man/plotScores.ldecomp.Rd index 8a31d26..60f6fed 100644 --- a/man/plotScores.ldecomp.Rd +++ b/man/plotScores.ldecomp.Rd @@ -6,7 +6,7 @@ \usage{ \method{plotScores}{ldecomp}( obj, - comp = c(1, 2), + comp = if (obj$ncomp > 1) c(1, 2) else 1, type = "p", show.axes = TRUE, show.plot = TRUE, diff --git a/man/plotScores.pca.Rd b/man/plotScores.pca.Rd index 194a3b6..a23193d 100644 --- a/man/plotScores.pca.Rd +++ b/man/plotScores.pca.Rd @@ -6,7 +6,7 @@ \usage{ \method{plotScores}{pca}( obj, - comp = c(1, 2), + comp = if (obj$ncomp > 1) c(1, 2) else 1, type = "p", show.axes = TRUE, show.legend = TRUE, diff --git a/man/plotXLoadings.pls.Rd b/man/plotXLoadings.pls.Rd index a9ab0ae..d5288d6 100644 --- a/man/plotXLoadings.pls.Rd +++ b/man/plotXLoadings.pls.Rd @@ -6,7 +6,7 @@ \usage{ \method{plotXLoadings}{pls}( obj, - comp = c(1, 2), + comp = if (obj$ncomp > 1) c(1, 2) else 1, type = "p", show.axes = TRUE, show.legend = TRUE, diff --git a/man/plotXScores.pls.Rd b/man/plotXScores.pls.Rd index 98d3905..9e587c3 100644 --- a/man/plotXScores.pls.Rd +++ b/man/plotXScores.pls.Rd @@ -6,8 +6,8 @@ \usage{ \method{plotXScores}{pls}( obj, - comp = c(1, 2), - show.axes = T, + comp = if (obj$ncomp > 1) c(1, 2) else 1, + show.axes = TRUE, main = "Scores (X)", res = obj$res, ... diff --git a/man/plotXYScores.pls.Rd b/man/plotXYScores.pls.Rd index 557bbc1..8c6620f 100644 --- a/man/plotXYScores.pls.Rd +++ b/man/plotXYScores.pls.Rd @@ -4,7 +4,7 @@ \alias{plotXYScores.pls} \title{XY scores plot for PLS} \usage{ -\method{plotXYScores}{pls}(obj, ncomp = 1, show.axes = T, res = obj$res, ...) +\method{plotXYScores}{pls}(obj, ncomp = 1, show.axes = TRUE, res = obj$res, ...) } \arguments{ \item{obj}{a PLS model (object of class \code{pls})} diff --git a/man/pls.Rd b/man/pls.Rd index 791e7ca..6da9be7 100644 --- a/man/pls.Rd +++ b/man/pls.Rd @@ -257,6 +257,7 @@ Methods inherited from \code{regmodel} object (parent class for \code{pls}): \tabular{ll}{ \code{\link{plotPredictions.regmodel}} \tab shows predicted vs. measured plot.\cr \code{\link{plotRMSE.regmodel}} \tab shows RMSE plot.\cr + \code{\link{plotRMSERatio.regmodel}} \tab shows plot for ratio RMSECV/RMSEC values.\cr \code{\link{plotYResiduals.regmodel}} \tab shows residuals plot for y values.\cr \code{\link{getRegcoeffs.regmodel}} \tab returns matrix with regression coefficients.\cr } diff --git a/man/pls.getpredictions.Rd b/man/pls.getpredictions.Rd new file mode 100644 index 0000000..31f07bf --- /dev/null +++ b/man/pls.getpredictions.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pls.R +\name{pls.getpredictions} +\alias{pls.getpredictions} +\title{Compute predictions for response values} +\usage{ +pls.getpredictions( + x, + coeffs, + ycenter, + yscale, + ynames = NULL, + y.attrs = NULL, + objnames = NULL, + compnames = NULL +) +} +\arguments{ +\item{x}{matrix with predictors, already preprocessed (e.g. mean centered) and cleaned} + +\item{coeffs}{array with regression coefficients} + +\item{ycenter}{`ycenter` property of PLS model} + +\item{yscale}{`yscale` property of PLS model} + +\item{ynames}{vector with names of the responses} + +\item{y.attrs}{list with response attributes (e.g. from reference values if any)} + +\item{objnames}{vector with names of objects (rows of x)} + +\item{compnames}{vector with names used for components} +} +\value{ +array with predicted y-values +} +\description{ +Compute predictions for response values +} diff --git a/man/pls.getxdecomp.Rd b/man/pls.getxdecomp.Rd new file mode 100644 index 0000000..d330cb2 --- /dev/null +++ b/man/pls.getxdecomp.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pls.R +\name{pls.getxdecomp} +\alias{pls.getxdecomp} +\title{Compute object with decomposition of x-values} +\usage{ +pls.getxdecomp( + x, + xscores, + xloadings, + xeigenvals, + xnames = NULL, + x.attrs = NULL, + objnames = NULL, + compnames = NULL +) +} +\arguments{ +\item{x}{matrix with predictors, already preprocessed (e.g. mean centered) and cleaned} + +\item{xscores}{matrix with X-scores} + +\item{xloadings}{matrix with X-loadings} + +\item{xeigenvals}{matrix with eigenvalues for X} + +\item{xnames}{vector with names of the predictors} + +\item{x.attrs}{list with preditors attributes} + +\item{objnames}{vector with names of objects (rows of x)} + +\item{compnames}{vector with names used for components} +} +\value{ +array `ldecomp` object for x-values +} +\description{ +Compute object with decomposition of x-values +} diff --git a/man/pls.getxscores.Rd b/man/pls.getxscores.Rd new file mode 100644 index 0000000..0fe457f --- /dev/null +++ b/man/pls.getxscores.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pls.R +\name{pls.getxscores} +\alias{pls.getxscores} +\title{Compute matrix with X-scores} +\usage{ +pls.getxscores(x, weights, xloadings) +} +\arguments{ +\item{x}{matrix with predictors, already preprocessed and cleaned} + +\item{weights}{matrix with PLS weights} + +\item{xloadings}{matrix with X-loadings} +} +\value{ +matrix with X-scores +} +\description{ +Compute matrix with X-scores +} diff --git a/man/pls.getydecomp.Rd b/man/pls.getydecomp.Rd new file mode 100644 index 0000000..e629db9 --- /dev/null +++ b/man/pls.getydecomp.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pls.R +\name{pls.getydecomp} +\alias{pls.getydecomp} +\title{Compute object with decomposition of y-values} +\usage{ +pls.getydecomp( + y, + yscores, + xscores, + yloadings, + yeigenvals, + ynames = NULL, + y.attrs = NULL, + x.attrs = NULL, + objnames = NULL, + compnames = NULL +) +} +\arguments{ +\item{y}{matrix with responses, already preprocessed (e.g. mean centered) and cleaned} + +\item{yscores}{matrix with Y-scores} + +\item{xscores}{matrix with X-scores} + +\item{yloadings}{matrix with Y-loadings} + +\item{yeigenvals}{matrix with eigenvalues for Y} + +\item{ynames}{vector with names of the responses} + +\item{y.attrs}{list with response attributes (e.g. from reference values if any)} + +\item{x.attrs}{list with preditors attributes} + +\item{objnames}{vector with names of objects (rows of x)} + +\item{compnames}{vector with names used for components} +} +\value{ +array `ldecomp` object for y-values (or NULL if y is not provided) +} +\description{ +Compute object with decomposition of y-values +} diff --git a/man/pls.getyscores.Rd b/man/pls.getyscores.Rd new file mode 100644 index 0000000..7075ab4 --- /dev/null +++ b/man/pls.getyscores.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pls.R +\name{pls.getyscores} +\alias{pls.getyscores} +\title{Compute and orthogonalize matrix with Y-scores} +\usage{ +pls.getyscores(y, yloadings, xscores) +} +\arguments{ +\item{y}{matrix with response values, already preprocessed and cleaned} + +\item{yloadings}{matrix with Y-loadings} + +\item{xscores}{matrix with X-scores (needed for orthogonalization)} +} +\value{ +matrix with Y-scores +} +\description{ +Compute and orthogonalize matrix with Y-scores +} diff --git a/man/pls.simplsold.Rd b/man/pls.simplsold.Rd new file mode 100644 index 0000000..214ab38 --- /dev/null +++ b/man/pls.simplsold.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pls.R +\name{pls.simplsold} +\alias{pls.simplsold} +\title{SIMPLS algorithm (old implementation)} +\usage{ +pls.simplsold(x, y, ncomp, cv = FALSE) +} +\arguments{ +\item{x}{a matrix with x values (predictors)} + +\item{y}{a matrix with y values (responses)} + +\item{ncomp}{number of components to calculate} + +\item{cv}{logical, is model calibrated during cross-validation or not} +} +\value{ +a list with computed regression coefficients, loadings and scores for x and y matrices, +and weights. +} +\description{ +SIMPLS algorithm for calibration of PLS model (old version) +} +\references{ +[1]. S. de Jong. SIMPLS: An Alternative approach to partial least squares regression. +Chemometrics and Intelligent Laboratory Systems, 18, 1993 (251-263). +} diff --git a/man/prep.alsbasecorr.Rd b/man/prep.alsbasecorr.Rd index b34d0f0..c78f1e3 100644 --- a/man/prep.alsbasecorr.Rd +++ b/man/prep.alsbasecorr.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/prep.R \name{prep.alsbasecorr} \alias{prep.alsbasecorr} -\title{Baseline correction using assymetric least squares} +\title{Baseline correction using asymetric least squares} \usage{ prep.alsbasecorr(data, plambda = 5, p = 0.1, max.niter = 10) } @@ -19,7 +19,7 @@ prep.alsbasecorr(data, plambda = 5, p = 0.1, max.niter = 10) preprocessed spectra. } \description{ -Baseline correction using assymetric least squares +Baseline correction using asymetric least squares } \details{ The function implements baseline correction algorithm based on Whittaker smoother. The method diff --git a/man/prep.norm.Rd b/man/prep.norm.Rd index 5e93c5e..7990453 100644 --- a/man/prep.norm.Rd +++ b/man/prep.norm.Rd @@ -4,15 +4,17 @@ \alias{prep.norm} \title{Normalization} \usage{ -prep.norm(data, type = "area", col.ind = NULL) +prep.norm(data, type = "area", col.ind = NULL, ref.spectrum = NULL) } \arguments{ \item{data}{a matrix with data values} -\item{type}{type of normalization \code{"area"}, \code{"length"}, \code{"sum"}, \code{"snv"}, or \code{"is"}.} +\item{type}{type of normalization \code{"area"}, \code{"length"}, \code{"sum"}, \code{"snv"}, \code{"is"}, or \code{"pqn"}.} \item{col.ind}{indices of columns (can be either integer or logical valuws) for normalization to internal standard peak.} + +\item{ref.spectrum}{reference spectrum for PQN normalization, if not provided a mean spectrum for data is used} } \value{ data matrix with normalized values @@ -28,4 +30,14 @@ does the Standard Normal Variate normalization, similar to \code{\link{prep.snv} parameter `col.ind`. If the position is a single value, the rows are normalized to the height of this peak. If `col.ind` points on several adjucent vales, the rows are normalized to the area under the peak - sum of the intensities. + +The \code{"pqn"} is Probabilistic Quotient Normalization as described in [1]. In this case you also +need to provide a reference spectrum (e.g. mean or median of spectra for some reference samples). If +reference spectrum is not provided it will be computed as mean of the spectra to be +preprocessed (parameter \code{data}). +} +\references{ +1. F. Dieterle, A. Ross, H. Senn. Probabilistic Quotient Normalization as Robust Method to +Account for Dilution of Complex Biological Mixtures. Application in 1 H NMR Metabonomics. +Anal. Chem. 2006, 78, 4281–4290. } diff --git a/man/randtest.Rd b/man/randtest.Rd index d6814d9..9126c1d 100644 --- a/man/randtest.Rd +++ b/man/randtest.Rd @@ -8,8 +8,8 @@ randtest( x, y, ncomp = 15, - center = T, - scale = F, + center = TRUE, + scale = FALSE, nperm = 1000, sig.level = 0.05, silent = TRUE, diff --git a/inst/testdata/People.xlsx b/tests/testthat/People.xlsx similarity index 100% rename from inst/testdata/People.xlsx rename to tests/testthat/People.xlsx diff --git a/inst/testdata/pls-coeffs.csv b/tests/testthat/pls-coeffs.csv similarity index 100% rename from inst/testdata/pls-coeffs.csv rename to tests/testthat/pls-coeffs.csv diff --git a/inst/testdata/pls-expvar.csv b/tests/testthat/pls-expvar.csv similarity index 100% rename from inst/testdata/pls-expvar.csv rename to tests/testthat/pls-expvar.csv diff --git a/inst/testdata/pls-vipscores.csv b/tests/testthat/pls-vipscores.csv similarity index 98% rename from inst/testdata/pls-vipscores.csv rename to tests/testthat/pls-vipscores.csv index 27d3bc3..90606c7 100644 --- a/inst/testdata/pls-vipscores.csv +++ b/tests/testthat/pls-vipscores.csv @@ -9,3 +9,4 @@ 1.3532 0.41075 0.24069 + diff --git a/inst/testdata/pls-weight.csv b/tests/testthat/pls-weight.csv similarity index 100% rename from inst/testdata/pls-weight.csv rename to tests/testthat/pls-weight.csv diff --git a/inst/testdata/pls-weights.csv b/tests/testthat/pls-weights.csv similarity index 100% rename from inst/testdata/pls-weights.csv rename to tests/testthat/pls-weights.csv diff --git a/inst/testdata/pls-xloadings.csv b/tests/testthat/pls-xloadings.csv similarity index 100% rename from inst/testdata/pls-xloadings.csv rename to tests/testthat/pls-xloadings.csv diff --git a/inst/testdata/pls-xres.csv b/tests/testthat/pls-xres.csv similarity index 100% rename from inst/testdata/pls-xres.csv rename to tests/testthat/pls-xres.csv diff --git a/inst/testdata/pls-xscores.csv b/tests/testthat/pls-xscores.csv similarity index 100% rename from inst/testdata/pls-xscores.csv rename to tests/testthat/pls-xscores.csv diff --git a/inst/testdata/pls-yloadings.csv b/tests/testthat/pls-yloadings.csv similarity index 100% rename from inst/testdata/pls-yloadings.csv rename to tests/testthat/pls-yloadings.csv diff --git a/inst/testdata/pls-yres.csv b/tests/testthat/pls-yres.csv similarity index 100% rename from inst/testdata/pls-yres.csv rename to tests/testthat/pls-yres.csv diff --git a/inst/testdata/pls-yscores.csv b/tests/testthat/pls-yscores.csv similarity index 100% rename from inst/testdata/pls-yscores.csv rename to tests/testthat/pls-yscores.csv diff --git a/tests/testthat/pls2-vipscores.csv b/tests/testthat/pls2-vipscores.csv new file mode 100644 index 0000000..ae69f50 --- /dev/null +++ b/tests/testthat/pls2-vipscores.csv @@ -0,0 +1,20 @@ +1.2671 +1.2949 +1.0898 +1.0064 +1.0368 +0.3689 +1.2343 +1.2086 +0.4838 +0.2964 +0.7180 +0.6928 +0.6460 +2.3307 +0.6128 +0.7728 +0.6622 +0.7537 +1.0538 +0.2556 \ No newline at end of file diff --git a/tests/testthat/plstlbx-people-weights.csv b/tests/testthat/plstlbx-people-weights.csv new file mode 100644 index 0000000..626e85e --- /dev/null +++ b/tests/testthat/plstlbx-people-weights.csv @@ -0,0 +1,11 @@ +0.4319 0.1854 0.1149 -0.0445 +0.4356 0.1764 0.2654 0.4317 +-0.3699 0.0237 0.8413 0.8718 +0.1452 -0.0604 0.0389 0.1838 +0.1593 -0.3207 -0.1333 -0.1604 +0.3133 -0.2264 0.3859 0.5192 +-0.0399 0.6602 0.4212 0.4030 +-0.4139 -0.2195 0.0848 -0.0953 +0.4193 0.1815 0.1986 0.0160 +-0.0696 0.5662 -0.1668 -0.1324 +-0.0540 -0.0880 -0.5830 0.5314 diff --git a/tests/testthat/plstlbx-people-xhdist.csv b/tests/testthat/plstlbx-people-xhdist.csv new file mode 100644 index 0000000..5dbaf8d --- /dev/null +++ b/tests/testthat/plstlbx-people-xhdist.csv @@ -0,0 +1,32 @@ +7.4875 +3.2397 +2.5583 +10.8514 +3.2828 +4.0721 +7.1071 +3.9232 +3.8897 +3.3473 +3.8491 +4.6580 +3.1512 +2.5963 +2.0801 +2.2521 +4.2200 +4.5492 +3.5500 +5.5432 +3.1816 +3.5329 +2.3602 +3.5737 +4.0456 +3.4025 +5.0792 +3.6392 +1.8355 +1.5958 +2.7383 +2.807 \ No newline at end of file diff --git a/tests/testthat/plstlbx-people-xloadings.csv b/tests/testthat/plstlbx-people-xloadings.csv new file mode 100644 index 0000000..6972cd3 --- /dev/null +++ b/tests/testthat/plstlbx-people-xloadings.csv @@ -0,0 +1,11 @@ +0.4106 0.1141 0.1036 0.0457 +0.4153 0.0766 0.0066 0.1391 +-0.3726 -0.0907 0.3806 0.3537 +0.1521 -0.0844 -0.0460 0.0869 +0.1963 -0.3142 0.0105 0.0939 +0.3394 -0.3346 0.1565 0.1682 +-0.1160 0.5861 0.0871 0.1951 +-0.3886 -0.1876 0.1756 0.0410 +0.3983 0.0960 0.1564 0.0921 +-0.1349 0.6053 -0.2032 -0.0492 +-0.0439 0.0270 -0.8447 0.8707 diff --git a/tests/testthat/plstlbx-people-xqdist.csv b/tests/testthat/plstlbx-people-xqdist.csv new file mode 100644 index 0000000..971cce6 --- /dev/null +++ b/tests/testthat/plstlbx-people-xqdist.csv @@ -0,0 +1,32 @@ +1.9041 +0.8735 +0.4510 +1.1024 +2.7907 +2.2340 +1.2173 +0.9854 +1.2716 +0.7961 +0.4860 +0.3754 +1.6481 +1.3018 +0.3733 +1.2522 +2.3064 +2.6794 +4.7591 +1.7569 +0.6085 +1.4004 +0.4096 +1.4456 +2.6432 +2.4861 +5.3929 +9.5875 +1.2365 +3.1687 +0.8946 +2.6442 \ No newline at end of file diff --git a/tests/testthat/plstlbx-people-xscores.csv b/tests/testthat/plstlbx-people-xscores.csv new file mode 100644 index 0000000..59763c8 --- /dev/null +++ b/tests/testthat/plstlbx-people-xscores.csv @@ -0,0 +1,32 @@ +4.8335 -0.4321 1.5689 0.2021 +2.8513 -0.6212 -0.6223 0.7530 +2.7141 -0.6940 -0.6950 0.4506 +-1.0533 -2.0401 -1.3808 -1.8132 +-1.0359 -1.0859 1.2470 0.5618 +-0.6989 -1.0948 1.6394 0.2147 +2.3781 -1.4688 -1.4418 1.1361 +2.2660 -1.4589 -1.0201 0.6079 +-1.5931 -1.2889 1.1821 -0.6916 +-1.5246 -1.4122 1.1142 -0.4895 +2.8442 -1.3881 -0.0623 -0.8758 +-2.5820 -2.2613 -0.7718 0.4286 +-1.5279 -1.3792 1.2190 -0.0454 +-1.7034 -1.6552 0.7695 0.1952 +2.7436 -1.1517 0.0271 -0.2252 +2.7353 -1.3513 -0.2008 -0.0098 +2.1300 2.4733 0.6869 0.0039 +2.4403 2.4744 0.3435 0.5122 +-1.8386 0.0980 -0.7841 1.0571 +-2.8772 2.1646 0.5288 0.8908 +-3.3698 0.0943 -0.7736 -0.4325 +0.6607 1.7764 -0.9579 -0.6680 +1.1611 1.9766 -0.3325 -0.3089 +1.4512 1.4728 -0.2671 -1.0428 +-2.9197 0.9634 0.5934 -0.9167 +-3.1564 0.8307 -0.8396 0.4499 +1.2063 1.3302 0.0391 -1.4371 +0.3213 1.2869 1.2137 0.7285 +-2.3826 0.1716 -0.7929 0.0487 +-2.5680 0.7585 -0.1434 0.2315 +0.8214 2.2157 -0.5020 -0.1606 +-2.7271 0.6961 -0.5844 0.6443 \ No newline at end of file diff --git a/tests/testthat/plstlbx-people-ypred.csv b/tests/testthat/plstlbx-people-ypred.csv new file mode 100644 index 0000000..826d4ce --- /dev/null +++ b/tests/testthat/plstlbx-people-ypred.csv @@ -0,0 +1,32 @@ +48.0862 +44.2076 +43.8202 +36.0841 +38.3637 +38.9222 +42.8676 +42.6631 +36.9118 +37.0067 +43.4588 +34.5260 +37.2101 +36.7130 +43.6720 +43.5525 +44.7849 +45.3343 +37.0879 +36.8568 +34.1175 +41.2542 +42.5075 +42.4945 +35.5957 +35.1031 +42.0069 +41.7480 +35.8982 +36.1891 +42.0698 +35.8860 \ No newline at end of file diff --git a/tests/testthat/plstlbx-people-yqdist.csv b/tests/testthat/plstlbx-people-yqdist.csv new file mode 100644 index 0000000..1185db6 --- /dev/null +++ b/tests/testthat/plstlbx-people-yqdist.csv @@ -0,0 +1,32 @@ +0.0005 +0.0028 +0.0021 +0.0005 +0.0087 +0.0004 +0.0496 +0.0075 +0.0548 +0.0000 +0.1401 +0.1431 +0.0411 +0.0054 +0.0071 +0.0132 +0.0030 +0.0292 +0.0005 +0.0483 +0.0009 +0.0043 +0.0160 +0.0168 +0.0108 +0.0530 +0.0000 +0.0042 +0.0007 +0.0931 +0.0003 +0.0517 \ No newline at end of file diff --git a/tests/testthat/plstlbx-people-yscores.csv b/tests/testthat/plstlbx-people-yscores.csv new file mode 100644 index 0000000..7fb603e --- /dev/null +++ b/tests/testthat/plstlbx-people-yscores.csv @@ -0,0 +1,32 @@ +11.1420 0.0915 0.0668 -0.0015 +5.6355 -0.1266 -0.0208 0.0050 +5.6355 -0.0683 0.0099 0.0308 +-5.3774 -0.5884 -0.1461 -0.0654 +-2.6242 -0.0651 0.0344 -0.0154 +-1.2476 0.0568 0.0901 0.0140 +2.8823 -0.4560 -0.1198 -0.0433 +4.2589 -0.1430 0.0211 0.0502 +-5.3774 -0.3587 -0.0865 -0.1055 +-4.0008 -0.1226 0.0276 -0.0162 +2.8823 -0.6542 -0.2142 -0.1614 +-5.3774 0.0619 0.1609 0.1486 +-2.6242 0.1441 0.1462 0.0709 +-4.0008 -0.0465 0.0763 0.0325 +5.6355 -0.0808 0.0311 0.0229 +5.6355 -0.0773 0.0445 0.0407 +7.0122 0.4454 0.0560 0.0198 +8.3888 0.5787 0.1163 0.0772 +-4.0008 0.0109 -0.0008 0.0256 +-5.3774 0.1874 -0.0425 -0.0501 +-8.1307 -0.1337 -0.0660 -0.0245 +1.5057 0.0092 -0.1003 -0.0445 +4.2589 0.3269 0.0316 0.0353 +4.2589 0.2036 0.0055 0.0131 +-5.3774 0.2055 0.0363 0.0079 +-5.3774 0.3061 0.0896 0.0964 +2.8823 0.0424 -0.0590 -0.0463 +2.8823 0.4188 0.1137 0.0462 +-5.3774 -0.0230 -0.0205 0.0109 +-6.7540 -0.2094 -0.1393 -0.1015 +2.8823 0.2061 -0.0371 -0.0115 +-6.7540 -0.1417 -0.1050 -0.0606 diff --git a/tests/testthat/test-bugs-220710.R b/tests/testthat/test-bugs-220710.R new file mode 100644 index 0000000..6f144de --- /dev/null +++ b/tests/testthat/test-bugs-220710.R @@ -0,0 +1,49 @@ + +setup({ + pdf(file = tempfile("mdatools-test-classmodel-", fileext = ".pdf")) + sink(tempfile("mdatools-test-test-classmodel-", fileext = ".txt"), append = FALSE, split = FALSE) +}) + +teardown({ + dev.off() + sink() +}) + +context("test for bug #107:") +data(people) + +test_that("bug is fixed in PCA", { + m <- pca(people, 1) + expect_silent(plotScores(m)) + expect_silent(plotLoadings(m)) + expect_silent(plotScores(m, 1)) + expect_silent(plotLoadings(m, 1)) + + expect_error(plotScores(m, 0)) + expect_error(plotScores(m, c(1, 3))) + expect_error(plotScores(m$res$cal, 0)) + expect_error(plotScores(m$res$cal, c(1, 3))) + + expect_error(plotLoadings(m, 0)) + expect_error(plotLoadings(m, c(1, 3))) + expect_error(plotLoadings(m$res$cal, 0)) + expect_error(plotLoadings(m$res$cal, c(1, 3))) + +}) + +test_that("bug is fixed in PLS", { + m <- pls(people[, -4], people[, 4], 1) + expect_silent(plotXScores(m)) + expect_silent(plotXLoadings(m)) + + expect_error(plotXScores(m, 0)) + expect_error(plotXScores(m, c(1, 3))) + expect_error(plotXScores(m$res$cal, 0)) + expect_error(plotXScores(m$res$cal, c(1, 3))) + + expect_error(plotXLoadings(m, 0)) + expect_error(plotXLoadings(m, c(1, 3))) + expect_error(plotXLoadings(m$res$cal, 0)) + expect_error(plotXLoadings(m$res$cal, c(1, 3))) + +}) \ No newline at end of file diff --git a/tests/testthat/test-crossval.R b/tests/testthat/test-crossval.R index 343cabd..e797a4f 100644 --- a/tests/testthat/test-crossval.R +++ b/tests/testthat/test-crossval.R @@ -8,25 +8,36 @@ nobj <- 24 context(sprintf("crossval: cross-validation")) test_that("leave one out works correctly", { + cv.ind <- matrix(seq_len(nobj), ncol = 1) + expect_equivalent(cv.ind, crossval(cv = "loo", nobj)) +}) + + +test_that("random cross-validation with one repetition works correctly", { + + # random full cross-validation + nseg = nobj set.seed(42) - cv.ind <- array(sample(nobj), dim = c(nobj, 1, 1)) + cv.ind <- matrix(rep(seq_len(nseg), length.out = nobj)[sample(nobj)], ncol = 1) set.seed(42) expect_equivalent(cv.ind, crossval(cv = 1, nobj)) -}) + set.seed(42) + expect_equivalent(cv.ind, crossval(cv = list("rand", 24), nobj)) -test_that("random cross-validation with one repetition works correctly", { # number of objets is a multuply of number segments + nseg = 4 set.seed(42) - cv.ind <- array(matrix(sample(nobj), nrow = 4, byrow = TRUE), dim = c(4, nobj/4, 1)) + cv.ind <- matrix(rep(seq_len(nseg), length.out = nobj)[sample(nobj)], ncol = 1) set.seed(42) expect_equivalent(cv.ind, crossval(cv = 4, nobj)) set.seed(42) expect_equivalent(cv.ind, crossval(cv = list("rand", 4), nobj)) # number of objets is not a multuply of number segments + nseg = 5 set.seed(42) - cv.ind <- array(matrix(c(sample(nobj), NA), nrow = 5, byrow = TRUE), dim = c(5, ceiling(nobj/5), 1)) + cv.ind <- matrix(rep(seq_len(nseg), length.out = nobj)[sample(nobj)], ncol = 1) set.seed(42) expect_equivalent(cv.ind, crossval(cv = 5, nobj)) set.seed(42) @@ -37,47 +48,41 @@ test_that("random cross-validation with one repetition works correctly", { test_that("random cross-validation with several repetition works correctly", { # number of objets is a multuply of number segments set.seed(42) - cv.ind <- array( - cbind( - matrix(sample(nobj), nrow = 4, byrow = TRUE), - matrix(sample(nobj), nrow = 4, byrow = TRUE), - matrix(sample(nobj), nrow = 4, byrow = TRUE) - ), dim = c(4, nobj/4, 3) - ) + nseg <- 4 + nrep <- 3 + cv.ind <- sapply(seq_len(nrep), function(i) rep(seq_len(nseg), length.out = nobj)[sample(nobj)]) set.seed(42) expect_equivalent(cv.ind, crossval(cv = list("rand", 4, 3), nobj)) - expect_equivalent(as.numeric(table(crossval(cv = list("rand", 4, 3), nobj))), rep(3, nobj)) # number of objets is not a multuply of number segments set.seed(42) - cv.ind <- array( - cbind( - matrix(c(sample(nobj), NA), nrow = 5, byrow = TRUE), - matrix(c(sample(nobj), NA), nrow = 5, byrow = TRUE), - matrix(c(sample(nobj), NA), nrow = 5, byrow = TRUE), - matrix(c(sample(nobj), NA), nrow = 5, byrow = TRUE) - ), dim = c(5, ceiling(nobj/5), 4) - ) + nseg <- 5 + nrep <- 4 + cv.ind <- sapply(seq_len(nrep), function(i) rep(seq_len(nseg), length.out = nobj)[sample(nobj)]) set.seed(42) expect_equivalent(cv.ind, crossval(cv = list("rand", 5, 4), nobj)) - expect_equivalent(as.numeric(table(crossval(cv = list("rand", 5, 4), nobj), useNA = "ifany")), rep(4, nobj + 1)) }) -test_that("random cross-validation with several repetition works correctly", { +test_that("systematic cross-validation works correctly", { # number of objets is a multuply of number segments + nseg <- 4 set.seed(42) resp <- rnorm(nobj) - cv.ind <- array(matrix(order(resp), nrow = 4, byrow = FALSE), dim = c(4, nobj/4, 1)) + cv.ind <- matrix(rep(seq_len(nseg), length.out = nobj)[order(resp)], ncol = 1) set.seed(42) expect_equivalent(cv.ind, crossval(cv = list("ven", 4), resp = resp)) # number of objets is not a multuply of number segments + nseg <- 5 set.seed(42) resp <- rnorm(nobj) - cv.ind <- array(matrix(c(order(resp), NA), nrow = 5, byrow = FALSE), dim = c(5, ceiling(nobj/5), 1)) + cv.ind <- matrix(rep(seq_len(nseg), length.out = nobj)[order(resp)], ncol = 1) set.seed(42) expect_equivalent(cv.ind, crossval(cv = list("ven", 5), resp = resp)) + + # it also works well without any response + expect_equivalent(crossval(list("ven", 4), 10), matrix(c(1, 2, 3, 4, 1, 2, 3, 4, 1, 2), ncol = 1)) }) diff --git a/tests/testthat/test-pls.R b/tests/testthat/test-pls.R index f9c3dd9..0788661 100644 --- a/tests/testthat/test-pls.R +++ b/tests/testthat/test-pls.R @@ -253,17 +253,10 @@ context("pls: compare pedict.pls() outcome with known values") test_that("predictions for people data are correct", { # model - f <- system.file("testdata", "pls-xloadings.csv", package = "mdatools") - xloadings <- read.csv(f, header = FALSE) - - f <- system.file("testdata", "pls-yloadings.csv", package = "mdatools") - yloadings <- read.csv(f, header = FALSE) - - f <- system.file("testdata", "pls-weights.csv", package = "mdatools") - weights <- read.csv(f, header = FALSE) - - f <- system.file("testdata", "pls-coeffs.csv", package = "mdatools") - coeffs <- read.csv(f, header = FALSE) + xloadings <- read.csv("pls-xloadings.csv", header = FALSE) + yloadings <- read.csv("pls-yloadings.csv", header = FALSE) + weights <- read.csv("pls-weights.csv", header = FALSE) + coeffs <- read.csv("pls-coeffs.csv", header = FALSE) xc <- people[, -4] yc <- people[, 4, drop = F] @@ -274,25 +267,11 @@ test_that("predictions for people data are correct", { expect_equivalent(obj$coeffs$values[, 4, ], as.matrix(coeffs), tolerance = 10^-5) # predictions - f <- system.file("testdata", "pls-xscores.csv", package = "mdatools") - xscores <- read.csv(f, header = FALSE) - print(f) - - f <- system.file("testdata", "pls-yscores.csv", package = "mdatools") - yscores <- read.csv(f, header = FALSE) - print(f) - - f <- system.file("testdata", "pls-xres.csv", package = "mdatools") - xresid <- read.csv(f, header = FALSE) - print(f) - - f <- system.file("testdata", "pls-yres.csv", package = "mdatools") - yresid <- read.csv(f, header = FALSE) - print(f) - - f <- system.file("testdata", "pls-expvar.csv", package = "mdatools") - expvar <- read.csv(f, header = FALSE) - print(f) + xscores <- read.csv("pls-xscores.csv", header = FALSE) + yscores <- read.csv("pls-yscores.csv", header = FALSE) + xresid <- read.csv("pls-xres.csv", header = FALSE) + yresid <- read.csv("pls-yres.csv", header = FALSE) + expvar <- read.csv("pls-expvar.csv", header = FALSE) res <- predict(obj, xc, yc) expect_equivalent(res$xdecomp$scores, as.matrix(xscores), tolerance = 10^-4) @@ -474,11 +453,22 @@ test_that("vipscores for people data (A = 4) identical to once computed in MATLA d <- datasets[[1]] m <- pls(d$xc, d$yc, d$ncomp, center = d$center, scale = d$scale, cv = 10) - f <- system.file("testdata", "pls-vipscores.csv", package = "mdatools") - vip <- read.csv(f, header = FALSE) + vip <- read.csv("pls-vipscores.csv", header = FALSE) expect_equivalent(vipscores(m, ncomp = 4), as.matrix(vip), tolerance = 10^-4) }) +test_that("vipscores for people data (A = 4) identical to once computed in MATLAB for PLS2", { + data(people) + X <- people[, -c(4, 6)] + Y <- people[, c(4, 6)] + m <- pls(X, Y, 8, center = TRUE, scale = TRUE, cv = 1) + + vip <- read.csv("pls2-vipscores.csv", header = FALSE)[[1]] + #dim(vip) <- c(ncol(X), 2) + + expect_equivalent(vipscores(m, ncomp = 4), matrix(vip, ncol = 2), tolerance = 10^-4) +}) + ###################################### # Block 6. Tests selratio() method # @@ -523,13 +513,6 @@ for (i in seq_along(datasets)) { print(v) } -#test_that("vipscores for people data (A = 4) identical to once computed in MATLAB", { -# d <- datasets[[1]] -# m <- pls(d$xc, d$yc, d$ncomp, center = d$center, scale = d$scale, cv = 10) -# -# vip <- as.matrix(read.csv("../matlab/pls-vipscores.csv", header = FALSE)) -# expect_equivalent(vipscores(m, ncomp = 4), vip, tolerance = 10^-4) -#}) ######################################### # Block 7. Tests for outlier detection # @@ -584,4 +567,64 @@ test_that("XY-residual limits are computed correctly", { expect_equal(sum(c2 == "extreme"), 2) expect_equal(sum(c2 == "outlier"), 2) -}) \ No newline at end of file +}) + +test_that("PLS gives results comparable to other software", { + + + # read model parameters and calibration scores made in PLS_Toolbox + xscores <- as.matrix(read.delim("plstlbx-people-xscores.csv", sep = " ", header = FALSE)) + yscores <- as.matrix(read.delim("plstlbx-people-yscores.csv", sep = " ", header = FALSE)) + weights <- as.matrix(read.delim("plstlbx-people-weights.csv", sep = " ", header = FALSE)) + xloadings <- as.matrix(read.delim("plstlbx-people-xloadings.csv", sep = " ", header = FALSE)) + yloadings <- c(5.3643, 1.0338, 0.4675, 0.3567) + coeffs <- c(0.2078, 0.2647, 0.0073, 0.0722, -0.0016, 0.1829, 0.1420, -0.1984, 0.2153, 0.0151, -0.0405) + + # make a model with manual cv splits + data(people) + X <- people[, -4] + y <- people[, 4, drop = FALSE] + m <- pls(X, y, 4, scale = TRUE, cv = rep(1:10, length.out = nrow(X))) + + # compare main model parameters + # here we re-normalize results from PLS_Toolbox + xnorm <- sqrt(colSums(xscores^2)) + expect_equivalent(m$xloadings, xloadings %*% diag(xnorm), tolerance = 10^-3) + expect_equivalent(m$res$cal$xdecomp$scores, xscores %*% diag(1/xnorm), tolerance = 10^-3) + expect_equivalent(m$weights, weights %*% diag(1/xnorm), tolerance = 10^-3) + + expect_equivalent(m$yloadings, yloadings, tolerance = 10^-4) + expect_equivalent(m$coeffs$values[, 4, 1], coeffs, tolerance = 10^-4) + + # check selectivity ratio + # here we change the result from PLS toolbox a bit for large values as they add + # given portion of x-variance when compute SR + sr <- c(24.8, 21.7, 2.2359, 0.1179, 0.1552, 0.9740, 0.0076, 5.9018, 10.0, 0.0256, 0.0138) + expect_equivalent(selratio(m, 4), sr, tolerance = 10^-1) + + # compare calibration results + ypred <- as.matrix(read.delim("plstlbx-people-ypred.csv", sep = " ", header = FALSE)) + xqdist <- as.matrix(read.delim("plstlbx-people-xqdist.csv", sep = " ", header = FALSE)) + xhdist <- as.matrix(read.delim("plstlbx-people-xhdist.csv", sep = " ", header = FALSE)) + yqdist <- as.matrix(read.delim("plstlbx-people-yqdist.csv", sep = " ", header = FALSE)) + rmsec <- c(1.0273, 0.7404, 0.6668, 0.6198) + r2c <- c(0.9283, 0.9627, 0.9698, 0.9739) + + expect_equivalent(m$res$cal$xdecomp$T2[, 4], xhdist, tolerance = 10^-3) + expect_equivalent(m$res$cal$xdecomp$Q[, 4], xqdist, tolerance = 10^-3) + + expect_equivalent(m$res$cal$ydecomp$Q[, 4], yqdist, tolerance = 10^-3) + expect_equivalent(m$res$cal$ydecomp$scores, yscores, tolerance = 10^-4) + expect_equivalent(m$res$cal$y.pred[, 4, 1], ypred, tolerance = 10^-4) + + expect_equivalent(m$res$cal$rmse, rmsec, tolerance = 10^-3) + expect_equivalent(m$res$cal$r2, r2c, tolerance = 10^-3) + + + # compare cross-validation results + rmsecv <- c(1.1044, 0.8673, 0.8627, 0.8186) + r2cv <- c(0.9173, 0.9489, 0.9498, 0.9545) + + expect_equivalent(m$res$cv$rmse, rmsecv, tolerance = 10^-3) + expect_equivalent(m$res$cv$r2, r2cv, tolerance = 10^-3) +}) diff --git a/tests/testthat/test-prep.R b/tests/testthat/test-prep.R index c0d7531..50a86c0 100644 --- a/tests/testthat/test-prep.R +++ b/tests/testthat/test-prep.R @@ -150,6 +150,41 @@ test_that("Normalization to IS peak works (one value)", { expect_equivalent(pspectra1, pspectra2) }) +test_that("PQN normalization works correctly", { + data(simdata) + spectra <- simdata$spectra.c + ref.spectrum1 <- apply(simdata$spectra.c[c(1, 20, 30, 80, 80, 100), ], 2, mean) + ref.spectrum2 <- apply(spectra, 2, mean) + + expect_error(prep.norm(spectra, "pqn", ref.spectrum = 1)) + expect_error(prep.norm(spectra, "pqn", ref.spectrum = ref.spectrum1[, 1:10, drop = FALSE])) + + # manual preprocessing of spectra + ref.spectrum1 <- ref.spectrum1 / sum(abs(ref.spectrum1)) + ref.spectrum2 <- ref.spectrum2 / sum(abs(ref.spectrum2)) + pspectra2 <- pspectra1 <- matrix(0, nrow(spectra), ncol(spectra)) + + for (i in seq_len(nrow(spectra))) { + s <- spectra[i, ] / sum(abs(spectra[i, ])) + + q1 <- s / ref.spectrum1 + pspectra1[i, ] <- s / median(q1) + + q2 <- s / ref.spectrum2 + pspectra2[i, ] <- s / median(q2) + } + + # par(mfrow = c(2, 2)) + # mdaplot(prep.norm(spectra, type = "sum"), type = "l") + # mdaplot(pspectra1, type = "l") + # mdaplot(pspectra2, type = "l") + # mdaplot(prep.norm(spectra, type = "pqn"), type = "l") + + expect_equivalent(prep.norm(spectra, type = "pqn", ref.spectrum = ref.spectrum1), pspectra1) + expect_equivalent(prep.norm(spectra, type = "pqn", ref.spectrum = ref.spectrum2), pspectra2) + expect_equivalent(prep.norm(spectra, type = "pqn"), pspectra2) +}) + test_that("Kubelka-Munk works correctly", { spectra <- simdata$spectra.c spectra <- spectra - min(spectra) + 0.01 * max(spectra) @@ -205,6 +240,40 @@ test_that("SavGol smoothing works correctly", { expect_equal(dim(dy3), dim(y)) expect_equivalent(which((abs(dy3) < 10^-12)), zeros) + # check for small numeric sets + x = matrix(c(1, 1, 1, 3, 4, 7, 4, 3, 1, 1, 1.0), nrow = 1) + y = matrix(c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5.0), nrow = 1) + z = matrix(c(4, 6, 4, 6, 4, 6, 4, 6, 4, 6.0), nrow = 1) + + # no derivartive + expect_equivalent(prep.savgol(x, width = 5, porder = 1, dorder = 0), c(0.4, 1.2, 2.0, 3.2, 3.8, 4.2, 3.8, 3.2, 2.0, 1.2, 0.4)) + expect_equivalent(prep.savgol(y, width = 3, porder = 1, dorder = 0), c(5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0)) + expect_equivalent(prep.savgol(z, width = 5, porder = 1, dorder = 0), c(4.8, 4.8, 4.8, 5.2, 4.8, 5.2, 4.8, 5.2, 5.2, 5.2)) + + # first derivartive + expect_equivalent(prep.savgol(x, width = 3, porder = 1, dorder = 1), c(0.0, 0.0, 1.0, 1.5, 2.0, 0.0, -2.0, -1.5, -1.0, 0.0, 0.0)) + expect_equivalent(prep.savgol(y, width = 3, porder = 1, dorder = 1), c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) + expect_equivalent(prep.savgol(z, width = 3, porder = 1, dorder = 1), c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) + + # second derivartive + expect_equivalent(prep.savgol(x, width = 3, porder = 2, dorder = 2), c(0.0, 0.0, 2.0, -1.0, 2.0, -6.0, 2.0, -1.0, 2.0, 0.0, 0.0)) + expect_equivalent(prep.savgol(y, width = 3, porder = 2, dorder = 2), c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) + expect_equivalent(prep.savgol(z, width = 3, porder = 2, dorder = 2), c(-4.0, -4.0, 4.0, -4.0, 4.0, -4.0, 4.0, -4.0, 4.0, 4.0)) + + + y1 <- prep.savgol( + rbind( + matrix(c(1, 1, 1, 3, 4, 7, 4, 3, 1, 1, 1.0), nrow = 1), + matrix(c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5.0), nrow = 1) + ), width = 5, porder = 1, dorder = 0 + ) + + y2 <- rbind( + prep.savgol(matrix(c(1, 1, 1, 3, 4, 7, 4, 3, 1, 1, 1.0), nrow = 1), width = 5, porder = 1, dorder = 0), + prep.savgol(matrix(c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5.0), nrow = 1), width = 5, porder = 1, dorder = 0) + ) + + expect_equivalent(y1, y2) }) context("prep: alsbasecorr") diff --git a/tests/testthat/test-regmodel.R b/tests/testthat/test-regmodel.R index 3706636..cec5352 100644 --- a/tests/testthat/test-regmodel.R +++ b/tests/testthat/test-regmodel.R @@ -196,3 +196,21 @@ test_that("text outcome works fine", { expect_output(summary(m, ny = 1, ncomp = 1, res = list("xres" = m$res$cal))) expect_output(print(m)) }) + +test_that("RMSE ratio plot works fine", { + data(simdata) + X <- simdata$spectra.c + Y <- simdata$conc.c + m1 <- pls(X, Y, 11, cv = 1) + + par(mfrow = c(2, 2)) + expect_silent(plotRMSERatio(m1)) + expect_silent(plotRMSERatio(m1, ny = 2)) + expect_silent(plotRMSERatio(m1, ny = 3)) + expect_silent(plotRMSERatio(m1, ny = 3, col = "red", pch = 3, lty = 2, lwd = 2)) + + expect_error(plotRMSERatio(m1, 0)) + expect_error(plotRMSERatio(m1, 4)) + expect_error(plotRMSERatio(m1, c(1, 2))) + +}) \ No newline at end of file diff --git a/tests/testthat/test-simpls.R b/tests/testthat/test-simpls.R new file mode 100644 index 0000000..888a8d8 --- /dev/null +++ b/tests/testthat/test-simpls.R @@ -0,0 +1,112 @@ +###################################### +# Tests for SIMPLS algorithms # +###################################### + +setup({ + pdf(file = tempfile("mdatools-test-simpls-", fileext = ".pdf")) + sink(tempfile("mdatools-test-simpls-", fileext = ".txt"), append = FALSE, split = FALSE) +}) + +teardown({ + dev.off() + sink() +}) + +################################################# +# Block 1. Tests the old algorithm # +################################################# + +context("simpls: PLS2 example from the paper") + +# convert results produced by SIMPLS algorithm to form +# suitable for testing +simpls2res <- function (m, X, Y, A) { + R <- m$weights + P <- m$xloadings + Q <- m$yloadings + + T <- X %*% R + Xexp <- rep(0, A) + Yexp <- rep(0, A) + + for (a in 1:A) { + B <- tcrossprod(R[, a, drop = FALSE], Q[, a, drop = FALSE]) + Xhat <- tcrossprod(T[, a, drop = FALSE], P[, a, drop = FALSE]) + Yhat <- X %*% B + Xexp[a] <- sum(Xhat^2)/sum(X^2) * 100 + Yexp[a] <- sum(Yhat^2)/sum(Y^2) * 100 + } + + rnorm = sqrt(colSums(R^2)) + R = R %*% diag(1 / rnorm) + + return(list(R = R, T = T, P = P, Q = Q, Xexp = Xexp, Yexp = Yexp)) +} + +# deterministic small data +A <- 3 +X <- matrix(c(-4, -4, 4, 4, 2, -2, 2, -2, 1, -1, -1, 1), nrow = 4) +Y <- matrix(c(430, -436, -361, 367, -94, 12, -22, 104), nrow = 4) + +# expected values +expected = list( + R = matrix(c(0.0164, 0.1798, 0.9836, 0.4562, -0.7719, 0.4428, 0.3392, 0.7141, -0.6124), ncol = A), + T = matrix(c(0.6088, -0.6712, -0.2662, 0.3286, -0.6018, -0.1489, -0.0333, 0.7840, -0.1311, -0.5266, 0.8234, -0.1657), ncol = A), + Xexp = c(6.72, 50.77, 42.51), + Yexp = c(90.15, 4.54, 5.31) +) + +test_that("new algorithm works correctly", { + res <- simpls2res(pls.simpls(X, Y, A), X, Y, A) + + expect_equivalent(abs(res$R), abs(expected$R), tolerance = 10^-4) + expect_equivalent(abs(res$T), abs(expected$T), tolerance = 10^-4) + expect_equivalent(res$Xexp, expected$Xexp, tolerance = 10^-4) + expect_equivalent(res$Yexp, expected$Yexp, tolerance = 10^-4) +}) + +test_that("old algorithm works correctly", { + res <- simpls2res(pls.simplsold(X, Y, A), X, Y, A) + + expect_equivalent(abs(res$R), abs(expected$R), tolerance = 10^-4) + expect_equivalent(abs(res$T), abs(expected$T), tolerance = 10^-4) + expect_equivalent(res$Xexp, expected$Xexp, tolerance = 10^-4) + expect_equivalent(res$Yexp, expected$Yexp, tolerance = 10^-4) +}) + + +test_that("new algorithm is more numerically stable", { + Xr <- matrix(rnorm(30000 * 1000), 30000, 1000) + Yr <- matrix(rnorm(30000 * 2), 30000, 2) + + expect_warning(pls.simplsold(Xr, Yr / 100000000, 50)) + expect_silent(pls.simpls(Xr, Yr / 100000000, 50)) +}) + + +test_that("new algorithm gives results comparable to other software", { + + # read model parameters made in PLS_Toolbox + weights <- as.matrix(read.delim("plstlbx-people-weights.csv", sep = " ", header = FALSE)) + xloadings <- as.matrix(read.delim("plstlbx-people-xloadings.csv", sep = " ", header = FALSE)) + xscores <- as.matrix(read.delim("plstlbx-people-xscores.csv", sep = " ", header = FALSE)) + yscores <- as.matrix(read.delim("plstlbx-people-yscores.csv", sep = " ", header = FALSE)) + yloadings <- c(5.3643, 1.0338, 0.4675, 0.3567) + coeffs <- c(0.2078, 0.2647, 0.0073, 0.0722, -0.0016, 0.1829, 0.1420, -0.1984, 0.2153, 0.0151, -0.0405) + + # make a model + data(people) + X <- scale(people[, -4], center = TRUE, scale = TRUE) + y <- scale(people[, 4, drop = FALSE], center = TRUE, scale = TRUE) + m <- pls.simpls(X, y, 4) + + # here we re-normalize results from PLS_Toolbox + xnorm <- sqrt(colSums(xscores^2)) + expect_equivalent(m$xloadings, xloadings %*% diag(xnorm), tolerance = 10^-3) + expect_equivalent(m$xscores, xscores %*% diag(1/xnorm), tolerance = 10^-3) + expect_equivalent(m$weights, weights %*% diag(1/xnorm), tolerance = 10^-3) + + expect_equivalent(m$yscores, yscores, tolerance = 10^-4) + expect_equivalent(m$yloadings, yloadings, tolerance = 10^-4) + expect_equivalent(m$coeffs[, 4, 1], coeffs, tolerance = 10^-4) +}) diff --git a/inst/testdata/test_ldecomp.m b/tests/testthat/test_ldecomp.m similarity index 100% rename from inst/testdata/test_ldecomp.m rename to tests/testthat/test_ldecomp.m diff --git a/inst/testdata/test_pls.m b/tests/testthat/test_pls.m similarity index 100% rename from inst/testdata/test_pls.m rename to tests/testthat/test_pls.m