diff --git a/.Rbuildignore b/.Rbuildignore index 739748a..7d9e69f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,5 +2,6 @@ ^\.Rproj\.user$ README.md ^\.github$ +.vscode src/.vscode CITATION.bib diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 9d387a3..8702342 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -29,7 +29,7 @@ jobs: R_KEEP_PKG_SOURCE: yes steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: r-lib/actions/setup-pandoc@v1 diff --git a/.gitignore b/.gitignore index eeb844f..7f6c18e 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ src/*.a *.spl *.synctex.gz *.tex -src/.vscode \ No newline at end of file +.vscode diff --git a/DESCRIPTION b/DESCRIPTION index 9ec2d6b..d77c470 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: Rankcluster Type: Package Title: Model-Based Clustering for Multivariate Partial Ranking Data -Version: 0.94.5 -Date: 2021-01-26 +Version: 0.98.0 +Date: 2022-11-11 Authors@R: c(person("Quentin", "Grimonprez", role = c("aut", "cre"), email = "quentingrim@yahoo.fr"), - person("Julien", "Jacques", role = "aut"), - person("Christophe", "Biernacki", role = "aut")) + person("Julien", "Jacques", role = "aut", email = "julien.jacques@univ-lyon2.fr"), + person("Christophe", "Biernacki", role = "aut", email = "christophe.biernacki@inria.fr")) Description: Implementation of a model-based clustering algorithm for ranking data (C. Biernacki, J. Jacques (2013) ). Multivariate rankings as well as partial rankings are taken @@ -24,4 +24,4 @@ LinkingTo: Rcpp, RcppEigen Suggests: knitr, rmarkdown, testthat VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.1 diff --git a/NEWS b/NEWS index 6e4c010..e7a53b1 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,9 @@ -# +# v0.98.0 - 2022-11-11 - add unit tests +- improve numerical stability of probability computation (WIP) +- refactor C++ code (WIP) +- fix c++ warnings +- lint R code # v0.94.5 - 2021-01-26 - replace random_shuffle in C++ @@ -18,7 +22,49 @@ - factorization - correct roxygen S4 tag +# v0.94.1 - 2019-08-28 +- beautify R code +- update Makevars +- add src/Rankcluster_init.c file + +# v0.94 - 2016-07-29 +- fix bug in gibbs for ties and partial ranks + +# v0.93.1 - 2016-01-12 +- update documentation url +- use r random number generator + +# v0.92.9 - 2014-07-25 +- data and package documentation in their own files +- parallelization with OPENMP +- update doc + # v0.92 - 2014-02-12 -- ranks have to be given to the package in the ranking notation (see convertRank function), with the following convention: - - missing positions are replaced by 0 - - tied are replaced by the lowest position they share \ No newline at end of file +- remove Eigen and use RcppEigen +- add probability function +- ranks have to be given to the package in the ranking notation (to manage ties) +- update doc + +# v0.91.6 - 2013-12-19 +- update doc +- eigen licence/copyright + +# v0.91.5 - 2013-12-05 +- update eigen +- add eigen copyright + +# v0.91 - 2013-11-03 +- correct c++ warnings: comparison between unsigned and signed int +- correction in khi2 for partial data + +# v0.90.3 - 2013-09-13 +- add vignettes + +# v0.90.2 - 2013-09-05 +- fix error in examples due to randomness + +# v0.90.1 - 2013-08-30 +- correct limited development formula + +# v0.89 - 2013-08-28 +- on CRAN, now diff --git a/R/ISRdistribution.R b/R/ISRdistribution.R index 89f21e5..3b68cb8 100644 --- a/R/ISRdistribution.R +++ b/R/ISRdistribution.R @@ -1,9 +1,9 @@ #' @title Simulate a sample of ISR(pi,mu) -#' +#' #' @description This function simulates univariate rankings data (ordering representation) according to the ISR(pi,mu). #' #' @param n size of the sample. -#' @param pi dispersion parameter: probability of correct paired comparaison according to mu. +#' @param pi dispersion parameter: probability of correct paired comparison according to mu. #' @param mu position parameter: modal ranking in ordering representation. #' @return a matrix with simulated ranks. #' @@ -23,45 +23,54 @@ #' result of the judge is o = (3, 1, 2) whereas the ranking #' result is r = (2, 3, 1). #' -#' You can see the \link{convertRank} function to convert the simualted ranking drom ordering to ranking representation. +#' You can see the \link{convertRank} function to convert the simulated ranking from ordering to ranking representation. #' #' @references -#' [1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, Computational Statistics and Data Analysis, 58, 162-176. -#' +#' [1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, +#' Computational Statistics and Data Analysis, 58, 162-176. +#' #' @examples #' x <- simulISR(30, 0.8, 1:4) -#' +#' #' @author Julien Jacques -#' +#' #' @export -simulISR <- function(n, pi, mu) -{ - if (missing(n)) +simulISR <- function(n, pi, mu) { + if (missing(n)) { stop("n is missing") - if (missing(mu)) + } + if (missing(mu)) { stop("mu is missing") - if (missing(pi)) + } + if (missing(pi)) { stop("pi is missing") - - if (!is.numeric(n) || (length(n) > 1)) + } + + if (!is.numeric(n) || (length(n) > 1)) { stop("n must be a strictly positive integer") - if ((n != round(n)) || (n <= 0)) + } + if ((n != round(n)) || (n <= 0)) { stop("n must be a strictly positive integer") - - if (!is.numeric(pi) || (length(pi) > 1)) + } + + if (!is.numeric(pi) || (length(pi) > 1)) { stop("pi must be a real between 0 and 1") - if ((pi > 1) || (pi < 0)) + } + if ((pi > 1) || (pi < 0)) { stop("pi must be a real between 0 and 1") - - if (!is.vector(mu, mode = "numeric")) + } + + if (!is.vector(mu, mode = "numeric")) { stop("mu must be a complete rank") - if (!checkRank(mu)) + } + if (!checkRank(mu)) { stop("mu must be a complete rank") - - - + } + + + res <- .Call("simulISRR", n, length(mu), mu, pi, PACKAGE = "Rankcluster") - + return(res) } @@ -70,17 +79,20 @@ simulISR <- function(n, pi, mu) #' #' @description It computes the probability of a (multivariate) rank x according to a ISR(mu, pi). #' -#' @param x a vector or a matrix containing the rankings in ranking notation (see Details or \link{convertRank} function). +#' @param x a vector or a matrix containing the rankings in ranking notation (see Details or \link{convertRank} function). #' The rankings of each dimension are placed end to end. \code{x} must contain only full ranking (no partial or tied). -#' @param pi a vector of size \code{p=length(m)}, where \code{p} is the number of dimension, containing the probabilities of a good comparison of the model (dispersion parameters). -#' @param mu a vector of length \code{sum(m)} containing the modal ranks in ranking notation (see Details or \link{convertRank} function). +#' @param pi a vector of size \code{p=length(m)}, where \code{p} is the number of dimension, containing the probabilities of +#' a good comparison of the model (dispersion parameters). +#' @param mu a vector of length \code{sum(m)} containing the modal ranks in ranking notation (see Details or +#' \link{convertRank} function). #' The rankings of each dimension are placed end to end. \code{mu} must contain only full ranking (no partial or tied). #' @param m a vector containing the size of ranks for each dimension. -#' +#' #' @return the probability of \code{x} according to a ISR(mu, pi). #' #' @details -#' The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), with the following convention: +#' The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), +#' with the following convention: #' #' - missing positions are replaced by 0 #' @@ -105,101 +117,121 @@ simulISR <- function(n, pi, mu) #' #' @examples #' m <- c(4, 5) -#' x = mu <- matrix(nrow = 1, ncol = 9) -#' x[1:4] = c(1, 4, 2, 3) -#' x[5:9] = c(3, 5, 2, 4, 1) -#' mu[1:4] = 1:4 -#' mu[5:9] = c(3, 5, 4, 2, 1) +#' x <- mu <- matrix(nrow = 1, ncol = 9) +#' x[1:4] <- c(1, 4, 2, 3) +#' x[5:9] <- c(3, 5, 2, 4, 1) +#' mu[1:4] <- 1:4 +#' mu[5:9] <- c(3, 5, 4, 2, 1) #' pi <- c(0.75, 0.82) #' #' prob <- probability(x, mu, pi, m) #' prob -#' +#' #' @author Quentin Grimonprez -#' +#' #' @export #' -probability <- function(x, mu, pi, m = length(mu)) -{ +probability <- function(x, mu, pi, m = length(mu)) { x <- matrix(x, ncol = sum(m)) mu <- matrix(mu, ncol = sum(m)) - + checkProbability(x, mu, pi, m) - + # convert to ordering - for (i in 1:length(m)) - { - mu[1, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])] = convertRank(mu[1, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])]) - for (j in 1:nrow(x)) - { - x[j, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])] = convertRank(x[j, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])]) + for (i in seq_along(m)) { + mu[1, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])] <- convertRank( + mu[1, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])] + ) + for (j in seq_len(nrow(x))) { + x[j, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])] <- convertRank( + x[j, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])] + ) } - } - + proba <- rep(NA, nrow(x)) - for (j in seq_along(proba)) + for (j in seq_along(proba)) { proba[j] <- .Call("computeProba", x[j, , drop = FALSE], mu, pi, m, PACKAGE = "Rankcluster") - + } + return(proba) } -checkProbability <- function(x, mu, pi, m) -{ +checkProbability <- function(x, mu, pi, m) { ### check parameters - if (missing(x)) + if (missing(x)) { stop("x is missing.") - if (missing(mu)) + } + if (missing(mu)) { stop("mu is missing.") - if (missing(pi)) + } + if (missing(pi)) { stop("pi is missing.") - + } + # x - if (!(is.vector(x) || is.matrix(x))) + if (!(is.vector(x) || is.matrix(x))) { stop("x must be either a matrix or a vector.") - if (is.vector(x)) + } + if (is.vector(x)) { x <- t(as.matrix(x)) - if (!is.numeric(x)) + } + if (!is.numeric(x)) { stop("x must be either a matrix or a vector of integer.") - + } + # mu - if (!(is.vector(mu) || is.matrix(mu))) + if (!(is.vector(mu) || is.matrix(mu))) { stop("mu must be either a matrix or a vector.") - if (is.vector(mu)) + } + if (is.vector(mu)) { mu <- t(as.matrix(mu)) - if (!is.numeric(mu)) + } + if (!is.numeric(mu)) { stop("mu must be either a matrix or a vector of integer.") - + } + # pi - if (!is.numeric(pi)) + if (!is.numeric(pi)) { stop("pi must be a vector of probabilities.") - if (!is.vector(pi)) + } + if (!is.vector(pi)) { stop("pi must be a vector of probabilities.") - if ((min(pi) < 0) || max(pi) > 1) + } + if ((min(pi) < 0) || max(pi) > 1) { stop("pi must be a vector of probabilities.") - + } + # m - if (!is.numeric(m)) + if (!is.numeric(m)) { stop("m must be a vector of integer.") - if (!is.vector(m)) + } + if (!is.vector(m)) { stop("m must be a vector of integer.") - if (sum(unlist(lapply(m, is.wholenumber))) != length(m)) + } + if (sum(unlist(lapply(m, is.wholenumber))) != length(m)) { stop("m contains non integer.") - if (sum(m) != ncol(x)) + } + if (sum(m) != ncol(x)) { stop("sum(m) and the length of x do not match.") - if (sum(m) != length(mu)) + } + if (sum(m) != length(mu)) { stop("sum(m) and the length of mu do not match.") - if (length(m) != length(pi)) + } + if (length(m) != length(pi)) { stop("the length of pi and m do not match.") - + } + # check if mu contains ranks - for (i in 1:length(m)) - { - if (!checkRank(mu[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], m[i])) + for (i in seq_along(m)) { + if (!checkRank(mu[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], m[i])) { stop("mu is not correct.") - for (j in 1:nrow(x)) - if (!checkRank(x[j, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], m[i])) + } + for (j in seq_len(nrow(x))) { + if (!checkRank(x[j, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], m[i])) { stop("x is not correct.") + } + } } } \ No newline at end of file diff --git a/R/Rankcluster-package.R b/R/Rankcluster-package.R index 17a13b5..d40cc7f 100644 --- a/R/Rankcluster-package.R +++ b/R/Rankcluster-package.R @@ -5,23 +5,26 @@ #' @docType package #' @aliases Rankcluster-package #' @name Rankcluster-package -#' +#' #' @description This package proposes a model-based clustering algorithm for ranking data. #' Multivariate rankings as well as partial rankings are taken into account. #' This algorithm is based on an extension of the Insertion Sorting Rank (ISR) model for ranking data, which is a meaningful -#' and effective model parametrized by a position parameter (the modal ranking, quoted by mu) and a dispersion parameter (quoted by pi). -#' The heterogeneity of the rank population is modelled by a mixture of ISR, whereas conditional independence assumption is considered for multivariate rankings. +#' and effective model parametrized by a position parameter (the modal ranking, quoted by mu) and a dispersion parameter +#' (quoted by pi). The heterogeneity of the rank population is modeled by a mixture of ISR, whereas conditional independence +#' assumption is considered for multivariate rankings. #' #' @details #' The main function is \link{rankclust}. -#' See vignettes for detailled examples: \code{RShowDoc("dataFormat", package = "Rankcluster")} and \code{RShowDoc("Rankcluster", package = "Rankcluster")} -#' +#' See vignettes for detailed examples: \code{RShowDoc("dataFormat", package = "Rankcluster")} and +#' \code{RShowDoc("Rankcluster", package = "Rankcluster")} +#' #' -#' @references [1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, Computational Statistics and Data Analysis, 58, 162-176. +#' @references [1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, +#' Computational Statistics and Data Analysis, 58, 162-176. #' -#' [2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, Inria Research Report n 8113. +#' [2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, +#' Inria Research Report n 8113. #' -#' @author Maintainer: Quentin Grimonprez #' #' @examples #' # see vignettes @@ -31,14 +34,13 @@ #' # main function of the package for run the algorithm #' data(big4) #' result <- rankclust(big4$data, K = 2, m = big4$m, Ql = 200, Bl = 100, maxTry = 2) -#' -#' if(result@@convergence) -#' { +#' +#' if(result@@convergence) { #' summary(result) -#' +#' #' partition <- result[2]@@partition #' tik <- result[2]@@tik #' } -#' +#' #' @keywords package NULL diff --git a/R/checkFunctions.R b/R/checkFunctions.R index 9e4d00e..3589362 100644 --- a/R/checkFunctions.R +++ b/R/checkFunctions.R @@ -4,78 +4,92 @@ # # @ return TRUE if the number is an integer, FALSE else # -is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) -{ +is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) { # if(!is.double(x)) # return(FALSE) - + abs(x - round(x)) < tol } -checkProportion <- function(proportion, paramName = "proportion", eps = 1e-10) -{ - if (missing(proportion)) +checkProportion <- function(proportion, paramName = "proportion", eps = 1e-10) { + if (missing(proportion)) { stop(paste0(paramName, " is missing")) - if (!is.vector(proportion, mode = "numeric")) + } + if (!is.vector(proportion, mode = "numeric")) { stop(paste0(paramName, " must be a vector of positive real whose sum equal 1")) - if (min(proportion) < 0) + } + if (min(proportion) < 0) { stop(paste0(paramName, " must be a vector of positive real whose sum equal 1")) - if (abs(1 - sum(proportion)) > eps) + } + if (abs(1 - sum(proportion)) > eps) { stop(paste0(paramName, " must be a vector of positive real whose sum equal 1")) + } } -checkPi <- function(pi, paramName = "pi") -{ - if (missing(pi)) +checkPi <- function(pi, paramName = "pi") { + if (missing(pi)) { stop(paste0(paramName, " is missing")) - if (!is.numeric(pi) || !is.matrix(pi)) + } + if (!is.numeric(pi) || !is.matrix(pi)) { stop(paste0(paramName, " must be a matrix of probabilities")) - if ((min(pi) < 0) && (max(pi) > 1)) + } + if ((min(pi) < 0) && (max(pi) > 1)) { stop(paste0(paramName, " must be a matrix of probabilities")) + } } -checkM <- function(m) -{ - if(missing(m)) +checkM <- function(m) { + if (missing(m)) { stop("m is missing") - if (!is.vector(m, mode = "numeric")) + } + if (!is.vector(m, mode = "numeric")) { stop("m must be a (vector of) integer strictly greater than 1") - if (length(m) != length(m[m > 1])) + } + if (length(m) != length(m[m > 1])) { stop("m must be a (vector of) integer strictly greater than 1") - if (!min(m == round(m))) + } + if (!min(m == round(m))) { stop("m must be a (vector of) integer strictly greater than 1") + } } -checkM2 <- function(m, pi, mu, piName = "pi", muName = "mu") -{ - if (length(m) != ncol(pi)) - stop(paste0("The number of column of ", piName," and m do not match.")) - if (sum(m) != ncol(mu)) - stop(paste0("The number of column of ", muName," and sum(m) do not match.")) +checkM2 <- function(m, pi, mu, piName = "pi", muName = "mu") { + if (length(m) != ncol(pi)) { + stop(paste0("The number of column of ", piName, " and m do not match.")) + } + if (sum(m) != ncol(mu)) { + stop(paste0("The number of column of ", muName, " and sum(m) do not match.")) + } } -checkMu <- function(mu, proportion, pi, muName = "mu", proportionName = "proportion", piName = "pi") -{ - if (missing(mu)) +checkMu <- function(mu, proportion, pi, muName = "mu", proportionName = "proportion", piName = "pi") { + if (missing(mu)) { stop(paste0(muName, " is missing")) - if (!is.numeric(mu) || !is.matrix(mu)) + } + if (!is.numeric(mu) || !is.matrix(mu)) { stop(paste0(muName, " must be a matrix of positive integer")) - if (min(mu) < 1) + } + if (min(mu) < 1) { stop(paste0(muName, " must be a matrix of positive integer")) - if (nrow(mu) != length(proportion)) - stop(paste0("The number of rows of ", muName, " and the length of ", proportionName , " do not match.")) - if (nrow(mu) != nrow(pi)) + } + if (nrow(mu) != length(proportion)) { + stop(paste0("The number of rows of ", muName, " and the length of ", proportionName, " do not match.")) + } + if (nrow(mu) != nrow(pi)) { stop(paste0("The number of rows of ", muName, " and ", piName, " do not match.")) + } } -checkData <- function(data) -{ - if (missing(data)) +checkData <- function(data) { + if (missing(data)) { stop("data is missing") - if (!is.numeric(data) || !is.matrix(data)) + } + if (!is.numeric(data) || !is.matrix(data)) { stop("X must be a matrix of positive integer") - if (length(data[data >= 0]) != length(data)) + } + if (length(data[data >= 0]) != length(data)) { stop("data must be a matrix of positive integer") -} \ No newline at end of file + } +} diff --git a/R/conversion.R b/R/conversion.R index c0f402e..c407bda 100644 --- a/R/conversion.R +++ b/R/conversion.R @@ -1,199 +1,173 @@ # convertir les mu (multi dim, plusieurs groupes) en une matrice -liste3d2mat <- function(liste) -{ - g = length(liste) - d = length(liste[[1]]) - m = rep(0, d) - - for (i in 1:d) - m[i] = length(liste[[1]][[i]]) - - mu = matrix(0, g, sum(m)) - rownom = rep(0, g) - colnom = rep("", sum(m)) - - for (j in 1:g) - { - compteur = 1 - rownom[j] = paste0("cl", j) - for (i in 1:d) - { - colnom[compteur] = paste0("dim", i) - for (k in 1:m[i]) - { - mu[j, compteur] = liste[[j]][[i]][k] - compteur = compteur + 1 +liste3d2mat <- function(liste) { + g <- length(liste) + d <- length(liste[[1]]) + m <- rep(0, d) + + for (i in 1:d) { + m[i] <- length(liste[[1]][[i]]) + } + + mu <- matrix(0, g, sum(m)) + rownom <- rep(0, g) + colnom <- rep("", sum(m)) + + for (j in 1:g) { + compteur <- 1 + rownom[j] <- paste0("cl", j) + for (i in 1:d) { + colnom[compteur] <- paste0("dim", i) + for (k in 1:m[i]) { + mu[j, compteur] <- liste[[j]][[i]][k] + compteur <- compteur + 1 } } } - colnames(mu) = colnom - rownames(mu) = rownom + colnames(mu) <- colnom + rownames(mu) <- rownom return(mu) } +liste2d2matD <- function(liste) { + d <- length(liste) + m <- rep(0, d) + for (i in 1:d) { + m[i] <- length(liste[[i]]) + } -liste2d2matD = function(liste) -{ - d = length(liste) - m = rep(0, d) - for (i in 1:d) - m[i] = length(liste[[i]]) - - - mu = matrix(0, 1, sum(m)) - colnom = rep("", sum(m)) + mu <- matrix(0, 1, sum(m)) + colnom <- rep("", sum(m)) - compteur = 1 - for (i in 1:d) - { - colnom[compteur] = paste0("dim", i) - for (k in 1:m[i]) - { - mu[1, compteur] = liste[[i]][k] - compteur = compteur + 1 + compteur <- 1 + for (i in 1:d) { + colnom[compteur] <- paste0("dim", i) + for (k in 1:m[i]) { + mu[1, compteur] <- liste[[i]][k] + compteur <- compteur + 1 } } - colnames(mu) = colnom - + colnames(mu) <- colnom return(mu) - } + # convertir les mu (1 dim, plusieurs groupes) en une matrice -liste2d2matG = function(liste) -{ - g = length(liste) - m = length(liste[[1]]) - - - mu = matrix(0, g, m) - rownom = rep(0, g) - colnames(mu) = c("dim1", rep("", m - 1)) - for (i in 1:g) - { - rownom[i] = paste0("cl", i) - - for (k in 1:m) - { - mu[i, k] = liste[[i]][k] +liste2d2matG <- function(liste) { + g <- length(liste) + m <- length(liste[[1]]) + + mu <- matrix(0, g, m) + rownom <- rep(0, g) + colnames(mu) <- c("dim1", rep("", m - 1)) + for (i in 1:g) { + rownom[i] <- paste0("cl", i) + + for (k in 1:m) { + mu[i, k] <- liste[[i]][k] } } - rownames(mu) = rownom + rownames(mu) <- rownom return(mu) - } - # convertir les mu (1 dim, plusieurs groupes) en une matrice -listedistPartiel = function(liste) # ,index) -{ - - n = length(liste) - d = length(liste[[1]]) - mu = matrix(ncol = d, nrow = n) +listedistPartiel <- function(liste) { # ,index) + n <- length(liste) + d <- length(liste[[1]]) + mu <- matrix(ncol = d, nrow = n) # rownames(mu)=index - colnames(mu) = paste0("dim", c(1:d)) - for (i in 1:n) - mu[i, ] = liste[[i]] - + colnames(mu) <- paste0("dim", c(1:d)) + for (i in 1:n) { + mu[i, ] <- liste[[i]] + } return(mu) - } -partition <- function(row) -{ + +partition <- function(row) { return(which(row == max(row)) - 1) } ##### convertir liste en array -liste3d2array = function(liste) -{ - - iter = length(liste) - d = nrow(liste[[1]]) - g = ncol(liste[[1]]) - tab = array(dim = c(iter, g, d)) - for (i in 1:iter) - { - tab[i, , ] = t(liste[[i]]) +liste3d2array <- function(liste) { + iter <- length(liste) + d <- nrow(liste[[1]]) + g <- ncol(liste[[1]]) + tab <- array(dim = c(iter, g, d)) + for (i in 1:iter) { + tab[i, , ] <- t(liste[[i]]) } return(tab) - } -tliste3d2mat = function(liste) -{ - d = length(liste) - g = length(liste[[1]]) - m = rep(0, d) - - for (i in 1:d) - m[i] = length(liste[[i]][[1]]) - - mu = matrix(0, g, sum(m)) - rownom = rep(0, g) - colnom = rep("", sum(m)) - - for (j in 1:g) - { - compteur = 1 - rownom[j] = paste0("cl", j) - for (i in 1:d) - { - colnom[compteur] = paste0("dim", i) - for (k in 1:m[i]) - { - mu[j, compteur] = liste[[i]][[j]][k] - compteur = compteur + 1 +tliste3d2mat <- function(liste) { + d <- length(liste) + g <- length(liste[[1]]) + m <- rep(0, d) + + for (i in 1:d) { + m[i] <- length(liste[[i]][[1]]) + } + + mu <- matrix(0, g, sum(m)) + rownom <- rep(0, g) + colnom <- rep("", sum(m)) + + for (j in 1:g) { + compteur <- 1 + rownom[j] <- paste0("cl", j) + for (i in 1:d) { + colnom[compteur] <- paste0("dim", i) + for (k in 1:m[i]) { + mu[j, compteur] <- liste[[i]][[j]][k] + compteur <- compteur + 1 } } } - colnames(mu) = colnom - rownames(mu) = rownom + colnames(mu) <- colnom + rownames(mu) <- rownom return(mu) } -liste2d2matgd = function(liste) -{ - d = length(liste) - g = length(liste[[1]]) +liste2d2matgd <- function(liste) { + d <- length(liste) + g <- length(liste[[1]]) - mat = matrix(ncol = d, nrow = g) - colnames(mat) = paste0("dim", 1:d) - rownames(mat) = paste0("cl", 1:g) - for (i in 1:d) - mat[, i] = liste[[i]] + mat <- matrix(ncol = d, nrow = g) + colnames(mat) <- paste0("dim", 1:d) + rownames(mat) <- paste0("cl", 1:g) + for (i in 1:d) { + mat[, i] <- liste[[i]] + } return(mat) } -liste3d2listematgd = function(liste) -{ - d = length(liste[[1]]) - g = length(liste[[1]][[1]]) - coln = paste0("dim", 1:d) - rown = paste0("cl", 1:g) - outliste = lapply(liste, FUN = function(x) - { - mat = matrix(ncol = d, nrow = g) - colnames(mat) = coln - rownames(mat) = rown - for (i in 1:d) - mat[, i] = x[[i]] +liste3d2listematgd <- function(liste) { + d <- length(liste[[1]]) + g <- length(liste[[1]][[1]]) + coln <- paste0("dim", 1:d) + rown <- paste0("cl", 1:g) + outliste <- lapply(liste, FUN = function(x) { + mat <- matrix(ncol = d, nrow = g) + colnames(mat) <- coln + rownames(mat) <- rown + for (i in 1:d) { + mat[, i] <- x[[i]] + } return(mat) }) return(outliste) - } diff --git a/R/criteria.R b/R/criteria.R index 136e96f..cf1f565 100644 --- a/R/criteria.R +++ b/R/criteria.R @@ -1,101 +1,116 @@ #' @title Criteria estimation -#' -#' @description This function estimates the loglikelihood of a mixture of multidimensional ISR model, as well as the BIC and ICL model selection criteria. -#' -#' @param data a matrix in which each row is a rank (partial or not; for partial rank, missing elements of a rank are put to 0 ). +#' +#' @description This function estimates the loglikelihood of a mixture of multidimensional ISR model, as well as the +#' BIC and ICL model selection criteria. +#' +#' @param data a matrix in which each row is a rank (partial or not; for partial rank, +#' missing elements of a rank are put to 0). #' @param proportion a vector (which sums to 1) containing the K mixture proportions. -#' @param pi a matrix of size K*p, where K is the number of clusters and p the number of dimension, containing the probabilities of a good comparaison of the model (dispersion parameters). -#' @param mu a matrix of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row. +#' @param pi a matrix of size K*p, where K is the number of clusters and p the number of dimension, containing the +#' probabilities of a good comparison of the model (dispersion parameters). +#' @param mu a matrix of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. +#' In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row. #' @param m a vector containing the size of ranks for each dimension. #' @param Ql number of iterations of the Gibbs sampler used for the estimation of the log-likelihood. #' @param Bl burn-in period of the Gibbs sampler. #' @param IC number of run of the computation of the loglikelihood. #' @param nb_cpus number of cpus for parallel computation -#' +#' #' @return a list containing: #' \item{ll}{the estimated log-likelihood.} #' \item{bic}{the estimated BIC criterion.} #' \item{icl}{the estimated ICL criterion.} -#' +#' #' @examples #' data(big4) -#' res = rankclust(big4$data, m = big4$m, K = 2, Ql = 100, Bl = 50, maxTry = 2) -#' if (res@@convergence) -#' crit = criteria(big4$data, res[2]@@proportion, res[2]@@pi, res[2]@@mu, +#' res <- rankclust(big4$data, m = big4$m, K = 2, Ql = 100, Bl = 50, maxTry = 2) +#' if (res@@convergence) { +#' crit <- criteria(big4$data, res[2]@@proportion, res[2]@@pi, res[2]@@mu, #' big4$m, Ql = 200, Bl = 100) -#' +#' } +#' #' @author Quentin Grimonprez -#' +#' #' @export -criteria <- function(data, proportion, pi, mu, m, Ql = 500, Bl = 100, IC = 1, nb_cpus = 1) -{ +criteria <- function(data, proportion, pi, mu, m, Ql = 500, Bl = 100, IC = 1, nb_cpus = 1) { checkCriteria(data, proportion, pi, mu, m, Ql, Bl, IC, nb_cpus) - a = t(pi) + a <- t(pi) - LL = .Call("loglikelihood", data, mu, a, proportion, m, Ql, Bl, IC, nb_cpus, PACKAGE = "Rankcluster") + LL <- .Call("loglikelihood", data, mu, a, proportion, m, Ql, Bl, IC, nb_cpus, PACKAGE = "Rankcluster") - if (LL$ll[1] == "pb") + if (LL$ll[1] == "pb") { stop("Data are not correct.") + } return(LL) } -checkCriteria <- function(data, proportion, pi, mu, m, Ql, Bl, IC, nb_cpus) -{ +checkCriteria <- function(data, proportion, pi, mu, m, Ql, Bl, IC, nb_cpus) { # data checkData(data) - - #proportion + + # proportion checkProportion(proportion, paramName = "proportion", eps = 1e-10) - + # m checkM(m) checkM2(m, pi, mu, piName = "pi", muName = "mu") # p checkPi(pi, paramName = "pi") - if ((nrow(pi) != length(proportion)) || (nrow(pi) != nrow(mu))) + if ((nrow(pi) != length(proportion)) || (nrow(pi) != nrow(mu))) { stop("The number of rows of pi doesn't match with the others parameters.") - + } + # Ql - if (!is.numeric(Ql) || (length(Ql) > 1)) + if (!is.numeric(Ql) || (length(Ql) > 1)) { stop("Ql must be a strictly positive integer") - if ((Ql != round(Ql)) || (Ql <= 0)) + } + if ((Ql != round(Ql)) || (Ql <= 0)) { stop("Ql must be a strictly positive integer") - + } + # IC - if (!is.numeric(IC) || (length(IC) > 1)) + if (!is.numeric(IC) || (length(IC) > 1)) { stop("IC must be a strictly positive integer") - if ((IC != round(IC)) || (IC <= 0)) + } + if ((IC != round(IC)) || (IC <= 0)) { stop("IC must be a strictly positive integer") - + } + # nb_cpus - if (!is.numeric(nb_cpus) || (length(nb_cpus) > 1)) + if (!is.numeric(nb_cpus) || (length(nb_cpus) > 1)) { stop("nb_cpus must be a strictly positive integer") - if ((nb_cpus != round(nb_cpus)) || (nb_cpus <= 0)) + } + if ((nb_cpus != round(nb_cpus)) || (nb_cpus <= 0)) { stop("nb_cpus must be a strictly positive integer") - + } + # Bl - if (!is.numeric(Bl) || (length(Bl) > 1)) + if (!is.numeric(Bl) || (length(Bl) > 1)) { stop("Bl must be a strictly positive integer lower than Ql") - if ((Bl != round(Bl)) || (Bl <= 0) || (Bl >= Ql)) + } + if ((Bl != round(Bl)) || (Bl <= 0) || (Bl >= Ql)) { stop("Bl must be a strictly positive integer lower than Ql") - + } + # mu checkMu(mu, proportion, pi, muName = "mu", proportionName = "proportion", piName = "pi") - - + + # check if mu contains ranks for (i in 1:length(m)) { - if (sum(apply(mu[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1]), drop = FALSE], 1, checkRank, m[i])) != nrow(mu)) + if (sum(apply(mu[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1]), drop = FALSE], 1, checkRank, m[i])) != nrow(mu)) { stop("mu is not correct") + } } - + # check data for (i in 1:length(m)) { - if (sum(apply(data[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkTiePartialRank, m[i])) != nrow(data)) + if (sum(apply(data[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkTiePartialRank, m[i])) != nrow(data)) { stop("Data are not correct") + } } } \ No newline at end of file diff --git a/R/data.R b/R/data.R index 420d90d..8a558fc 100644 --- a/R/data.R +++ b/R/data.R @@ -6,31 +6,35 @@ #' @name APA #' @format A list containing: #' \describe{ -#' \item{data}{ matrix of size 5738x5 containing the 5738 observed full ranks in ranking representation. -#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. +#' \item{data}{A matrix of size 5738x5 containing the 5738 observed full ranks in ranking representation. +#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is +#' in r_ith position. #' -#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} +#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks +#' the first object in 4th position, second object in 3rd position, ...} #' #' \item{frequency}{matrix of size 120x6. Each row corresponds to one of the different observed rank. #' The first fifth columns contains the observed ranks (ordering representation) and the sixth column #' contains the frequency of observation.} #' -#' \item{m}{ vector with the size of the ranks (5 here).} +#' \item{m}{vector with the size of the ranks (5 here).} #' } #' -#' @description This dataset contains the 5738 full rankings resulting from the American Psychological Association (APA) presidential election of 1980. -#' For this election, members of APA had to rank five candidates in order of preference. +#' @description This dataset contains the 5738 full rankings resulting from the American Psychological Association (APA) +#' presidential election of 1980. For this election, members of APA had to rank five candidates in order of preference. #' -#' For information, a total of 15449 votes have been registred for this election, but only the 5738 full rankings are reported in the APA dataset. Candidates A and C were research psychologists, candidates D and E were clinical psychologists and candidate B was a community psychologist. +#' For information, a total of 15449 votes have been registered for this election, but only the 5738 full rankings are +#' reported in the APA dataset. Candidates A and C were research psychologists, candidates D and E were clinical +#' psychologists and candidate B was a community psychologist. #' #' #' @source "Group representations in probability and statistics", P. Diaconis, 1988. #' #' @examples #' data(APA) -#' +#' #' @family datasets -#' +#' #' @keywords datasets NULL @@ -41,106 +45,123 @@ NULL #' @name big4 #' @format A list containing: #' \describe{ -#' \item{data}{A matrix of size 21*8 containing the 21 Premier League seasons. Each row corresponding to one ranking (ranking representation). +#' \item{data}{A matrix of size 21*8 containing the 21 Premier League seasons. Each row corresponding to one ranking +#' (ranking representation). #' -#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. +#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the +#' ith object is in r_ith position. #' -#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} -#' \item{frequency}{matrix of size 21*9. Each row corresponds to one of the 21 different observed rankings, and the last column contains the observation frequency.} +#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first +#' object in 4th position, second object in 3rd position, ...} +#' \item{frequency}{matrix of size 21*9. Each row corresponds to one of the 21 different observed rankings, and the +#' last column contains the observation frequency.} #' \item{m}{the size of the rankings (m=c(4,4) ).} #' } -#' -#' @description This dataset is composed of the rankings (in ranking notation) of the "Big Four" English football teams (A: Manchester, B: Liverpool, C: Arsenal, D: Chelsea) to the English Championship (Premier League) and according to the UEFA coefficients (statistics used in Europe for ranking and seeding teams in international competitions), from 1993 to 2013. #' -#' In 2000-2001, Arsenal and Chelsea had the same UEFA coefficient and then are tied. UEFA ranking is (1, 4, 2, 2) for 2000-2001, what means that Manchester United is the first, Liverpool is the last, and the two intermediate positions are for Arsenal and Chelsea in an unknown order. +#' @description This dataset is composed of the rankings (in ranking notation) of the "Big Four" English football teams +#' (A: Manchester, B: Liverpool, C: Arsenal, D: Chelsea) to the English Championship (Premier League) and according to the UEFA +#' coefficients (statistics used in Europe for ranking and seeding teams in international competitions), from 1993 to 2013. +#' +#' In 2000-2001, Arsenal and Chelsea had the same UEFA coefficient and then are tied. UEFA ranking is (1, 4, 2, 2) for +#' 2000-2001, what means that Manchester United is the first, Liverpool is the last, and the two intermediate positions are +#' for Arsenal and Chelsea in an unknown order. #' #' In 2009-2010, Liverpool and Arsenal have also the same UEFA coefficient, the ranking is (1, 2, 2, 4). #' #' @source \url{https://en.wikipedia.org/wiki/Premier_League} #' -#' \url{https://www.uefa.com/memberassociations/uefarankings/club/} +#' \url{https://fr.uefa.com/nationalassociations/uefarankings/club/} #' #' @examples #' data(big4) -#' +#' #' @family datasets -#' +#' #' @keywords datasets NULL -#' @title Multidimensionnal partial rank data: eurovision +#' @title Multidimensional partial rank data: eurovision #' @docType data #' @aliases eurovision #' @name eurovision #' @format A list containing: #' \describe{ -#' \item{data}{ A matrix of size 34*48. Each row corresponds to the ranking representation of a multidimensionnal ranking. +#' \item{data}{ A matrix of size 34*48. Each row corresponds to the ranking representation of a multidimensional ranking. #' Columns 1 to 8 correspond to the 2007 contest, columns 9 to 18 to the 2008 contest, etc... #' -#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. +#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that +#' the ith object is in r_ith position. #' -#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ... +#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first +#' object in 4th position, second object in 3rd position, ... #' #' } #' -#' \item{frequency}{A matrix of size 34*49 containing the differents multidimensionnal rankings. The 48 first columns are the same as in data, and the last column contains the frequency (1 for all ranks).} +#' \item{frequency}{A matrix of size 34*49 containing the different multidimensional rankings. The 48 first columns are +#' the same as in data, and the last column contains the frequency (1 for all ranks).} #' #' \item{m}{ a vector with the sizes of ranks for each dimension.} #' } -#' -#' @description This dataset contains the ranking of the 8 common finalists of the Eurovision song contest from 2007 to 2012: +#' +#' @description This dataset contains the ranking of the 8 common finalists of the Eurovision song contest from 2007 to 2012: #' #' A: France, B:Germany, C:Greece, D:Romania, E:Russia, F:Spain, G:Ukraine, H:United Kingdom. #' -#' The number of rankings is 33, corresponding to the 33 European countries having participated to this six editions of the contest. +#' The number of rankings is 33, corresponding to the 33 European countries having participated to those +#' six editions of the contest. #' -#' All the rankings are partial since none country has ranked this 8 countries in its 10 preferences. Missing ranking elements are zeros. +#' All the rankings are partial since none country has ranked this 8 countries in its 10 preferences. Missing ranking +#' elements are zeros. #' #' @source https://eurovision.tv #' #' @examples #' data(eurovision) -#' +#' #' @family datasets -#' +#' #' @keywords datasets NULL -#' @title Multidimensionnal rank data: quiz +#' @title Multidimensional rank data: quiz #' @docType data #' @aliases quiz #' @name quiz #' @format A list containing: #' \describe{ -#' \item{data}{a matrix of size 70*16. The student's answers are in row and the 16 columns correspond to the 4 rankings (for the 4 quizzes) of size 4 (ranking representation). +#' \item{data}{a matrix of size 70*16. The student's answers are in row and the 16 columns correspond to the 4 rankings +#' (for the 4 quizzes) of size 4 (ranking representation). #' -#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. +#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is +#' in r_ith position. #' -#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} -#' \item{frequency}{a matrix of size 63*17. Each row corresponds to one of the 63 differents observed -#' rankings (ranking representation). Each row contains 4 ranks of size 4 and a last column for the frequency.} +#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th +#' position, second object in 3rd position, ...} +#' \item{frequency}{a matrix of size 63*17. Each row corresponds to one of the 63 different observed rankings +#' (ranking representation). Each row contains 4 ranks of size 4 and a last column for the frequency.} #' \item{m}{a vector with the sizes of the ranks for each dimension.} #' #' } -#' -#' @description This dataset contains the answers of 70 students (40 of third year and 30 of fourth year) from Polytech'Lille (statistics engineering school, France) to the four following quizzes: #' -#' \describe{ +#' @description This dataset contains the answers of 70 students (40 of third year and 30 of fourth year) +#' from Polytech'Lille (statistics engineering school, France) to the four following quizzes: #' +#' \describe{#' #' \item{Literature Quiz}{ -#' This quiz consists of ranking four french writers according to chronological order: -#' A=Victor Hugo, B=Moliere, C=Albert Camus, D=Jean-Jacques Rousseau.} +#' This quiz consists of ranking four French writers according to chronological order: +#' A=Victor Hugo, B=Molière, C=Albert Camus, D=Jean-Jacques Rousseau.} #' #' \item{Football Quiz}{ -#' This quiz consists of ranking four national football teams according to increasing number of wins in the football World Cup: A=France, B=Germany, C=Brazil, D=Italy.} +#' This quiz consists of ranking four national football teams according to increasing number of wins in the football +#' World Cup: A=France, B=Germany, C=Brazil, D=Italy.} #' #' \item{Mathematics Quiz}{ #' This quiz consists of ranking four numbers according to increasing order: #' A=pi/3, B=log(1), C=exp(2), D=(1+sqrt(5))/2.} #' #' \item{Cinema Quiz}{ -#' This quiz consists of ranking four Tarentino's movies according to chronological order: +#' This quiz consists of ranking four Tarantino's movies according to chronological order: #' A=Inglourious Basterds, B=Pulp Fiction, C=Reservoir Dogs, D=Jackie Brown.} #' #' } @@ -149,9 +170,9 @@ NULL #' #' @examples #' data(quiz) -#' +#' #' @family datasets -#' +#' #' @keywords datasets NULL @@ -164,14 +185,17 @@ NULL #' \describe{ #' \item{data}{a matrix containing 130 ranks of size 7 in ranking representation. #' -#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. +#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that +#' the ith object is in r_ith position. #' -#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} +#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first +#' object in 4th position, second object in 3rd position, ...} #' -#' \item{frequency}{a matrix with 123 differents ranks of size 7. In each row the first 7 columns correspond to one observed ranking and the last column contains the observation frequency.} +#' \item{frequency}{a matrix with 123 different ranks of size 7. In each row the first 7 columns correspond to one +#' observed ranking and the last column contains the observation frequency.} #' \item{m}{ the size of the rankings (m=7).} #' } -#' +#' #' @description This data set is due to Louis Roussos who asked 130 students at the #' University of Illinois to rank seven sports according to their preference in participating: #' A = Baseball, B = Football, C = Basketball, D = Tennis, E = Cycling, F = @@ -181,9 +205,9 @@ NULL #' #' @examples #' data(sports) -#' +#' #' @family datasets -#' +#' #' @keywords datasets NULL @@ -193,25 +217,30 @@ NULL #' @name words #' @format A list containing: #' \describe{ -#' \item{data}{A matrix of size 98*5 containing the 98 answers. Each row corresponding to one ranking (ranking representation). +#' \item{data}{A matrix of size 98*5 containing the 98 answers. Each row corresponding to one ranking +#' (ranking representation). #' -#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. +#' The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that +#' the ith object is in r_ith position. #' -#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} -#' \item{frequency}{matrix of size 15*6. Each row corresponds to one of the 15 different observed rankings, and the last column contains the observation frequency.} +#' For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the +#' first object in 4th position, second object in 3rd position, ...} +#' \item{frequency}{matrix of size 15*6. Each row corresponds to one of the 15 different observed rankings, and the +#' last column contains the observation frequency.} #' \item{m}{the size of the rankings (m=5).} #' } -#' -#' @description The data was collected under the auspices of the Graduate Record -#' Examination Board. A sample of 98 college students were asked to rank five words according to strength of association (least to most associated) with the target word "Idea": +#' +#' @description The data was collected under the auspices of the Graduate Record +#' Examination Board. A sample of 98 college students were asked to rank five words according to strength of association +#' (least to most associated) with the target word "Idea": #' A = Thought, B = Play, C = Theory, D = Dream and E = Attention. #' #' @source M.A. Fligner and J.S. Verducci. "Distance based ranking models". J. Roy. Statist. Soc. Ser. B, 48(3):359-369, 1986. #' #' @examples #' data(sports) -#' +#' #' @family datasets -#' +#' #' @keywords datasets NULL diff --git a/R/mixtureSEM.R b/R/mixtureSEM.R index aa1fc33..26d09cc 100644 --- a/R/mixtureSEM.R +++ b/R/mixtureSEM.R @@ -7,194 +7,189 @@ # @param m a vector with the size of ranks for each dimension # @param maxIt the maximum number of iteration of the algorithm -# @param Qsem the total number of iterations for the SEM algorithm (defaut value=40) +# @param Qsem the total number of iterations for the SEM algorithm (default value=40) # @param Bsem burn-in period for SEM algorithm (default value=10) -# @param RjSE a vector containing the number of iteration for each dimension of the Gibbs algorithm in the SE step for generate partial ranks and orders of presentation(only for SEM algorithm, default value=m(m-1)/2) -# @param RjM a vector containing the number of iterations for each dimension for the Gibbs Sampler in the M step(only for SEM algorithm, default value=m(m-1)/2) -# @param Ql number of iterations of the Gibbs sampler for estimation of log-likelihood (only for SEM algorithm, default value=100) +# @param RjSE a vector containing the number of iteration for each dimension of the Gibbs algorithm in the SE step for +# generating partial ranks and orders of presentation(only for SEM algorithm, default value=m(m-1)/2) +# @param RjM a vector containing the number of iterations for each dimension for the Gibbs Sampler in the M step +# (only for SEM algorithm, default value=m(m-1)/2) +# @param Ql number of iterations of the Gibbs sampler for estimation of log-likelihood +# (only for SEM algorithm, default value=100) # @param Bl burn-in period for estimation of log-likelihood (only for SEM algorithm, default value=50) -# @param detail boolean, if TRUE, time and others informations will be print during the process (default value FALSE) +# @param detail boolean, if TRUE, time and others information will be print during the process (default value FALSE) -# @return an object containing the refererence rank mu, the probability pi of a correct comparaison, proportion, conditionnal probability of belonging to each cluster (tik), the loglikelihood, the partition, the BIC and the ICL +# @return an object containing the reference rank mu, the probability pi of a correct comparison, proportion, +# conditional probability of belonging to each cluster (tik), the loglikelihood, the partition, the BIC and the ICL # @references "Model-based clustering for multivariate partial ranking data", J. Jacques, C. Biernacki # @examples # data(APA)#m=5 # mixtureSEM(APA,2) # @export -mixtureSEM <- function(X, g, m, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail) -{ - - n = nrow(X) - d = length(m) - if (ncol(X) != sum(m)) +mixtureSEM <- function(X, g, m, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail) { + n <- nrow(X) + d <- length(m) + if (ncol(X) != sum(m)) { stop(paste0("the number of column of X (", ncol(X), ") does not match to the sum of vector m (", sum(m), ").")) - - + } + + # Verification des donnees - for (i in 1:d) - { - check = apply(X[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkTiePartialRank, m[i]) - if (sum(check) != n) - { - indfalse = which(check == 0) + for (i in 1:d) { + check <- apply(X[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkTiePartialRank, m[i]) + if (sum(check) != n) { + indfalse <- which(check == 0) stop(paste("Data are not correct.\n", "For dimension", i, ", ranks at row", indfalse, "are not correct.")) } - } - - res = .Call("semR", X, m, g, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail, PACKAGE = "Rankcluster") - if (res$stock[1] == 2) - { - res$indexPb = lapply(res$indexPb, unique) - for (i in 1:d) - { - if (length(res$indexPb) != 0) - { + + res <- .Call("semR", X, m, g, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail, PACKAGE = "Rankcluster") + if (res$stock[1] == 2) { + res$indexPb <- lapply(res$indexPb, unique) + for (i in 1:d) { + if (length(res$indexPb) != 0) { cat(paste0("For dimension ", i, ", rankings at the following index have format problem:\n")) cat(res$indexPb[[i]]) } } - stop("Problem with your data.\n The ranks have to be given in the ranking notation (see convertRank function), with the following convention: + stop( + "Problem with your data.\n The ranks have to be given in the ranking notation (see convertRank function), with the following convention: - missing positions are replaced by 0 - tied are replaced by the lowest position they share\n") } - - + + # recuperation des resultats - if (res$stock[1] == 1) # si convergence - { - res$referenceRank = tliste3d2mat(res$referenceRank) - res$initMu = tliste3d2mat(res$initMu) - res$p = liste2d2matgd(res$p) - res$initPi = liste2d2matgd(res$initPi) - res$cluster = res$cluster + 1 - - res$entropy = cbind(res$entropy, res$cluster) - res$probability = cbind(res$probability, res$cluster) - colnames(res$entropy) = c("entropy", "cluster") - colnames(res$probability) = c("probability", "cluster") - - res$distMu = liste3d2listematgd(res$distMu) - res$distP = liste3d2listematgd(res$distP) - - ### rank conversion from ordering to ranking - indM = c(0, cumsum(m)) - - for (i in 1:length(m)) - { - # res$initMu - res$initMu[, (indM[i] + 1):indM[i + 1]] = t(apply(res$initMu[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank)) - - # res$referenceRank - res$referenceRank[, (indM[i] + 1):indM[i + 1]] = t(apply(res$referenceRank[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank)) - - } - - - - if (res$stock[2] == 1) # si il y a des donnees partielles - { - res$partialRank = tliste3d2mat(res$partialRank) ## proba a rajoute - - rownames(res$partialRank) = rep("", nrow(res$partialRank)) # enlever les cl1... - # colnames(res$partialRank)[1]="Index" - # colnames(res$partialRank)[ncol(res$rangPartial)]="Probability" - res$initPartialRank = tliste3d2mat(res$initPartialRank) - res$scorePartial = tliste3d2mat(res$scorePartial) - # colnames(res$initPartialRank)[1]="Index" - rownames(res$initPartialRank) = rep("", nrow(res$initPartialRank)) - rownames(res$scorePartial) = rep("", nrow(res$scorePartial)) - + if (res$stock[1] == 1) { # si convergence + res$referenceRank <- tliste3d2mat(res$referenceRank) + res$initMu <- tliste3d2mat(res$initMu) + res$p <- liste2d2matgd(res$p) + res$initPi <- liste2d2matgd(res$initPi) + res$cluster <- res$cluster + 1 + + res$entropy <- cbind(res$entropy, res$cluster) + res$probability <- cbind(res$probability, res$cluster) + colnames(res$entropy) <- c("entropy", "cluster") + colnames(res$probability) <- c("probability", "cluster") + + res$distMu <- liste3d2listematgd(res$distMu) + res$distP <- liste3d2listematgd(res$distP) + ### rank conversion from ordering to ranking - for (i in 1:length(m)) - { - # res$initPartialRank - res$initPartialRank[, (indM[i] + 1):indM[i + 1]] = t(apply(res$initPartialRank[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank)) - - # res$partialRank - # res$partialRank[,(indM[i]+1):indM[i+1]]=t(apply(res$partialRank[,(indM[i]+1):indM[i+1],drop=FALSE],1,convertRank)) - for (j in 1:n) - { - ordtemp = order(res$partialRank[j, (indM[i] + 1):indM[i + 1]]) - res$partialRank[j, (indM[i] + 1):indM[i + 1]] = ordtemp - res$scorePartial[j, (indM[i] + 1):indM[i + 1]] = res$scorePartial[j, ((indM[i] + 1):indM[i + 1])][ordtemp] - - } + indM <- c(0, cumsum(m)) + + for (i in seq_along(m)){ + # res$initMu + res$initMu[, (indM[i] + 1):indM[i + 1]] <- t( + apply(res$initMu[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank) + ) + + # res$referenceRank + res$referenceRank[, (indM[i] + 1):indM[i + 1]] <- t( + apply(res$referenceRank[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank) + ) } - - - res$distPartialRank = lapply(res$distPartialRank, FUN = function(x) {listedistPartiel(x)}) - - result = new(Class = "Output", - bic = res$stock[4], - icl = res$stock[5], - ll = res$stock[3], - proportion = res$proportion, - pi = res$p, - mu = res$referenceRank, - tik = res$tik, - partition = res$cluster, - entropy = res$entropy, - probability = res$probability, - convergence = TRUE, - partial = TRUE, - partialRank = res$partialRank, - distanceZ = res$distZ, - distanceMu = res$distMu, - distanceProp = res$distProp, - distancePi = res$distP, - distancePartialRank = res$distPartialRank, - piInitial = res$initPi, - muInitial = res$initMu, - partialRankInitial = res$initPartialRank, - proportionInitial = res$initProportion, - partialRankScore = res$scorePartial) - } - else - { - - result = new(Class = "Output", - bic = res$stock[4], - icl = res$stock[5], - ll = res$stock[3], - proportion = res$proportion, - pi = res$p, - mu = res$referenceRank, - tik = res$tik, - partition = res$cluster, - entropy = res$entropy, - probability = res$probability, - convergence = TRUE, - partial = FALSE, - distanceZ = res$distZ, - distanceMu = res$distMu, - distanceProp = res$distProp, - distancePi = res$distP, - piInitial = res$initPi, - muInitial = res$initMu, - proportionInitial = res$initProportion) - } - - if (detail) - { - cat("RESULTS:\n") - cat("NUMBER OF CLUSTERS: ", g) - cat("\nLoglikelihood =", res$stock[3]) - cat("\nBIC=", res$stock[4]) - cat("\nICL=", res$stock[5]) - cat("\nProportion:", res$proportion) - cat("\nProbabilities pi:\n") - print(res$p) - cat("\nReference ranks mu:\n") - print(res$referenceRank) - } - } - else - { - - result = new(Class = "Output", convergence = FALSE) + + + if (res$stock[2] == 1) { # si il y a des donnees partielles + res$partialRank <- tliste3d2mat(res$partialRank) ## proba a rajoute + + rownames(res$partialRank) <- rep("", nrow(res$partialRank)) # enlever les cl1... + # colnames(res$partialRank)[1]="Index" + # colnames(res$partialRank)[ncol(res$rangPartial)]="Probability" + res$initPartialRank <- tliste3d2mat(res$initPartialRank) + res$scorePartial <- tliste3d2mat(res$scorePartial) + # colnames(res$initPartialRank)[1]="Index" + rownames(res$initPartialRank) <- rep("", nrow(res$initPartialRank)) + rownames(res$scorePartial) <- rep("", nrow(res$scorePartial)) + + ### rank conversion from ordering to ranking + for (i in seq_along(m)) { + # res$initPartialRank + res$initPartialRank[, (indM[i] + 1):indM[i + 1]] <- t( + apply(res$initPartialRank[, (indM[i] + 1):indM[i + 1], drop = FALSE], 1, convertRank) + ) + + # res$partialRank + #res$partialRank[,(indM[i]+1):indM[i+1]]=t(apply(res$partialRank[,(indM[i]+1):indM[i+1],drop=FALSE],1,convertRank)) + for (j in 1:n) { + ordtemp <- order(res$partialRank[j, (indM[i] + 1):indM[i + 1]]) + res$partialRank[j, (indM[i] + 1):indM[i + 1]] <- ordtemp + res$scorePartial[j, (indM[i] + 1):indM[i + 1]] <- res$scorePartial[j, ((indM[i] + 1):indM[i + 1])][ordtemp] + } + } + + + res$distPartialRank <- lapply(res$distPartialRank, FUN = function(x) { + listedistPartiel(x) + }) + + result <- new( + Class = "Output", + bic = res$stock[4], + icl = res$stock[5], + ll = res$stock[3], + proportion = res$proportion, + pi = res$p, + mu = res$referenceRank, + tik = res$tik, + partition = res$cluster, + entropy = res$entropy, + probability = res$probability, + convergence = TRUE, + partial = TRUE, + partialRank = res$partialRank, + distanceZ = res$distZ, + distanceMu = res$distMu, + distanceProp = res$distProp, + distancePi = res$distP, + distancePartialRank = res$distPartialRank, + piInitial = res$initPi, + muInitial = res$initMu, + partialRankInitial = res$initPartialRank, + proportionInitial = res$initProportion, + partialRankScore = res$scorePartial + ) + } else { + result <- new( + Class = "Output", + bic = res$stock[4], + icl = res$stock[5], + ll = res$stock[3], + proportion = res$proportion, + pi = res$p, + mu = res$referenceRank, + tik = res$tik, + partition = res$cluster, + entropy = res$entropy, + probability = res$probability, + convergence = TRUE, + partial = FALSE, + distanceZ = res$distZ, + distanceMu = res$distMu, + distanceProp = res$distProp, + distancePi = res$distP, + piInitial = res$initPi, + muInitial = res$initMu, + proportionInitial = res$initProportion + ) + } + + if (detail) { + cat("RESULTS:\n") + cat("NUMBER OF CLUSTERS: ", g) + cat("\nLoglikelihood =", res$stock[3]) + cat("\nBIC=", res$stock[4]) + cat("\nICL=", res$stock[5]) + cat("\nProportion:", res$proportion) + cat("\nProbabilities pi:\n") + print(res$p) + cat("\nReference ranks mu:\n") + print(res$referenceRank) + } + } else { + result <- new(Class = "Output", convergence = FALSE) } - + return(result) } diff --git a/R/rankDistance.R b/R/rankDistance.R index b8cfe45..50889d2 100644 --- a/R/rankDistance.R +++ b/R/rankDistance.R @@ -1,45 +1,41 @@ #' @title Kendall distance between two ranks -#' +#' #' @description The Kendall distance between two ranks is the number of pairs that are in different order in the two ranks. -#' +#' #' @param x,y two ranks of size m. #' @param type type of the rank representation ("ordering" ou "ranking"). -#' +#' #' @return an integer, the Kendall distance between x and y. -#' +#' #' @references A New Measure of Rank Correlation, M. G. Kendall -#' +#' #' @examples #' x <- 1:5 #' y <- c(2, 3, 1, 4, 5) #' distKendall(x, y, type = "ordering") -#' +#' #' @author Julien Jacques -#' +#' #' @family distance -#' +#' #' @export -distKendall <- function(x, y, type = "ordering") -{ - if (type == "ordering") - dist = distKendall_ordering(x, y) - else - dist = distKendall_ranking(x, y) +distKendall <- function(x, y, type = "ordering") { + if (type == "ordering") { + dist <- distKendall_ordering(x, y) + } else { + dist <- distKendall_ranking(x, y) + } return(dist) } -distKendall_ranking <- function(x, y) -{ - m = length(x) - distKendall_ranking = 0 - for (i in 1:(m - 1)) - { - for (j in (i + 1):m) - { - if (((x[i] - x[j]) * (y[i] - y[j])) < 0) - { - distKendall_ranking = distKendall_ranking + 1 +distKendall_ranking <- function(x, y) { + m <- length(x) + distKendall_ranking <- 0 + for (i in 1:(m - 1)) { + for (j in (i + 1):m) { + if (((x[i] - x[j]) * (y[i] - y[j])) < 0) { + distKendall_ranking <- distKendall_ranking + 1 } } } @@ -48,42 +44,39 @@ distKendall_ranking <- function(x, y) -distKendall_ordering <- function(x, y) -{ - m = length(x) - distKendall_ordering = 0 - x_ordering = x - y_ordering = y - for (i in 1:m) - { - x_ordering[i] = which(x == i) - y_ordering[i] = which(y == i) +distKendall_ordering <- function(x, y) { + m <- length(x) + distKendall_ordering <- 0 + x_ordering <- x + y_ordering <- y + for (i in 1:m) { + x_ordering[i] <- which(x == i) + y_ordering[i] <- which(y == i) } - distKendall_ordering = distKendall_ranking(x_ordering, y_ordering) + distKendall_ordering <- distKendall_ranking(x_ordering, y_ordering) return(distKendall_ordering) } #' @title Spearman distance between two ranks -#' +#' #' @description The Spearman distance is the square of Euclidean distance between two rank vector. #' #' @param x,y two ranks of size m. -#' +#' #' @return an integer, the Spearman distance between x and y. -#' +#' #' @examples #' x <- 1:5 #' y <- c(2, 3, 1, 4, 5) #' distSpearman(x, y) -#' +#' #' @author Julien Jacques -#' +#' #' @family distance -#' +#' #' @export -distSpearman <- function(x, y) -{ +distSpearman <- function(x, y) { distSpearman <- sum((x - y)^2) return(distSpearman) } @@ -98,43 +91,40 @@ distSpearman <- function(x, y) # y=c(2,3,1,4,5) # CorrelSpearman(x, y) # @export -correlSpearman <- function(x, y) -{ +correlSpearman <- function(x, y) { correlSpearman <- sqrt(sum((x - y)^2)) return(correlSpearman) } #' @title Cayley distance between two ranks -#' -#' @description The Cayley distance between two ranks x and y is the minimum number of transpositions required to transform the ranking x into y. -#' -#' +#' +#' @description The Cayley distance between two ranks x and y is the minimum number of transpositions required to +#' transform the ranking x into y. +#' +#' #' @param x,y two ranks of size m. -#' +#' #' @return the Cayley distance between x and y. -#' +#' #' @examples #' x <- 1:5 #' y <- c(2, 3, 1, 4, 5) #' distCayley(x, y) -#' +#' #' @author Julien Jacques -#' +#' #' @family distance -#' +#' #' @export -distCayley <- function(x, y) -{ +distCayley <- function(x, y) { m <- length(x) distCayley <- 0 - for (i in 1:(m - 1)) - { - if (!x[i] == y[i]) - { - distCayley = distCayley + 1 + for (i in 1:(m - 1)) { + if (!x[i] == y[i]) { + distCayley <- distCayley + 1 tmp <- x[i] - x[i] = y[i] - x[which(y[i] == x)] = tmp + x[i] <- y[i] + x[which(y[i] == x)] <- tmp } } return(distCayley) @@ -143,24 +133,24 @@ distCayley <- function(x, y) #' @title Hamming distance between two ranks #' #' @description The Hamming distance between two ranks x and y is the number of difference between the two ranks. -#' For example, the Hamming's distance between x=(1,4,2,5,3) and y=(1,3,4,5,2) is 3 because, only 1 and 5 have the same place in both ranks. -#' +#' For example, the Hamming's distance between x=(1,4,2,5,3) and y=(1,3,4,5,2) is 3 because, only 1 and 5 have the same +#' place in both ranks. +#' #' @param x,y two ranks of size m. -#' +#' #' @return an integer, the Hamming distance between x and y. -#' +#' #' @examples #' x <- 1:5 #' y <- c(2, 3, 1, 4, 5) #' distHamming(x, y) -#' +#' #' @author Julien Jacques -#' +#' #' @family distance -#' +#' #' @export -distHamming <- function(x, y) -{ +distHamming <- function(x, y) { distHamming <- sum(x != y) return(distHamming) diff --git a/R/rankManipulation.R b/R/rankManipulation.R index 2490185..957f1b4 100644 --- a/R/rankManipulation.R +++ b/R/rankManipulation.R @@ -1,47 +1,50 @@ # convert a single rank -convertSingleRank <- function(x) -{ +convertSingleRank <- function(x) { xb <- rep(0, length(x)) ind <- x[x > 0] - for (i in ind) - xb[i] = which(x == i) + for (i in ind) { + xb[i] <- which(x == i) + } return(xb) } #' @title Change the representation of a rank #' -#' @description convertRank converts a rank from its ranking representation to its ordering representation, and vice-versa. The function does not work with partial ranking. -#' The transformation to convert a rank from ordering to ranking representation is the same that from ranking to ordering representation, there is no need to precise the representation of rank x. +#' @description convertRank converts a rank from its ranking representation to its ordering representation, and vice-versa. +#' The function does not work with partial ranking. The transformation to convert a rank from ordering to ranking +#' representation is the same that from ranking to ordering representation, there is no need to precise the representation +#' of rank x. #' #' @param x a rank (vector) datum either in its ranking or ordering representation. -#' -#' @return a rank (vector) in its ordering representation if its ranking representation has been given in input of convertRank, and vice-versa. -#' -#' +#' +#' @return a rank (vector) in its ordering representation if its ranking representation has been given in input of +#' convertRank, and vice-versa. +#' +#' #' @details The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, #' and means that the ith object is in r_ith position. #' #' The ordering representation o=(o_1,...,o_m) means that object o_i is in the ith position. #' -#' Let us consider the following example to illustrate both notations: a judge, which has to rank three holidays destinations according to its preferences, -#' O1 = Countryside, O2 =Mountain and O3 = Sea, ranks first Sea, second Countryside, and last Mountain. The ordering result of the judge is o = (3, 1, 2) -#' whereas the ranking result is r = (2, 3, 1). +#' Let us consider the following example to illustrate both notations: a judge, which has to rank three holidays destinations +#' according to its preferences, O1 = Countryside, O2 =Mountain and O3 = Sea, ranks first Sea, second Countryside, +#' and last Mountain. The ordering result of the judge is o = (3, 1, 2) whereas the ranking result is r = (2, 3, 1). +#' #' -#' #' @examples #' x <- c(2, 3, 1, 4, 5) #' convertRank(x) -#' +#' #' @author Julien Jacques -#' +#' #' @export -convertRank <- function(x) -{ - if (is.matrix(x)) +convertRank <- function(x) { + if (is.matrix(x)) { return(t(apply(x, 1, convertSingleRank))) - else + } else { return(convertSingleRank(x)) + } } # ' @@ -50,7 +53,8 @@ convertRank <- function(x) # ' # ' @title Check the data # ' @param X a matrix containing ranks -# ' @param m a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the matrix data). +# ' @param m a vector composed of the sizes of the rankings of each dimension +# ' (default value is the number of column of the matrix data). # ' # ' @return a list containing for each dimension, numbers of rows with problem. # ' @@ -90,77 +94,76 @@ convertRank <- function(x) # } # checkRank check if a vector is a rank -checkRank <- function(x, m = length(x)) -{ +checkRank <- function(x, m = length(x)) { return(sum(sort(x) == (1:m)) == m) } # checkPartialRank check if a vector is a partial rank # missing element : 0 -checkPartialRank <- function(x, m = length(x)) -{ +checkPartialRank <- function(x, m = length(x)) { return((length(x[x <= m]) == m) && (length(x[x >= 0]) == m) && (length(unique(x[x != 0])) == length(x[x != 0]))) } # checkPartialRank check if a vector is a partial rank -checkTiePartialRank <- function(x, m = length(x)) -{ - return ((length(x[x <= m]) == m) && (length(x[x >= 0]) == m)) +checkTiePartialRank <- function(x, m = length(x)) { + return((length(x[x <= m]) == m) && (length(x[x >= 0]) == m)) } # completeRank complete partial that have only one missing element -completeRank <- function(x) -{ - if (length(x[x == 0]) == 1) - { +completeRank <- function(x) { + if (length(x[x == 0]) == 1) { m <- length(x) a <- 1:m - a[x[x != 0]] = 0 - x[x == 0] = a[a != 0] + a[x[x != 0]] <- 0 + x[x == 0] <- a[a != 0] } return(x) } #' @title Convert data storage -#' -#' @description This function takes in input a matrix containing all the observed ranks (a rank can be repeated) and returns a matrix containing all -#' the different observed ranks with their observation frequencies (in the last column). +#' +#' @description This function takes in input a matrix containing all the observed ranks (a rank can be repeated) +#' and returns a matrix containing all the different observed ranks with their observation frequencies (in the last column). #' #' @param X a matrix containing ranks. #' @param m a vector with the size of ranks of each dimension. -#' +#' #' @return A matrix containing each different observed ranks with its observation frequencies in the last column. -#' +#' #' @examples #' X <- matrix(1:4, ncol = 4, nrow = 5, byrow = TRUE) #' Y <- frequence(X) #' Y -#' +#' #' @author Quentin Grimonprez -#' +#' #' @seealso \link{unfrequence} -#' +#' #' @export -frequence <- function(X, m = ncol(X)) -{ - if (missing(X)) +frequence <- function(X, m = ncol(X)) { + if (missing(X)) { stop("X is missing") - if (!is.numeric(X) || !is.matrix(X)) + } + if (!is.numeric(X) || !is.matrix(X)) { stop("X must be a matrix of positive integer") - if (length(X[X >= 0]) != length(X)) + } + if (length(X[X >= 0]) != length(X)) { stop("X must be a matrix of positive integer") - if (!is.vector(m, mode = "numeric")) + } + if (!is.vector(m, mode = "numeric")) { stop("m must be a (vector of) integer strictly greater than 1") - if (length(m) != length(m[m > 1])) + } + if (length(m) != length(m[m > 1])) { stop("m must be a (vector of) integer strictly greater than 1") + } - if (length(m) == 1) - { - if (m != ncol(X)) - { - print(paste0("You put m=", m, ", but X has ", ncol(X), " columns(rank of size ", ncol(X) - 1, " and 1 for the frequence).")) + if (length(m) == 1) { + if (m != ncol(X)) { + print(paste0( + "You put m=", m, ", but X has ", ncol(X), " columns(rank of size ", ncol(X) - 1, " and 1 for the frequence)." + )) print(paste0("The algorithm will continue with m=", ncol(X) - 1)) } } @@ -168,43 +171,43 @@ frequence <- function(X, m = ncol(X)) res <- .Call("freqMultiR", X, m, PACKAGE = "Rankcluster") data <- matrix(0, ncol = length(res$data[[1]]) + 1, nrow = length(res$data)) - for (i in 1:nrow(data)) - data[i, ] = c(res$data[[i]], res$freq[[i]]) + for (i in seq_len(nrow(data))) { + data[i, ] <- c(res$data[[i]], res$freq[[i]]) + } return(data) - } #' @title Convert data -#' -#' @description This function takes in input a matrix in which the m first columns are the different observed ranks and the last column contains the observation frequency, -#' and returns a matrix containing all the ranks (ranks with frequency>1 are repeated). +#' +#' @description This function takes in input a matrix in which the m first columns are the different observed ranks and +#' the last column contains the observation frequency, and returns a matrix containing all the ranks (ranks with frequency>1 +#' are repeated). #' #' @param data a matrix containing rankings and observation frequency. -#' +#' #' @return a matrix containing all the rankings. -#' +#' #' @examples #' data(quiz) #' Y <- unfrequence(quiz$frequency) #' Y -#' +#' #' @seealso \link{frequence} -#' +#' #' @export -unfrequence <- function(data) -{ +unfrequence <- function(data) { X <- matrix(ncol = ncol(data) - 1, nrow = sum(data[, ncol(data)])) - colnames(X) = colnames(data)[-ncol(data)] + colnames(X) <- colnames(data)[-ncol(data)] compteur <- 1 - for (i in 1:nrow(data)) - for (j in 1:data[i, ncol(data)]) - { - X[compteur, ] = data[i, -ncol(data)] - compteur = compteur + 1 + for (i in seq_len(nrow(data))) { + for (j in seq_len(data[i, ncol(data)])) { + X[compteur, ] <- data[i, -ncol(data)] + compteur <- compteur + 1 } - + } + return(X) } diff --git a/R/rankclust.R b/R/rankclust.R index 4a6ba4f..7d82b62 100644 --- a/R/rankclust.R +++ b/R/rankclust.R @@ -1,35 +1,40 @@ #' @title Model-based clustering for multivariate partial ranking -#' -#' @description This functions estimates a clustering of ranking data, potentially multivariate, partial and containing tied, based on a mixture of multivariate ISR model [2]. +#' +#' @description This functions estimates a clustering of ranking data, potentially multivariate, partial and containing tied, +#' based on a mixture of multivariate ISR model [2]. #' By specifying only one cluster, the function performs a modelling of the ranking data using the multivariate ISR model. #' The estimation is performed thanks to a SEM-Gibbs algorithm. #' -#' +#' #' @param data a matrix in which each row is a ranking (partial or not; for partial ranking, -#' missing elements must be 0 or NA. Tied are replaced by the lowest position they share). For multivariate rankings, the rankings of each dimension are -#' placed end to end in each row. The data must be in ranking notation (see Details or +#' missing elements must be 0 or NA. Tied are replaced by the lowest position they share). For multivariate rankings, +#' the rankings of each dimension are placed end to end in each row. The data must be in ranking notation (see Details or #' \link{convertRank} functions). -#' @param m a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the matrix data). +#' @param m a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the +#' matrix data). #' @param K an integer or a vector of integer with the number of clusters. #' @param criterion criterion "bic" or "icl", criterion to minimize for selecting the number of clusters. #' @param Qsem the total number of iterations for the SEM algorithm (default value=40). #' @param Bsem burn-in period for SEM algorithm (default value=10). #' @param RjSE a vector containing, for each dimension, the number of iterations of the Gibbs sampler #' used both in the SE step for partial rankings and for the presentation orders generation (default value=mj(mj-1)/2). -#' @param RjM a vector containing, for each dimension, the number of iterations of the Gibbs sampler used in the M step (default value=mj(mj-1)/2) +#' @param RjM a vector containing, for each dimension, the number of iterations of the Gibbs sampler used in the M step +#' (default value=mj(mj-1)/2) #' @param Ql number of iterations of the Gibbs sampler #' for estimation of log-likelihood (default value=100). #' @param Bl burn-in period for estimation of log-likelihood (default value=50). #' @param maxTry maximum number of restarts of the SEM-Gibbs algorithm in the case of non convergence (default value=3). #' @param run number of runs of the algorithm for each value of K. -#' @param detail boolean, if TRUE, time and others informations will be print during the process (default value FALSE). +#' @param detail boolean, if TRUE, time and others information will be print during the process (default value FALSE). #' #' @return An object of class Rankclust (See \code{\link{Output-class}} and \code{\link{Rankclust-class}}). -#' If the output object is named \code{res}. You can access the result by res[number of groups]@@slotName where \code{slotName} is an element of the class Output. +#' If the output object is named \code{res}. You can access the result by res[number of groups]@@slotName where +#' \code{slotName} is an element of the class Output. +#' #' -#' #' @details -#' The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), with the following convention: +#' The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), with the following +#' convention: #' #' - missing positions are replaced by 0 #' @@ -55,18 +60,19 @@ #' #' #' @references -#' [1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, Computational Statistics and Data Analysis, 58, 162-176. +#' [1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, +#' Computational Statistics and Data Analysis, 58, 162-176. #' -#' [2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, Inria Research Report n 8113. +#' [2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, +#' Inria Research Report n 8113. #' #' @examples #' data(big4) #' result <- rankclust(big4$data, K = 2, m = big4$m, Ql = 200, Bl = 100, maxTry = 2) -#' -#' if(result@@convergence) -#' { +#' +#' if(result@@convergence) { #' summary(result) -#' +#' #' partition <- result[2]@@partition #' tik <- result[2]@@tik #' } @@ -77,55 +83,48 @@ #' #' @export #' -rankclust <- function(data, m = ncol(data), K = 1, criterion = "bic", Qsem = 100, Bsem = 20, +rankclust <- function(data, m = ncol(data), K = 1, criterion = "bic", Qsem = 100, Bsem = 20, RjSE = m * (m - 1) / 2, RjM = m * (m - 1) / 2, Ql = 500, Bl = 100, - maxTry = 3, run = 1, detail = FALSE) -{ - + maxTry = 3, run = 1, detail = FALSE) { .checkArgRankclust(data, m, K, criterion, Qsem, Bsem, RjSE, RjM, Ql, Bl, detail, maxTry, run) # change NA in data - data[is.na(data)] = 0 + data[is.na(data)] <- 0 # output container - result = c() + result <- c() # number of clusters for which the algorithm converges - G = c() + G <- c() # loop over the different number of cluster - for (k in K) - { - if (detail) + for (k in K) { + if (detail) { cat("K=", k, "\n") - ## first run - res = mixtureSEM(data, k, m, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail) - if (res@convergence) - { - G = c(G, k) - result = c(result, list(res)) } - else - { + ## first run + res <- mixtureSEM(data, k, m, Qsem, Bsem, Ql, Bl, RjSE, RjM, maxTry, run, detail) + if (res@convergence) { + G <- c(G, k) + result <- c(result, list(res)) + } else { cat("\n for K=", k, "clusters, the algorithm has not converged (a proportion was equal to 0 during the process), please retry\n") } } # end for K number of cluster - if (length(G) == 0) - { - resultat = new("Rankclust", convergence = FALSE) + if (length(G) == 0) { + resultat <- new("Rankclust", convergence = FALSE) cat("No convergence for all values of K (a proportion was equal to 0 during the process). Please retry\n") - } - else - { - colnom = c() - for (i in 1:length(m)) - colnom = c(colnom, paste0("dim", i), rep("", m[i] - 1)) + } else { + colnom <- c() + for (i in seq_along(m)) { + colnom <- c(colnom, paste0("dim", i), rep("", m[i] - 1)) + } - colnames(data) = colnom + colnames(data) <- colnom - resultat = new("Rankclust", K = G, criterion = criterion, results = result, data = data, convergence = TRUE) + resultat <- new("Rankclust", K = G, criterion = criterion, results = result, data = data, convergence = TRUE) } @@ -133,80 +132,101 @@ rankclust <- function(data, m = ncol(data), K = 1, criterion = "bic", Qsem = 100 } -.checkArgRankclust = function(data, m, K, criterion, Qsem, Bsem, RjSE, RjM, Ql, Bl, detail, maxTry, run) -{ +.checkArgRankclust <- function(data, m, K, criterion, Qsem, Bsem, RjSE, RjM, Ql, Bl, detail, maxTry, run) { ################## check the arguments # data checkData(data) # m checkM(m) - if (sum(m) != ncol(data)) + if (sum(m) != ncol(data)) { stop("The number of column of data and m don't match.") + } # K - if (!is.vector(K, mode = "numeric")) + if (!is.vector(K, mode = "numeric")) { stop("K must be a (vector of) integer strictly greater than 0") - if (length(K) != length(K[K > 0])) + } + if (length(K) != length(K[K > 0])) { stop("K must be a (vector of) integer strictly greater than 0") - if (!min(K == round(K))) + } + if (!min(K == round(K))) { stop("K must be a (vector of) integer strictly greater than 0") + } # criterion - if (criterion != "bic" && criterion != "icl") + if (criterion != "bic" && criterion != "icl") { stop("criterion must be \"bic\" or \"icl\" ") + } # Qsem - if (!is.numeric(Qsem) || (length(Qsem) > 1)) + if (!is.numeric(Qsem) || (length(Qsem) > 1)) { stop("Qsem must be a strictly positive integer") - if ((Qsem != round(Qsem)) || (Qsem <= 0)) + } + if ((Qsem != round(Qsem)) || (Qsem <= 0)) { stop("Qsem must be a strictly positive integer") + } # Bsem - if (!is.numeric(Bsem) || (length(Bsem) > 1)) + if (!is.numeric(Bsem) || (length(Bsem) > 1)) { stop("Bsem must be a strictly positive integer lower than Qsem") - if ((Bsem != round(Bsem)) || (Bsem <= 0) || (Bsem >= Qsem)) + } + if ((Bsem != round(Bsem)) || (Bsem <= 0) || (Bsem >= Qsem)) { stop("Bsem must be a strictly positive integer lower than Qsem") + } # RjM - if (!is.numeric(RjM) || (length(RjM) != length(m))) + if (!is.numeric(RjM) || (length(RjM) != length(m))) { stop("RjM must be a vector of strictly positive integer") - if (any((RjM != round(RjM)) | (RjM <= 0))) + } + if (any((RjM != round(RjM)) | (RjM <= 0))) { stop("RjM must be a vector of strictly positive integer") + } # RjSE - if (!is.numeric(RjSE) || (length(RjSE) != length(m))) + if (!is.numeric(RjSE) || (length(RjSE) != length(m))) { stop("RjSE must be a vector of strictly positive integer") - if (any((RjSE != round(RjSE)) | (RjSE <= 0))) + } + if (any((RjSE != round(RjSE)) | (RjSE <= 0))) { stop("RjSE must be a vector of strictly positive integer") + } # Ql - if (!is.numeric(Ql) || (length(Ql) > 1)) + if (!is.numeric(Ql) || (length(Ql) > 1)) { stop("Ql must be a strictly positive integer") - if ((Ql != round(Ql)) || (Ql <= 0)) + } + if ((Ql != round(Ql)) || (Ql <= 0)) { stop("Ql must be a strictly positive integer") + } # Bl - if (!is.numeric(Bl) || (length(Bl) > 1)) + if (!is.numeric(Bl) || (length(Bl) > 1)) { stop("Bl must be a strictly positive integer lower than Ql") - if ((Bl != round(Bl)) || (Bl <= 0) || (Bl >= Ql)) + } + if ((Bl != round(Bl)) || (Bl <= 0) || (Bl >= Ql)) { stop("Bl must be a strictly positive integer lower than Ql") + } # maxTry - if (!is.numeric(maxTry) || (length(maxTry) != 1)) + if (!is.numeric(maxTry) || (length(maxTry) != 1)) { stop("maxTry must be a positive integer") - if ((maxTry != round(maxTry)) || (maxTry <= 0)) + } + if ((maxTry != round(maxTry)) || (maxTry <= 0)) { stop("maxTry must be a positive integer") + } # run - if (!is.numeric(run) || (length(run) != 1)) + if (!is.numeric(run) || (length(run) != 1)) { stop("run must be a positive integer") - if ((run != round(run)) || (run <= 0)) + } + if ((run != round(run)) || (run <= 0)) { stop("run must be a positive integer") + } # detail - if (!is.logical(detail)) + if (!is.logical(detail)) { stop("detail must be a logical.") + } } diff --git a/R/resultClass.R b/R/resultClass.R index add1434..f2a07bf 100644 --- a/R/resultClass.R +++ b/R/resultClass.R @@ -29,13 +29,13 @@ #' @slot partialRankScore confidence score in estimated partial rank #' @slot distanceProp Distances (MSE) between the final estimation and the current #' value at each iteration of the SEM-Gibbs algorithm (except the burn-in phase) for proportions. A list of Qsem-Bsem elements, -#' each element being a K*p-matrix. -#' @slot distancePi Distances (MSE) between the final estimation and the current -#' value at each iteration of the SEM-Gibbs algorithm (except the burn-in phase) for scale parameters. A list of Qsem-Bsem elements, #' each element being a K*p-matrix. -#' @slot distanceMu Distances (Kendall distance) between the final estimation and the current -#' value at each iteration of the SEM-Gibbs algorithm (except the burn-in phase) for proportions. A list of Qsem-Bsem elements, +#' @slot distancePi Distances (MSE) between the final estimation and the current value at each iteration of the +#' SEM-Gibbs algorithm (except the burn-in phase) for scale parameters. A list of Qsem-Bsem elements, #' each element being a K*p-matrix. +#' @slot distanceMu Distances (Kendall distance) between the final estimation and the current value at each iteration of the +#' SEM-Gibbs algorithm (except the burn-in phase) for proportions. A list of Qsem-Bsem elements, +#' each element being a K*p-matrix. #' @slot distanceZ a vector of size Qsem-Bsem containing the rand index between the final #' estimated partition and the current value at each iteration of the SEM-Gibbs algorithm (except #' the burn-in phase). Let precise that the rand index is not affected by label switching. @@ -48,7 +48,6 @@ #' @slot piInitial a matrix containing the initialization of the probabilities of good paired comparison in the algorithm. #' @slot muInitial a matrix containing the initialization of modal rankings in the algorithm. #' @slot partialRankInitial a matrix containing the initialization of the partial rankings in the algorithm. -#' #' #' #' @name Output-class @@ -123,14 +122,15 @@ setClass( #' any number of clusters). #' @slot results a list of \link{Output-class}, containing the results for each number of clusters (one #' element of the list is associated to one number of clusters). -#' +#' #' #' #' @details -#' If \code{res} is the result of \link{rankclust} function, each slot of results can be reached by \code{res[k]@@slotname}, where -#' \code{k} is the number of clusters and \code{slotname} is the name of the slot we want to reach (see \link{Output-class}). -#' For the slots, \code{ll}, \code{bic}, \code{icl}, \code{res["slotname"]} returns a vector of size \code{k} containing the values of the -#' slot for each number of clusters. +#' If \code{res} is the result of \link{rankclust} function, each slot of results can be reached by \code{res[k]@@slotname}, +#' where \code{k} is the number of clusters and \code{slotname} is the name of the slot we want to reach +#' (see \link{Output-class}). +#' For the slots, \code{ll}, \code{bic}, \code{icl}, \code{res["slotname"]} returns a vector of size \code{k} containing +#' the values of the slot for each number of clusters. #' #' @name Rankclust-class #' @rdname Rankclust-class @@ -152,11 +152,10 @@ setClass( criterion = "bic", convergence = logical(0) ) - ) #' @name [,Rankclust-method -#' +#' #' @title Getter method for rankclust output #' #' @description Extract values of various @@ -164,62 +163,48 @@ setClass( #' #' @param x object from which to extract element(s) or in which to replace element(s). #' @param i the number of cluster of the element we want to extract or "bic", "icl", "ll" -#' @param j,drop not used -#' +#' @param j,drop not used +#' @usage \S4method{[}{Rankclust}(x, i, j, drop) #' @export setMethod( f = "[", signature = signature(x = "Rankclust"), definition = function(x, i, j, drop) { - if (x@convergence) - { - if (is.numeric(i)) - { - if (i %in% x@K) - { + if (x@convergence) { + if (is.numeric(i)) { + if (i %in% x@K) { return(x@results[[which(x@K == i)]]) - } - else + } else { stop("Invalid number of cluster.") - } - else - { - if (i == "bic") - { - bic = rep(NA, length(x@K)) - for (iter in 1:length(x@K)) - { - if (x@results[[iter]]@convergence) - bic[iter] = x@results[[iter]]@bic + } + } else { + if (i == "bic") { + bic <- rep(NA, length(x@K)) + for (iter in seq_along(bic)) { + if (x@results[[iter]]@convergence) { + bic[iter] <- x@results[[iter]]@bic + } } return(bic) - } - else - { - if (i == "icl") - { - icl = rep(NA, length(x@K)) - for (iter in 1:length(x@K)) - { - if (x@results[[iter]]@convergence) - icl[iter] = x@results[[iter]]@icl + } else { + if (i == "icl") { + icl <- rep(NA, length(x@K)) + for (iter in seq_along(x@K)) { + if (x@results[[iter]]@convergence) { + icl[iter] <- x@results[[iter]]@icl + } } return(icl) - } - else - { - if (i == "ll") - { - ll = rep(NA, length(x@K)) - for (iter in 1:length(x@K)) - { - if (x@results[[iter]]@convergence) - ll[iter] = x@results[[iter]]@ll + } else { + if (i == "ll") { + ll <- rep(NA, length(x@K)) + for (iter in seq_along(x@K)) { + if (x@results[[iter]]@convergence) { + ll[iter] <- x@results[[iter]]@ll + } } return(ll) - } - else - { + } else { stop("Invalid Name.") } } @@ -240,34 +225,28 @@ setMethod( f = "summary", signature = "Rankclust", definition = function(object, ...) { - if (object@convergence) - { - if (object@criterion == "bic") - { - BIC = c() - for (i in object@K) - { - BIC = c(BIC, object@results[[which(object@K == i)]]@bic) + if (object@convergence) { + if (object@criterion == "bic") { + BIC <- c() + for (i in object@K) { + BIC <- c(BIC, object@results[[which(object@K == i)]]@bic) } - index = which(BIC == min(BIC)) - } - else - { - ICL = c() - for (i in object@K) - { - ICL = c(ICL, object@results[[which(object@K == i)]]@icl) + index <- which(BIC == min(BIC)) + } else { + ICL <- c() + for (i in object@K) { + ICL <- c(ICL, object@results[[which(object@K == i)]]@icl) } - index = which(ICL == min(ICL)) - + index <- which(ICL == min(ICL)) } cat("******************************************************************\n") cat("NUMBER OF CLUSTERS: ", object@K[index], "\n") - if (object@criterion == "bic") + if (object@criterion == "bic") { cat(object@criterion, "=", object[object@K[index]]@bic) - else + } else { cat(object@criterion, "=", object[object@K[index]]@icl) + } cat("\nLoglikelihood =", object[object@K[index]]@ll) cat("\n\n************************ PARAMETERS ******************************\n") cat("Proportion:", object[object@K[index]]@proportion) @@ -277,13 +256,12 @@ setMethod( print(object[object@K[index]]@pi) cat("\n************************ CLUSTERING ******************************\n") cat("Ranks with the highest entropy for each cluster:\n") - for (i in 1:object@K[index]) - { + for (i in 1:object@K[index]) { # classe=object[object@K[index]]@entropy[object[object@K[index]]@entropy[,2]==i,] - classe = which(object[object@K[index]]@entropy[, 2] == i) - if (length(classe) != 0) - { - classe = classe[order(object[object@K[index]]@entropy[classe, 1], decreasing = TRUE)][1:min(5, length(classe))] + classe <- which(object[object@K[index]]@entropy[, 2] == i) + if (length(classe) != 0) { + classe <- classe[order(object[object@K[index]]@entropy[classe, 1], decreasing = TRUE)][ + seq_len(min(5, length(classe)))] # if(object@algorithm=="SEM") print(cbind(object@data[classe, ], object[object@K[index]]@entropy[classe, ])) # else @@ -299,17 +277,15 @@ setMethod( # print(cbind(object@data[best5[,1],-ncol(object@data)],best5[,2:3])) # } } - } # rm(best5) cat("Ranks with the highest probability for each cluster:\n") - for (i in 1:object@K[index]) - { + for (i in 1:object@K[index]) { # classe=object[object@K[index]]@probability[object[object@K[index]]@probability[,2]==i,] - classe = which(object[object@K[index]]@probability[, 2] == i) - if (length(classe) != 0) - { - classe = classe[order(object[object@K[index]]@probability[classe, 1], decreasing = TRUE)][1:min(5, length(classe))] + classe <- which(object[object@K[index]]@probability[, 2] == i) + if (length(classe) != 0) { + classe <- classe[order(object[object@K[index]]@probability[classe, 1], decreasing = TRUE)][ + seq_len(min(5, length(classe)))] # if(object@algorithm=="SEM") print(cbind(object@data[classe, ], object[object@K[index]]@probability[classe, ])) # else @@ -324,24 +300,22 @@ setMethod( # best5=classe[order(classe[,2],decreasing=TRUE),][1:min(5,nrow(classe)),] # print(cbind(object@data[best5[,1],-ncol(object@data)],best5[,2:3])) # } - } } rm(classe) # rm(best5) - if (object[object@K[index]]@partial) - { + if (object[object@K[index]]@partial) { cat("\n************************ PARTIAL RANK ****************************\n") - if (min(50, nrow(object[object@K[index]]@partialRank)) == 50) + if (min(50, nrow(object[object@K[index]]@partialRank)) == 50) { cat("\nOnly the first 50 are printed, total length:", nrow(object[object@K[index]]@partialRank), "\n") - print(object[object@K[index]]@partialRank[1:min(50, nrow(object[object@K[index]]@partialRank)), ]) + } + print(object[object@K[index]]@partialRank[seq_len(min(50, nrow(object[object@K[index]]@partialRank))), ]) } cat("\n******************************************************************\n") - } - else + } else { cat("No convergence. Please retry\n") - + } } ) @@ -368,13 +342,15 @@ setMethod( cat("\npi:\n") print(object@pi) cat("\npartition:\n") - print(object@partition[1:min(50, length(object@partition))]) - if (min(50, length(object@partition)) == 50) + print(object@partition[seq_len(min(50, length(object@partition)))]) + if (min(50, length(object@partition)) == 50) { cat("\nOnly the first 50 are printed, total length:", length(object@partition)) + } cat("\ntik:\n") - print(object@tik[1:min(50, nrow(object@tik)), ]) - if (min(50, nrow(object@tik)) == 50) + print(object@tik[seq_len(min(50, nrow(object@tik))), ]) + if (min(50, nrow(object@tik)) == 50) { cat("\nOnly the first 50 rows are printed, total rows:", nrow(object@tik)) + } } ) @@ -386,20 +362,16 @@ setMethod( f = "show", signature = "Rankclust", definition = function(object) { - if(object@convergence) - { - for (i in object@K) - { + if (object@convergence) { + for (i in object@K) { cat("\n******************************************************************\n") cat("Number of clusters:", i) cat("\n******************************************************************\n") show(object@results[[which(object@K == i)]]) cat("\n******************************************************************\n") - } - }else{ + } else { cat("Algorithm did not converge.\n") } - } ) diff --git a/R/test.R b/R/test.R index 8cf1697..1da56a0 100644 --- a/R/test.R +++ b/R/test.R @@ -1,55 +1,58 @@ #' @title Kullback-Leibler divergence #' -#' @description This function computes the Kullback-Leibler divergence between two mixtures of multidimensional ISR distributions. -#' +#' @description It computes the Kullback-Leibler divergence between two mixtures of multidimensional ISR distributions. +#' #' @param proportion1,proportion2 vectors (which sums to 1) containing the K mixture proportions. -#' @param pi1,pi2 matrices of size K*p, where K is the number of clusters and p the number of dimension, containing the probabilities of a good comparaison of the model (dispersion parameters). -#' @param mu1,mu2 matrices of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row. +#' @param pi1,pi2 matrices of size K*p, where K is the number of clusters and p the number of dimension, +#' containing the probabilities of a good comparison of the model (dispersion parameters). +#' @param mu1,mu2 matrices of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. +#' In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row. #' @param m a vector containing the size of ranks for each dimension. -#' +#' #' @return the Kullback-Leibler divergence. -#' +#' #' @references #' http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence -#' +#' #' @examples #' proportion1 <- c(0.4, 0.6) #' pi1 <- matrix(c(0.8, 0.75), nrow = 2) #' mu1 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE) -#' +#' #' proportion2 <- c(0.43, 0.57) #' pi2 <- matrix(c(0.82, 0.7), nrow = 2) #' mu2 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE) -#' +#' #' dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4) -#' +#' #' @author Quentin Grimonprez -#' +#' #' @export -kullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m) -{ +kullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m) { checkKullback(proportion1, pi1, mu1, proportion2, pi2, mu2, m) - a = t(pi1) - b = t(pi2) - dKL = .Call("kullback", m, mu1, mu2, a, b, proportion1, proportion2, PACKAGE = "Rankcluster") + a <- t(pi1) + b <- t(pi2) + dKL <- .Call("kullback", m, mu1, mu2, a, b, proportion1, proportion2, PACKAGE = "Rankcluster") return(dKL) } -checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m) -{ - if (missing(mu1)) +checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m) { + if (missing(mu1)) { stop("mu1 is missing") - if (missing(mu2)) + } + if (missing(mu2)) { stop("mu2 is missing") - + } + checkProportion(proportion1, paramName = "proportion1", eps = 1e-10) checkProportion(proportion2, paramName = "proportion2", eps = 1e-10) - if (length(proportion1) != length(proportion2)) + if (length(proportion1) != length(proportion2)) { stop("proportion1 and proportion2 must have the same length.") - + } + # m checkM(m) checkM2(m, pi1, mu1, piName = "pi1", muName = "mu1") @@ -58,105 +61,125 @@ checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m) # pi checkPi(pi1, paramName = "pi1") checkPi(pi2, paramName = "pi2") - if (length(pi1) != length(pi2)) + if (length(pi1) != length(pi2)) { stop("pi1 and pi2 must have the same size.") - if ((nrow(pi1) != length(proportion1)) || (nrow(pi1) != nrow(mu1))) + } + if ((nrow(pi1) != length(proportion1)) || (nrow(pi1) != nrow(mu1))) { stop("The number of rows of pi1 doesn't match with the others parameters.") - if ((nrow(pi2) != length(proportion2)) || (nrow(pi2) != nrow(mu2))) + } + if ((nrow(pi2) != length(proportion2)) || (nrow(pi2) != nrow(mu2))) { stop("The number of rows of pi2 doesn't match with the others parameters.") - + } + # mu1 mu2 checkMu(mu1, proportion1, pi1, muName = "mu1", proportionName = "proportion1", piName = "pi1") checkMu(mu2, proportion2, pi2, muName = "mu2", proportionName = "proportion2", piName = "pi2") # check if mu contains ranks - for (i in 1:length(m)) + for (i in seq_along(m)) { - if (sum(apply(mu1[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkRank, m[i])) != nrow(mu1)) + if (sum(apply(mu1[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkRank, m[i])) != nrow(mu1)) { stop("mu1 is not correct") - if (sum(apply(mu2[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkRank, m[i])) != nrow(mu2)) + } + if (sum(apply(mu2[, (1 + cumsum(c(0, m))[i]):(cumsum(c(0, m))[i + 1])], 1, checkRank, m[i])) != nrow(mu2)) { stop("mu2 is not correct") + } } } #' @title Khi2 test #' -#' @description This function computes the p-value of the khi2 adequation test (only for univariate data). -#' +#' @description This function computes the p-value of the khi2 goodness-of-fit test (only for univariate data). +#' #' @param data a matrix in which each row is a rank of size m. #' @param proportion a vector (which sums to 1) containing the K mixture proportion. -#' @param pi a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparison of the model (dispersion parameters). -#' @param mu a matrix of size K*m, where m is the size of a rank, containing the modal rankings of the model (position parameters). -#' @param nBoot number of bootstrap iterations used to estimate the khi2 adequation test p-value. -#' -#' @return a real, the p-value of the khi2 adequation test. -#' +#' @param pi a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparison +#' of the model (dispersion parameters). +#' @param mu a matrix of size K*m, where m is the size of a rank, containing the modal rankings of the model +#' (position parameters). +#' @param nBoot number of bootstrap iterations used to estimate the p-value. +#' +#' @return the p-value of the test. +#' #' @examples #' proportion <- c(0.4, 0.6) #' pi <- c(0.8, 0.75) #' mu <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE) #' # simulate a data set with declared parameters. -#' data <- rbind(simulISR(proportion[1] * 100, pi[1], mu[1, ]), -#' simulISR(proportion[2] * 100, pi[2], mu[2, ])) +#' data <- rbind( +#' simulISR(proportion[1] * 100, pi[1], mu[1, ]), +#' simulISR(proportion[2] * 100, pi[2], mu[2, ]) +#' ) #' pval <- khi2(data, proportion, mu, pi) -#' +#' #' @author Quentin Grimonprez -#' +#' #' @export -khi2 <- function(data, proportion, mu, pi, nBoot = 1000) -{ +khi2 <- function(data, proportion, mu, pi, nBoot = 1000) { checkKhi2(data, proportion, mu, pi, nBoot) - - pval = .Call("adkhi2partial", data, pi, proportion, mu, nBoot, PACKAGE = "Rankcluster") + + pval <- .Call("adkhi2partial", data, pi, proportion, mu, nBoot, PACKAGE = "Rankcluster") return(pval) } -checkKhi2 <- function(data, proportion, mu, pi, nBoot = 1000) -{ - if (missing(mu)) +checkKhi2 <- function(data, proportion, mu, pi, nBoot = 1000) { + if (missing(mu)) { stop("mu is missing") - if (missing(pi)) + } + if (missing(pi)) { stop("pi is missing") - + } + # proportion checkProportion(proportion, paramName = "proportion", eps = 1e-10) - + # pi - if (!is.vector(pi, mode = "numeric")) + if (!is.vector(pi, mode = "numeric")) { stop("pi must be a vector of probabilities") - if ((min(pi) < 0) && (max(pi) > 1)) + } + if ((min(pi) < 0) && (max(pi) > 1)) { stop("pi must be a vector of probabilities") - + } + # mu - if (!is.numeric(mu) || !is.matrix(mu)) + if (!is.numeric(mu) || !is.matrix(mu)) { stop("mu must be a matrix of positive integer") - if (min(mu) < 1) + } + if (min(mu) < 1) { stop("mu must be a matrix of positive integer") - if (nrow(mu) != length(proportion)) + } + if (nrow(mu) != length(proportion)) { stop("The number of rows of mu and the length of proportion don't match.") - if (nrow(mu) != length(pi)) + } + if (nrow(mu) != length(pi)) { stop("The number of rows of mu and the length of pi don't match.") - + } + # data checkData(data) - if (ncol(data) != ncol(mu)) + if (ncol(data) != ncol(mu)) { stop("mu and data must have the same number of columns.") - - + } + + # nBoot - if (!is.numeric(nBoot)) + if (!is.numeric(nBoot)) { stop("nBoot must be a positive integer.") - if (length(nBoot) != 1) + } + if (length(nBoot) != 1) { stop("nBoot must be a positive integer.") - if ((nBoot < 0) || (nBoot != round(nBoot))) + } + if ((nBoot < 0) || (nBoot != round(nBoot))) { stop("nBoot must be a positive integer.") - + } + # check if mu and data are rank - if (sum(apply(data, 1, checkPartialRank)) != nrow(data)) + if (sum(apply(data, 1, checkPartialRank)) != nrow(data)) { stop("Data are not correct") - if (sum(apply(mu, 1, checkPartialRank)) != nrow(mu)) + } + if (sum(apply(mu, 1, checkPartialRank)) != nrow(mu)) { stop("mu is not correct") - + } } \ No newline at end of file diff --git a/R/zzz.R b/R/zzz.R index a3f0743..b3e18e5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,7 +1,7 @@ #' @export -.onAttach <- function(lib, pkg) -{ - packageStartupMessage("WARNING: Since Rancluster 0.92, the ranks have to be given to the package in the ranking notation (see convertRank function), with the following convention: +.onAttach <- function(lib, pkg) { + packageStartupMessage( + "WARNING: Since Rancluster 0.92, ranks have to be given in the ranking notation (see convertRank function), with the following convention: - missing positions are replaced by 0 - tied are replaced by the lowest position they share") } diff --git a/Rankcluster.Rproj b/Rankcluster.Rproj index eaa6b81..7801328 100644 --- a/Rankcluster.Rproj +++ b/Rankcluster.Rproj @@ -15,4 +15,5 @@ LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageBuildArgs: --compact-vignettes=both PackageRoxygenize: rd,collate,namespace diff --git a/man/APA.Rd b/man/APA.Rd index 5bac0d4..6fffc95 100644 --- a/man/APA.Rd +++ b/man/APA.Rd @@ -7,26 +7,30 @@ \format{ A list containing: \describe{ - \item{data}{ matrix of size 5738x5 containing the 5738 observed full ranks in ranking representation. - The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. + \item{data}{A matrix of size 5738x5 containing the 5738 observed full ranks in ranking representation. + The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is +in r_ith position. - For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} +For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks +the first object in 4th position, second object in 3rd position, ...} \item{frequency}{matrix of size 120x6. Each row corresponds to one of the different observed rank. The first fifth columns contains the observed ranks (ordering representation) and the sixth column contains the frequency of observation.} - \item{m}{ vector with the size of the ranks (5 here).} + \item{m}{vector with the size of the ranks (5 here).} } } \source{ "Group representations in probability and statistics", P. Diaconis, 1988. } \description{ -This dataset contains the 5738 full rankings resulting from the American Psychological Association (APA) presidential election of 1980. -For this election, members of APA had to rank five candidates in order of preference. +This dataset contains the 5738 full rankings resulting from the American Psychological Association (APA) +presidential election of 1980. For this election, members of APA had to rank five candidates in order of preference. -For information, a total of 15449 votes have been registred for this election, but only the 5738 full rankings are reported in the APA dataset. Candidates A and C were research psychologists, candidates D and E were clinical psychologists and candidate B was a community psychologist. +For information, a total of 15449 votes have been registered for this election, but only the 5738 full rankings are +reported in the APA dataset. Candidates A and C were research psychologists, candidates D and E were clinical +psychologists and candidate B was a community psychologist. } \examples{ data(APA) diff --git a/man/Output-class.Rd b/man/Output-class.Rd index 098856c..3cc96bb 100644 --- a/man/Output-class.Rd +++ b/man/Output-class.Rd @@ -53,13 +53,13 @@ observed ranking. Available only in presence of at least one partial ranking.} value at each iteration of the SEM-Gibbs algorithm (except the burn-in phase) for proportions. A list of Qsem-Bsem elements, each element being a K*p-matrix.} -\item{\code{distancePi}}{Distances (MSE) between the final estimation and the current -value at each iteration of the SEM-Gibbs algorithm (except the burn-in phase) for scale parameters. A list of Qsem-Bsem elements, +\item{\code{distancePi}}{Distances (MSE) between the final estimation and the current value at each iteration of the +SEM-Gibbs algorithm (except the burn-in phase) for scale parameters. A list of Qsem-Bsem elements, each element being a K*p-matrix.} -\item{\code{distanceMu}}{Distances (Kendall distance) between the final estimation and the current -value at each iteration of the SEM-Gibbs algorithm (except the burn-in phase) for proportions. A list of Qsem-Bsem elements, -each element being a K*p-matrix.} +\item{\code{distanceMu}}{Distances (Kendall distance) between the final estimation and the current value at each iteration of the +SEM-Gibbs algorithm (except the burn-in phase) for proportions. A list of Qsem-Bsem elements, + each element being a K*p-matrix.} \item{\code{distanceZ}}{a vector of size Qsem-Bsem containing the rand index between the final estimated partition and the current value at each iteration of the SEM-Gibbs algorithm (except diff --git a/man/Rankclust-class.Rd b/man/Rankclust-class.Rd index 15d7f0e..c7848fc 100644 --- a/man/Rankclust-class.Rd +++ b/man/Rankclust-class.Rd @@ -8,10 +8,11 @@ This class contains results of \link{rankclust} function. } \details{ -If \code{res} is the result of \link{rankclust} function, each slot of results can be reached by \code{res[k]@slotname}, where -\code{k} is the number of clusters and \code{slotname} is the name of the slot we want to reach (see \link{Output-class}). -For the slots, \code{ll}, \code{bic}, \code{icl}, \code{res["slotname"]} returns a vector of size \code{k} containing the values of the -slot for each number of clusters. +If \code{res} is the result of \link{rankclust} function, each slot of results can be reached by \code{res[k]@slotname}, +where \code{k} is the number of clusters and \code{slotname} is the name of the slot we want to reach +(see \link{Output-class}). +For the slots, \code{ll}, \code{bic}, \code{icl}, \code{res["slotname"]} returns a vector of size \code{k} containing +the values of the slot for each number of clusters. } \section{Slots}{ diff --git a/man/Rankcluster-package.Rd b/man/Rankcluster-package.Rd index 90cac8a..04135d6 100644 --- a/man/Rankcluster-package.Rd +++ b/man/Rankcluster-package.Rd @@ -8,12 +8,14 @@ This package proposes a model-based clustering algorithm for ranking data. Multivariate rankings as well as partial rankings are taken into account. This algorithm is based on an extension of the Insertion Sorting Rank (ISR) model for ranking data, which is a meaningful -and effective model parametrized by a position parameter (the modal ranking, quoted by mu) and a dispersion parameter (quoted by pi). -The heterogeneity of the rank population is modelled by a mixture of ISR, whereas conditional independence assumption is considered for multivariate rankings. +and effective model parametrized by a position parameter (the modal ranking, quoted by mu) and a dispersion parameter +(quoted by pi). The heterogeneity of the rank population is modeled by a mixture of ISR, whereas conditional independence +assumption is considered for multivariate rankings. } \details{ The main function is \link{rankclust}. -See vignettes for detailled examples: \code{RShowDoc("dataFormat", package = "Rankcluster")} and \code{RShowDoc("Rankcluster", package = "Rankcluster")} +See vignettes for detailed examples: \code{RShowDoc("dataFormat", package = "Rankcluster")} and +\code{RShowDoc("Rankcluster", package = "Rankcluster")} } \examples{ # see vignettes @@ -24,8 +26,7 @@ See vignettes for detailled examples: \code{RShowDoc("dataFormat", package = "Ra data(big4) result <- rankclust(big4$data, K = 2, m = big4$m, Ql = 200, Bl = 100, maxTry = 2) -if(result@convergence) -{ +if(result@convergence) { summary(result) partition <- result[2]@partition @@ -34,11 +35,10 @@ if(result@convergence) } \references{ -[1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, Computational Statistics and Data Analysis, 58, 162-176. +[1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, +Computational Statistics and Data Analysis, 58, 162-176. -[2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, Inria Research Report n 8113. -} -\author{ -Maintainer: Quentin Grimonprez +[2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, +Inria Research Report n 8113. } \keyword{package} diff --git a/man/big4.Rd b/man/big4.Rd index e139643..e5bd425 100644 --- a/man/big4.Rd +++ b/man/big4.Rd @@ -7,24 +7,32 @@ \format{ A list containing: \describe{ - \item{data}{A matrix of size 21*8 containing the 21 Premier League seasons. Each row corresponding to one ranking (ranking representation). + \item{data}{A matrix of size 21*8 containing the 21 Premier League seasons. Each row corresponding to one ranking +(ranking representation). - The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. + The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the +ith object is in r_ith position. - For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} - \item{frequency}{matrix of size 21*9. Each row corresponds to one of the 21 different observed rankings, and the last column contains the observation frequency.} + For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first +object in 4th position, second object in 3rd position, ...} + \item{frequency}{matrix of size 21*9. Each row corresponds to one of the 21 different observed rankings, and the +last column contains the observation frequency.} \item{m}{the size of the rankings (m=c(4,4) ).} } } \source{ \url{https://en.wikipedia.org/wiki/Premier_League} -\url{https://www.uefa.com/memberassociations/uefarankings/club/} +\url{https://fr.uefa.com/nationalassociations/uefarankings/club/} } \description{ -This dataset is composed of the rankings (in ranking notation) of the "Big Four" English football teams (A: Manchester, B: Liverpool, C: Arsenal, D: Chelsea) to the English Championship (Premier League) and according to the UEFA coefficients (statistics used in Europe for ranking and seeding teams in international competitions), from 1993 to 2013. +This dataset is composed of the rankings (in ranking notation) of the "Big Four" English football teams +(A: Manchester, B: Liverpool, C: Arsenal, D: Chelsea) to the English Championship (Premier League) and according to the UEFA +coefficients (statistics used in Europe for ranking and seeding teams in international competitions), from 1993 to 2013. -In 2000-2001, Arsenal and Chelsea had the same UEFA coefficient and then are tied. UEFA ranking is (1, 4, 2, 2) for 2000-2001, what means that Manchester United is the first, Liverpool is the last, and the two intermediate positions are for Arsenal and Chelsea in an unknown order. +In 2000-2001, Arsenal and Chelsea had the same UEFA coefficient and then are tied. UEFA ranking is (1, 4, 2, 2) for +2000-2001, what means that Manchester United is the first, Liverpool is the last, and the two intermediate positions are +for Arsenal and Chelsea in an unknown order. In 2009-2010, Liverpool and Arsenal have also the same UEFA coefficient, the ranking is (1, 2, 2, 4). } diff --git a/man/convertRank.Rd b/man/convertRank.Rd index 7d98bd1..21a5051 100644 --- a/man/convertRank.Rd +++ b/man/convertRank.Rd @@ -10,11 +10,14 @@ convertRank(x) \item{x}{a rank (vector) datum either in its ranking or ordering representation.} } \value{ -a rank (vector) in its ordering representation if its ranking representation has been given in input of convertRank, and vice-versa. +a rank (vector) in its ordering representation if its ranking representation has been given in input of +convertRank, and vice-versa. } \description{ -convertRank converts a rank from its ranking representation to its ordering representation, and vice-versa. The function does not work with partial ranking. -The transformation to convert a rank from ordering to ranking representation is the same that from ranking to ordering representation, there is no need to precise the representation of rank x. +convertRank converts a rank from its ranking representation to its ordering representation, and vice-versa. +The function does not work with partial ranking. The transformation to convert a rank from ordering to ranking +representation is the same that from ranking to ordering representation, there is no need to precise the representation +of rank x. } \details{ The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, @@ -22,9 +25,9 @@ and means that the ith object is in r_ith position. The ordering representation o=(o_1,...,o_m) means that object o_i is in the ith position. -Let us consider the following example to illustrate both notations: a judge, which has to rank three holidays destinations according to its preferences, -O1 = Countryside, O2 =Mountain and O3 = Sea, ranks first Sea, second Countryside, and last Mountain. The ordering result of the judge is o = (3, 1, 2) -whereas the ranking result is r = (2, 3, 1). +Let us consider the following example to illustrate both notations: a judge, which has to rank three holidays destinations +according to its preferences, O1 = Countryside, O2 =Mountain and O3 = Sea, ranks first Sea, second Countryside, +and last Mountain. The ordering result of the judge is o = (3, 1, 2) whereas the ranking result is r = (2, 3, 1). } \examples{ x <- c(2, 3, 1, 4, 5) diff --git a/man/criteria.Rd b/man/criteria.Rd index 45ac397..9749ac6 100644 --- a/man/criteria.Rd +++ b/man/criteria.Rd @@ -7,13 +7,16 @@ criteria(data, proportion, pi, mu, m, Ql = 500, Bl = 100, IC = 1, nb_cpus = 1) } \arguments{ -\item{data}{a matrix in which each row is a rank (partial or not; for partial rank, missing elements of a rank are put to 0 ).} +\item{data}{a matrix in which each row is a rank (partial or not; for partial rank, +missing elements of a rank are put to 0).} \item{proportion}{a vector (which sums to 1) containing the K mixture proportions.} -\item{pi}{a matrix of size K*p, where K is the number of clusters and p the number of dimension, containing the probabilities of a good comparaison of the model (dispersion parameters).} +\item{pi}{a matrix of size K*p, where K is the number of clusters and p the number of dimension, containing the +probabilities of a good comparison of the model (dispersion parameters).} -\item{mu}{a matrix of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row.} +\item{mu}{a matrix of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. +In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row.} \item{m}{a vector containing the size of ranks for each dimension.} @@ -32,15 +35,17 @@ a list containing: \item{icl}{the estimated ICL criterion.} } \description{ -This function estimates the loglikelihood of a mixture of multidimensional ISR model, as well as the BIC and ICL model selection criteria. +This function estimates the loglikelihood of a mixture of multidimensional ISR model, as well as the +BIC and ICL model selection criteria. } \examples{ data(big4) -res = rankclust(big4$data, m = big4$m, K = 2, Ql = 100, Bl = 50, maxTry = 2) -if (res@convergence) - crit = criteria(big4$data, res[2]@proportion, res[2]@pi, res[2]@mu, +res <- rankclust(big4$data, m = big4$m, K = 2, Ql = 100, Bl = 50, maxTry = 2) +if (res@convergence) { + crit <- criteria(big4$data, res[2]@proportion, res[2]@pi, res[2]@mu, big4$m, Ql = 200, Bl = 100) - +} + } \author{ Quentin Grimonprez diff --git a/man/distCayley.Rd b/man/distCayley.Rd index bc8fd81..742fe65 100644 --- a/man/distCayley.Rd +++ b/man/distCayley.Rd @@ -13,7 +13,8 @@ distCayley(x, y) the Cayley distance between x and y. } \description{ -The Cayley distance between two ranks x and y is the minimum number of transpositions required to transform the ranking x into y. +The Cayley distance between two ranks x and y is the minimum number of transpositions required to +transform the ranking x into y. } \examples{ x <- 1:5 diff --git a/man/distHamming.Rd b/man/distHamming.Rd index 6443a1a..fc0cd41 100644 --- a/man/distHamming.Rd +++ b/man/distHamming.Rd @@ -14,7 +14,8 @@ an integer, the Hamming distance between x and y. } \description{ The Hamming distance between two ranks x and y is the number of difference between the two ranks. -For example, the Hamming's distance between x=(1,4,2,5,3) and y=(1,3,4,5,2) is 3 because, only 1 and 5 have the same place in both ranks. +For example, the Hamming's distance between x=(1,4,2,5,3) and y=(1,3,4,5,2) is 3 because, only 1 and 5 have the same +place in both ranks. } \examples{ x <- 1:5 diff --git a/man/eurovision.Rd b/man/eurovision.Rd index e7b6c2f..02d61d4 100644 --- a/man/eurovision.Rd +++ b/man/eurovision.Rd @@ -3,20 +3,23 @@ \docType{data} \name{eurovision} \alias{eurovision} -\title{Multidimensionnal partial rank data: eurovision} +\title{Multidimensional partial rank data: eurovision} \format{ A list containing: \describe{ - \item{data}{ A matrix of size 34*48. Each row corresponds to the ranking representation of a multidimensionnal ranking. + \item{data}{ A matrix of size 34*48. Each row corresponds to the ranking representation of a multidimensional ranking. Columns 1 to 8 correspond to the 2007 contest, columns 9 to 18 to the 2008 contest, etc... - The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. + The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that +the ith object is in r_ith position. - For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ... + For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first +object in 4th position, second object in 3rd position, ... } - \item{frequency}{A matrix of size 34*49 containing the differents multidimensionnal rankings. The 48 first columns are the same as in data, and the last column contains the frequency (1 for all ranks).} + \item{frequency}{A matrix of size 34*49 containing the different multidimensional rankings. The 48 first columns are +the same as in data, and the last column contains the frequency (1 for all ranks).} \item{m}{ a vector with the sizes of ranks for each dimension.} } @@ -29,9 +32,11 @@ This dataset contains the ranking of the 8 common finalists of the Eurovision so A: France, B:Germany, C:Greece, D:Romania, E:Russia, F:Spain, G:Ukraine, H:United Kingdom. -The number of rankings is 33, corresponding to the 33 European countries having participated to this six editions of the contest. +The number of rankings is 33, corresponding to the 33 European countries having participated to those +six editions of the contest. -All the rankings are partial since none country has ranked this 8 countries in its 10 preferences. Missing ranking elements are zeros. +All the rankings are partial since none country has ranked this 8 countries in its 10 preferences. Missing ranking +elements are zeros. } \examples{ data(eurovision) diff --git a/man/frequence.Rd b/man/frequence.Rd index 58dadc1..f98ca73 100644 --- a/man/frequence.Rd +++ b/man/frequence.Rd @@ -15,8 +15,8 @@ frequence(X, m = ncol(X)) A matrix containing each different observed ranks with its observation frequencies in the last column. } \description{ -This function takes in input a matrix containing all the observed ranks (a rank can be repeated) and returns a matrix containing all -the different observed ranks with their observation frequencies (in the last column). +This function takes in input a matrix containing all the observed ranks (a rank can be repeated) +and returns a matrix containing all the different observed ranks with their observation frequencies (in the last column). } \examples{ X <- matrix(1:4, ncol = 4, nrow = 5, byrow = TRUE) diff --git a/man/khi2.Rd b/man/khi2.Rd index 4789a50..c0a140a 100644 --- a/man/khi2.Rd +++ b/man/khi2.Rd @@ -11,25 +11,29 @@ khi2(data, proportion, mu, pi, nBoot = 1000) \item{proportion}{a vector (which sums to 1) containing the K mixture proportion.} -\item{mu}{a matrix of size K*m, where m is the size of a rank, containing the modal rankings of the model (position parameters).} +\item{mu}{a matrix of size K*m, where m is the size of a rank, containing the modal rankings of the model +(position parameters).} -\item{pi}{a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparison of the model (dispersion parameters).} +\item{pi}{a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparison +of the model (dispersion parameters).} -\item{nBoot}{number of bootstrap iterations used to estimate the khi2 adequation test p-value.} +\item{nBoot}{number of bootstrap iterations used to estimate the p-value.} } \value{ -a real, the p-value of the khi2 adequation test. +the p-value of the test. } \description{ -This function computes the p-value of the khi2 adequation test (only for univariate data). +This function computes the p-value of the khi2 goodness-of-fit test (only for univariate data). } \examples{ proportion <- c(0.4, 0.6) pi <- c(0.8, 0.75) mu <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE) # simulate a data set with declared parameters. -data <- rbind(simulISR(proportion[1] * 100, pi[1], mu[1, ]), - simulISR(proportion[2] * 100, pi[2], mu[2, ])) +data <- rbind( + simulISR(proportion[1] * 100, pi[1], mu[1, ]), + simulISR(proportion[2] * 100, pi[2], mu[2, ]) +) pval <- khi2(data, proportion, mu, pi) } diff --git a/man/kullback.Rd b/man/kullback.Rd index 962c5af..4842797 100644 --- a/man/kullback.Rd +++ b/man/kullback.Rd @@ -9,9 +9,11 @@ kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, m) \arguments{ \item{proportion1, proportion2}{vectors (which sums to 1) containing the K mixture proportions.} -\item{pi1, pi2}{matrices of size K*p, where K is the number of clusters and p the number of dimension, containing the probabilities of a good comparaison of the model (dispersion parameters).} +\item{pi1, pi2}{matrices of size K*p, where K is the number of clusters and p the number of dimension, +containing the probabilities of a good comparison of the model (dispersion parameters).} -\item{mu1, mu2}{matrices of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row.} +\item{mu1, mu2}{matrices of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. +In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row.} \item{m}{a vector containing the size of ranks for each dimension.} } @@ -19,7 +21,7 @@ kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, m) the Kullback-Leibler divergence. } \description{ -This function computes the Kullback-Leibler divergence between two mixtures of multidimensional ISR distributions. +It computes the Kullback-Leibler divergence between two mixtures of multidimensional ISR distributions. } \examples{ proportion1 <- c(0.4, 0.6) diff --git a/man/probability.Rd b/man/probability.Rd index 8eef392..74e64a6 100644 --- a/man/probability.Rd +++ b/man/probability.Rd @@ -7,13 +7,15 @@ probability(x, mu, pi, m = length(mu)) } \arguments{ -\item{x}{a vector or a matrix containing the rankings in ranking notation (see Details or \link{convertRank} function). +\item{x}{a vector or a matrix containing the rankings in ranking notation (see Details or \link{convertRank} function). The rankings of each dimension are placed end to end. \code{x} must contain only full ranking (no partial or tied).} -\item{mu}{a vector of length \code{sum(m)} containing the modal ranks in ranking notation (see Details or \link{convertRank} function). +\item{mu}{a vector of length \code{sum(m)} containing the modal ranks in ranking notation (see Details or +\link{convertRank} function). The rankings of each dimension are placed end to end. \code{mu} must contain only full ranking (no partial or tied).} -\item{pi}{a vector of size \code{p=length(m)}, where \code{p} is the number of dimension, containing the probabilities of a good comparison of the model (dispersion parameters).} +\item{pi}{a vector of size \code{p=length(m)}, where \code{p} is the number of dimension, containing the probabilities of +a good comparison of the model (dispersion parameters).} \item{m}{a vector containing the size of ranks for each dimension.} } @@ -24,7 +26,8 @@ the probability of \code{x} according to a ISR(mu, pi). It computes the probability of a (multivariate) rank x according to a ISR(mu, pi). } \details{ -The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), with the following convention: +The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), +with the following convention: - missing positions are replaced by 0 @@ -48,11 +51,11 @@ result is r = (2, 3, 1). } \examples{ m <- c(4, 5) -x = mu <- matrix(nrow = 1, ncol = 9) -x[1:4] = c(1, 4, 2, 3) -x[5:9] = c(3, 5, 2, 4, 1) -mu[1:4] = 1:4 -mu[5:9] = c(3, 5, 4, 2, 1) +x <- mu <- matrix(nrow = 1, ncol = 9) +x[1:4] <- c(1, 4, 2, 3) +x[5:9] <- c(3, 5, 2, 4, 1) +mu[1:4] <- 1:4 +mu[5:9] <- c(3, 5, 4, 2, 1) pi <- c(0.75, 0.82) prob <- probability(x, mu, pi, m) diff --git a/man/quiz.Rd b/man/quiz.Rd index f598279..f88a936 100644 --- a/man/quiz.Rd +++ b/man/quiz.Rd @@ -3,17 +3,20 @@ \docType{data} \name{quiz} \alias{quiz} -\title{Multidimensionnal rank data: quiz} +\title{Multidimensional rank data: quiz} \format{ A list containing: \describe{ - \item{data}{a matrix of size 70*16. The student's answers are in row and the 16 columns correspond to the 4 rankings (for the 4 quizzes) of size 4 (ranking representation). + \item{data}{a matrix of size 70*16. The student's answers are in row and the 16 columns correspond to the 4 rankings +(for the 4 quizzes) of size 4 (ranking representation). - The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. + The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is +in r_ith position. -For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} -\item{frequency}{a matrix of size 63*17. Each row corresponds to one of the 63 differents observed -rankings (ranking representation). Each row contains 4 ranks of size 4 and a last column for the frequency.} +For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th +position, second object in 3rd position, ...} +\item{frequency}{a matrix of size 63*17. Each row corresponds to one of the 63 different observed rankings +(ranking representation). Each row contains 4 ranks of size 4 and a last column for the frequency.} \item{m}{a vector with the sizes of the ranks for each dimension.} } @@ -22,23 +25,24 @@ rankings (ranking representation). Each row contains 4 ranks of size 4 and a las Julien Jacques } \description{ -This dataset contains the answers of 70 students (40 of third year and 30 of fourth year) from Polytech'Lille (statistics engineering school, France) to the four following quizzes: - -\describe{ +This dataset contains the answers of 70 students (40 of third year and 30 of fourth year) +from Polytech'Lille (statistics engineering school, France) to the four following quizzes: +\describe{#' \item{Literature Quiz}{ - This quiz consists of ranking four french writers according to chronological order: - A=Victor Hugo, B=Moliere, C=Albert Camus, D=Jean-Jacques Rousseau.} + This quiz consists of ranking four French writers according to chronological order: + A=Victor Hugo, B=Molière, C=Albert Camus, D=Jean-Jacques Rousseau.} \item{Football Quiz}{ - This quiz consists of ranking four national football teams according to increasing number of wins in the football World Cup: A=France, B=Germany, C=Brazil, D=Italy.} + This quiz consists of ranking four national football teams according to increasing number of wins in the football +World Cup: A=France, B=Germany, C=Brazil, D=Italy.} \item{Mathematics Quiz}{ This quiz consists of ranking four numbers according to increasing order: A=pi/3, B=log(1), C=exp(2), D=(1+sqrt(5))/2.} \item{Cinema Quiz}{ - This quiz consists of ranking four Tarentino's movies according to chronological order: + This quiz consists of ranking four Tarantino's movies according to chronological order: A=Inglourious Basterds, B=Pulp Fiction, C=Reservoir Dogs, D=Jackie Brown.} } diff --git a/man/rankclust.Rd b/man/rankclust.Rd index 2ab0722..ad5bf0a 100644 --- a/man/rankclust.Rd +++ b/man/rankclust.Rd @@ -22,11 +22,12 @@ rankclust( } \arguments{ \item{data}{a matrix in which each row is a ranking (partial or not; for partial ranking, -missing elements must be 0 or NA. Tied are replaced by the lowest position they share). For multivariate rankings, the rankings of each dimension are -placed end to end in each row. The data must be in ranking notation (see Details or +missing elements must be 0 or NA. Tied are replaced by the lowest position they share). For multivariate rankings, +the rankings of each dimension are placed end to end in each row. The data must be in ranking notation (see Details or \link{convertRank} functions).} -\item{m}{a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the matrix data).} +\item{m}{a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the +matrix data).} \item{K}{an integer or a vector of integer with the number of clusters.} @@ -39,7 +40,8 @@ placed end to end in each row. The data must be in ranking notation (see Details \item{RjSE}{a vector containing, for each dimension, the number of iterations of the Gibbs sampler used both in the SE step for partial rankings and for the presentation orders generation (default value=mj(mj-1)/2).} -\item{RjM}{a vector containing, for each dimension, the number of iterations of the Gibbs sampler used in the M step (default value=mj(mj-1)/2)} +\item{RjM}{a vector containing, for each dimension, the number of iterations of the Gibbs sampler used in the M step +(default value=mj(mj-1)/2)} \item{Ql}{number of iterations of the Gibbs sampler for estimation of log-likelihood (default value=100).} @@ -50,19 +52,22 @@ for estimation of log-likelihood (default value=100).} \item{run}{number of runs of the algorithm for each value of K.} -\item{detail}{boolean, if TRUE, time and others informations will be print during the process (default value FALSE).} +\item{detail}{boolean, if TRUE, time and others information will be print during the process (default value FALSE).} } \value{ An object of class Rankclust (See \code{\link{Output-class}} and \code{\link{Rankclust-class}}). -If the output object is named \code{res}. You can access the result by res[number of groups]@slotName where \code{slotName} is an element of the class Output. +If the output object is named \code{res}. You can access the result by res[number of groups]@slotName where +\code{slotName} is an element of the class Output. } \description{ -This functions estimates a clustering of ranking data, potentially multivariate, partial and containing tied, based on a mixture of multivariate ISR model [2]. +This functions estimates a clustering of ranking data, potentially multivariate, partial and containing tied, +based on a mixture of multivariate ISR model [2]. By specifying only one cluster, the function performs a modelling of the ranking data using the multivariate ISR model. The estimation is performed thanks to a SEM-Gibbs algorithm. } \details{ -The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), with the following convention: +The ranks have to be given to the package in the ranking notation (see \link{convertRank} function), with the following +convention: - missing positions are replaced by 0 @@ -89,8 +94,7 @@ result is r = (2, 3, 1). data(big4) result <- rankclust(big4$data, K = 2, m = big4$m, Ql = 200, Bl = 100, maxTry = 2) -if(result@convergence) -{ +if(result@convergence) { summary(result) partition <- result[2]@partition @@ -99,9 +103,11 @@ if(result@convergence) } \references{ -[1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, Computational Statistics and Data Analysis, 58, 162-176. +[1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, +Computational Statistics and Data Analysis, 58, 162-176. -[2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, Inria Research Report n 8113. +[2] J.Jacques and C.Biernacki (2012), Model-based clustering for multivariate partial ranking data, +Inria Research Report n 8113. } \seealso{ See \code{\link{Output-class}} and \code{\link{Rankclust-class}} for available output. diff --git a/man/simulISR.Rd b/man/simulISR.Rd index 1f1dc5b..e45cb86 100644 --- a/man/simulISR.Rd +++ b/man/simulISR.Rd @@ -9,7 +9,7 @@ simulISR(n, pi, mu) \arguments{ \item{n}{size of the sample.} -\item{pi}{dispersion parameter: probability of correct paired comparaison according to mu.} +\item{pi}{dispersion parameter: probability of correct paired comparison according to mu.} \item{mu}{position parameter: modal ranking in ordering representation.} } @@ -35,14 +35,15 @@ second Countryside, and last Mountain. The ordering result of the judge is o = (3, 1, 2) whereas the ranking result is r = (2, 3, 1). -You can see the \link{convertRank} function to convert the simualted ranking drom ordering to ranking representation. +You can see the \link{convertRank} function to convert the simulated ranking from ordering to ranking representation. } \examples{ x <- simulISR(30, 0.8, 1:4) } \references{ -[1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, Computational Statistics and Data Analysis, 58, 162-176. +[1] C.Biernacki and J.Jacques (2013), A generative model for rank data based on sorting algorithm, +Computational Statistics and Data Analysis, 58, 162-176. } \author{ Julien Jacques diff --git a/man/sports.Rd b/man/sports.Rd index 31bbd4e..4d255b5 100644 --- a/man/sports.Rd +++ b/man/sports.Rd @@ -9,11 +9,14 @@ A list containing: \describe{ \item{data}{a matrix containing 130 ranks of size 7 in ranking representation. - The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. + The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that +the ith object is in r_ith position. - For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} + For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first +object in 4th position, second object in 3rd position, ...} - \item{frequency}{a matrix with 123 differents ranks of size 7. In each row the first 7 columns correspond to one observed ranking and the last column contains the observation frequency.} + \item{frequency}{a matrix with 123 different ranks of size 7. In each row the first 7 columns correspond to one +observed ranking and the last column contains the observation frequency.} \item{m}{ the size of the rankings (m=7).} } } diff --git a/man/unfrequence.Rd b/man/unfrequence.Rd index aa627e3..7637fe9 100644 --- a/man/unfrequence.Rd +++ b/man/unfrequence.Rd @@ -13,8 +13,9 @@ unfrequence(data) a matrix containing all the rankings. } \description{ -This function takes in input a matrix in which the m first columns are the different observed ranks and the last column contains the observation frequency, -and returns a matrix containing all the ranks (ranks with frequency>1 are repeated). +This function takes in input a matrix in which the m first columns are the different observed ranks and +the last column contains the observation frequency, and returns a matrix containing all the ranks (ranks with frequency>1 +are repeated). } \examples{ data(quiz) diff --git a/man/words.Rd b/man/words.Rd index 77a2821..43149be 100644 --- a/man/words.Rd +++ b/man/words.Rd @@ -7,12 +7,16 @@ \format{ A list containing: \describe{ - \item{data}{A matrix of size 98*5 containing the 98 answers. Each row corresponding to one ranking (ranking representation). + \item{data}{A matrix of size 98*5 containing the 98 answers. Each row corresponding to one ranking +(ranking representation). - The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that the ith object is in r_ith position. + The ranking representation r=(r_1,...,r_m) contains the ranks assigned to the objects, and means that +the ith object is in r_ith position. - For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the first object in 4th position, second object in 3rd position, ...} - \item{frequency}{matrix of size 15*6. Each row corresponds to one of the 15 different observed rankings, and the last column contains the observation frequency.} + For example, if the ranking representation of a rank is (4,3,1,2,5), it means that judge ranks the +first object in 4th position, second object in 3rd position, ...} + \item{frequency}{matrix of size 15*6. Each row corresponds to one of the 15 different observed rankings, and the +last column contains the observation frequency.} \item{m}{the size of the rankings (m=5).} } } @@ -21,7 +25,8 @@ M.A. Fligner and J.S. Verducci. "Distance based ranking models". J. Roy. Statist } \description{ The data was collected under the auspices of the Graduate Record -Examination Board. A sample of 98 college students were asked to rank five words according to strength of association (least to most associated) with the target word "Idea": +Examination Board. A sample of 98 college students were asked to rank five words according to strength of association +(least to most associated) with the target word "Idea": A = Thought, B = Play, C = Theory, D = Dream and E = Attention. } \examples{ diff --git a/src/ISRfunctions.cpp b/src/ISRfunctions.cpp new file mode 100644 index 0000000..2ae1a84 --- /dev/null +++ b/src/ISRfunctions.cpp @@ -0,0 +1,219 @@ +/* + * File containing functions associated with the ISR distribution + */ +#include + +#include "ISRfunctions.h" +#include "functions.h" + +using namespace std; + +/* + * Computation of A(x,y) (number of comparisons) and G(x,y,mu) (number of good comparisons) + * + * These values are useful to compute the probability according to an ISR. + * See appendix A and B: https://hal.archives-ouvertes.fr/hal-00441209v3/document + */ +vector comparaison(Rank const &x, Rank const &y, Rank const &mu) +{ + int const m(mu.size()); + int gplus(0), gmoins(0), gjmoinsb(0), gjplusb(0), index(0); + vector ajmoins, ajplus, ajplusb, ajmoinsb, ajplusbIndex; + ajplusb.reserve(m); //le Aj+ en cours + ajmoinsb.reserve(m); //le Aj- en cours + ajplusbIndex.reserve(m); //les index du Aj+ en cours + ajplus.reserve(m * (m - 1)); //l'union de tt les Aj+ + ajmoins.reserve(m * (m - 1)); //l'union de tt les Aj- + + for (int j(1); j < m; j++) + { + gjmoinsb = 0; + gjplusb = 0; + for (int i(0); i < j; i++) + { + //calcul Aj- + if (positionRank(x, y[i]) < positionRank(x, y[j])) + { + ajmoins.push_back(i); + ajmoinsb.push_back(i); + } + else //calcul Aj+//if (positionRank(x,y[i])>positionRank(x,y[j])) + { + ajplusb.push_back(positionRank(x, y[i])); + ajplusbIndex.push_back(i); + } + } + + if (ajplusb.size() > 0) //si le Aj+ en cours est non vide, on rajoute l'index du min à Aj+ + { + index = min_element(ajplusb.begin(), ajplusb.end()) - ajplusb.begin(); + ajplus.push_back(ajplusbIndex[index]); + + //calcul de G+ + if (positionRank(mu, y[j]) < positionRank(mu, y[ajplus[ajplus.size() - 1]])) + { + gplus++; + gjplusb++; + } + ajplusb.erase(ajplusb.begin(), ajplusb.end()); + ajplusbIndex.erase(ajplusbIndex.begin(), ajplusbIndex.end()); + } + if (ajmoinsb.size() > 0) //si le Aj- en cours est non vide on calcule G- + { + //calcul de G- + for (unsigned int i(0); i < ajmoinsb.size(); i++) + { + if (positionRank(mu, y[ajmoinsb[i]]) < positionRank(mu, y[j])) + { + gmoins++; + gjmoinsb++; + } + } + ajmoinsb.erase(ajmoinsb.begin(), ajmoinsb.end()); + } + } + + vector comparaison(2, 0); + comparaison[0] = ajmoins.size() + ajplus.size(); + comparaison[1] = gmoins + gplus; + + return comparaison; +} + +/** Compute log p(x|y;mu,pi) according to an ISR model */ +double lnProbaCond(Rank const &x, Rank const &y, Rank const &mu, double const &p) +{ + vector comp(2); + comp = comparaison(x, y, mu); + + // manage special case, otherwise it returns NaN + if (p == 1) + { + if (comp[0] == comp[1]) + return 0.; + } + + if (p == 0) + { + if (comp[1] == 0) + return 0.; + } + + return comp[1] * log(p) + (comp[0] - comp[1]) * log(1. - p); +} + +double probaCond(Rank const &x, Rank const &y, Rank const &mu, double const &p) +{ + return exp(lnProbaCond(x, y, mu, p)); +} + + +// Simulate n samples from an ISR(mu, p) +vector simulISR(int const &n, int const &m, Rank const &mu, double const &p) +{ + vector simul(n, Rank(m, 0)); + Rank s(m), rgTemp(m); + int l; + double correct; + bool compar, avance; + + initializeRank(rgTemp); + + for (int i(0); i < n; i++) + { + //simulation d'un rang aléatoire: permutation du vecteur 1 2..m + s = rgTemp; + Rshuffle(s.begin(), s.end()); + simul[i][0] = s[0]; + for (int j(1); j < m; j++) + { + l = 0; + avance = true; + while (avance && l < j) + { + correct = (double)runif(0., 1.); + compar = (positionRank(mu, s[j]) < positionRank(mu, simul[i][l])); + if ((compar && correct < p) || (!compar && correct > p)) + { + for (int k(j - 1); k >= l; k--) + simul[i][k + 1] = simul[i][k]; + + simul[i][l] = s[j]; + avance = false; + } + else + l++; + } + if (l == j) + simul[i][l] = s[j]; + } + } + return simul; +} + + +// Simulate n samples from a mixture of ISR(mu, p) +void simulMixtureISR(vector &simul, vector const &mu, + vector const &p, vector const &prop) +{ + int classe = 0; + int n = simul.size(); + int m = mu[0].size(); + + Eigen::ArrayXd probaEig(prop.size()); + for (size_t k = 0; k < prop.size(); k++) + probaEig(k) = prop[k]; + + for (int i(0); i < n; i++) + { + // class number sample according to proportion + classe = sampleMultinomial(probaEig); + + // sample a rank according to the parameters of the sampled class + simul[i] = simulISR(1, m, mu[classe], p[classe])[0]; + } +} + +double proba(vector const &x, vector const &mu, vector const &pi) +{ + int d = pi.size(); + vector probaDim(d, 0); + double finalProba; + vector tabFact, listeY; + Rank y; + vector m(mu.size()); + map> diffDim; + + for (int dim = 0; dim < d; dim++) + { + m[dim] = mu[dim].size(); + diffDim[m[dim]].push_back(dim); + } + + // we iterate over the different values of m (size of ranks) to limit the computation of y + for (map>::iterator it = diffDim.begin(); it != diffDim.end(); it++) + { + int m = it->first; + tabFact = tab_factorial(m); + listeY = listIndexOrderOfPresentation(m, tabFact); + double mult = 2. / (double)tabFact[m - 1]; + + for (int i = 0; i < tabFact[m - 1] / 2; i++) + { + y = index2rankNoCheck(listeY[i], m, tabFact); + + // for each dimension that have a rank of size m + for (int j = 0; j < (int)(it->second).size(); j++) + probaDim[(it->second)[j]] += probaCond(x[(it->second)[j]], y, mu[(it->second)[j]], pi[(it->second)[j]]); + } + + for (int j = 0; j < (int)(it->second).size(); j++) + probaDim[(it->second)[j]] *= mult; + } + + finalProba = probaDim[0]; + for (int dim = 1; dim < d; dim++) + finalProba *= probaDim[dim]; + + return finalProba; +} diff --git a/src/ISRfunctions.h b/src/ISRfunctions.h new file mode 100644 index 0000000..a989422 --- /dev/null +++ b/src/ISRfunctions.h @@ -0,0 +1,68 @@ +/* + * File containing functions associated with the ISR distribution + */ + +#ifndef ISR_FUNCTIONS_H_ +#define ISR_FUNCTIONS_H_ + +#include +#include "Typedef.h" + + +/** + * compute A(x,y) and G(x,y,mu) + * @param x rank + * @param y order of presentation of x + * @param mu order of reference + * @return a vector of 2 elements (A(x,y),G(x,y,mu)) + * + * A(x,y)=total number of comparison in the insertion sorting algorithm + * G(x,y,mu)= total number of good comparison according to mu in the insertion sorting algorithm + * + */ +std::vector comparaison(Rank const &x, Rank const &y, Rank const &mu); + +/** + * compute the conditional probability p(x|y;mu,p) + * @param x rank + * @param y order of presentation of x + * @param mu order of reference + * @param p probability of making a good comparison + * @return p(x|y;mu,p) + */ +double probaCond(Rank const &x, Rank const &y, Rank const &mu, double const &p); +double lnProbaCond(Rank const &x, Rank const &y, Rank const &mu, double const &p); + + +/** + * compute probability of x according to multivariate ISR + * @param x rank for each dimension for compute probability + * @param mu reference rank for each dimension + * @param pi dispersion parameter for each dimension + * @return p(x;mu,pi) + */ +double proba(std::vector const &x, std::vector const &mu, std::vector const &pi); + + +/** + * simulation of a n-sample of ISR(mu,p) + * @param n size of the sample + * @param m size of the rank + * @param mu rank + * @param p probability of a good comparison + * @return a n-sample of ISR(mu,p) + */ +std::vector simulISR(int const &n, int const &m, Rank const &mu, double const &p); + +/** + * Simulate a sample of mixture of ISR + * @param simul sample will be modified + * @param mu reference rank + * @param p dispersion parameter: probability of a good comparison + * @param prop proportion of the mixture + */ +void simulMixtureISR(std::vector &simul, std::vector const &mu, std::vector const &p, + std::vector const &prop); + + +#endif /* ISR_FUNCTIONS_H_ */ \ No newline at end of file diff --git a/src/RankCluster.cpp b/src/RankCluster.cpp index d8165cd..527017d 100644 --- a/src/RankCluster.cpp +++ b/src/RankCluster.cpp @@ -1,9 +1,9 @@ /* @file RankCluster.cpp * @brief Implementation of methods of the class @c RankCluster + * See https://hal.archives-ouvertes.fr/hal-00441209v3/document and + * https://hal.inria.fr/hal-00743384/document for mathematical background */ -#include "RankCluster.h" - #include #include #include @@ -14,852 +14,875 @@ #include #include +#include "RankCluster.h" + using namespace std; using namespace Eigen; //using namespace Rcpp; -//constructor + +/********************************************************************************************** + * + * CONSTRUCTORS, DESTRUCTOR + * + **********************************************************************************************/ + RankCluster::RankCluster() { } -//constructor -RankCluster::RankCluster(std::vector> const &X, int g, vector const &m, SEMparameters const ¶m) - : m_(m), n_(X.size()), d_(m.size()), g_(g), - data_(d_, vector(n_)), - z_(n_), - mu_(d_, vector>(g_)), - p_(d_, vector(g_)), - proportion_(g), - parameter_(param), - partial_(false), - dataOk_(true), - indexPb_(m.size()) +RankCluster::RankCluster(std::vector> const &X, int g, + vector const &m, SEMparameters const ¶m) + : m_(m), n_(X.size()), d_(m.size()), g_(g), + data_(d_, vector(n_)), + z_(n_), + mu_(d_, vector(g_)), + p_(d_, vector(g_)), + proportion_(g), + parameter_(param), + partial_(false), + dataOk_(true), + indexPb_(m.size()) { - // try - // { - // convert data in the good notation and create information about missing and partial - conversion2data(X); - // } - // catch(string const& chaine) - // {dataOk_=false;} + // try + // { + // convert data in the good notation and create information about missing and partial + conversion2data(X); + // } + // catch(string const& errorString) + // {dataOk_=false;} } -//constructor RankCluster::RankCluster(vector> const &X, vector const &m, SEMparameters const ¶m, - vector const &proportion, vector> const &p, vector>> const &mu) - : m_(m), n_(X.size()), d_(m.size()), g_(proportion.size()), - data_(d_, vector(n_)), - z_(n_), mu_(mu), p_(p), - proportion_(proportion), - parameter_(param), - partial_(false), - dataOk_(true), - indexPb_(m.size()) + vector const &proportion, vector> const &p, + vector> const &mu) + : m_(m), n_(X.size()), d_(m.size()), g_(proportion.size()), + data_(d_, vector(n_)), + z_(n_), mu_(mu), p_(p), + proportion_(proportion), + parameter_(param), + partial_(false), + dataOk_(true), + indexPb_(m.size()) { - // try - // { - // convert data in the good notation and create information about missing and partial - conversion2data(X); - // } - // catch(string const& chaine) - // {dataOk_=false;} + // try + // { + // convert data in the good notation and create information about missing and partial + conversion2data(X); + // } + // catch(string const& errorString) + // {dataOk_=false;} } -//copy constructor +// copy constructor RankCluster::RankCluster(RankCluster &rankClusterObject) - : m_(rankClusterObject.m()), - n_(rankClusterObject.n()), - d_(rankClusterObject.d()), - g_(rankClusterObject.g()), - data_(rankClusterObject.data()), - mu_(rankClusterObject.mu()), - p_(rankClusterObject.p()), - proportion_(rankClusterObject.proportion()), - parameter_(rankClusterObject.parameter()), - partial_(rankClusterObject.partial()), - dataOk_(rankClusterObject.dataOk()) + : m_(rankClusterObject.m()), + n_(rankClusterObject.n()), + d_(rankClusterObject.d()), + g_(rankClusterObject.g()), + data_(rankClusterObject.data()), + mu_(rankClusterObject.mu()), + p_(rankClusterObject.p()), + proportion_(rankClusterObject.proportion()), + parameter_(rankClusterObject.parameter()), + partial_(rankClusterObject.partial()), + dataOk_(rankClusterObject.dataOk()) { - //nothing to do + // nothing to do } -//destructor +// destructor RankCluster::~RankCluster() { - // nothing + // nothing } -void RankCluster::readRankingRank(vector> const &X, int const &dim, int const &j, vector const &indM) -{ - //initialization - int indiceElement = 0; - data_[dim][j].isNotFull = false; - - //multi dim rank temporary - vector> temp(m_[dim] + 1); - - for (int i = indM[dim]; i < indM[dim + 1]; i++) - { - temp[X[j][i]].push_back(indiceElement + 1); - indiceElement++; - } - - //vector containing index of partial element - vector partialIndex; - - int skip = 0; - //index 0 is for missing, we don't manage in this loop - for (int i = 1; i < (int)temp.size(); i++) - { - if (skip) - { - if (temp[i].size() != 0) - { - dataOk_ = false; - indexPb_[dim].push_back(j + 1); - //throw string("Problem with data."); - } - - skip--; - } - else - { - //tied case - if (temp[i].size() > 1) - { - data_[dim][j].isNotFull = true; - partial_ = true; - skip = temp[i].size() - 1; - data_[dim][j].missingData.push_back(temp[i]); - vector missingIndex(temp[i].size()); +/********************************************************************************************** + * + * READ DATA + * + **********************************************************************************************/ - for (int ii = 0; ii < (int)temp[i].size(); ii++) - missingIndex[ii] = i + ii - 1; - - data_[dim][j].missingIndex.push_back(missingIndex); - } - else - { - //normal case - if (temp[i].size() == 1) - data_[dim][j].rank[i - 1] = temp[i][0]; - else //temp[i].size=0//partial case - partialIndex.push_back(i - 1); - } - } - } - - //problem with the data : index of 0 et element at missing position don't match - if (temp[0].size() != partialIndex.size()) +void RankCluster::readRankingRank(vector> const &X, int const &dim, int const &j, vector const &indM) +{ + //initialization + int indiceElement = 0; + data_[dim][j].isPartial = false; + + //multi dim rank temporary + vector> temp(m_[dim] + 1); + + for (int i = indM[dim]; i < indM[dim + 1]; i++) + { + temp[X[j][i]].push_back(indiceElement + 1); + indiceElement++; + } + + //vector containing index of partial element + vector partialIndex; + + int skip = 0; + //index 0 is for missing, we don't manage in this loop + for (int i = 1; i < (int)temp.size(); i++) + { + if (skip) { + if (temp[i].size() != 0) + { dataOk_ = false; indexPb_[dim].push_back(j + 1); //throw string("Problem with data."); - } + } - //add partial - if (temp[0].size() != 0) + skip--; + } + else { - data_[dim][j].isNotFull = true; + //tied case + if (temp[i].size() > 1) + { + data_[dim][j].isPartial = true; partial_ = true; - data_[dim][j].missingData.push_back(temp[0]); - data_[dim][j].missingIndex.push_back(partialIndex); + skip = temp[i].size() - 1; + data_[dim][j].missingData.push_back(temp[i]); + + vector missingIndex(temp[i].size()); + + for (int ii = 0; ii < (int)temp[i].size(); ii++) + missingIndex[ii] = i + ii - 1; + + data_[dim][j].missingIndex.push_back(missingIndex); + } + else + { + //normal case + if (temp[i].size() == 1) + data_[dim][j].x[i - 1] = temp[i][0]; + else //temp[i].size=0//partial case + partialIndex.push_back(i - 1); + } } + } + + //problem with the data : index of 0 et element at missing position don't match + if (temp[0].size() != partialIndex.size()) + { + dataOk_ = false; + indexPb_[dim].push_back(j + 1); + //throw string("Problem with data."); + } + + //add partial + if (temp[0].size() != 0) + { + data_[dim][j].isPartial = true; + partial_ = true; + data_[dim][j].missingData.push_back(temp[0]); + data_[dim][j].missingIndex.push_back(partialIndex); + } } void RankCluster::conversion2data(vector> const &X) { - //size of a row of X - vector indM(d_ + 1, 0); - for (int i = 0; i < d_; i++) - indM[i + 1] = indM[i] + m_[i]; + //size of a row of X + vector indM(d_ + 1, 0); + for (int i = 0; i < d_; i++) + indM[i + 1] = indM[i] + m_[i]; - //resize data - for (int i = 0; i < d_; i++) - for (int j = 0; j < n_; j++) - data_[i][j].rank.resize(m_[i]); - - //begin the read of the data row by row + //resize data + for (int i = 0; i < d_; i++) for (int j = 0; j < n_; j++) + data_[i][j].x.resize(m_[i]); + + //begin the read of the data row by row + for (int j = 0; j < n_; j++) + { + //dim by dim + for (int dim(0); dim < d_; dim++) { - //dim by dim - for (int dim(0); dim < d_; dim++) - { - //read rank j of dim dim - readRankingRank(X, dim, j, indM); - } + //read rank j of dim dim + readRankingRank(X, dim, j, indM); } + } } + +/********************************************************************************************** + * + * INITIALIZATION METHODS + * + **********************************************************************************************/ void RankCluster::initialization() { - double alea; + // t0 = clock(); + initializeZ(); + initializeP(); + initializeMu(); + estimateProportion(); + initializePartialRank(); + fillIndexPartialData(); + saveInitialization(); + // t1 = clock(); + + // if(parameter_.verbose) + // cout << "Initialization completed in " <<(double) (t1-t0)/CLOCKS_PER_SEC << "s." << endl; +} - //zik initialization with multinomial of equal proba - if (g_ != 1) - { - for (int i = 0; i < n_; i++) - { - alea = runif(0., 1.); - for (int j = 0; j < g_; j++) - if ((alea > (double)j / g_) & (alea < (double)(j + 1) / g_)) - { - z_[i] = j; - break; - } - } - } - else +// random initialization of z_: multinomial law of size g_ +void RankCluster::initializeZ() +{ + for (int i = 0; i < n_; i++) + z_[i] = randWrapper(g_); +} + +// initialization of p_ with double between 0.5 and 1 +void RankCluster::initializeP() +{ + for (int k = 0; k < d_; k++) + { + for (int i = 0; i < g_; i++) { - for (int i(0); i < n_; i++) - z_[i] = 0; + p_[k][i] = (double) runif(0.5, 1.); } + } +} - //mu & p initialization - for (int k = 0; k < d_; k++) +// random initialization of mu_ with random rank of size m_ +void RankCluster::initializeMu() +{ + for (int k = 0; k < d_; k++) + { + for (int i = 0; i < g_; i++) { - for (int i = 0; i < g_; i++) - { - //initialization of p_ with double between 0.5 and 1 - alea = (double)runif(0.5, 1.); - p_[k][i] = alea; - //initialization of mu_ with alea rank of size m_ - mu_[k][i].resize(m_[k]); - for (int j = 0; j < m_[k]; j++) - mu_[k][i][j] = j + 1; - Rshuffle(mu_[k][i].begin(), mu_[k][i].end()); - } + mu_[k][i].resize(m_[k]); + randomRank(mu_[k][i]); } + } +} - //proportion initialization - for (int i = 0; i < n_; i++) - proportion_[z_[i]]++; +// estimate proportion using z_: proportion of each class among z_ +void RankCluster::estimateProportion() +{ + for (int k = 0; k < g_; k++) + proportion_[k] = 0; - for (int i = 0; i < g_; i++) - proportion_[i] /= (double)n_; + for (int i = 0; i < n_; i++) + proportion_[z_[i]]++; - //partial data and order of presentation initialization - for (int dim = 0; dim < d_; dim++) + for (int k = 0; k < g_; k++) + proportion_[k] /= (double)n_; +} + +// order of presentation random initialization +void RankCluster::initializeY() +{ + for (int dim = 0; dim < d_; dim++) + { + vector rankTemp(m_[dim]); + initializeRank(rankTemp); + for (int ind = 0; ind < n_; ind++) { - vector rankTemp(m_[dim]); - for (int i = 0; i < m_[dim]; i++) - rankTemp[i] = i + 1; - for (int ind = 0; ind < n_; ind++) - { - //initialization of y - Rshuffle(rankTemp.begin(), rankTemp.end()); - data_[dim][ind].y = rankTemp; + Rshuffle(rankTemp.begin(), rankTemp.end()); + data_[dim][ind].y = rankTemp; + } + } +} - if (data_[dim][ind].isNotFull) - { - for (int ii = 0; ii < (int)data_[dim][ind].missingIndex.size(); ii++) - { - //initialization of Partial Rank - vector rankTemp2(data_[dim][ind].missingIndex[ii]); - Rshuffle(rankTemp2.begin(), rankTemp2.end()); +// order of partial rank: random initialization with missing index +void RankCluster::initializePartialRank() +{ + for (int dim = 0; dim < d_; dim++) + { + for (int ind = 0; ind < n_; ind++) + { + if (data_[dim][ind].isPartial) + { + for (int ii = 0; ii < (int)data_[dim][ind].missingIndex.size(); ii++) + { + Rank rankTemp(data_[dim][ind].missingIndex[ii]); + Rshuffle(rankTemp.begin(), rankTemp.end()); - for (int iii = 0; iii < (int)data_[dim][ind].missingData[ii].size(); iii++) - data_[dim][ind].rank[rankTemp2[iii]] = data_[dim][ind].missingData[ii][iii]; - } - } + for (int iii = 0; iii < (int)data_[dim][ind].missingData[ii].size(); iii++) + data_[dim][ind].x[rankTemp[iii]] = data_[dim][ind].missingData[ii][iii]; } + } } + } +} + - indexPartialData_ = vector>(d_); - for (int dim = 0; dim < d_; dim++) +void RankCluster::fillIndexPartialData() +{ + indexPartialData_ = vector>(d_); + for (int dim = 0; dim < d_; dim++) + { + for (int ind = 0; ind < n_; ind++) { - for (int ind = 0; ind < n_; ind++) - { - if (data_[dim][ind].isNotFull) - indexPartialData_[dim].push_back(ind); - } + if (data_[dim][ind].isPartial) + indexPartialData_[dim].push_back(ind); } + } +} - vector>> donneesPartiel(d_); - for (int dim = 0; dim < d_; dim++) - for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) - donneesPartiel[dim].push_back(data_[dim][*it].rank); - - //sauvegarde initialisation - output_.initialPartialRank = donneesPartiel; - output_.initialP = p_; - output_.initialZ = z_; - output_.initialMu = mu_; - output_.initialProportion = proportion_; +// save initialization in output_ member +void RankCluster::saveInitialization() +{ + vector> partialRankData(d_); + for (int dim = 0; dim < d_; dim++) + for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) + partialRankData[dim].push_back(data_[dim][*it].x); + + output_.initialPartialRank = partialRankData; + output_.initialP = p_; + output_.initialZ = z_; + output_.initialMu = mu_; + output_.initialProportion = proportion_; } + + +/********************************************************************************************** + * + * SE STEP METHODS + * + **********************************************************************************************/ + void RankCluster::SEstep() { - //simulation of order of presentation for each dimension - for (int dim = 0; dim < d_; dim++) - gibbsY(dim); + // simulation of order of presentation for each dimension + for (int dim = 0; dim < d_; dim++) + gibbsY(dim); - //simulation of z - zSimulation(); + sampleZ(); - //simulation of partial rank for each dimension - for (int dim = 0; dim < d_; dim++) - gibbsX(dim); + // simulation of partial rank for each dimension + for (int dim = 0; dim < d_; dim++) + gibbsX(dim); } void RankCluster::gibbsY(int indexDim) { - double p1(0), p2(0), alea(0); - set::iterator itset; + double logPcurrent(0), logPcandidate(0); + set::iterator itset; - //rank 1 2..m - vector yTemp(m_[indexDim]); - for (int j = 0; j < m_[indexDim]; j++) - yTemp[j] = j + 1; + //rank 1 2..m + // Rank yTemp(m_[indexDim]); + // initializeRank(yTemp); - for (int ind = 0; ind < n_; ind++) - { - //Gibbs sampling - vector y(m_[indexDim]), y2(m_[indexDim]), y1(m_[indexDim]); + for (int ind = 0; ind < n_; ind++) + { + //Gibbs sampling + Rank y(m_[indexDim]), yCandidate(m_[indexDim]), yCurrent(m_[indexDim]); - //initialization of p1 and y1 - y = yTemp; - Rshuffle(y.begin(), y.end()); //simulation of alea rank - y1 = y; - p1 = probaCond(data_[indexDim][ind].rank, y1, mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); + //initialization of p1 and y1 + randomRank(y); + yCurrent = y; + logPcurrent = lnProbaCond(data_[indexDim][ind].x, yCurrent, mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); - //start of iteration - for (int iter(0); iter < parameter_.nGibbsSE[indexDim]; iter++) + //start of iteration + for (int iter(0); iter < parameter_.nGibbsSE[indexDim]; iter++) + { + for (int i(0); i < m_[indexDim] - 1; i++) + { + // new y to test (old y with inversion of 2 adjacent elements) + yCandidate = y; + yCandidate[i] = y[i + 1]; + yCandidate[i + 1] = y[i]; + + // compute the probability of accept the change of y + logPcandidate = lnProbaCond(data_[indexDim][ind].x, yCandidate, + mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); + + bool changeAccepted = acceptChange(logPcurrent, logPcandidate); + if (changeAccepted) //change acceptation { - for (int k(0); k < m_[indexDim] - 1; k++) - { - //new y to test (old y with inversion of 2 adjacents elements) - y2 = y; - y2[k] = y[k + 1]; - y2[k + 1] = y[k]; - - //compute the probability of accept the changement of y - p2 = probaCond(data_[indexDim][ind].rank, y2, mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); - - alea = (double)runif(0, (p1 + p2)); - - if (alea < p2) //changement acceptation - { - y = y2; - p1 = p2; - y1 = y; - } - else - y = y1; - } + y = yCandidate; + logPcurrent = logPcandidate; + yCurrent = y; } - data_[indexDim][ind].y = y; + else + y = yCurrent; + } } + data_[indexDim][ind].y = y; + } } -void RankCluster::zSimulation() -{ - if (g_ != 1) - { - double alea(0), sumTik(0); - vector lim(g_ + 1, 0), tik(g_); - - for (int ind(0); ind < n_; ind++) - { - //computation of the probability to belong to each cluster - for (int k(0); k < g_; k++) - tik[k] = 1; +Eigen::ArrayXd RankCluster::computeTik(int const ind) +{ + Eigen::ArrayXd tik = ArrayXd::Zero(g_); - sumTik = 0; - for (int k(0); k < g_; k++) - { - for (int l(0); l < d_; l++) - tik[k] *= probaCond(data_[l][ind].rank, data_[l][ind].y, mu_[l][k], p_[l][k]); - tik[k] *= proportion_[k]; - sumTik += tik[k]; - } + for (int k(0); k < g_; k++) + { + for (int l(0); l < d_; l++) + tik(k) += lnProbaCond(data_[l][ind].x, data_[l][ind].y, mu_[l][k], p_[l][k]); - for (int i(0); i < g_; i++) - tik[i] /= sumTik; + tik[k] += log(proportion_[k]); + } + normalizeLogProbaInPlace(tik); - //z follow a multinomial law of parameter tik - for (int k(1); k < g_ + 1; k++) - lim[k] = lim[k - 1] + tik[k - 1]; + return tik; +} - alea = (double)runif(0., 1.); +/* z follow a multinomial law of parameter tik */ +void RankCluster::sampleZ() +{ + if (g_ != 1) + { + Eigen::ArrayXd tik(g_); - for (int j(0); j < g_; j++) - if ((lim[j] <= alea) && (alea <= lim[j + 1])) - { - z_[ind] = j; - break; - } - } - } - else + for (int ind(0); ind < n_; ind++) { - for (int i = 0; i < (int)z_.size(); i++) - z_[i] = 0; + // computation of the probability to belong to each cluster + tik = computeTik(ind); + + z_[ind] = sampleMultinomial(tik); } + } + else + { + for (int ind(0); ind < n_; ind++) + z_[ind] = 0; + } } void RankCluster::gibbsX(int indexDim) { - double p1(0), p2(0), alea(0); + double logPcurrent(0), logPcandidate(0); - for (int ind = 0; ind < n_; ind++) + for (int ind = 0; ind < n_; ind++) + { + if (data_[indexDim][ind].isPartial) { - if (data_[indexDim][ind].isNotFull) - { - //Algorithme de Gibbs - vector x(m_[indexDim]), x2(m_[indexDim]), x1(m_[indexDim]); + // Gibbs algorithm + Rank x(m_[indexDim]), xCurrent(m_[indexDim]), xCandidate(m_[indexDim]); - //initialisation de mu et p pour Gibbs - x = data_[indexDim][ind].rank; - x1 = x; - p1 = probaCond(x1, data_[indexDim][ind].y, mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); + // mu and log-probability initialization + x = data_[indexDim][ind].x; + xCurrent = x; + logPcurrent = lnProbaCond(xCurrent, data_[indexDim][ind].y, mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); - for (int iter = 0; iter < parameter_.nGibbsSE[indexDim]; iter++) + for (int iter = 0; iter < parameter_.nGibbsSE[indexDim]; iter++) + { + for (int ii = 0; ii < (int)data_[indexDim][ind].missingIndex.size(); ii++) + { + for (int i = 0; i < (int)data_[indexDim][ind].missingIndex[ii].size() - 1; i++) + { + // new rank to test: old rank where 2 partial elements are exchanged + xCandidate = x; + xCandidate[data_[indexDim][ind].missingIndex[ii][i]] = x[data_[indexDim][ind].missingIndex[ii][i + 1]]; + xCandidate[data_[indexDim][ind].missingIndex[ii][i + 1]] = x[data_[indexDim][ind].missingIndex[ii][i]]; + + logPcandidate = lnProbaCond(xCandidate, data_[indexDim][ind].y, mu_[indexDim][z_[ind]], + p_[indexDim][z_[ind]]); + + bool changeAccepted = acceptChange(logPcurrent, logPcandidate); + if (changeAccepted) { - for (int ii = 0; ii < (int)data_[indexDim][ind].missingIndex.size(); ii++) - { - for (int i = 0; i < (int)data_[indexDim][ind].missingIndex[ii].size() - 1; i++) - { - //nouveau x à tester, ancien x auquel on inverse 2 éléments partiels - x2 = x; - x2[data_[indexDim][ind].missingIndex[ii][i]] = x[data_[indexDim][ind].missingIndex[ii][i + 1]]; - x2[data_[indexDim][ind].missingIndex[ii][i + 1]] = x[data_[indexDim][ind].missingIndex[ii][i]]; - - p2 = probaCond(x2, data_[indexDim][ind].y, mu_[indexDim][z_[ind]], p_[indexDim][z_[ind]]); - - alea = (double)runif(0., p1 + p2); - - if (alea < p2) //acceptation du changement - { - x = x2; - p1 = p2; - x1 = x; - } - else - x = x1; - } - } + x = xCandidate; + logPcurrent = logPcandidate; + xCurrent = x; } - data_[indexDim][ind].rank = x; + else + x = xCurrent; + } } + } + data_[indexDim][ind].x = x; } + } } + +/********************************************************************************************** + * + * M STEP METHODS + * + **********************************************************************************************/ + void RankCluster::Mstep() { - //MAJ proportion - for (int k(0); k < g_; k++) - { - proportion_[k] = 0; - for (int ind(0); ind < n_; ind++) - { - if (z_[ind] == k) - proportion_[k]++; - } - proportion_[k] /= (double)n_; - if (proportion_[k] == 0) - throw string("NON CONVERGENCE DE L'ALGORITHME : a proportion is equal to 0"); - } - - //simulation of mu for ezach dim and cluster - for (int dim(0); dim < d_; dim++) - { - for (int numCl(0); numCl < g_; numCl++) - simuM(dim, numCl); - } + // update proportion + estimateProportion(); + + for (int k(0); k < g_; k++) + { + if (proportion_[k] == 0) + throw string("Algorithm did not converge: a proportion is equal to 0"); + } + + // simulation of mu for each dim and cluster + for (int dim(0); dim < d_; dim++) + { + for (int classNumber(0); classNumber < g_; classNumber++) + estimateMuP(dim, classNumber); + } } -void RankCluster::simuM(int indexDim, int indCl) -{ - long double p1(0), p2(0), alea(0), lnp1(0), lnp2(0), lnp1Plusp2(0); - double s1(0), lim(-numeric_limits::max()); - vector tabFact(m_[indexDim]), comp(2); - tabFact = tab_factorial(m_[indexDim]); - vector> MU(parameter_.nGibbsM[indexDim]); - vector P(parameter_.nGibbsM[indexDim], 0), L(parameter_.nGibbsM[indexDim], 0); - int indMu, compteur(0); - int tailleComposante(0); +void RankCluster::estimateMuP(int indexDim, int classNumber) +{ + vector tabFact = tab_factorial(m_[indexDim]); + + // vector to store results of each iteration + vector MU(parameter_.nGibbsM[indexDim]); + vector P(parameter_.nGibbsM[indexDim], 0.), L(parameter_.nGibbsM[indexDim], 0.); + + // elements will be grouped per mu. If the new mu is the same, we do not recompute P and likelihood + map> resPerMu; + map>::iterator iteratorResPerMu; + int indMu; + + // initialization of mu + Rank mu = mu_[indexDim][classNumber]; + + // initial completed log-likelihood + double logPcurrent = 0.; + int sizeCluster = 0.; + for (int ind(0); ind < n_; ind++) + { + if (z_[ind] == classNumber) + { + logPcurrent += lnProbaCond(data_[indexDim][ind].x, data_[indexDim][ind].y, mu, p_[indexDim][classNumber]); + sizeCluster++; + } + } - for (int i(0); i < n_; i++) - if (z_[i] == indCl) - tailleComposante++; + // Gibbs algorithm + for (int iter(0); iter < parameter_.nGibbsM[indexDim]; iter++) + { + simulateCandidateMuKJ(indexDim, classNumber, mu, logPcurrent); + MU[iter] = mu; - vector G(tailleComposante), A_G(tailleComposante); + indMu = rank2index(MU[iter], tabFact); + iteratorResPerMu = resPerMu.find(indMu); - map> muTeste; - map>::iterator itmapmu; + if (iteratorResPerMu == resPerMu.end()) //if we have already tested this mu, we do not redo computation + { + double sumG(0.), sumA_G(0.); + vector vecTemp(2); - //Algorithme de Gibbs - vector mu(m_[indexDim]), mu2(m_[indexDim]), mu1(m_[indexDim]); + P[iter] = updatePKJ(indexDim, classNumber, sizeCluster, mu, sumG, sumA_G); - //initialization of mu and p - mu = mu_[indexDim][indCl]; - mu1 = mu; - lnp1 = 0; + L[iter] = computeCompletedLoglikehoodKJ(P[iter], sumG, sumA_G); - //initial proba - for (int ind(0); ind < n_; ind++) + vecTemp[0] = P[iter]; + vecTemp[1] = L[iter]; + resPerMu[indMu] = vecTemp; + } + else { - if (z_[ind] == indCl) - { - p1 = probaCond(data_[indexDim][ind].rank, data_[indexDim][ind].y, mu1, p_[indexDim][indCl]); - lnp1 += log(p1); - } + P[iter] = (iteratorResPerMu->second)[0]; + L[iter] = (iteratorResPerMu->second)[1]; } - for (int iter(0); iter < parameter_.nGibbsM[indexDim]; iter++) - { - //new mu - for (int k(0); k < m_[indexDim] - 1; k++) - { - //new mu to test - mu2 = mu; - mu2[k] = mu[k + 1]; - mu2[k + 1] = mu[k]; + p_[indexDim][classNumber] = P[iter]; + } - lnp2 = 0; + // find the best (mu, p) according to completed log-likelihood + int indice(max_element(L.begin(), L.end()) - L.begin()); + p_[indexDim][classNumber] = P[indice]; + mu_[indexDim][classNumber] = MU[indice]; +} - //new proba - for (int ind(0); ind < n_; ind++) - { - if (z_[ind] == indCl) - { - p2 = probaCond(data_[indexDim][ind].rank, data_[indexDim][ind].y, mu2, p_[indexDim][indCl]); - lnp2 += log(p2); - } - } - //p1+p2 - //ln(p1+p2)~ln( - long double diffln; - if (lnp1 > lnp2) - { - diffln = lnp2 - lnp1; - lnp1Plusp2 = lnp1; - } - else - { - diffln = lnp1 - lnp2; - lnp1Plusp2 = lnp2; - } +void RankCluster::simulateCandidateMuKJ(int indexDim, int classNumber, Rank &mu, double &logPcurrent) +{ + Rank muCandidate, muCurrent(mu); // ? is muCurrent necessary ? + double logPcandidate; - //taylor development of order for estiamting lnp1Plusp2 - for (int ordre(1); ordre < 6; ordre++) - { - //* (long double) std::pow((int) -1,(int) ordre-1) - int sign = 1; - if (ordre % 2 == 1) - sign = -1; - lnp1Plusp2 += (long double)sign / ordre * exp(diffln * ordre); - } + // new mu + for (int k(0); k < m_[indexDim] - 1; k++) + { + // new mu to test + muCandidate = mu; + muCandidate[k] = mu[k + 1]; + muCandidate[k + 1] = mu[k]; - alea = (long double)runif(0., 1.); + logPcandidate = 0.; - // acceptaion of change or not - if (alea < exp(lnp2 - lnp1Plusp2)) //accept the changement - { - mu = mu2; - lnp1 = lnp2; - mu1 = mu; - } - else - mu = mu1; + // new probability + for (int ind(0); ind < n_; ind++) + { + if (z_[ind] == classNumber) + { + logPcandidate += lnProbaCond(data_[indexDim][ind].x, data_[indexDim][ind].y, + muCandidate, p_[indexDim][classNumber]); + } + } - } //fin parcours mu - MU[iter] = mu; + // acceptation of change or not + bool changeAccepted = acceptChange(logPcurrent, logPcandidate); + if (changeAccepted) + { + mu = muCandidate; + logPcurrent = logPcandidate; + muCurrent = mu; + } + else + mu = muCurrent; + } - //MAJ p - indMu = rank2index(MU[iter], tabFact); - itmapmu = muTeste.find(indMu); +} - if (itmapmu == muTeste.end()) //if we have already tested this mu, we do not redo computation - { - s1 = 0; - compteur = 0; - //computation of G and A-G (number of good and bad comparison) for loglikelihood - double somG(0), somA_G(0); - for (int ind(0); ind < n_; ind++) - { - if (z_[ind] == indCl) - { - comp = comparaison(data_[indexDim][ind].rank, data_[indexDim][ind].y, MU[iter]); - G[compteur] = comp[1]; - A_G[compteur] = comp[0] - comp[1]; - s1 += comp[0]; - somG += G[compteur]; - somA_G += A_G[compteur]; - compteur++; - } - } +/* + * sumG and sumA_G are modified to be used in computeCompletedLoglikehoodKJ function + */ +double RankCluster::updatePKJ(int indexDim, int classNumber, int nObsClass, Rank const& mu, + double &sumG, double &sumA_G) +{ + double s1 = 0.; + vector comp(2); - P[iter] = somG / s1; + // computation of G and A-G (number of good and bad comparison) for log-likelihood + sumG = 0.; + sumA_G = 0.; - if ((P[iter] != 0) & (P[iter] != 1)) - { - //L[iter]+=(G[i]*log(P[iter])+A_G[i]*log(1-P[iter])); - L[iter] = somG * log(P[iter]) + somA_G * log(1 - P[iter]); - } - else - { - if ((P[iter] == 0) & (somG == 0)) - L[iter] = 0; - else - { - if ((P[iter] == 1) & (somA_G == 0)) - L[iter] = 0; - else - L[iter] = lim; - } - } - /*vector stock(3); - stock[0]=somG; - stock[1]=somA_G; - stock[2]=s1;*/ - vector stock(2); - stock[0] = P[iter]; - stock[1] = L[iter]; - muTeste[indMu] = stock; //MAJ des mu test� - } - else - { - L[iter] = (itmapmu->second)[1]; - P[iter] = (itmapmu->second)[0]; - } + for (int ind(0); ind < n_; ind++) + { + if (z_[ind] == classNumber) + { + comp = comparaison(data_[indexDim][ind].x, data_[indexDim][ind].y, mu); - p_[indexDim][indCl] = P[iter]; + s1 += comp[0]; + sumG += comp[1]; + sumA_G += (comp[0] - comp[1]); } + } - int indice(max_element(L.begin(), L.end()) - L.begin()); - p_[indexDim][indCl] = P[indice]; - mu_[indexDim][indCl] = MU[indice]; + return sumG / s1; } -typedef struct ListeMu ListeMu; -void RankCluster::likelihood(vector>>> &listeMu, vector>> &resP, vector> &resProp) +/* + * Compute the completed log-likelihood to choose the best couple (mu_k^j, p_k^j). + * + * Maximizing the completed log-likelihood for a new couple (mu_k^j, p_k^j) + * (dimension j = indexDim, cluster k = classNumber) + * is equivalent to maximizing + * \sum_{individuals i of class k} P(x_i^k | y_i^k; mu_k^j, p_k^j) = + * \sum_{individuals i of class k} G_i^j log(p_k^j) + \sum_{individuals i of class k} A_i^j - G_i^j) log(1 - p_k^j) + */ +double RankCluster::computeCompletedLoglikehoodKJ(double p, double sumG, double sumA_G) { - //we put the same mu together and make the mean of their parameters - //double t1,t2,tL(0); - - //t1=clock(); - struct ListeMu; - struct ListeMu - { - double compteur; //number of same mu - std::vector>> rangComplet; - std::vector> p; - std::vector prop; - ListeMu *suivant; - }; - - bool continuer(true), egaliteRang; - - ListeMu *headMu = new ListeMu; - ListeMu *currMu = headMu; - ListeMu *next = 0; - currMu->compteur = 1; - currMu->suivant = 0; - currMu->rangComplet = listeMu[0]; - currMu->p = resP[0]; - currMu->prop = resProp[0]; - - int nbMu(1); - - for (int j(1); j < parameter_.maxIt - parameter_.burnAlgo; j++) //we see all the mu + double loglikelihood = 0.; + if ((p != 0) && (p != 1)) + { + loglikelihood = sumG * log(p) + sumA_G * log(1 - p); + } + else + { + if ((p == 0) && (sumG == 0)) + loglikelihood = 0.; + else { - continuer = true; - currMu = headMu; - while (continuer) - { - egaliteRang = true; - //look if the j-th mu is the same that the current mu - for (int J(0); J < d_; J++) - for (int k(0); k < g_; k++) - { - for (int i(0); i < m_[J]; i++) - if (currMu->rangComplet[J][k][i] != listeMu[j][J][k][i]) - { - egaliteRang = false; - break; - } - } - - if (egaliteRang) - { - //same mu - currMu->compteur++; - //we sum the proportion and p - for (int compt1(0); compt1 < g_; compt1++) - { - currMu->prop[compt1] += resProp[j][compt1]; - for (int compt2(0); compt2 < d_; compt2++) - currMu->p[compt2][compt1] += resP[j][compt2][compt1]; - } - continuer = false; //no need to check other mu of the struct - } - else - { - //not the same mu - if (currMu->suivant == 0) - { //if the current mu is the last, we add thz j-th mu in the struct - nbMu++; - continuer = false; - next = new ListeMu; - currMu->suivant = next; - currMu = next; - currMu->compteur = 1; - currMu->suivant = 0; - currMu->rangComplet = listeMu[j]; - currMu->prop = resProp[j]; - currMu->p = resP[j]; - } - else - currMu = currMu->suivant; //we will test the next mu - } - } + if ((p == 1) && (sumA_G == 0)) + loglikelihood = 0.; + else + loglikelihood = -numeric_limits::max(); } + } - //t2=clock(); - //cout<<"Temps regroupement mu: "<<(double) (t2-t1)/CLOCKS_PER_SEC<<"s"<>> &resMu, + vector>> &resP, + vector> &resProp) +{ + // double t1, t2, tL(0); + + // t1 = clock(); + + MuList *headMu = findDifferentMu(resMu, resP, resProp); + MuList *currMu = headMu; + MuList *next = 0; + + // t2 = clock(); + // cout << "Temps regroupement mu: " << (double) (t2-t1)/CLOCKS_PER_SEC << "s" << endl; - //calcul logvraisemblance - //if(parameter_.detail) - //cout<<"Number of reference rank which must compute the log-likelihood: "<::max()), L; + double Llast(-numeric_limits::max()), L; - vector>> Y(d_, vector>(n_)), xPartialTemp(output_.initialPartialRank); - vector>> scoreTemp(output_.initialPartialRank.size()); - for (int ii = 0; ii < (int)scoreTemp.size(); ii++) + vector> Y(d_, vector(n_)), xPartialTemp(output_.initialPartialRank); + + vector>> scoreTemp(output_.initialPartialRank.size()); + for (int ii = 0; ii < (int)scoreTemp.size(); ii++) + { + scoreTemp[ii].resize(output_.initialPartialRank[ii].size()); + for (int iii = 0; iii < (int)scoreTemp[ii].size(); iii++) { - scoreTemp[ii].resize(output_.initialPartialRank[ii].size()); - for (int iii = 0; iii < (int)scoreTemp[ii].size(); iii++) - { - scoreTemp[ii][iii].resize(output_.initialPartialRank[ii][iii].size()); - } + scoreTemp[ii][iii].resize(output_.initialPartialRank[ii][iii].size()); } + } - //Now, we have the list of all the different Mu - currMu = headMu; - ArrayXXd tik(n_, g_); - ArrayXXd probabilities(n_, g_); //estimate probability for an individual to belong to each cluster + // Now, we have the list of all the different Mu + currMu = headMu; + ArrayXXd tik(n_, g_); + ArrayXXd probabilities(n_, g_); // estimate probability for an individual to belong to each cluster + bool hasNextElement; + + // for each mu, we will compute the associated log likelihood + do + { + // if(parameter_.verbose) + // cout<<"*"; + + meanParameters(currMu); + + // compute the log likelihood + // t1 = clock(); + L = computeLogLikelihood(currMu->fullRank, currMu->p, currMu->prop, tik, Y, xPartialTemp, probabilities, scoreTemp); + // t2 = clock(); + // tL += t2-t1; - //for each mu, we will compute the associate log likelihood - while (currMu->suivant != 0) + if (L > Llast) { - //if(parameter_.detail) - //cout<<"*"; + // the current mu has a better log-likelihood, we save the parameter + Llast = L; + mu_ = currMu->fullRank; + p_ = currMu->p; + proportion_ = currMu->prop; + output_.tik = tik; + output_.L = L; + output_.probabilities = probabilities; + output_.partialRankScore = scoreTemp; + for (int dim = 0; dim < d_; dim++) + { + for (int ind = 0; ind < n_; ind++) + data_[dim][ind].y = Y[dim][ind]; - //mean of the parameter - for (int compt1(0); compt1 < g_; compt1++) + int compteur(0); + for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) { - currMu->prop[compt1] /= currMu->compteur; - for (int compt2(0); compt2 < d_; compt2++) - currMu->p[compt2][compt1] /= currMu->compteur; + data_[dim][*it].x = xPartialTemp[dim][compteur]; + compteur++; } + } + } - //compute the log likelihood - //t1=clock(); - L = computeLikelihood(currMu->rangComplet, currMu->p, currMu->prop, tik, Y, xPartialTemp, probabilities, scoreTemp); - //t2=clock(); - //tL+=t2-t1; + next = currMu->nextMu; + hasNextElement = (currMu->nextMu != 0); + delete currMu; // delete the mu + currMu = next; + } + while(hasNextElement); - if (L > Llast) - { - //the current mu has a better loglikelihood, we save the parameter - Llast = L; - mu_ = currMu->rangComplet; - p_ = currMu->p; - proportion_ = currMu->prop; - output_.tik = tik; - output_.L = L; - output_.probabilities = probabilities; - output_.partialRankScore = scoreTemp; - for (int dim = 0; dim < d_; dim++) - { - for (int ind = 0; ind < n_; ind++) - data_[dim][ind].y = Y[dim][ind]; - int compteur(0); - for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) - { - data_[dim][*it].rank = xPartialTemp[dim][compteur]; - compteur++; - } - } - } + // if(parameter_.verbose) + // { + // cout << "Computing time for log-likelihood approximation: " << (double)tL / CLOCKS_PER_SEC + // << "s (" << (double)tL / CLOCKS_PER_SEC / compteur << "s per mu)." << endl; + // } - next = currMu->suivant; - delete currMu; //delete the mu - currMu = next; - compteur++; - } +} - //the last mu of the struct - //mean of the parameter - for (int compt1 = 0; compt1 < g_; compt1++) - { - currMu->prop[compt1] /= currMu->compteur; - for (int compt2 = 0; compt2 < d_; compt2++) - currMu->p[compt2][compt1] /= currMu->compteur; - } - //if(parameter_.detail) - //cout<<"*"<>> &resMu, vector>> &resP, + vector> &resProp) +{ + bool notFound(true), equalRank; - //compute log likelihood - //t1=clock(); - L = computeLikelihood(currMu->rangComplet, currMu->p, currMu->prop, tik, Y, xPartialTemp, probabilities, scoreTemp); + MuList *headMu = new MuList; + MuList *currMu = headMu; + currMu->freq = 1; + currMu->nextMu = 0; + currMu->fullRank = resMu[0]; + currMu->p = resP[0]; + currMu->prop = resProp[0]; - //t2=clock(); - //tL+=t2-t1; - compteur++; + int nbMu(1); - if (L > Llast) + for (int it(1); it < (parameter_.maxIt - parameter_.burnAlgo); it++) //we see all the mu + { + notFound = true; + currMu = headMu; + while (notFound) { - //the current mu has a better loglikelihood, we save the parameter - Llast = L; - mu_ = currMu->rangComplet; - p_ = currMu->p; - proportion_ = currMu->prop; - output_.tik = tik; - output_.L = L; - output_.probabilities = probabilities; - output_.partialRankScore = scoreTemp; - vector compteurPartiel(d_, 0); - - for (int dim = 0; dim < d_; dim++) + equalRank = true; + // look if the j-th mu is the same that the current mu + for (int j(0); j < d_; j++) + { + for (int k(0); k < g_; k++) { - for (int ind = 0; ind < n_; ind++) - data_[dim][ind].y = Y[dim][ind]; - - int compteur(0); - for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) + for (int i(0); i < m_[j]; i++) + if (currMu->fullRank[j][k][i] != resMu[it][j][k][i]) { - data_[dim][*it].rank = xPartialTemp[dim][compteur]; - compteur++; + equalRank = false; + break; } } + } + + if (equalRank) + { + currMu->freq++; + // we sum the proportion and p + for (int k(0); k < g_; k++) + { + currMu->prop[k] += resProp[it][k]; + for (int j(0); j < d_; j++) + currMu->p[j][k] += resP[it][j][k]; + } + notFound = false; // no need to check other mu of the struct + } + else + { + // if the current mu is the last, we add the j-th mu in the struct + if (currMu->nextMu == 0) + { + nbMu++; + notFound = false; + currMu->nextMu = new MuList; + currMu = currMu->nextMu; + currMu->freq = 1; + currMu->nextMu = 0; + currMu->fullRank = resMu[it]; + currMu->prop = resProp[it]; + currMu->p = resP[it]; + } + else + currMu = currMu->nextMu; // we will test the next mu + } } - delete currMu; //delete the last mu + } - //if(parameter_.detail) - //cout<<"Computing time for log-likelihood approximation: "<<(double) tL/CLOCKS_PER_SEC<<"s ("<<(double) tL/CLOCKS_PER_SEC/compteur<<"s per mu)."<>> const &mu, vector> const &p, - vector const &proportion, ArrayXXd &tik, vector>> &Y, vector>> &xTemp, ArrayXXd &probabilities, - vector>> &score) + +void RankCluster::meanParameters(MuList * currMu) +{ + for (int k(0); k < g_; k++) + { + currMu->prop[k] /= (double) currMu->freq; + for (int j(0); j < d_; j++) + currMu->p[j][k] /= (double) currMu->freq; + } +} + + +// LL gibbs +double RankCluster::computeLogLikelihood(vector> const &mu, vector> const &p, + vector const &proportion, ArrayXXd &tik, + vector> &Y, vector> &xTemp, + ArrayXXd &probabilities, + vector>> &score) { long double p1(0), p2(0), p1x(0), p2x(0), alea(0), l(0), li(0); double div((double)(parameter_.nGibbsL - parameter_.burnL)); vector compteur(d_, 0); - vector x1, x2; + Rank x1, x2; //objet pour stockage de calcul pour éviter répétition ArrayXXd proba1(d_, g_), proba2(d_, g_); @@ -872,7 +895,7 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec ArrayXXd propb(1, g_); propb = prop.transpose(); - vector> yTemp(d_), x(d_); + vector yTemp(d_), x(d_); vector>> scoreCount(d_); for (int j = 0; j < d_; j++) { @@ -889,9 +912,9 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec for (int ind = 0; ind < n_; ind++) { //cout<<"ind "<> y(d_), y2(d_), y1(d_); + vector y(d_), y2(d_), y1(d_); li = 0; - //algorithme de Gibbs pour simuler yi + //Gibbs Algorithm to sample yi //initialisation de y et p pour Gibbs y = yTemp; @@ -902,7 +925,7 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec scoreCount[j][jj][jjj] = 0; Rshuffle(y[j].begin(), y[j].end()); //permutation de 1 2 3 ..m - x[j] = data_[j][ind].rank; + x[j] = data_[j][ind].x; } y1 = y; @@ -931,7 +954,8 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec y2[J][K] = y[J][K + 1]; y2[J][K + 1] = y[J][K]; - for (int k = 0; k < g_; k++) //tester un stockage des proba calcul� pour �viter r�p�tition de calculs dans la boucle + //tester un stockage des proba calculées pour éviter répétition de calculs dans la boucle + for (int k = 0; k < g_; k++) proba2(J, k) = probaCond(x[J], y2[J], mu[J][k], p[J][k]); p2 = (long double)(propb * proba2.colwise().prod()).sum(); @@ -957,7 +981,7 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec /*simulation des x_i^j qui sont partiels*/ for (int J = 0; J < d_; J++) { - if (data_[J][ind].isNotFull) //simulation de xi si partiel + if (data_[J][ind].isPartial) //simulation de xi si partiel { x1 = x[J]; proba1X = proba1.row(J); @@ -1015,7 +1039,7 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec //compute score partial rank for (int dim = 0; dim < d_; dim++) { - if (data_[dim][ind].isNotFull) + if (data_[dim][ind].isPartial) { for (int indElem = 0; indElem < m_[dim]; indElem++) scoreCount[dim][indElem][x[dim][indElem] - 1]++; @@ -1035,7 +1059,7 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec for (int j(0); j < d_; j++) { Y[j][ind] = y[j]; - if (data_[j][ind].isNotFull) + if (data_[j][ind].isPartial) { xTemp[j][compteur[j]] = x[j]; for (int elem = 0; elem < m_[j]; elem++) @@ -1050,481 +1074,494 @@ double RankCluster::computeLikelihood(vector>> const &mu, vec return l; } -//compute the final partition +/** compute the final partition + * The final partition is the argmax per row of tik + */ void RankCluster::computePartition() { - if (g_ > 1) - { //calcul partition - double max; - for (int ind(0); ind < n_; ind++) + // TODO can be done directly with Eigen + // see https://stackoverflow.com/questions/11430588/find-rowwise-maxcoeff-and-index-of-maxcoeff-in-eigen + + if (g_ > 1) + { + double max; + for (int ind(0); ind < n_; ind++) + { + max = output_.tik(ind, 0); + z_[ind] = 0; + for (int k(1); k < g_; k++) + { + if (output_.tik(ind, k) > max) { - max = output_.tik(ind, 0); - z_[ind] = 0; - for (int k(1); k < g_; k++) - { - if (output_.tik(ind, k) > max) - { - max = output_.tik(ind, k); - z_[ind] = k; - } - } - //cout<> const &resProp, vector>> const &resP, - vector>>> const &resMu, vector> const &resZ, - vector>>> const &resDonneesPartiel) + vector>> const &resMu, vector> const &resZ, + vector>> const &resDonneesPartiel) { - int const iterTotal(parameter_.maxIt - parameter_.burnAlgo); + int const iterTotal(parameter_.maxIt - parameter_.burnAlgo); - //initialization of container - output_.distProp = vector>(iterTotal, vector(g_)); - output_.distP = vector>>(iterTotal, vector>(d_, vector(g_))); - output_.distMu = vector>>(iterTotal, vector>(d_, vector(g_))); - output_.distZ = vector(iterTotal); + //initialization of container + output_.distProp = vector>(iterTotal, vector(g_)); + output_.distP = vector>>(iterTotal, vector>(d_, vector(g_))); + output_.distMu = vector>>(iterTotal, vector>(d_, vector(g_))); + output_.distZ = vector(iterTotal); - //compute the distance between the output parameters and parameters from each iteration - for (int i(0); i < iterTotal; i++) - { - //distance between partition - output_.distZ[i] = indiceRand(z_, resZ[i]); + //compute the distance between the output parameters and parameters from each iteration + for (int i(0); i < iterTotal; i++) + { + //distance between partition + output_.distZ[i] = computeRandIndex(z_, resZ[i]); - for (int cl(0); cl < g_; cl++) - { - //distance between proportion - output_.distProp[i][cl] = pow(resProp[i][cl] - proportion_[cl], 2); + for (int cl(0); cl < g_; cl++) + { + //distance between proportion + output_.distProp[i][cl] = pow(resProp[i][cl] - proportion_[cl], 2); - for (int dim(0); dim < d_; dim++) - { - //distance between p - output_.distP[i][dim][cl] = pow(resP[i][dim][cl] - p_[dim][cl], 2); + for (int dim(0); dim < d_; dim++) + { + //distance between p + output_.distP[i][dim][cl] = pow(resP[i][dim][cl] - p_[dim][cl], 2); - //distance between mu - output_.distMu[i][dim][cl] = distanceKendall(mu_[dim][cl], resMu[i][dim][cl]); - } - } + //distance between mu + output_.distMu[i][dim][cl] = distanceKendall(mu_[dim][cl], resMu[i][dim][cl]); + } } + } - //distance between partial rank - vector>> distRangPartiel(iterTotal, vector>(d_)); - if (partial_) + //distance between partial rank + vector>> distRangPartiel(iterTotal, vector>(d_)); + if (partial_) + { + for (int i(0); i < iterTotal; i++) { - for (int i(0); i < iterTotal; i++) + for (int dim(0); dim < d_; dim++) + { + int compteur(0); + //for(int k(0);k::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) { - for (int dim(0); dim < d_; dim++) - { - int compteur(0); - //for(int k(0);k::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) - { - distRangPartiel[i][dim].push_back(distanceKendall(data_[dim][*it].rank, resDonneesPartiel[i][dim][compteur])); - compteur++; - } - } + distRangPartiel[i][dim].push_back(distanceKendall(data_[dim][*it].x, resDonneesPartiel[i][dim][compteur])); + compteur++; } + } } + } - //changement de format - vector compteurElemPartiel(d_, 0); - output_.distPartialRank = vector>>(resDonneesPartiel.size()); - vector rangTemp(d_); + //changement de format + vector compteurElemPartiel(d_, 0); + output_.distPartialRank = vector>>(resDonneesPartiel.size()); + Rank rangTemp(d_); - for (int iter(0); iter < (int)distRangPartiel.size(); iter++) - { - for (int dim(0); dim < d_; dim++) - compteurElemPartiel[dim] = 0; + for (int iter(0); iter < (int)distRangPartiel.size(); iter++) + { + for (int dim(0); dim < d_; dim++) + compteurElemPartiel[dim] = 0; - for (int ind(0); ind < n_; ind++) + for (int ind(0); ind < n_; ind++) + { + for (int dim(0); dim < d_; dim++) + { + if (data_[dim][ind].isPartial) { - for (int dim(0); dim < d_; dim++) - { - if (data_[dim][ind].isNotFull) - { - rangTemp[dim] = distRangPartiel[iter][dim][compteurElemPartiel[dim]]; - compteurElemPartiel[dim]++; - } - else - rangTemp[dim] = 0; - } - output_.distPartialRank[iter].push_back(rangTemp); + rangTemp[dim] = distRangPartiel[iter][dim][compteurElemPartiel[dim]]; + compteurElemPartiel[dim]++; } + else + rangTemp[dim] = 0; + } + output_.distPartialRank[iter].push_back(rangTemp); } + } } -void RankCluster::run() +void RankCluster::storeParameters(int const iterNumber, vector> &resProp, + vector>> &resP, vector>> &resMu, + vector> &resZ, vector>> &resPartialData) { - convergence_ = false; - int nbTry(0); - while (!convergence_ && nbTry < parameter_.maxTry) + // for identifiability, we must have p >= 0.5. If not, we invert the rank. + for (int l(0); l < d_; l++) + { + for (int k(0); k < g_; k++) { - try - { - //double t0,t1,t2,t3,tM(0),tSE(0); - - //if(parameter_.detail) - //{ - //cout<<"##########################################################"< indrang(g_); - vector>> resP(parameter_.maxIt - parameter_.burnAlgo, vector>(d_, vector(g_))); - vector> resProp(parameter_.maxIt - parameter_.burnAlgo, (vector(g_))); - vector> resZ(parameter_.maxIt - parameter_.burnAlgo, vector(n_)); - vector>>> resMu(parameter_.maxIt - parameter_.burnAlgo, mu_); - vector>>> resDonneesPartiel(parameter_.maxIt - parameter_.burnAlgo, output_.initialPartialRank); - - //algorithm - //if(parameter_.detail) - for (int iter(0); iter < parameter_.maxIt; iter++) - { - //if(parameter_.detail) - //cout<<"*"; + if (p_[l][k] < 0.5) + { + p_[l][k] = 1 - p_[l][k]; + invertRank(mu_[l][k]); + } + } + } - //t2=clock(); - SEstep(); - //t3=clock(); - //tSE+=t3-t2; + // the first cluster must be the cluster with the fewer index of mu + vector indRank(g_); + for (int k(0); k < g_; k++) + indRank[k] = rank2index(mu_[0][k], tab_factorial(m_[0])); - //t2=clock(); - Mstep(); - //t3=clock(); - //tM+=t3-t2; + tri_insertionMulti(indRank, proportion_, p_, mu_, z_, g_, d_, n_); - //we store the estimated parameters - if (iter >= parameter_.burnAlgo) - { + resP[iterNumber - parameter_.burnAlgo] = p_; + resProp[iterNumber - parameter_.burnAlgo] = proportion_; + resMu[iterNumber - parameter_.burnAlgo] = mu_; + resZ[iterNumber - parameter_.burnAlgo] = z_; - for (int l(0); l < d_; l++) - { - for (int k(0); k < g_; k++) - { - if (p_[l][k] < 0.5) - { - p_[l][k] = 1 - p_[l][k]; - inverseRang(mu_[l][k]); - } - } - } + for (int dim(0); dim < d_; dim++) + { + int compteur(0); + for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) + { + resPartialData[iterNumber - parameter_.burnAlgo][dim][compteur] = data_[dim][*it].x; + compteur++; + } + } - for (int k(0); k < g_; k++) - indrang[k] = rank2index(mu_[0][k], tab_factorial(m_[0])); +} - //the first cluster must be the cluster with the more little index of mu - tri_insertionMulti(indrang, proportion_, p_, mu_, z_, g_, d_, n_); //tri selon les mu pour que 2 3=3 2 +void RankCluster::run() +{ + convergence_ = false; + int nbTry(0); + while (!convergence_ && nbTry < parameter_.maxTry) + { + try + { + // double t0, t1, t2, t3, tM(0), tSE(0); + + // if (parameter_.verbose) + // { + // cout << "##########################################################" << endl; + // cout << "# SEM-Gibbs Algorithm for multivariate partial ranking #" << endl; + // cout << "##########################################################" << endl; + // } + // t0 = clock(); + initialization(); + // t1 = clock(); + + // if (parameter_.verbose) + // cout << "Initialization: " << (double)(t1 - t0) / CLOCKS_PER_SEC << "s." << endl; + + // objects for storing the estimated parameters at each iteration + vector>> resP(parameter_.maxIt - parameter_.burnAlgo, + vector>(d_, vector(g_))); + vector> resProp(parameter_.maxIt - parameter_.burnAlgo, (vector(g_))); + vector> resZ(parameter_.maxIt - parameter_.burnAlgo, vector(n_)); + vector>> resMu(parameter_.maxIt - parameter_.burnAlgo, mu_); + vector>> resDonneesPartiel(parameter_.maxIt - parameter_.burnAlgo, output_.initialPartialRank); + + for (int iter(0); iter < parameter_.maxIt; iter++) + { + // if(parameter_.verbose) + // cout << "*"; + + // t2 = clock(); + SEstep(); + // t3 = clock(); + // tSE += t3-t2; + + // t2 = clock(); + Mstep(); + // t3 = clock(); + // tM += t3-t2; + + // we store the estimated parameters + if (iter >= parameter_.burnAlgo) + { + storeParameters(iter, resProp, resP, resMu, resZ, resDonneesPartiel); + } + } - //store parameters - resP[iter - parameter_.burnAlgo] = p_; - resProp[iter - parameter_.burnAlgo] = proportion_; - resMu[iter - parameter_.burnAlgo] = mu_; - resZ[iter - parameter_.burnAlgo] = z_; + // t2 = clock(); + // if (parameter_.verbose) + // { + // cout << endl << endl << "Log-likelihood estimation" << endl; - for (int dim(0); dim < d_; dim++) - { - int compteur(0); - for (vector::iterator it = indexPartialData_[dim].begin(); it != indexPartialData_[dim].end(); it++) - { - resDonneesPartiel[iter - parameter_.burnAlgo][dim][compteur] = data_[dim][*it].rank; - compteur++; - } - } + // cout << "Computing time for SE step: " << (double)tSE / CLOCKS_PER_SEC << "s ( " + // << (double)tSE / CLOCKS_PER_SEC / parameter_.maxIt << "s per step)." << endl; + // cout << "Computing time for M step: " << (double)tM / CLOCKS_PER_SEC << "s ( " + // << (double)tM / CLOCKS_PER_SEC / parameter_.maxIt << "s per step )." << endl; + // } - } //end storage - } //end SEM + // compute log-likelihood and choice of the best parameters + selectBestParameters(resMu, resP, resProp); + //t3=clock(); - //if(parameter_.detail) - //cout< rankTemp(m_[dim]); - for (int i(0); i < m_[dim]; i++) - rankTemp[i] = i + 1; - for (int ind(0); ind < n_; ind++) - { - //initialization of y - Rshuffle(rankTemp.begin(), rankTemp.end()); - data_[dim][ind].y = rankTemp; + //initialization of y + Rshuffle(rankTemp.begin(), rankTemp.end()); + data_[dim][ind].y = rankTemp; - if (data_[dim][ind].isNotFull) - { - for (int i = 0; i < (int)data_[dim][ind].missingData.size(); i++) - { - //initialization of Partial Rank - vector rankTemp2(data_[dim][ind].missingIndex[i]); - Rshuffle(rankTemp2.begin(), rankTemp2.end()); + if (data_[dim][ind].isPartial) + { + for (int i = 0; i < (int)data_[dim][ind].missingData.size(); i++) + { + //initialization of Partial Rank + Rank rankTemp2(data_[dim][ind].missingIndex[i]); + Rshuffle(rankTemp2.begin(), rankTemp2.end()); - for (int ii = 0; ii < (int)data_[dim][ind].missingData[i].size(); ii++) - data_[dim][ind].rank[rankTemp2[ii]] = data_[dim][ind].missingData[i][ii]; - } - } + for (int ii = 0; ii < (int)data_[dim][ind].missingData[i].size(); ii++) + data_[dim][ind].x[rankTemp2[ii]] = data_[dim][ind].missingData[i][ii]; } + } } - - /*log likelihood computation*/ - ArrayXXd tik(n_, g_); - long double p1(0), p2(0), p1x(0), p2x(0), alea(0), li(0); - double div((double)(parameter_.nGibbsL - parameter_.burnL)); - vector compteur(d_, 0); - vector x1, x2; - - //objet pour stockage de calcul pour éviter répétition - ArrayXXd proba1(d_, g_), proba2(d_, g_); - ArrayXd proba1X(g_), proba2X(g_); - - ArrayXd prop(g_); - for (int i(0); i < g_; i++) - prop(i) = proportion_[i]; - ArrayXXd propb(1, g_); - propb = prop.transpose(); - - //génération rang 1 2 3 ..m par dimension - vector> yTemp(d_), x(d_); + } + + /*log likelihood computation*/ + ArrayXXd tik(n_, g_); + long double p1(0), p2(0), p1x(0), p2x(0), alea(0), li(0); + double div((double)(parameter_.nGibbsL - parameter_.burnL)); + vector compteur(d_, 0); + Rank x1, x2; + + //objet pour stockage de calcul pour éviter répétition + ArrayXXd proba1(d_, g_), proba2(d_, g_); + ArrayXd proba1X(g_), proba2X(g_); + + ArrayXd prop(g_); + for (int i(0); i < g_; i++) + prop(i) = proportion_[i]; + ArrayXXd propb(1, g_); + propb = prop.transpose(); + + //génération rang 1 2 3 ..m par dimension + vector yTemp(d_), x(d_); + for (int j(0); j < d_; j++) + { + yTemp[j].resize(m_[j]); + initializeRank(yTemp[j]); + } + + vector logL(parameter_.nGibbsL - parameter_.burnL, 0); + + //simulation de y multi dimensionnel + for (int ind(0); ind < n_; ind++) + { + + vector y(d_), y2(d_), y1(d_); + li = 0; + //algorithme de Gibbs pour simuler yi + + //initialisation de y et p pour Gibbs + y = yTemp; for (int j(0); j < d_; j++) { - yTemp[j].resize(m_[j]); - for (int i(0); i < m_[j]; i++) - yTemp[j][i] = i + 1; + Rshuffle(y[j].begin(), y[j].end()); //permutation de 1 2 3 ..m + x[j] = data_[j][ind].x; } - vector logL(parameter_.nGibbsL - parameter_.burnL, 0); + y1 = y; - //simulation de y multi dimensionnel - for (int ind(0); ind < n_; ind++) + for (int k(0); k < g_; k++) { + tik(ind, k) = 0; + for (int j(0); j < d_; j++) + proba1(j, k) = probaCond(x[j], y1[j], mu_[j][k], p_[j][k]); + } - vector> y(d_), y2(d_), y1(d_); - li = 0; - //algorithme de Gibbs pour simuler yi - - //initialisation de y et p pour Gibbs - y = yTemp; - for (int j(0); j < d_; j++) - { - Rshuffle(y[j].begin(), y[j].end()); //permutation de 1 2 3 ..m - x[j] = data_[j][ind].rank; - } + p1 = (long double)(propb * proba1.colwise().prod()).sum(); + proba2 = proba1; - y1 = y; + for (int iter(0); iter < parameter_.nGibbsL; iter++) + { - for (int k(0); k < g_; k++) + /*simulation des y*/ + for (int J(0); J < d_; J++) + { + for (int K(0); K < m_[J] - 1; K++) { - tik(ind, k) = 0; - for (int j(0); j < d_; j++) - proba1(j, k) = probaCond(x[j], y1[j], mu_[j][k], p_[j][k]); + //"état" à tester (inversion de 2 éléments adjacents) + y2 = y; + y2[J][K] = y[J][K + 1]; + y2[J][K + 1] = y[J][K]; + + //tester un stockage des proba calculées pour éviter répétition de calculs dans la boucle + for (int k(0); k < g_; k++) + proba2(J, k) = probaCond(x[J], y2[J], mu_[J][k], p_[J][k]); + + p2 = (long double)(propb * proba2.colwise().prod()).sum(); + + alea = (long double)runif(0., p1 + p2); //unif(0,p1+p2) + + if (alea < p2) //accept changement + { + y[J] = y2[J]; + p1 = p2; + proba1.row(J) = proba2.row(J); + y1[J] = y[J]; + } + else //do not change y + { + y[J] = y1[J]; //rajout J + proba2.row(J) = proba1.row(J); + } } + } + //y_i is updated - p1 = (long double)(propb * proba1.colwise().prod()).sum(); - proba2 = proba1; - - for (int iter(0); iter < parameter_.nGibbsL; iter++) + /*simulation of partial rank with a gibbs sampler*/ + for (int J = 0; J < d_; J++) + { + if (data_[J][ind].isPartial) //simulation of xi if it is a partial rank { + x1 = x[J]; + proba1X = proba1.row(J); + p1x = (proba1X * prop).sum(); - /*simulation des y*/ - for (int J(0); J < d_; J++) - { - for (int K(0); K < m_[J] - 1; K++) - { - //"état" à tester (inversion de 2 éléments adjacents) - y2 = y; - y2[J][K] = y[J][K + 1]; - y2[J][K + 1] = y[J][K]; - - for (int k(0); k < g_; k++) //tester un stockage des proba calcul� pour �viter r�p�tition de calculs dans la boucle - proba2(J, k) = probaCond(x[J], y2[J], mu_[J][k], p_[J][k]); - - p2 = (long double)(propb * proba2.colwise().prod()).sum(); - - alea = (long double)runif(0., p1 + p2); //unif(0,p1+p2) - - if (alea < p2) //accept changement - { - y[J] = y2[J]; - p1 = p2; - proba1.row(J) = proba2.row(J); - y1[J] = y[J]; - } - else //do not change y - { - y[J] = y1[J]; //rajout J - proba2.row(J) = proba1.row(J); - } - } - } - //y_i is updated - - /*simulation of partial rank with a gibbs sampler*/ - for (int J = 0; J < d_; J++) + for (int kk = 0; kk < (int)(data_[J][ind].missingIndex).size() - 1; kk++) //Gibbs sur les x + { + for (int k = 0; k < (int)(data_[J][ind].missingIndex[kk]).size() - 1; k++) { - if (data_[J][ind].isNotFull) //simulation of xi if it is a partial rank - { - x1 = x[J]; - proba1X = proba1.row(J); - p1x = (proba1X * prop).sum(); - - for (int kk = 0; kk < (int)(data_[J][ind].missingIndex).size() - 1; kk++) //Gibbs sur les x - { - for (int k = 0; k < (int)(data_[J][ind].missingIndex[kk]).size() - 1; k++) - { - //new x to test - x2 = x[J]; - x2[data_[J][ind].missingIndex[kk][k]] = x[J][data_[J][ind].missingIndex[kk][k + 1]]; - x2[data_[J][ind].missingIndex[kk][k + 1]] = x[J][data_[J][ind].missingIndex[kk][k]]; - - for (int l = 0; l < g_; l++) - proba2X(l) = probaCond(x2, y[J], mu_[J][l], p_[J][l]); - - p2x = (proba2X * prop).sum(); - - alea = (double)runif(0., p1x + p2x); - - if (alea < p2) //we accept the changement - { - x[J] = x2; - p1x = p2x; - proba1X = proba2X; - x1 = x[J]; - } - else - x[J] = x1; - } - } - proba1.row(J) = proba1X; - } + //new x to test + x2 = x[J]; + x2[data_[J][ind].missingIndex[kk][k]] = x[J][data_[J][ind].missingIndex[kk][k + 1]]; + x2[data_[J][ind].missingIndex[kk][k + 1]] = x[J][data_[J][ind].missingIndex[kk][k]]; + + for (int l = 0; l < g_; l++) + proba2X(l) = probaCond(x2, y[J], mu_[J][l], p_[J][l]); + + p2x = (proba2X * prop).sum(); + + alea = (double)runif(0., p1x + p2x); + + if (alea < p2) //we accept the changement + { + x[J] = x2; + p1x = p2x; + proba1X = proba2X; + x1 = x[J]; + } + else + x[J] = x1; } + } + proba1.row(J) = proba1X; + } + } - if (iter >= parameter_.burnL) - { - ArrayXd calculInter(g_); - for (int cl = 0; cl < g_; cl++) - { - calculInter(cl) = 1; - for (int dim = 0; dim < d_; dim++) - calculInter(cl) *= proba1(dim, cl); - calculInter(cl) *= propb(cl); - } + if (iter >= parameter_.burnL) + { + ArrayXd calculInter(g_); + for (int cl = 0; cl < g_; cl++) + { + calculInter(cl) = 1; + for (int dim = 0; dim < d_; dim++) + calculInter(cl) *= proba1(dim, cl); + calculInter(cl) *= propb(cl); + } - double den = calculInter.sum(); - tik.row(ind) += (calculInter / den); + double den = calculInter.sum(); + tik.row(ind) += (calculInter / den); - li += (long double)1 / den; - } + li += (long double)1 / den; + } - } //end gibbs sampling for sample ind + } //end gibbs sampling for sample ind - //L -= log(li*div); - L -= log(li); + //L -= log(li*div); + L -= log(li); - tik.row(ind) /= div; + tik.row(ind) /= div; - } //end loop on sample + } //end loop on sample - L += (double)n_ * log(div); + L += (double)n_ * log(div); - output_.L = L; + output_.L = L; - output_.bic = BIC(output_.L, n_, 2 * g_ * d_ + g_ - 1); + output_.bic = BIC(output_.L, n_, 2 * g_ * d_ + g_ - 1); - output_.icl = output_.bic; + output_.icl = output_.bic; - ArrayXd entropy(n_); - for (int i = 0; i < n_; i++) + ArrayXd entropy(n_); + for (int i = 0; i < n_; i++) + { + entropy(i) = 0; + for (int j = 0; j < g_; j++) { - entropy(i) = 0; - for (int j = 0; j < g_; j++) - { - if (tik(i, j) != 0) - entropy(i) -= tik(i, j) * std::log(tik(i, j)); - } - output_.icl += 2 * entropy(i); + if (tik(i, j) != 0) + entropy(i) -= tik(i, j) * std::log(tik(i, j)); } + output_.icl += 2 * entropy(i); + } - bic = output_.bic; - icl = output_.icl; + bic = output_.bic; + icl = output_.icl; } diff --git a/src/RankCluster.h b/src/RankCluster.h index 6f4896a..eda1363 100644 --- a/src/RankCluster.h +++ b/src/RankCluster.h @@ -2,7 +2,9 @@ #define RANKCLUSTER_H_ /* @file RankCluster.h - * @brief Definition of the class @c RankCluster and struct PartialRank, SEMparameters and OutParameters + * @brief Implementation of methods of the class @c RankCluster + * See https://hal.archives-ouvertes.fr/hal-00441209v3/document and + * https://hal.inria.fr/hal-00743384/document for mathematical background */ //#include #include @@ -10,238 +12,249 @@ #include #include "Eigen/Dense" +#include "ISRfunctions.h" #include "functions.h" +#include "Typedef.h" -// create your own data structures -struct PartialRank -{ - ///rank - std::vector rank; - ///order of presentation - std::vector y; - ///if true, the rank contains partial or ties - bool isNotFull; - /// missing element of the rank or ties - std::vector> missingData; - ///index of the 0 or ties - std::vector> missingIndex; -}; - -struct SEMparameters -{ - ///number of iteration in the gibbs of SE step for each dimension of rank - std::vector nGibbsSE; - ///number of iteration in the gibbs of M step for each dimension of rank - std::vector nGibbsM; - ///maximum number of iteration of the SEM algorithm - int maxIt; - ///burn-in period of SEM algorithm - int burnAlgo; - /// number of iteration in the gibbs of the likelihood computation - int nGibbsL; - ///burn-in period of the likelihood computation - int burnL; - ///maximum number of try of the SEM - int maxTry; - ///if true print details - bool detail; -}; - -struct OutParameters -{ - ///loglikelihood - double L; - ///bic criterion - double bic; - ///icl criterion - double icl; - /// - Eigen::ArrayXXd tik; - /// - Eigen::ArrayXd entropy; - /// - Eigen::ArrayXXd probabilities; - ///percentage of confidence in final estimation of missing data - std::vector>> partialRankScore; - - //algorithm initialization - std::vector>> initialPartialRank; - std::vector> initialP; - std::vector initialZ; - std::vector initialProportion; - std::vector>> initialMu; - - //distance between parameters - std::vector> distProp; - std::vector>> distP; - std::vector>> distMu; - std::vector distZ; - std::vector>> distPartialRank; -}; class RankCluster { public: - ///defaut constructor - RankCluster(); - /** - * @param X data one row= a multi dimensionnal rank - * @param g number of clusters - * @param m size of rank of each dimension - * @param param parameters of SEM algorithm - */ - RankCluster(std::vector> const &X, int g, std::vector const &m, SEMparameters const ¶m); - //constructor with given initialization of parameters - RankCluster(std::vector> const &X, std::vector const &m, SEMparameters const ¶m, - std::vector const &proportion, std::vector> const &p, - std::vector>> const &mu); - - ///copy constructor - RankCluster(RankCluster &rankClusterObject); - - ///destructor - virtual ~RankCluster(); - - /// run the SEM algorithm - void run(); - - //getters - inline int d() const { return d_; } - inline int n() const { return n_; } - inline int g() const { return g_; } - inline std::vector m() const { return m_; } - inline std::vector z() const { return z_; } - inline std::vector> p() const { return p_; } - inline std::vector>> mu() const { return mu_; } - inline std::vector proportion() const { return proportion_; } - inline std::vector> indexPartialData() const { return indexPartialData_; } - inline std::vector> data() const { return data_; } - inline std::vector rank(int dim, int index) const { return data_[dim][index].rank; } - inline bool dataOk() const { return dataOk_; } - inline bool convergence() const { return convergence_; } - inline bool partial() const { return partial_; } - inline std::vector> indexPb() const { return indexPb_; } - inline SEMparameters parameter() const { return parameter_; } - - //output getters - inline Eigen::ArrayXXd tik() const { return output_.tik; } - inline Eigen::ArrayXd entropy() const { return output_.entropy; } - inline Eigen::ArrayXXd probabilities() const { return output_.probabilities; } - Eigen::ArrayXd probability() const; - inline double bic() const { return output_.bic; } - inline double icl() const { return output_.icl; } - inline double L() const { return output_.L; } - inline std::vector>> initialPartialRank() const { return output_.initialPartialRank; } - inline std::vector> initialP() const { return output_.initialP; } - inline std::vector initialZ() const { return output_.initialZ; } - inline std::vector>> initialMu() const { return output_.initialMu; } - inline std::vector initialProportion() const { return output_.initialProportion; } - inline std::vector> distProp() const { return output_.distProp; } - inline std::vector>> distP() const { return output_.distP; } - inline std::vector>> distMu() const { return output_.distMu; } - inline std::vector distZ() const { return output_.distZ; } - inline std::vector>> distPartialRank() const { return output_.distPartialRank; } - inline std::vector>> partialRankScore() const { return output_.partialRankScore; } - - ///reestimation of criterion - void estimateCriterion(double &L, double &bic, double &icl); + // default constructor + RankCluster(); + /** + * @param X data one row= a multivariate rank + * @param g number of clusters + * @param m size of rank of each dimension + * @param param parameters of SEM algorithm + */ + RankCluster(std::vector> const &X, int g, std::vector const &m, SEMparameters const ¶m); + // constructor with given initialization of parameters + RankCluster(std::vector> const &X, std::vector const &m, SEMparameters const ¶m, + std::vector const &proportion, std::vector> const &p, + std::vector>> const &mu); + + // copy constructor + RankCluster(RankCluster &rankClusterObject); + + // destructor + virtual ~RankCluster(); + + // run the SEM algorithm + void run(); + + // getters + inline int d() const { return d_; } + inline int n() const { return n_; } + inline int g() const { return g_; } + inline std::vector m() const { return m_; } + inline std::vector z() const { return z_; } + inline std::vector> p() const { return p_; } + inline std::vector> mu() const { return mu_; } + inline std::vector proportion() const { return proportion_; } + inline std::vector> indexPartialData() const { return indexPartialData_; } + inline std::vector> data() const { return data_; } + inline std::vector rank(int dim, int index) const { return data_[dim][index].x; } + inline bool dataOk() const { return dataOk_; } + inline bool convergence() const { return convergence_; } + inline bool partial() const { return partial_; } + inline std::vector> indexPb() const { return indexPb_; } + inline SEMparameters parameter() const { return parameter_; } + + // output getters + inline Eigen::ArrayXXd tik() const { return output_.tik; } + inline Eigen::ArrayXd entropy() const { return output_.entropy; } + inline Eigen::ArrayXXd probabilities() const { return output_.probabilities; } + Eigen::ArrayXd probability() const; + inline double bic() const { return output_.bic; } + inline double icl() const { return output_.icl; } + inline double L() const { return output_.L; } + inline std::vector> initialPartialRank() const { return output_.initialPartialRank; } + inline std::vector> initialP() const { return output_.initialP; } + inline std::vector initialZ() const { return output_.initialZ; } + inline std::vector> initialMu() const { return output_.initialMu; } + inline std::vector initialProportion() const { return output_.initialProportion; } + inline std::vector> distProp() const { return output_.distProp; } + inline std::vector>> distP() const { return output_.distP; } + inline std::vector>> distMu() const { return output_.distMu; } + inline std::vector distZ() const { return output_.distZ; } + inline std::vector>> distPartialRank() const { return output_.distPartialRank; } + inline std::vector>> partialRankScore() const { return output_.partialRankScore; } + + /** re-estimation of criterion */ + void estimateCriterion(double &L, double &bic, double &icl); protected: - /**convert X in vector> - * @param X raw data one row= a multi dimensionnal rank - */ - void conversion2data(std::vector> const &X); - /** read rank. used in conversion2data - * @param X raw data one row= a multi dimensionnal rank - * @param dim actual dimension - * @param j actual index of the sample - * @param indM transformation of m_ - */ - void readRankingRank(std::vector> const &X, int const &dim, int const &j, std::vector const &indM); - - ///initialization of parameters - void initialization(); - /** SE step */ - void SEstep(); - /**unidimensionnal gibbs sampler for y estimation - * @param indexDim index of the dimension (>>> &listeMu, std::vector>> &resP, - std::vector> &resProp); - - /** compute the log likelihood for a set of parameter - * @param mu estimated central rank - * @param p estimated p (dispersion parameter) - * @param proportion estimated proportion - * @param tik used for store tik - * @param Y used for store estimated y - * @param xTemp used for store estimated partial rank - * @param score used for confidence in estimated partial rank - * @param iterproba probability of each individuals at each iteration - */ - double computeLikelihood(std::vector>> const &mu, std::vector> const &p, - std::vector const &proportion, Eigen::ArrayXXd &tik, std::vector>> &Y, - std::vector>> &xTemp, Eigen::ArrayXXd &probabilities, - std::vector>> &score); - ///compute the final z_ - void computePartition(); - ///compute distance between final parameters and each iteration parameters - void computeDistance(std::vector> const &resProp, std::vector>> const &resP, - std::vector>>> const &resMu, std::vector> const &resZ, - std::vector>>> const &resDonneesPartiel); + /** convert X in vector> + * @param X raw data one row= a multi dimensional rank + */ + void conversion2data(std::vector> const &X); + + /** read rank. used in conversion2data + * @param X raw data one row= a multi dimensional rank + * @param dim actual dimension + * @param j actual index of the sample + * @param indM transformation of m_ + */ + void readRankingRank(std::vector> const &X, int const &dim, int const &j, std::vector const &indM); + + // initialization of parameters + void initialization(); + void initializeZ(); + void initializeP(); + void initializeMu(); + void initializeY(); + void initializePartialRank(); + void fillIndexPartialData(); + void saveInitialization(); + + void estimateProportion(); + + + /** SE step */ + void SEstep(); + + /** univariate gibbs sampler for y estimation + * @param indexDim index of the dimension (>> &resMu, + std::vector>> &resP, + std::vector> &resProp); + + /** List the different found mu + * + * @param resMu mu for every iteration of the SEM + * @param resP pi for every iteration of the SEM + * @param resProp proportion for every iteration of the SEM + */ + MuList * findDifferentMu(std::vector>> &resMu, + std::vector>> &resP, + std::vector> &resProp); + + /** Mean the parameters for a mu */ + void meanParameters(MuList * currMu); + + /** compute the log likelihood for a set of parameter + * @param mu estimated central rank + * @param p estimated p (dispersion parameter) + * @param proportion estimated proportion + * @param tik used for store tik + * @param Y used for store estimated y + * @param xTemp used for store estimated partial ranks + * @param probabilities probability of each individual at each iteration + * @param score used for confidence in estimated partial rank + */ + double computeLogLikelihood(std::vector> const &mu, + std::vector> const &p, + std::vector const &proportion, Eigen::ArrayXXd &tik, + std::vector> &Y, + std::vector> &xTemp, Eigen::ArrayXXd &probabilities, + std::vector>> &score); + + /** compute the final partition (z_) */ + void computePartition(); + + /** Store the parameters */ + void storeParameters(int const iterNumber, std::vector> &resProp, + std::vector>> &resP, + std::vector>> &resMu, + std::vector> &resZ, + std::vector>> &resPartialData); + + /** compute distance between final parameters and each iteration parameters */ + void computeDistance(std::vector> const &resProp, + std::vector>> const &resP, + std::vector>> const &resMu, + std::vector> const &resZ, + std::vector>> const &resDonneesPartiel); private: - ///contains the size of rank for each dim - std::vector m_; - ///number of individuals - int n_; - ///number of dimension - int d_; - ///number of cluster - int g_; - ///data of teh form data[dimension[index]] - std::vector> data_; - ///estimated cluster of each individual - std::vector z_; - /// estimated rank parameter of each cluster : mu_[dimension][cluster][indice] - std::vector>> mu_; - /// estimated probability parameter of each cluster : p_[dimension][cluster] - std::vector> p_; - /// estimated proportion of the mixture model - std::vector proportion_; - ///algorithm parameters - SEMparameters parameter_; - ///distance and initialization of the algorithm - OutParameters output_; - ///true if there is partial rank in the data - bool partial_; - ///index of partial data - std::vector> indexPartialData_; - /// if true, SEM has converged - bool convergence_; - /// if true, good data - bool dataOk_; - ///index of rank with problem for each dimension - std::vector> indexPb_; + // contains the size of rank for each dim + std::vector m_; + // number of individuals + int n_; + // number of dimension + int d_; + // number of cluster + int g_; + // data of the form data[dimension][index] + std::vector> data_; + // estimated cluster of each individual + std::vector z_; + // estimated rank parameter of each cluster: mu_[dimension][cluster] + std::vector> mu_; + // estimated probability parameter of each cluster : p_[dimension][cluster] + std::vector> p_; + // estimated proportion of the mixture model + std::vector proportion_; + // algorithm parameters + SEMparameters parameter_; + // distance and initialization of the algorithm + OutParameters output_; + // true if there is partial rank in the data + bool partial_; + // index of partial data + std::vector> indexPartialData_; + // if true, SEM has converged + bool convergence_; + // if true, good data + bool dataOk_; + // index of rank with problem for each dimension + std::vector> indexPb_; }; #endif /* RANKCLUSTER_H_ */ diff --git a/src/runFunctions.cpp b/src/RinterfaceFunctions.cpp similarity index 98% rename from src/runFunctions.cpp rename to src/RinterfaceFunctions.cpp index d2af398..dcb2725 100644 --- a/src/runFunctions.cpp +++ b/src/RinterfaceFunctions.cpp @@ -1,6 +1,6 @@ #include "functions.h" -#include "runFunctions.h" -#include "runTest.h" +#include "RinterfaceFunctions.h" +#include "RinterfaceTest.h" #include "RankCluster.h" #ifdef _OPENMP #include @@ -128,7 +128,7 @@ RcppExport SEXP loglikelihood(SEXP X, SEXP mu, SEXP p, SEXP proportion, SEXP m, param.nGibbsL = as(iterL); param.burnL = as(burnL); param.maxTry = 1; - param.detail = false; + param.verbose = false; RankCluster estimLog(data, mC, param, prop, pC, muC); if (!estimLog.dataOk()) diff --git a/src/runFunctions.h b/src/RinterfaceFunctions.h similarity index 100% rename from src/runFunctions.h rename to src/RinterfaceFunctions.h diff --git a/src/run.cpp b/src/RinterfaceSEM.cpp similarity index 98% rename from src/run.cpp rename to src/RinterfaceSEM.cpp index 9082cf7..dfde1fe 100644 --- a/src/run.cpp +++ b/src/RinterfaceSEM.cpp @@ -1,10 +1,10 @@ -#include "run.h" +#include "RinterfaceSEM.h" using namespace Rcpp; using namespace std; using namespace Eigen; -RcppExport SEXP semR(SEXP X, SEXP m, SEXP K, SEXP Qsem, SEXP Bsem, SEXP Ql, SEXP Bl, SEXP RjSE, SEXP RjM, SEXP maxTry, SEXP run, SEXP detail) +RcppExport SEXP semR(SEXP X, SEXP m, SEXP K, SEXP Qsem, SEXP Bsem, SEXP Ql, SEXP Bl, SEXP RjSE, SEXP RjM, SEXP maxTry, SEXP run, SEXP verbose) { int g = as(K), runC(as(run)); vector mC = as>(m); @@ -16,7 +16,7 @@ RcppExport SEXP semR(SEXP X, SEXP m, SEXP K, SEXP Qsem, SEXP Bsem, SEXP Ql, SEXP param.nGibbsL = as(Ql); param.burnL = as(Bl); param.maxTry = as(maxTry); - param.detail = as(detail); + param.verbose = as(verbose); NumericMatrix XR(X); int n(XR.nrow()), col(XR.ncol()); diff --git a/src/run.h b/src/RinterfaceSEM.h similarity index 98% rename from src/run.h rename to src/RinterfaceSEM.h index ab1e1cc..f55adce 100644 --- a/src/run.h +++ b/src/RinterfaceSEM.h @@ -4,7 +4,7 @@ #include "RankCluster.h" #include -RcppExport SEXP semR(SEXP X, SEXP m, SEXP K, SEXP Qsem, SEXP Bsem, SEXP Ql, SEXP Bl, SEXP RjSE, SEXP RjM, SEXP maxTry, SEXP run, SEXP detail); +RcppExport SEXP semR(SEXP X, SEXP m, SEXP K, SEXP Qsem, SEXP Bsem, SEXP Ql, SEXP Bl, SEXP RjSE, SEXP RjM, SEXP maxTry, SEXP run, SEXP verbose); template inline out convertMatrix(const inp &matrixinput) diff --git a/src/runTest.cpp b/src/RinterfaceTest.cpp similarity index 93% rename from src/runTest.cpp rename to src/RinterfaceTest.cpp index e33faba..df68100 100644 --- a/src/runTest.cpp +++ b/src/RinterfaceTest.cpp @@ -1,5 +1,5 @@ -#include "runTest.h" -#include "runFunctions.h" +#include "RinterfaceTest.h" +#include "RinterfaceFunctions.h" using namespace Rcpp; using namespace std; @@ -28,11 +28,11 @@ vector> convertToVVi(SEXP const &rMatrix) return output; } -vector downUniVariateRank(NumericMatrix XR) +vector downUniVariateRank(NumericMatrix XR) { int const n(XR.nrow()); int const m(XR.ncol()); - vector donnees(n); + vector donnees(n); set element; for (int i(1); i < m + 1; i++) @@ -110,7 +110,7 @@ RcppExport SEXP adkhi2partial(SEXP donnees, SEXP p, SEXP proportion, SEXP mu, SE vector pC = as>(p); vector> muC = convertToVVi(mu); NumericMatrix donneesR(donnees); - vector data = downUniVariateRank(donneesR); + vector data = downUniVariateRank(donneesR); // double pval(0); diff --git a/src/runTest.h b/src/RinterfaceTest.h similarity index 89% rename from src/runTest.h rename to src/RinterfaceTest.h index 765ee6b..dc03299 100644 --- a/src/runTest.h +++ b/src/RinterfaceTest.h @@ -8,7 +8,7 @@ std::vector> convertToVVd(SEXP const &rMatrix); std::vector> convertToVVi(SEXP const &rMatrix); -std::vector downUniVariateRank(Rcpp::NumericMatrix XR); +std::vector downUniVariateRank(Rcpp::NumericMatrix XR); RcppExport SEXP kullback(SEXP m, SEXP mu1, SEXP mu2, SEXP p1, SEXP p2, SEXP proportion1, SEXP proportion2); RcppExport SEXP adkhi2(SEXP donnees, SEXP p, SEXP proportion, SEXP mu, SEXP nBootstrap); diff --git a/src/Typedef.h b/src/Typedef.h new file mode 100644 index 0000000..8a5074c --- /dev/null +++ b/src/Typedef.h @@ -0,0 +1,88 @@ +#ifndef TYPEDEF_H +#define TYPEDEF_H + +#include +#include "Eigen/Dense" + +typedef std::vector Rank; +typedef std::vector> MultivariateRank; + + +struct PartialRank +{ + // observed rank + Rank x; + // order of presentation + Rank y; + // if true, the rank contains partial or ties + bool isPartial; + // missing element of the rank or ties + std::vector> missingData; + // index of the 0 or ties + std::vector> missingIndex; +}; + +typedef std::vector MultivariatePartialRank; + +struct MuList +{ + int freq; + std::vector> fullRank; + std::vector> p; + std::vector prop; + MuList *nextMu; +}; + + +struct SEMparameters +{ + // number of iteration in the gibbs of SE step for each dimension of rank + std::vector nGibbsSE; + // number of iteration in the gibbs of M step for each dimension of rank + std::vector nGibbsM; + // maximum number of iteration of the SEM algorithm + int maxIt; + // burn-in period of SEM algorithm + int burnAlgo; + // number of iteration in the gibbs of the likelihood computation + int nGibbsL; + // burn-in period of the likelihood computation + int burnL; + // maximum number of try of the SEM + int maxTry; + // if true print information + bool verbose; +}; + +struct OutParameters +{ + // log-likelihood + double L; //TODO rename into ll + // bic criterion + double bic; + // icl criterion + double icl; + // + Eigen::ArrayXXd tik; //TODO rename into posteriorProbabilities + Eigen::ArrayXd entropy; + // + Eigen::ArrayXXd probabilities; //TODO rename into conditionalProbabilities + // percentage of confidence in final estimation of missing data + std::vector>> partialRankScore; + + // algorithm initialization + std::vector> initialPartialRank; + std::vector> initialP; + std::vector initialZ; + std::vector initialProportion; + std::vector> initialMu; + + // distance between parameters + std::vector> distProp; + std::vector>> distP; + std::vector>> distMu; + std::vector distZ; + std::vector>> distPartialRank; +}; + +#endif // TYPEDEF_H \ No newline at end of file diff --git a/src/functions.cpp b/src/functions.cpp index 0585642..b67b6a1 100644 --- a/src/functions.cpp +++ b/src/functions.cpp @@ -1,14 +1,13 @@ /* -* functions.cpp -* -* Created on: 1 mars 2013 -* Author: grimonprez -*/ + * functions.cpp + * + * Created on: 1 mars 2013 + * Author: Quentin Grimonprez + */ #include #include #include -#include #include #include "functions.h" @@ -16,608 +15,454 @@ using namespace std; // generate an integer between 0 and n -1 -int randWrapper(const int n) +int randWrapper(const int n) { - return floor(unif_rand() * n); + return floor(unif_rand() * n); } -// retourne le rang de l'objet i dans la liste x (ordering representation) equivalent rank.gamma//----FAIRE CAS i PAS DEDANS? -int positionRank(vector const &x, int const &i) +// generate a rank containing 1, 2, .., m +void initializeRank(Rank &rank) { - int j(0); - while (x[j] != i) - j++; - - return j; //entre 0 et m-1 + for (size_t j = 0; j < rank.size(); j++) + rank[j] = j + 1; } -// calcul de A(x,y) et G(x,y,mu) -vector comparaison(vector const &x, vector const &y, vector const &mu) +// generate a random Rank +void randomRank(Rank &rank) { - int const m(mu.size()); - int gplus(0), gmoins(0), gjmoinsb(0), gjplusb(0), index(0); - vector ajmoins, ajplus, ajplusb, ajmoinsb, ajplusbIndex; - ajplusb.reserve(m); //le Aj+ en cours - ajmoinsb.reserve(m); //le Aj- en cours - ajplusbIndex.reserve(m); //les index du Aj+ en cours - ajplus.reserve(m * (m - 1)); //l'union de tt les Aj+ - ajmoins.reserve(m * (m - 1)); //l'union de tt les Aj- - - for (int j(1); j < m; j++) - { - gjmoinsb = 0; - gjplusb = 0; - for (int i(0); i < j; i++) - { - //calcul Aj- - if (positionRank(x, y[i]) < positionRank(x, y[j])) - { - ajmoins.push_back(i); - ajmoinsb.push_back(i); - } - else //calcul Aj+//if (positionRank(x,y[i])>positionRank(x,y[j])) - { - ajplusb.push_back(positionRank(x, y[i])); - ajplusbIndex.push_back(i); - } - } - - if (ajplusb.size() > 0) //si le Aj+ en cours est non vide, on rajoute l'index du min � Aj+ - { - index = min_element(ajplusb.begin(), ajplusb.end()) - ajplusb.begin(); - ajplus.push_back(ajplusbIndex[index]); - - //calcul de G+ - if (positionRank(mu, y[j]) < positionRank(mu, y[ajplus[ajplus.size() - 1]])) - { - gplus++; - gjplusb++; - } - ajplusb.erase(ajplusb.begin(), ajplusb.end()); - ajplusbIndex.erase(ajplusbIndex.begin(), ajplusbIndex.end()); - } - if (ajmoinsb.size() > 0) //si le Aj- en cours est non vide on calcule G- - { - //calcul de G- - for (unsigned int i(0); i < ajmoinsb.size(); i++) - { - if (positionRank(mu, y[ajmoinsb[i]]) < positionRank(mu, y[j])) - { - gmoins++; - gjmoinsb++; - } - } - ajmoinsb.erase(ajmoinsb.begin(), ajmoinsb.end()); - } - } - - vector comparaison(2, 0); - comparaison[0] = ajmoins.size() + ajplus.size(); - comparaison[1] = gmoins + gplus; - - return comparaison; + initializeRank(rank); + Rshuffle(rank.begin(), rank.end()); } -//Calcul proba conditionnelle p(x|y;mu,pi) -double probaCond(vector const &x, vector const &y, vector const &mu, double const &p) +// return the position of object i in x (ordering representation). Equivalent rank.gamma +// TODO manage the case where is is not in x ? +int positionRank(Rank const &x, int const &i) { - vector comp(2); - comp = comparaison(x, y, mu); - return pow(p, comp[1]) * pow(1 - p, comp[0] - comp[1]); + int j(0); + while (x[j] != i) + j++; + + return j; //entre 0 et m-1 } -//factorial function -int factorial(int const &nombre) -{ - int temp; - - if (nombre <= 1) - return 1; - temp = nombre * factorial(nombre - 1); - return temp; +// factorial function +int factorial(int const &m) +{ + int temp; + + if (m <= 1) + return 1; + + temp = m * factorial(m - 1); + return temp; } -//calcul des factorielles de 1! to m! +// compute all factorial values from 1! to to m! vector tab_factorial(int const &m) { - vector tab(m); - tab[0] = 1; - for (int i(1); i < m; i++) - tab[i] = (i + 1) * tab[i - 1]; - - return tab; + vector tab(m); + tab[0] = 1; + for (int i(1); i < m; i++) + tab[i] = (i + 1) * tab[i - 1]; + + return tab; } -//conversion d'un rang en index -int rank2index(vector const &rang, vector const &tabFactorial) +// convert a single rank into an index (int) +int rank2index(Rank const &rank, vector const &tabFactorial) { - int const m(rang.size()); - int index(0); - index = (rang[0] - 1) * tabFactorial[m - 2]; - vector::iterator it; - vector liste(m); - - for (int i(0); i < m; i++) - liste[i] = i + 1; - - liste.erase(remove(liste.begin(), liste.end(), rang[0]), liste.end()); - - for (int j(1); j < m - 1; j++) - { - it = search_n(liste.begin(), liste.end(), 1, rang[j]); - index += (int(it - liste.begin()) * tabFactorial[m - j - 2]); - liste.erase(remove(liste.begin(), liste.end(), rang[j]), liste.end()); - } - - return index + 1; + int const m(rank.size()); + int index(0); + index = (rank[0] - 1) * tabFactorial[m - 2]; + vector::iterator it; + vector liste(m); + + initializeRank(liste); + + liste.erase(remove(liste.begin(), liste.end(), rank[0]), liste.end()); + + for (int j = 1; j < m - 1; j++) + { + it = search_n(liste.begin(), liste.end(), 1, rank[j]); + index += (int(it - liste.begin()) * tabFactorial[m - j - 2]); + liste.erase(remove(liste.begin(), liste.end(), rank[j]), liste.end()); + } + + return index + 1; } -vector rank2index(vector> const &listeRang, vector const &tabFact) +// convert several ranks into index +vector rank2index(vector const &rankList, vector const &tabFact) { - const unsigned int n(listeRang.size()); - vector listeIndex(n); - for (unsigned int i(0); i < n; i++) - listeIndex[i] = rank2index(listeRang[i], tabFact); - - return listeIndex; + int n(rankList.size()); + vector listeIndex(n); + + for (size_t i = 0; i < n; i++) + listeIndex[i] = rank2index(rankList[i], tabFact); + + return listeIndex; } -//conversion d'un index en rang -vector index2rank(int index, int const &m, vector const &tabFactorial) +// Convert an index to a rank. This one does not check that the index exists (lesser than m!). +Rank index2rankNoCheck(int index, int const &m, vector const &tabFactorial) { - if (index > factorial(m)) - { - vector temp(m, 0); - return temp; - //cout<<"ERREUR "< liste(m), r(m, 0); - int temp(0), temp2(0); - - index--; - r[0] = index / tabFactorial[m - 2] + 1; - - for (int i(0); i < m; i++) - liste[i] = i + 1; - - //on supprime l'élément égale à r[0] - liste.erase(remove(liste.begin(), liste.end(), r[0]), liste.end()); - - for (int j(1); j < m - 1; j++) - { - temp = index; - for (int k(1); k < j + 1; k++) - temp %= tabFactorial[m - k - 1]; - - temp2 = temp / tabFactorial[m - j - 2]; - r[j] = liste[temp2]; - - //replace (liste.begin(), liste.end(), r[j], 0); - liste.erase(remove(liste.begin(), liste.end(), r[j]), liste.end()); - } - r[m - 1] = liste[0]; - - return r; - } + Rank r(m, 0); + vector liste(m); + int temp(0), temp2(0); + + index--; + r[0] = index / tabFactorial[m - 2] + 1; + + initializeRank(liste); + + // we remove the element equal to r[0] + liste.erase(remove(liste.begin(), liste.end(), r[0]), liste.end()); + + for (int j(1); j < m - 1; j++) + { + temp = index; + for (int k(1); k < j + 1; k++) + temp %= tabFactorial[m - k - 1]; + + temp2 = temp / tabFactorial[m - j - 2]; + r[j] = liste[temp2]; + + liste.erase(remove(liste.begin(), liste.end(), r[j]), liste.end()); + } + r[m - 1] = liste[0]; + + return r; } -vector index2rank(int index, int const &m) +Rank index2rank(int index, int const &m, vector const &tabFactorial) { - vector r(m, 0); - - if (index > factorial(m)) - { - vector temp(m, 0); - return temp; - //cout<<"ERREUR "< liste(m); - int temp(0), temp2(0); - - index--; - r[0] = index / factorial(m - 1) + 1; - - for (int i(0); i < m; i++) - liste[i] = i + 1; - - //on supprime l'élément égale à r[0] - liste.erase(remove(liste.begin(), liste.end(), r[0]), liste.end()); - - for (int j(1); j < m - 1; j++) - { - temp = index; - for (int k(1); k < j + 1; k++) - temp %= factorial(m - k); - - temp2 = temp / factorial(m - j - 1); - r[j] = liste[temp2]; - - //replace (liste.begin(), liste.end(), r[j], 0); - liste.erase(remove(liste.begin(), liste.end(), r[j]), liste.end()); - } - r[m - 1] = liste[0]; - - return r; - } + if (index > factorial(m)) + { + Rank temp(m, 0); + return temp; + //cout << "ERROR " << index << "<" << m << "!" << endl; + } + else + { + return index2rankNoCheck(index, m, tabFactorial); + } } -vector index2rankb(int index, int const &m, vector const &tabFactorial) +Rank index2rank(int index, int const &m) { - vector liste(m), r(m, 0); - int temp(0), temp2(0); - - index--; - r[0] = index / tabFactorial[m - 2] + 1; - - for (int i(0); i < m; i++) - liste[i] = i + 1; - - //on supprime l'élément égale à r[0] - liste.erase(remove(liste.begin(), liste.end(), r[0]), liste.end()); - - for (int j(1); j < m - 1; j++) - { - temp = index; - for (int k(1); k < j + 1; k++) - temp %= tabFactorial[m - k - 1]; - - temp2 = temp / tabFactorial[m - j - 2]; - r[j] = liste[temp2]; - - //replace (liste.begin(), liste.end(), r[j], 0); - liste.erase(remove(liste.begin(), liste.end(), r[j]), liste.end()); - } - r[m - 1] = liste[0]; - - return r; + vector tabFactorial; + tabFactorial = tab_factorial(m); + + return index2rank(index, m, tabFactorial); } -vector listeSigma(int const &m, vector const &tabFactorial) +vector listIndexOrderOfPresentation(int const &m, vector const &tabFactorial) { - vector liste(tabFactorial[m - 1] / 2); - int const1(0), const2(0), ind(0); - - for (int i(1); i <= m - 1; i++) - { - const1 = (i - 1) * (tabFactorial[m - 2] + tabFactorial[m - 3]) + 1; - const2 = i * tabFactorial[m - 2]; - int nb(const2 - const1 + 1); - for (int j(0); j < nb; j++) - liste[ind + j] = const1 + j; - ind += nb; - } - - return liste; + vector liste(tabFactorial[m - 1] / 2); + int const1(0), const2(0), ind(0); + + for (int i(1); i <= m - 1; i++) + { + const1 = (i - 1) * (tabFactorial[m - 2] + tabFactorial[m - 3]) + 1; + const2 = i * tabFactorial[m - 2]; + int nb(const2 - const1 + 1); + for (int j(0); j < nb; j++) + liste[ind + j] = const1 + j; + ind += nb; + } + + return liste; } -//inversion d'un rang -//ex: transforme 1 2 3 4 en 4 3 2 1 -void inverseRang(vector &rang) +// Invert a rank +// ex: transform 1 2 3 4 into 4 3 2 1 +void invertRank(Rank &rank) { - for (unsigned int j(0); j < rang.size() / 2; j++) - swap(rang[j], rang[rang.size() - j - 1]); + for (size_t j(0); j < rank.size() / 2; j++) + swap(rank[j], rank[rank.size() - j - 1]); } -void inverseRang(vector &rang, int const &m) -{ - for (int j(0); j < m / 2; j++) - swap(rang[j], rang[m - j - 1]); -} -//calcul BIC -double BIC(double loglikelihood, int nbDonnees, int nbParam) -{ - double bic(0); - bic = -2 * loglikelihood + nbParam * log(nbDonnees); - return bic; -} -double indiceRand(vector const &z1, vector const &z2) +double computeRandIndex(Rank const &z1, Rank const &z2) { - const int N(z1.size()); - double a(0), b(0), c(0), d(0); - for (int i(0); i < N; i++) + const int N(z1.size()); + double a(0), b(0), c(0), d(0); + for (int i(0); i < N; i++) + { + for (int j(0); j < N; j++) { - for (int j(0); j < N; j++) + if ((z1[i] == z1[j]) && (z2[i] == z2[j])) + a++; + else + { + if ((z1[i] != z1[j]) && (z2[i] != z2[j])) + b++; + else { - if ((z1[i] == z1[j]) && (z2[i] == z2[j])) - a++; - else - { - if ((z1[i] != z1[j]) && (z2[i] != z2[j])) - b++; - else - { - if ((z1[i] == z1[j]) && (z2[i] != z2[j])) - c++; - else - d++; - } - } + if ((z1[i] == z1[j]) && (z2[i] != z2[j])) + c++; + else + d++; } + } } - - return (a + b) / (a + b + c + d); + } + + return (a + b) / (a + b + c + d); } -/*vector order2rank(vector &x,int const& m) -{ -vector y(x); -for(int i(0);i order2rank(vector const &x, int const &m) +Rank ordering2ranking(Rank const &x) { - vector y(x); - for (int i(0); i < m; i++) - y[i] = positionRank(x, i + 1) + 1; - - return y; + Rank y(x); + for (size_t i(0); i < x.size(); i++) + y[i] = positionRank(x, i + 1) + 1; + + return y; } -int distanceKendall(vector const &x, vector const &y) +int distanceKendall(Rank const &x, Rank const &y) { - const int m(x.size()); - vector xr(m), yr(m); - xr = order2rank(x, m); - yr = order2rank(y, m); - int distK(0); - - for (int i(0); i < m - 1; i++) - for (int j(i + 1); j < m; j++) - if ((xr[i] - xr[j]) * (yr[i] - yr[j]) < 0) - distK++; - - return distK; + const int m(x.size()); + Rank xr(m), yr(m); + xr = ordering2ranking(x); + yr = ordering2ranking(y); + int distK(0); + + for (int i(0); i < m - 1; i++) + for (int j(i + 1); j < m; j++) + if ((xr[i] - xr[j]) * (yr[i] - yr[j]) < 0) + distK++; + + return distK; } // mu: index des elements de la 1ere dim -//listeMu: listMu[dim][composante][elem] -void tri_insertionMulti(vector &mu, vector &prop, vector> &p, vector>> &listeMu, vector &z, int const &g, int const &d, int const &n) +// listeMu: listMu[dim][composante][elem] +void tri_insertionMulti(Rank &mu, vector &prop, vector> &p, + vector> &listeMu, vector &z, int const &g, int const &d, int const &n) { - int i, j, elem; - double elemprop; - vector elemp(d); - vector> elemmu(d); - vector order(g); - for (int l = 0; l < g; l++) - order[l] = l; - int elemorder; - - //sort algorithm - //we sort the cluster of the first dim and make changement for all dim - for (i = 1; i < g; ++i) + int i, j, elem; + double elemprop; + vector elemp(d); + vector> elemmu(d); + vector order(g); + for (int l = 0; l < g; l++) + order[l] = l; + int elemorder; + + // sort algorithm + // we sort the cluster of the first dimension and replicate the change for all other dimensions + for (i = 1; i < g; ++i) + { + elem = mu[i]; + for (int l(0); l < d; l++) + elemp[l] = p[l][i]; + elemprop = prop[i]; + for (int l(0); l < d; l++) + elemmu[l] = listeMu[l][i]; + elemorder = order[i]; + + for (j = i; j > 0 && mu[j - 1] > elem; j--) { - elem = mu[i]; - for (int l(0); l < d; l++) - elemp[l] = p[l][i]; - elemprop = prop[i]; - for (int l(0); l < d; l++) - elemmu[l] = listeMu[l][i]; - elemorder = order[i]; - - for (j = i; j > 0 && mu[j - 1] > elem; j--) - { - order[j] = order[j - 1]; - mu[j] = mu[j - 1]; - prop[j] = prop[j - 1]; - for (int l(0); l < d; l++) - p[l][j] = p[l][j - 1]; - for (int l(0); l < d; l++) - listeMu[l][j] = listeMu[l][j - 1]; - } - order[j] = elemorder; - mu[j] = elem; - for (int l(0); l < d; l++) - p[l][j] = elemp[l]; - prop[j] = elemprop; - - for (int l(0); l < d; l++) - listeMu[l][j] = elemmu[l]; + order[j] = order[j - 1]; + mu[j] = mu[j - 1]; + prop[j] = prop[j - 1]; + for (int l(0); l < d; l++) + p[l][j] = p[l][j - 1]; + for (int l(0); l < d; l++) + listeMu[l][j] = listeMu[l][j - 1]; } - - //re order the z - for (int l = 0; l < n; l++) + order[j] = elemorder; + mu[j] = elem; + for (int l(0); l < d; l++) + p[l][j] = elemp[l]; + prop[j] = elemprop; + + for (int l(0); l < d; l++) + listeMu[l][j] = elemmu[l]; + } + + //re order the partition z + for (int l = 0; l < n; l++) + { + for (int k = 0; k < g; k++) { - for (int k = 0; k < g; k++) - { - if (z[l] == order[k]) - { - z[l] = k; - break; - } - } + if (z[l] == order[k]) + { + z[l] = k; + break; + } } + } } -//fonction pour simuler un N-échantillon d'ISR(mu,p) -vector> simulISR(int const &n, int const &m, vector const &mu, double const &p) -{ - vector> simul(n, vector(m, 0)); - vector s(m), rgTemp(m); - int l; - double correct; - bool compar, avance; - - for (int i(0); i < m; i++) - rgTemp[i] = i + 1; - for (int i(0); i < n; i++) - { - //simulation d'un rang aléatoire: permutation du vecteur 1 2..m - s = rgTemp; - Rshuffle(s.begin(), s.end()); - simul[i][0] = s[0]; - for (int j(1); j < m; j++) - { - l = 0; - avance = true; - while (avance && l < j) - { - correct = (double)runif(0., 1.); - compar = (positionRank(mu, s[j]) < positionRank(mu, simul[i][l])); - if ((compar && correct < p) || (!compar && correct > p)) - { - for (int k(j - 1); k >= l; k--) - simul[i][k + 1] = simul[i][k]; - - simul[i][l] = s[j]; - avance = false; - } - else - l++; - } - if (l == j) - simul[i][l] = s[j]; - } - } - return simul; -} - -//frequence d'un jeu de données multi varié +// frequency of a multivariate dataset typedef struct TableauRang TableauRang; -pair>>, vector> freqMulti(vector>> const &listeRang) +pair>, vector> freqMulti(vector> const &rankList) { - //rassemblement des mu identiques, proba->moyenne des proba,p->moyenne des p - struct TableauRang; - struct TableauRang - { - int compteur; - std::vector> rang; - TableauRang *suivant; - }; - - bool continuer(true), egaliteRang; - const int d(listeRang.size()), N(listeRang[0].size()); - vector> rang(d); - vector M(d); - for (int i(0); i < d; i++) - M[i] = listeRang[i][0].size(); - - TableauRang *headRang = new TableauRang; - TableauRang *currRang = headRang; - TableauRang *next = 0; - currRang->compteur = 1; - currRang->suivant = 0; - for (int i(0); i < d; i++) - rang[i] = listeRang[i][0]; - currRang->rang = rang; - - for (int j(1); j < N; j++) + // we group the same mu values. + // For a group of mu, the new p is the mean of the different p values in the group (same for proba) + struct TableauRang; + struct TableauRang + { + int compteur; + std::vector rang; + TableauRang *suivant; + }; + + bool continuer(true), egaliteRang; + const int d(rankList.size()), N(rankList[0].size()); + vector rang(d); + vector M(d); + for (int i(0); i < d; i++) + M[i] = rankList[i][0].size(); + + TableauRang *headRang = new TableauRang; + TableauRang *currRang = headRang; + TableauRang *next = 0; + currRang->compteur = 1; + currRang->suivant = 0; + for (int i(0); i < d; i++) + rang[i] = rankList[i][0]; + currRang->rang = rang; + + for (int j(1); j < N; j++) + { + continuer = true; + currRang = headRang; + while (continuer) { - continuer = true; - currRang = headRang; - while (continuer) + egaliteRang = true; + for (int J(0); J < d; J++) + { + for (int k(0); k < M[J]; k++) { - egaliteRang = true; - for (int J(0); J < d; J++) - { - for (int k(0); k < M[J]; k++) - { - if (currRang->rang[J][k] != listeRang[J][j][k]) - { - egaliteRang = false; - break; - } - } - } - - if (egaliteRang) - { - currRang->compteur++; - continuer = false; - } - else - { - if (currRang->suivant == 0) - { - continuer = false; - next = new TableauRang; - currRang->suivant = next; - currRang = next; - currRang->compteur = 1; - currRang->suivant = 0; - for (int i(0); i < d; i++) - rang[i] = listeRang[i][j]; - currRang->rang = rang; - } - else - currRang = currRang->suivant; - } + if (currRang->rang[J][k] != rankList[J][j][k]) + { + egaliteRang = false; + break; + } } + } + + if (egaliteRang) + { + currRang->compteur++; + continuer = false; + } + else + { + if (currRang->suivant == 0) + { + continuer = false; + next = new TableauRang; + currRang->suivant = next; + currRang = next; + currRang->compteur = 1; + currRang->suivant = 0; + for (int i(0); i < d; i++) + rang[i] = rankList[i][j]; + currRang->rang = rang; + } + else + currRang = currRang->suivant; + } } - - //stockage des infos - vector>> donnees(d); - vector freq; - - currRang = headRang; - while (currRang->suivant != 0) - { - for (int i(0); i < d; i++) - donnees[i].push_back(currRang->rang[i]); - - freq.push_back(currRang->compteur); - next = currRang->suivant; - delete currRang; - currRang = next; - } - + } + + // storage + vector> donnees(d); + vector freq; + + currRang = headRang; + while (currRang->suivant != 0) + { for (int i(0); i < d; i++) - donnees[i].push_back(currRang->rang[i]); - + donnees[i].push_back(currRang->rang[i]); + freq.push_back(currRang->compteur); + next = currRang->suivant; delete currRang; - - return make_pair(donnees, freq); + currRang = next; + } + + for (int i(0); i < d; i++) + donnees[i].push_back(currRang->rang[i]); + + freq.push_back(currRang->compteur); + delete currRang; + + return make_pair(donnees, freq); } -double proba(vector> const &x, vector> const &mu, vector const &pi) + +bool acceptChange(double const logP1, double const logP2) { - int d = pi.size(); - vector probaDim(d, 0); - double finalProba; - vector tabFact, listeY, y; - vector m(mu.size()); - map> diffDim; - - for (int dim = 0; dim < d; dim++) - { - m[dim] = mu[dim].size(); - diffDim[m[dim]].push_back(dim); - } + double logP1PlusP2; + if (logP1 > logP2) + { + logP1PlusP2 = logP1 + log(exp(logP2 - logP1) + 1); + } + else + { + logP1PlusP2 = logP2 + log(exp(logP1 - logP2) + 1); + } + + double proba1 = exp(logP1 - logP1PlusP2); + double proba2 = exp(logP2 - logP1PlusP2); + + double alea = runif(0., proba1 + proba2); + + return (alea < proba2); +} - for (map>::iterator it = diffDim.begin(); it != diffDim.end(); it++) - { - int m = it->first; - tabFact = tab_factorial(m); - listeY = listeSigma(m, tabFact); - double div = 2. / (double)tabFact[m - 1]; - vector aa = index2rankb(1, m, tabFact); - for (int i = 0; i < tabFact[m - 1] / 2; i++) - { - y = index2rankb(listeY[i], m, tabFact); +double LSE(Eigen::ArrayXd &logProba) +{ + double m = logProba.maxCoeff(); + + return m + std::log((logProba - m).exp().sum()); +} - for (int j = 0; j < (int)(it->second).size(); j++) - probaDim[(it->second)[j]] += probaCond(x[(it->second)[j]], y, mu[(it->second)[j]], pi[(it->second)[j]]); - } +Eigen::ArrayXd normalizeLogProba(Eigen::ArrayXd &logProba) +{ + double lse = LSE(logProba); + + return (logProba - lse).exp(); +} - for (int j = 0; j < (int)(it->second).size(); j++) - probaDim[(it->second)[j]] *= div; - } +void normalizeLogProbaInPlace(Eigen::ArrayXd &logProba) +{ + double lse = LSE(logProba); + + logProba = (logProba - lse).exp(); +} - finalProba = probaDim[0]; - for (int dim = 1; dim < d; dim++) - finalProba *= probaDim[dim]; +int sampleMultinomial(Eigen::ArrayXd const &proba) +{ + int g = proba.size(); + vector lim(g + 1, 0.); + + for (int k = 0; k < g; k++) + lim[k + 1] = lim[k] + proba(k); + + double alea = (double) runif(0., 1.); + + for (int j = 0; j < g; j++) + if ((lim[j] <= alea) && (alea <= lim[j + 1])) + return j; + + return g - 1; +} - return finalProba; +// compute BIC +double BIC(double loglikelihood, int nbDonnees, int nbParam) +{ + double bic(0); + bic = -2 * loglikelihood + nbParam * log(nbDonnees); + return bic; } diff --git a/src/functions.h b/src/functions.h index 9e16ad3..60a9d38 100644 --- a/src/functions.h +++ b/src/functions.h @@ -2,61 +2,48 @@ * functions.h * * Created on: 1 mars 2013 - * Author: grimonprez + * Author: Quentin Grimonprez */ #ifndef FUNCTIONS_H_ #define FUNCTIONS_H_ +#include #include #include #include +#include "Typedef.h" + + /**r random number generator, code from : http://gallery.rcpp.org/articles/r-function-from-c++/ * generate an integer between 0 and n - 1 */ int randWrapper(const int n); -/** equivalent of std::shuffle with an R generator - */ +/** equivalent of std::shuffle with an R generator */ template void Rshuffle(RandomAccessIterator first, RandomAccessIterator last) { - for (auto i = (last-first) - 1; i > 0; --i) { - std::swap (first[i], first[randWrapper(i + 1)]); + for (auto i = (last - first) - 1; i > 0; --i) + { + std::swap(first[i], first[randWrapper(i + 1)]); } } +/** Initialize a rank as 1 2 ... m */ +void initializeRank(Rank &rank); + +/** Initialize a rank as a random rank */ +void randomRank(Rank &rank); + /** * search the position of i in x * @param x rank * @param i integer which we want the position in x * @return the position of i in x */ -int positionRank(std::vector const &x, int const &i); - -/** - * compute A(x,y) and G(x,y,mu) - * @param x rank - * @param y order of presentation of x - * @param mu order of reference - * @return a vector of 2 elements (A(x,y),G(x,y,mu)) - * - * A(x,y)=total number of comparaison in the insertion sorting algorithm - * G(x,y,mu)= total number of good comparaison according to mu in the insertion sorting algorithm - * - */ -std::vector comparaison(std::vector const &x, std::vector const &y, std::vector const &mu); - -/** - * compute the conditional probability p(x|y;mu,p) - * @param x rank - * @param y order of presentation of x - * @param mu order of reference - * @param p probability of making a good comparaison - * @return p(x|y;mu,p) - */ -double probaCond(std::vector const &x, std::vector const &y, std::vector const &mu, double const &p); +int positionRank(Rank const &x, int const &i); /** * factorial function (recursive) @@ -73,37 +60,37 @@ int factorial(int const &nombre); std::vector tab_factorial(int const &m); /** - *-----------rang->index------------ + *-----------rank->index------------ * convert a rank into an integer * @param rang rank * @param tabFactorial \see tab_factorial * @return index */ -int rank2index(std::vector const &rang, std::vector const &tabFactorial); +int rank2index(Rank const &rank, std::vector const &tabFactorial); /** - *-----------rang->index------------ + *-----------rank->index------------ * @see rank2index for a vector of rank - * @param listeRang vector of rank + * @param rankList vector of rank * @param tabFactorial @see tab_factorial * @return vector of index */ -std::vector rank2index(std::vector> const &listeRang, std::vector const &tabFact); +std::vector rank2index(std::vector const &rankList, std::vector const &tabFact); /** * conversion index->rank * @param index index of the rank * @param m size of the rank * @param tabFactorial vector containing 1! to m! (@see tab_factorial)(optional) - * @return le rang correspondant à l'index - * index=rank - * 0=1 2 ... m + * @return the rank corresponding to the index + * index | rank + * 0 | 1 2 ... m * - * m!= m m-1 .. 1 + * m! | m m-1 ... 1 */ -std::vector index2rank(int index, int const &m, std::vector const &tabFactorial); -std::vector index2rankb(int index, int const &m, std::vector const &tabFactorial); -std::vector index2rank(int index, int const &m); +Rank index2rank(int index, int const &m, std::vector const &tabFactorial); +Rank index2rankNoCheck(int index, int const &m, std::vector const &tabFactorial); +Rank index2rank(int index, int const &m); /** * Return a vector containing the index of order of presentation y for which we have to compute probability @@ -114,50 +101,51 @@ std::vector index2rank(int index, int const &m); * p(x|y;mu,p) doesn't change if the 2 first element of y are inverted * So we compute probabilities for the half of y */ -std::vector listeSigma(int const &m, std::vector const &tabFactorial); +std::vector listIndexOrderOfPresentation(int const &m, std::vector const &tabFactorial); /** * invert a rank (1 2 3 become 3 2 1) - * @param rang rank to invert - * @param m size of the rank + * @param rank rank to invert */ -void inverseRang(std::vector &rang); -void inverseRang(std::vector &rang, int const &m); +void invertRank(Rank &rank); /** * Compute the BIC - * @param loglikelihood the loglikelihhod of the data + * @param loglikelihood the loglikelihood of the data * @param nbDonnees total number of data * @return BIC */ double BIC(double loglikelihood, int nbDonnees, int nbParam); /** - * compute the Rand index of 2 partitions + * Compute the Rand index between 2 partitions * @param z1 partition * @param z2 partition * @return a double, the Rand index */ -double indiceRand(std::vector const &z1, std::vector const &z2); +double computeRandIndex(std::vector const &z1, std::vector const &z2); /** - * conversion from ordering representation to ranking representation + * Conversion from ordering representation to ranking representation + * * @param x a rank - * @param m size of the rank - * @return a rank : the ranking representation of x + * @return a rank: the ranking representation of x */ -std::vector order2rank(std::vector const &x, int const &m); +Rank ordering2ranking(Rank const &x); /** * Compute the Kendall's distance between 2 ranks (in ordering representation) + * + * https://en.wikipedia.org/wiki/Kendall_tau_distance + * * @param x a rank in ordering representation * @param y a rank in ordering representation of the same size than x * @return an integer, the kendall distance between x and y */ -int distanceKendall(std::vector const &x, std::vector const &y); +int distanceKendall(Rank const &x, Rank const &y); /** - * sort the parameters in order that the first cluster is the cluster with the more little ndex of mu + * Sort the parameters such that the first cluster is the cluster with the more little index of mu * @param mu index of the rank of the first dimension of listeMu * @param p parameter of the ISR * @param prop proportion of the mixture model @@ -166,37 +154,57 @@ int distanceKendall(std::vector const &x, std::vector const &y); * @param g number of cluster * @param d number of dimension * @param n number of individual - * listeMu, p, prop and z are modify if necessary + * + * listeMu, p, prop and z are modified if necessary * */ -void tri_insertionMulti(std::vector &mu, std::vector &prop, std::vector> &p, - std::vector>> &listeMu, std::vector &z, int const &g, int const &d, int const &n); +void tri_insertionMulti(Rank &mu, std::vector &prop, std::vector> &p, + std::vector> &listeMu, std::vector &z, int const &g, + int const &d, int const &n); /** - * simulation of a n-sample of ISR(mu,p) - * @param n size of the sample - * @param m size of the rank - * @param mu rank - * @param p probability of a good comapraison - * @return a n-sample of ISR(mu,p) + * compute frequency of a data set + * @param rankList data + * @return a pair with the unique rank and the frequency of each unique rank */ -std::vector> simulISR(int const &n, int const &m, std::vector const &mu, double const &p); +std::pair>, std::vector> freqMulti(std::vector> const &rankList); + /** - * compute frequence of a data set - * @param listeRang data - * @return a pair with the unique rank and the frequence of each unique rank + * accept or not a change in a gibbs sampler + * + * Sample x~U(0, 1), accept the change if x < p2/(p1 + p2) + * + * @param logP1 log-probability of the current element + * @param logP2 log-probability of the candidate + * @return true if the change is accepted */ -std::pair>>, std::vector> -freqMulti(std::vector>> const &listeRang); +bool acceptChange(double const logP1, double const logP2); + +/** LogSumExp function + * https://en.wikipedia.org/wiki/LogSumExp + * + * LSE(x1, x2, ..., xn) = (max xi) + log(exp(x1 - (max xi)) + ... + exp(xn - (max xi))) + * LSE(x1, x2, ..., xn) = log(exp(x1) + ... + exp(xn)) + * + * Application for proba: LSE(log(p1), ..., log(pn)) = log(p1 + ... + pn) + */ +double LSE(Eigen::ArrayXd &logProba); + +/** + * LSE trick to normalize a vector of log probabilities + * Given (log(p1), ..., log(pn)), it returns (p1/sum(pi), ..., pn/sum(pi)) + * + * Useful to compute tik + */ +Eigen::ArrayXd normalizeLogProba(Eigen::ArrayXd &logProba); +void normalizeLogProbaInPlace(Eigen::ArrayXd &logProba); /** - * compute probability of x according to multivariate ISR - * @param x rank for each dimension for compute probability - * @param mu reference rank for each dimension - * @param pi dispersion parameter for each dimension - * @return p(x;mu,pi) + * Sample according to a multinomial with given probabilities + * + * @param proba vector of probabilities of each element */ -double proba(std::vector> const &x, std::vector> const &mu, std::vector const &pi); +int sampleMultinomial(Eigen::ArrayXd const &proba); #endif /* FUNCTIONS_H_ */ diff --git a/src/test.cpp b/src/test.cpp index 2775f5a..6ee024e 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -6,81 +6,26 @@ #include #include "test.h" +#include "functions.h" +#include "ISRfunctions.h" using namespace std; -//fonction pour simuler un N-échantillon d'ISR(mu,p) -void simulMixtureISR(vector> &simul, int const &n, int const &m, vector> const &mu, - vector const &p, vector const &prop) -{ - vector s(m), rgTemp(m); - int l, classe(0); - double correct, alea(0); - bool compar, avance; - - vector limite(prop.size() + 1, 0); - for (int i(1); i < (int)limite.size(); i++) - limite[i] = limite[i - 1] + prop[i - 1]; - - for (int i(0); i < m; i++) - rgTemp[i] = i + 1; - - for (int i(0); i < n; i++) - { - //tirage aléatoire de la classe - alea = (double)runif(0., 1.); - for (int j(0); j < (int)prop.size(); j++) - { - if ((alea > limite[j]) & (alea < limite[j + 1])) - { - classe = j; - break; - } - } - - //simulation d'un rang aléatoire: permutation du vecteur 1 2..m - s = rgTemp; - Rshuffle(s.begin(), s.end()); - simul[i][0] = s[0]; - for (int j(1); j < m; j++) - { - l = 0; - avance = true; - while (avance && l < j) - { - correct = (double)runif(0., 1.); - compar = (positionRank(mu[classe], s[j]) < positionRank(mu[classe], simul[i][l])); - if ((compar && correct < p[classe]) || (!compar && correct > p[classe])) - { - for (int k(j - 1); k >= l; k--) - simul[i][k + 1] = simul[i][k]; - - simul[i][l] = s[j]; - avance = false; - } - else - l++; - } - if (l == j) - simul[i][l] = s[j]; - } - } -} - -double khi2(vector> const &data, vector const &p, vector const &prop, vector> const &mu, int const &nBoot) +double khi2(vector> const &data, vector const &p, vector const &prop, + vector> const &mu, int const &nBoot) { int const g(prop.size()), m(data[0].size()), n(data.size()); int factM(factorial(m)); double dkhi2(0), pvalue(0), mult((double)2 / factM), prob(0); - //************** cacul des effectifs theoriques + //************** calcul des effectifs theoriques vector effTheo(factM, 0); vector> x(factM, vector(m)); //generation of the index of y vector tabFact(tab_factorial(m)), listeY; - listeY = listeSigma(m, tabFact); + listeY = listIndexOrderOfPresentation(m, tabFact); //generation of all the rank for the dim for (int i(0); i < factM; i++) @@ -120,7 +65,7 @@ double khi2(vector> const &data, vector const &p, vector effSim(factM, 0); - simulMixtureISR(simulation, n, m, mu, p, prop); + simulMixtureISR(simulation, mu, p, prop); for (int i(0); i < n; i++) { indexData = rank2index(simulation[i], tabFact); @@ -140,19 +85,20 @@ double khi2(vector> const &data, vector const &p, vector &data, vector const &p, vector const &prop, vector> const &mu, int const &nBoot) +double khi2partial(vector &data, vector const &p, vector const &prop, + vector> const &mu, int const &nBoot) { int const g(prop.size()), m(data[0].rank.size()), n(data.size()); int factM(factorial(m)); double dkhi2(0), pvalue(0), mult((double)2 / factM), prob(0); - //************** cacul des effectifs theoriques + //************** calcul des effectifs theoriques vector effTheo(factM, 0); vector> x(factM, vector(m)); //generation of the index of y vector tabFact(tab_factorial(m)), listeY; - listeY = listeSigma(m, tabFact); + listeY = listIndexOrderOfPresentation(m, tabFact); //generation of all the rank for the dim for (int i(0); i < factM; i++) @@ -173,7 +119,7 @@ double khi2partial(vector &data, vector const &p, vector c //cout< effEmp(factM, 0); vector> freq; int indexData; @@ -248,7 +194,7 @@ double khi2partial(vector &data, vector const &p, vector c for (int iter(0); iter < nBoot; iter++) { vector effSim(factM, 0); - simulMixtureISR(simulation, n, m, mu, p, prop); + simulMixtureISR(simulation, mu, p, prop); for (int i(0); i < n; i++) { //on réintroduit des 0 dans la simulation au même endroit @@ -313,7 +259,8 @@ double khi2partial(vector &data, vector const &p, vector c } //---------------------------- divergence de kullback -void updateD(double &divKL, vector &index, vector>> const &p1, vector>> const &p2, int const &d, int const &g, +void updateD(double &divKL, vector &index, vector>> const &p1, + vector>> const &p2, int const &d, int const &g, vector const &proportion1, vector const &proportion2) { double p1b(0), p2b(0); @@ -349,8 +296,9 @@ void updateIndex(vector &index, int i, vector const &factm, bool &stop } } -void computePQ(vector>> &p, vector>> &q, vector>> const &mu1, - vector>> const &mu2, vector> const &p1, vector> const &p2, vector const &m, int d, int g) +void computePQ(vector>> &p, vector>> &q, + vector>> const &mu1, vector>> const &mu2, + vector> const &p1, vector> const &p2, vector const &m, int d, int g) { bool isDimDiff(true); int n = factorial(m[0]); @@ -365,7 +313,7 @@ void computePQ(vector>> &p, vector>> x = vector>(n, vector(m[0])); //generation of the index of y vector tabFact(tab_factorial(m[dim])); - listeY = listeSigma(m[dim], tabFact); + listeY = listIndexOrderOfPresentation(m[dim], tabFact); //generation of all the rank for the dim for (int i(0); i < n; i++) @@ -400,7 +348,8 @@ void computePQ(vector>> &p, vector>> } double divKL(vector const &m, vector>> const &mu1, vector>> const &mu2, - vector> const &p1, vector> const &p2, vector const &proportion1, vector const &proportion2) + vector> const &p1, vector> const &p2, + vector const &proportion1, vector const &proportion2) { double divKL(0); int const d = m.size(); diff --git a/src/test.h b/src/test.h index f90e385..6ecadbb 100644 --- a/src/test.h +++ b/src/test.h @@ -6,9 +6,8 @@ #include #include -#include "functions.h" -struct Rank +struct RankStruct { std::vector rank; bool isPartial; @@ -16,42 +15,34 @@ struct Rank std::set missingNumber; }; -/** - * Simulate a sample of mixture of ISR - * @param simul sample will be modified - * @param n size of sample - * @param m size of rank - * @param mu reference rank - * @param p dispersion parameter: probability of a godd comparaison - * @param prop proportion of the mixture - */ -void simulMixtureISR(std::vector> &simul, int const &n, int const &m, std::vector> const &mu, std::vector const &p, std::vector const &prop); /** * khi2 adequation test * @param data univariate rank - * @param p dispersion parameter: probability of a godd comparaison + * @param p dispersion parameter: probability of a good comparaison * @param prop proportion of the mixture * @param mu reference rank * @param nBoot number of iteration for estimation * @return estimated pvalue of khi2 adequation test */ -double khi2(std::vector> const &data, std::vector const &p, std::vector const &prop, std::vector> const &mu, int const &nBoot); +double khi2(std::vector> const &data, std::vector const &p, std::vector const &prop, + std::vector> const &mu, int const &nBoot); /** * khi2 adequation test for partial rank * @param data univariate partial rank - * @param p dispersion parameter: probability of a godd comparaison + * @param p dispersion parameter: probability of a good comparaison * @param prop proportion of the mixture * @param mu reference rank * @param nBoot number of iteration for estimation * @return estimated pvalue of khi2 adequation test */ -double khi2partial(std::vector &data, std::vector const &p, std::vector const &prop, std::vector> const &mu, int const &nBoot); +double khi2partial(std::vector &data, std::vector const &p, std::vector const &prop, + std::vector> const &mu, int const &nBoot); /** - * Updating the kullback leibler divergence - * @param divKL actual kullback leibler divergence + * Updating the Kullback-Leibler divergence + * @param divKL actual Kullback-Leibler divergence * @param p1 probabilities of each rank for the first set of parameters * @param p2 probabilities of each rank for the second set of parameters * @param d dimension @@ -59,10 +50,11 @@ double khi2partial(std::vector &data, std::vector const &p, std::v * @param proportion1 proportion of the mixture of the first set of parameters * @param proportion2 proportion of the mixture of the second set of parameters */ -void updateD(double &divKL, std::vector &index, std::vector>> const &p1, std::vector>> const &p2, int const &d, int const &g, +void updateD(double &divKL, std::vector &index, std::vector>> const &p1, + std::vector>> const &p2, int const &d, int const &g, std::vector const &proportion1, std::vector const &proportion2); /** - * Recursive function for updating the index for compute probabilities in kullback + * Recursive function for updating the index for compute probabilities in kullback * @param index current index * @param i actual dimension * @param factm factorial of the size of rank for each dimension @@ -72,7 +64,7 @@ void updateIndex(std::vector &index, int i, std::vector const &factm, /** * Compute probabilities of each rank for each set of parameters - * @param divKL actual kullback leibler divergence + * @param divKL actual Kullback-Leibler divergence * @param p probabilities of each rank for the first set of parameters * @param q probabilities of each rank for the second set of parameters * @param mu1 reference rank for the first set of parameters @@ -83,11 +75,13 @@ void updateIndex(std::vector &index, int i, std::vector const &factm, * @param d dimension * @param g number of cluster */ -void computePQ(std::vector>> &p, std::vector>> &q, std::vector>> const &mu1, - std::vector>> const &mu2, std::vector> const &p1, std::vector> const &p2, std::vector const &m, int d, int g); +void computePQ(std::vector>> &p, std::vector>> &q, + std::vector>> const &mu1, + std::vector>> const &mu2, std::vector> const &p1, + std::vector> const &p2, std::vector const &m, int d, int g); /** - * Compute the kullback leibler divergence between the first and the second set of parameters + * Compute the Kullback-Leibler divergence between the first and the second set of parameters * @param m size of rank for each dimension * @param mu1 reference rank for the first set of parameters * @param mu2 reference rank for the second set of parameters @@ -95,9 +89,11 @@ void computePQ(std::vector>> &p, std::vector const &m, std::vector>> const &mu1, std::vector>> const &mu2, - std::vector> const &p1, std::vector> const &p2, std::vector const &proportion1, std::vector const &proportion2); +double divKL(std::vector const &m, std::vector>> const &mu1, + std::vector>> const &mu2, std::vector> const &p1, + std::vector> const &p2, std::vector const &proportion1, + std::vector const &proportion2); #endif /* TEST_H_ */ diff --git a/tests/testthat/test.ISRdistribution.R b/tests/testthat/test.ISRdistribution.R index 45ac4b3..92a6c04 100644 --- a/tests/testthat/test.ISRdistribution.R +++ b/tests/testthat/test.ISRdistribution.R @@ -4,7 +4,7 @@ context("ISR distribution") test_that("simulISR samples mu when pi=1", { out <- simulISR(10, 1, 1:4) - + expect_equal(out, matrix(1:4, nrow = 10, ncol = 4, byrow = TRUE)) }) @@ -12,8 +12,8 @@ test_that("simulISR samples mu when pi=1", { set.seed(42) out <- simulISR(100000, 0.5, 1:3) x <- frequence(out) - - expect_lte(max(abs(x[, 4]/100000 - rep(1/6, 6))/rep(1/6, 6)), 2e-2) + + expect_lte(max(abs(x[, 4] / 100000 - rep(1 / 6, 6)) / rep(1 / 6, 6)), 2e-2) }) @@ -21,16 +21,22 @@ test_that("probability works with a vector: univariate case", { # if pi == 1, the probability is 1 if x == mu, 0 otherwise p <- probability(1:4, 1:4, 1) expect_equal(p, 1) - + p <- probability(4:1, 1:4, 1) expect_equal(p, 0) + + p <- probability(1:4, 1:4, 0) + expect_equal(p, 0) + + p <- probability(4:1, 1:4, 0) + expect_equal(p, 1) }) test_that("probability works with a vector: multivariate case", { # if pi == 1, the probability is 1 if x == mu, 0 otherwise p <- probability(c(1:4, 1:3), c(1:4, 1:3), c(1, 1), c(4, 3)) expect_equal(p, 1) - + p <- probability(c(4:1, 1:3), c(1:4, 1:3), c(1, 1), c(4, 3)) expect_equal(p, 0) }) @@ -39,30 +45,34 @@ test_that("probability works with a matrix: univariate case", { # if pi == 1, the probability is 1 if x == mu, 0 otherwise p <- probability(matrix(1:4, nrow = 10, ncol = 4, byrow = TRUE), 1:4, 1) expect_equal(p, rep(1, 10)) - + p <- probability(matrix(4:1, nrow = 10, ncol = 4, byrow = TRUE), 1:4, 1) expect_equal(p, rep(0, 10)) - + # pi == 0.5 : uniform case - x <- matrix(c(1, 2, 3, - 1, 3, 2, - 2, 1, 3, - 2, 3, 1, - 3, 1, 2, - 3, 2, 1), ncol = 3, byrow = TRUE) - + x <- matrix(c( + 1, 2, 3, + 1, 3, 2, + 2, 1, 3, + 2, 3, 1, + 3, 1, 2, + 3, 2, 1 + ), ncol = 3, byrow = TRUE) + mu <- 1:3 p <- probability(x, mu, 0.5) - expect_equal(p, rep(1/nrow(x), nrow(x))) - + expect_equal(p, rep(1 / nrow(x), nrow(x))) + # pi = 0.9 - x <- matrix(c(1, 2, 3, - 1, 3, 2, - 2, 1, 3, - 2, 3, 1, - 3, 1, 2, - 3, 2, 1), ncol = 3, byrow = TRUE) - + x <- matrix(c( + 1, 2, 3, + 1, 3, 2, + 2, 1, 3, + 2, 3, 1, + 3, 1, 2, + 3, 2, 1 + ), ncol = 3, byrow = TRUE) + mu <- 1:3 p <- probability(x, mu, 0.9) expect_equal(p, c(0.756, 0.084, 0.084, 0.036, 0.036, 0.004)) @@ -71,24 +81,30 @@ test_that("probability works with a matrix: univariate case", { test_that("probability works with a matrix: multivariate case", { # if pi == 1, the probability is 1 if x == mu, 0 otherwise - p <- probability(cbind(matrix(1:4, nrow = 10, ncol = 4, byrow = TRUE), - matrix(1:3, nrow = 10, ncol = 3, byrow = TRUE)), c(1:4, 1:3), c(1, 1), c(4, 3)) + p <- probability(cbind( + matrix(1:4, nrow = 10, ncol = 4, byrow = TRUE), + matrix(1:3, nrow = 10, ncol = 3, byrow = TRUE) + ), c(1:4, 1:3), c(1, 1), c(4, 3)) expect_equal(p, rep(1, 10)) - - p <- probability(cbind(matrix(4:1, nrow = 10, ncol = 4, byrow = TRUE), - matrix(1:3, nrow = 10, ncol = 3, byrow = TRUE)), c(1:4, 1:3), c(1, 1), c(4, 3)) + + p <- probability(cbind( + matrix(4:1, nrow = 10, ncol = 4, byrow = TRUE), + matrix(1:3, nrow = 10, ncol = 3, byrow = TRUE) + ), c(1:4, 1:3), c(1, 1), c(4, 3)) expect_equal(p, rep(0, 10)) - + # pi == 0.5 : uniform case - x <- matrix(c(1, 2, 3, - 1, 3, 2, - 2, 1, 3, - 2, 3, 1, - 3, 1, 2, - 3, 2, 1), ncol = 3, byrow = TRUE) + x <- matrix(c( + 1, 2, 3, + 1, 3, 2, + 2, 1, 3, + 2, 3, 1, + 3, 1, 2, + 3, 2, 1 + ), ncol = 3, byrow = TRUE) x <- cbind(x, x) - + mu <- 1:3 p <- probability(x, c(mu, mu), c(0.5, 0.5), c(3, 3)) - expect_equal(p, rep(1/(nrow(x) * nrow(x)), nrow(x))) + expect_equal(p, rep(1 / (nrow(x) * nrow(x)), nrow(x))) }) diff --git a/tests/testthat/test.rankManipulation.R b/tests/testthat/test.rankManipulation.R index 1a0e2f1..151d8ee 100644 --- a/tests/testthat/test.rankManipulation.R +++ b/tests/testthat/test.rankManipulation.R @@ -5,13 +5,13 @@ context("Rank Manipulation") test_that("convertSingleRank converts rank from ordering to ranking and vice-versa", { out <- convertSingleRank(1:4) expect_equal(out, 1:4) - + out <- convertSingleRank(c(2, 1, 4, 3)) expect_equal(out, c(2, 1, 4, 3)) - + out <- convertSingleRank(c(3, 1, 4, 2)) expect_equal(out, c(2, 4, 1, 3)) - + out <- convertSingleRank(c(2, 4, 1, 3)) expect_equal(out, c(3, 1, 4, 2)) }) @@ -19,27 +19,31 @@ test_that("convertSingleRank converts rank from ordering to ranking and vice-ver test_that("convertRank converts rank in vector", { out <- convertRank(1:4) expect_equal(out, 1:4) - + out <- convertRank(c(2, 1, 4, 3)) expect_equal(out, c(2, 1, 4, 3)) - + out <- convertRank(c(3, 1, 4, 2)) expect_equal(out, c(2, 4, 1, 3)) - + out <- convertRank(c(2, 4, 1, 3)) expect_equal(out, c(3, 1, 4, 2)) }) test_that("convertRank converts rank in matrix", { - x <- matrix(c(1, 2, 3, 4, - 2, 1, 4, 3, - 3, 1, 4, 2, - 2, 4, 1, 3), ncol = 4, byrow = TRUE) + x <- matrix(c( + 1, 2, 3, 4, + 2, 1, 4, 3, + 3, 1, 4, 2, + 2, 4, 1, 3 + ), ncol = 4, byrow = TRUE) out <- convertRank(x) - expectedOut <- matrix(c(1:4, c(2, 1, 4, 3), - c(2, 4, 1, 3), c(3, 1, 4, 2)), ncol = 4, byrow = TRUE) + expectedOut <- matrix(c( + 1:4, c(2, 1, 4, 3), + c(2, 4, 1, 3), c(3, 1, 4, 2) + ), ncol = 4, byrow = TRUE) expect_equal(out, expectedOut) }) @@ -84,23 +88,31 @@ test_that("completeRank compeltes the rank when only 1 element is missing", { test_that("frequence works", { - X <- matrix(c(rep(1:4, 5), - rep(4:1, 3)), ncol = 4, byrow = TRUE) + X <- matrix(c( + rep(1:4, 5), + rep(4:1, 3) + ), ncol = 4, byrow = TRUE) out <- frequence(X) - - expectedOut <- matrix(c(1:4, 5, - 4:1, 3), ncol = 5, byrow = TRUE) + + expectedOut <- matrix(c( + 1:4, 5, + 4:1, 3 + ), ncol = 5, byrow = TRUE) expect_equal(out, expectedOut) }) test_that("unfrequence works", { - X <- matrix(c(1:4, 5, - 4:1, 3), ncol = 5, byrow = TRUE) + X <- matrix(c( + 1:4, 5, + 4:1, 3 + ), ncol = 5, byrow = TRUE) out <- unfrequence(X) - - expectedOut <- matrix(c(rep(1:4, 5), - rep(4:1, 3)), ncol = 4, byrow = TRUE) - + + expectedOut <- matrix(c( + rep(1:4, 5), + rep(4:1, 3) + ), ncol = 4, byrow = TRUE) + expect_equal(out, expectedOut) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test.rankclust.R b/tests/testthat/test.rankclust.R index 3ecd50f..554f73b 100644 --- a/tests/testthat/test.rankclust.R +++ b/tests/testthat/test.rankclust.R @@ -4,10 +4,10 @@ context("rankclust") test_that("rankclust works on simulated data K=1", { set.seed(42) - + x <- simulISR(200, 0.8, 1:4) res <- rankclust(x) - + expect_s4_class(res, "Rankclust") expect_equal(res@K, 1) expect_equal(res@criterion, "bic") @@ -24,10 +24,10 @@ test_that("rankclust works on simulated data K=1", { test_that("rankclust works on simulated data K>1", { set.seed(42) - + x <- rbind(simulISR(200, 0.9, 1:4), simulISR(150, 0.9, c(2, 4, 3, 1))) res <- rankclust(x, K = 1:3) - + expect_s4_class(res, "Rankclust") expect_equal(res@K, 1:3) expect_equal(res@criterion, "bic") @@ -40,10 +40,10 @@ test_that("rankclust works on simulated data K>1", { test_that("rankclust finds the right parameters on a simple case", { set.seed(42) - + x <- rbind(simulISR(200, 0.95, 1:4), simulISR(150, 0.95, 4:1)) res <- rankclust(x, K = 2) - + expect_s4_class(res, "Rankclust") expect_equal(res@K, 2) expect_equal(res@criterion, "bic") @@ -52,7 +52,7 @@ test_that("rankclust finds the right parameters on a simple case", { expect_true(res@results[[1]]@convergence) expect_false(res@results[[1]]@partial) expect_equivalent(res@results[[1]]@mu, matrix(c(1:4, 4:1), nrow = 2, byrow = TRUE)) - expect_equal(res@results[[1]]@proportion, c(200, 150)/350, tolerance = 2e-2) + expect_equal(res@results[[1]]@proportion, c(200, 150) / 350, tolerance = 2e-2) expect_equivalent(res@results[[1]]@pi, c(0.95, 0.95), tolerance = 2e-2) expect_gte(mean(res@results[[1]]@tik > 0.95) * 2, 0.9) }) @@ -60,14 +60,14 @@ test_that("rankclust finds the right parameters on a simple case", { test_that("rankclust works on simulated data with tied", { set.seed(42) - + x <- simulISR(200, 0.9, 1:4) x[1, ] <- c(2, 1, 3, 3) x[2, ] <- c(1, 2, 2, 2) x[3, ] <- c(1, 1, 1, 1) x[4, ] <- c(1, 1, 3, 3) res <- rankclust(x, K = 1, criterion = "icl") - + expect_s4_class(res, "Rankclust") expect_equal(res@K, 1) expect_equal(res@criterion, "icl") @@ -85,10 +85,10 @@ test_that("rankclust works on simulated data with tied", { test_that("rankclust works on simulated data K>1", { set.seed(42) - + x <- rbind(simulISR(200, 0.9, 1:4), simulISR(150, 0.9, c(2, 4, 3, 1))) res <- rankclust(x, K = 1:3) - + expect_s4_class(res, "Rankclust") expect_equal(res@K, 1:3) expect_equal(res@criterion, "bic") diff --git a/tests/testthat/test.test.R b/tests/testthat/test.test.R index c7fc5e4..a37a487 100644 --- a/tests/testthat/test.test.R +++ b/tests/testthat/test.test.R @@ -16,14 +16,18 @@ test_that("kullback-leibler divergence is large for different parameters", { set.seed(42) proportion1 <- c(0.4, 0.6) pi1 <- matrix(c(0.8, 0.75), nrow = 2) - mu1 <- matrix(c(1, 2, 3, 4, - 4, 2, 1, 3), nrow = 2, byrow = TRUE) - + mu1 <- matrix(c( + 1, 2, 3, 4, + 4, 2, 1, 3 + ), nrow = 2, byrow = TRUE) + proportion2 <- c(0.8, 0.2) pi2 <- matrix(c(0.9, 0.9), nrow = 2) - mu2 <- matrix(c(4, 3, 2, 1, - 1, 3, 4, 2), nrow = 2, byrow = TRUE) - + mu2 <- matrix(c( + 4, 3, 2, 1, + 1, 3, 4, 2 + ), nrow = 2, byrow = TRUE) + dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4) expect_gt(dK, 1.5) }) @@ -33,14 +37,18 @@ test_that("kullback-leibler divergence is close to 0 for close parameters", { set.seed(42) proportion1 <- c(0.4, 0.6) pi1 <- matrix(c(0.8, 0.75), nrow = 2) - mu1 <- matrix(c(1, 2, 3, 4, - 4, 2, 1, 3), nrow = 2, byrow = TRUE) - + mu1 <- matrix(c( + 1, 2, 3, 4, + 4, 2, 1, 3 + ), nrow = 2, byrow = TRUE) + proportion2 <- c(0.42, 0.58) pi2 <- matrix(c(0.85, 0.7), nrow = 2) - mu2 <- matrix(c(1, 2, 3, 4, - 4, 2, 1, 3), nrow = 2, byrow = TRUE) - + mu2 <- matrix(c( + 1, 2, 3, 4, + 4, 2, 1, 3 + ), nrow = 2, byrow = TRUE) + dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4) expect_lt(dK, 0.3) }) @@ -49,11 +57,15 @@ test_that("khi2 test has a p-value greater than 0.05 for data from given paramet set.seed(42) proportion <- c(0.4, 0.6) pi <- c(0.8, 0.75) - mu <- matrix(c(1, 2, 3, 4, - 4, 2, 1, 3), nrow = 2, byrow = TRUE) - - data <- rbind(simulISR(proportion[1] * 500, pi[1], mu[1, ]), - simulISR(proportion[2] * 500, pi[2], mu[2, ])) + mu <- matrix(c( + 1, 2, 3, 4, + 4, 2, 1, 3 + ), nrow = 2, byrow = TRUE) + + data <- rbind( + simulISR(proportion[1] * 500, pi[1], mu[1, ]), + simulISR(proportion[2] * 500, pi[2], mu[2, ]) + ) pval <- khi2(data, proportion, mu, pi) expect_gt(pval, 0.05) @@ -63,12 +75,16 @@ test_that("khi2 test has a p-value lower than 0.05 for data from other parameter set.seed(42) proportion <- c(0.4, 0.6) pi <- c(0.8, 0.75) - mu <- matrix(c(1, 2, 3, 4, - 4, 2, 1, 3), nrow = 2, byrow = TRUE) - - data <- rbind(simulISR(0.8 * 500, 0.9, c(4, 3, 2, 1)), - simulISR(0.2 * 500, 0.9, c(3, 1, 4, 2))) + mu <- matrix(c( + 1, 2, 3, 4, + 4, 2, 1, 3 + ), nrow = 2, byrow = TRUE) + + data <- rbind( + simulISR(0.8 * 500, 0.9, c(4, 3, 2, 1)), + simulISR(0.2 * 500, 0.9, c(3, 1, 4, 2)) + ) pval <- khi2(data, proportion, mu, pi) - + expect_lt(pval, 0.05) })