test: add tests for KL divergence and khi2 test (#12)

* tests: test estimation in simple case * test: add tests for kublack leibler and khi2 test
modal-inria · Oct 15, 2021 · d3449b3 · d3449b3
1 parent 72a30df
commit d3449b3
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 19 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,4 +24,4 @@ LinkingTo: Rcpp, RcppEigen
 Suggests: knitr, rmarkdown, testthat
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
diff --git a/R/data.R b/R/data.R
@@ -1,6 +1,5 @@
 # DOCUMENTATION FOR DATA SET
 
-
 #' @title Rank data: APA
 #' @docType data
 #' @aliases APA

diff --git a/R/rankclust.R b/R/rankclust.R
@@ -2,7 +2,7 @@
 #' 
 #' @description This functions estimates a clustering of ranking data, potentially multivariate, partial and containing tied, based on a mixture of multivariate ISR model [2].
 #' By specifying only one cluster, the function performs a modelling of the ranking data using the multivariate ISR model.
-#' The estimation is performed thanks to a SEM-Gibbs algorithm in the general case.
+#' The estimation is performed thanks to a SEM-Gibbs algorithm.
 #'
 #' 
 #' @param data a matrix in which each row is a ranking (partial or not; for partial ranking,
@@ -12,7 +12,7 @@
 #' @param m a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the matrix data).
 #' @param K an integer or a vector of integer with the number of clusters.
 #' @param criterion criterion "bic" or "icl", criterion to minimize for selecting the number of clusters.
-#' @param Qsem the total number of iterations for the SEM algorithm (defaut value=40).
+#' @param Qsem the total number of iterations for the SEM algorithm (default value=40).
 #' @param Bsem burn-in period for SEM algorithm (default value=10).
 #' @param RjSE a vector containing, for each dimension, the number of iterations of the Gibbs sampler
 #' used both in the SE step for partial rankings and for the presentation orders generation (default value=mj(mj-1)/2).
@@ -24,7 +24,7 @@
 #' @param run number of runs of the algorithm for each value of K.
 #' @param detail boolean, if TRUE, time and others informations will be print during the process (default value FALSE).
 #'
-#' @return An object of class rankclust (See \code{\link{Output-class}} and \code{\link{Rankclust-class}}).
+#' @return An object of class Rankclust (See \code{\link{Output-class}} and \code{\link{Rankclust-class}}).
 #' If the output object is named \code{res}. You can access the result by res[number of groups]@@slotName where \code{slotName} is an element of the class Output.
 #'
 #' 
@@ -33,10 +33,11 @@
 #'
 #' - missing positions are replaced by 0
 #'
-#' - tied are replaced by the lowest position they share"
+#' - tied are replaced by the lowest position they share
 #'
+#' See the vignette dataFormat for mode details (\code{RShowDoc("dataFormat", package = "Rankcluster")}).
 #'
-#'   The ranking representation r=(r_1,...,r_m) contains the
+#' The ranking representation r=(r_1,...,r_m) contains the
 #' ranks assigned to the objects, and means that the ith
 #' object is in r_ith position.
 #'

diff --git a/R/test.R b/R/test.R
@@ -7,7 +7,7 @@
 #' @param mu1,mu2 matrices of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row.
 #' @param m a vector containing the size of ranks for each dimension.
 #' 
-#' @return a real, the Kullback-Leibler divergence.
+#' @return the Kullback-Leibler divergence.
 #' 
 #' @references
 #' http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
@@ -16,9 +16,11 @@
 #' proportion1 <- c(0.4, 0.6)
 #' pi1 <- matrix(c(0.8, 0.75), nrow = 2)
 #' mu1 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
+#' 
 #' proportion2 <- c(0.43, 0.57)
 #' pi2 <- matrix(c(0.82, 0.7), nrow = 2)
 #' mu2 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
+#' 
 #' dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4)
 #' 
 #' @author Quentin Grimonprez
@@ -83,7 +85,7 @@ checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m)
 #' 
 #' @param data a matrix in which each row is a rank of size m.
 #' @param proportion a vector (which sums to 1) containing the K mixture proportion.
-#' @param pi a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparaison of the model (dispersion parameters).
+#' @param pi a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparison of the model (dispersion parameters).
 #' @param mu a matrix of size K*m, where m is the size of a rank, containing the modal rankings of the model (position parameters).
 #' @param nBoot number of bootstrap iterations used to estimate the khi2 adequation test p-value.
 #' 
@@ -95,15 +97,14 @@ checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m)
 #' mu <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
 #' # simulate a data set with declared parameters.
 #' data <- rbind(simulISR(proportion[1] * 100, pi[1], mu[1, ]),
-#' simulISR(proportion[2] * 100, pi[2], mu[2, ]))
+#'               simulISR(proportion[2] * 100, pi[2], mu[2, ]))
 #' pval <- khi2(data, proportion, mu, pi)
 #' 
 #' @author Quentin Grimonprez
 #' 
 #' @export
 khi2 <- function(data, proportion, mu, pi, nBoot = 1000)
 {
-
   checkKhi2(data, proportion, mu, pi, nBoot)
 
   pval = .Call("adkhi2partial", data, pi, proportion, mu, nBoot, PACKAGE = "Rankcluster")

diff --git a/man/khi2.Rd b/man/khi2.Rd
diff --git a/man/kullback.Rd b/man/kullback.Rd
diff --git a/man/rankclust.Rd b/man/rankclust.Rd
diff --git a/tests/testthat/test.rankclust.R b/tests/testthat/test.rankclust.R
@@ -38,6 +38,25 @@ test_that("rankclust works on simulated data K>1", {
   expect_equivalent(res@results[[2]]@mu, matrix(c(1:4, 2, 4, 3, 1), nrow = 2, byrow = TRUE))
 })
 
+test_that("rankclust finds the right parameters on a simple case", {
+  set.seed(42)
+
+  x <- rbind(simulISR(200, 0.95, 1:4), simulISR(150, 0.95, 4:1))
+  res <- rankclust(x, K = 2)
+
+  expect_s4_class(res, "Rankclust")
+  expect_equal(res@K, 2)
+  expect_equal(res@criterion, "bic")
+  expect_true(res@convergence)
+  expect_length(res@results, 1)
+  expect_true(res@results[[1]]@convergence)
+  expect_false(res@results[[1]]@partial)
+  expect_equivalent(res@results[[1]]@mu, matrix(c(1:4, 4:1), nrow = 2, byrow = TRUE))
+  expect_equal(res@results[[1]]@proportion, c(200, 150)/350, tolerance = 2e-2)
+  expect_equivalent(res@results[[1]]@pi, c(0.95, 0.95), tolerance = 2e-2)
+  expect_gte(mean(res@results[[1]]@tik > 0.95) * 2, 0.9)
+})
+
 
 test_that("rankclust works on simulated data with tied", {
   set.seed(42)

diff --git a/tests/testthat/test.test.R b/tests/testthat/test.test.R
@@ -0,0 +1,74 @@
+# Author: Quentin Grimonprez
+
+context("Test functions")
+
+test_that("kullback-leibler divergence is close to 0 for same parameters", {
+  set.seed(42)
+  proportion1 <- c(0.4, 0.6)
+  pi1 <- matrix(c(0.8, 0.75), nrow = 2)
+  mu1 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
+
+  dK <- kullback(proportion1, pi1, mu1, proportion1, pi1, mu1, 4)
+  expect_equal(dK, 0, tolerance = 1e-2)
+})
+
+test_that("kullback-leibler divergence is large for different parameters", {
+  set.seed(42)
+  proportion1 <- c(0.4, 0.6)
+  pi1 <- matrix(c(0.8, 0.75), nrow = 2)
+  mu1 <- matrix(c(1, 2, 3, 4, 
+                  4, 2, 1, 3), nrow = 2, byrow = TRUE)
+
+  proportion2 <- c(0.8, 0.2)
+  pi2 <- matrix(c(0.9, 0.9), nrow = 2)
+  mu2 <- matrix(c(4, 3, 2, 1, 
+                  1, 3, 4, 2), nrow = 2, byrow = TRUE)
+
+  dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4)
+  expect_gt(dK, 1.5)
+})
+
+
+test_that("kullback-leibler divergence is close to 0 for close parameters", {
+  set.seed(42)
+  proportion1 <- c(0.4, 0.6)
+  pi1 <- matrix(c(0.8, 0.75), nrow = 2)
+  mu1 <- matrix(c(1, 2, 3, 4, 
+                  4, 2, 1, 3), nrow = 2, byrow = TRUE)
+
+  proportion2 <- c(0.42, 0.58)
+  pi2 <- matrix(c(0.85, 0.7), nrow = 2)
+  mu2 <- matrix(c(1, 2, 3, 4, 
+                  4, 2, 1, 3), nrow = 2, byrow = TRUE)
+
+  dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4)
+  expect_lt(dK, 0.3)
+})
+
+test_that("khi2 test has a p-value greater than 0.05 for data from given parameters", {
+  set.seed(42)
+  proportion <- c(0.4, 0.6)
+  pi <- c(0.8, 0.75)
+  mu <- matrix(c(1, 2, 3, 4, 
+                 4, 2, 1, 3), nrow = 2, byrow = TRUE)
+
+  data <- rbind(simulISR(proportion[1] * 500, pi[1], mu[1, ]),
+                simulISR(proportion[2] * 500, pi[2], mu[2, ]))
+  pval <- khi2(data, proportion, mu, pi)
+
+  expect_gt(pval, 0.05)
+})
+
+test_that("khi2 test has a p-value lower than 0.05 for data from other parameters", {
+  set.seed(42)
+  proportion <- c(0.4, 0.6)
+  pi <- c(0.8, 0.75)
+  mu <- matrix(c(1, 2, 3, 4, 
+                 4, 2, 1, 3), nrow = 2, byrow = TRUE)
+
+  data <- rbind(simulISR(0.8 * 500, 0.9, c(4, 3, 2, 1)),
+                simulISR(0.2 * 500, 0.9, c(3, 1, 4, 2)))
+  pval <- khi2(data, proportion, mu, pi)
+
+  expect_lt(pval, 0.05)
+})