Skip to content

Commit

Permalink
test: add tests for KL divergence and khi2 test (#12)
Browse files Browse the repository at this point in the history
* tests: test estimation in simple case

* test: add tests for kublack leibler and khi2 test
  • Loading branch information
Quentin62 authored Oct 15, 2021
1 parent 72a30df commit d3449b3
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 19 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ LinkingTo: Rcpp, RcppEigen
Suggests: knitr, rmarkdown, testthat
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
1 change: 0 additions & 1 deletion R/data.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# DOCUMENTATION FOR DATA SET


#' @title Rank data: APA
#' @docType data
#' @aliases APA
Expand Down
11 changes: 6 additions & 5 deletions R/rankclust.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#'
#' @description This functions estimates a clustering of ranking data, potentially multivariate, partial and containing tied, based on a mixture of multivariate ISR model [2].
#' By specifying only one cluster, the function performs a modelling of the ranking data using the multivariate ISR model.
#' The estimation is performed thanks to a SEM-Gibbs algorithm in the general case.
#' The estimation is performed thanks to a SEM-Gibbs algorithm.
#'
#'
#' @param data a matrix in which each row is a ranking (partial or not; for partial ranking,
Expand All @@ -12,7 +12,7 @@
#' @param m a vector composed of the sizes of the rankings of each dimension (default value is the number of column of the matrix data).
#' @param K an integer or a vector of integer with the number of clusters.
#' @param criterion criterion "bic" or "icl", criterion to minimize for selecting the number of clusters.
#' @param Qsem the total number of iterations for the SEM algorithm (defaut value=40).
#' @param Qsem the total number of iterations for the SEM algorithm (default value=40).
#' @param Bsem burn-in period for SEM algorithm (default value=10).
#' @param RjSE a vector containing, for each dimension, the number of iterations of the Gibbs sampler
#' used both in the SE step for partial rankings and for the presentation orders generation (default value=mj(mj-1)/2).
Expand All @@ -24,7 +24,7 @@
#' @param run number of runs of the algorithm for each value of K.
#' @param detail boolean, if TRUE, time and others informations will be print during the process (default value FALSE).
#'
#' @return An object of class rankclust (See \code{\link{Output-class}} and \code{\link{Rankclust-class}}).
#' @return An object of class Rankclust (See \code{\link{Output-class}} and \code{\link{Rankclust-class}}).
#' If the output object is named \code{res}. You can access the result by res[number of groups]@@slotName where \code{slotName} is an element of the class Output.
#'
#'
Expand All @@ -33,10 +33,11 @@
#'
#' - missing positions are replaced by 0
#'
#' - tied are replaced by the lowest position they share"
#' - tied are replaced by the lowest position they share
#'
#' See the vignette dataFormat for mode details (\code{RShowDoc("dataFormat", package = "Rankcluster")}).
#'
#' The ranking representation r=(r_1,...,r_m) contains the
#' The ranking representation r=(r_1,...,r_m) contains the
#' ranks assigned to the objects, and means that the ith
#' object is in r_ith position.
#'
Expand Down
9 changes: 5 additions & 4 deletions R/test.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' @param mu1,mu2 matrices of size K*sum(m), containing the modal ranks. Each row contains the modal rank for a cluster. In the case of multivariate ranks, the reference rank for each dimension are set successively on the same row.
#' @param m a vector containing the size of ranks for each dimension.
#'
#' @return a real, the Kullback-Leibler divergence.
#' @return the Kullback-Leibler divergence.
#'
#' @references
#' http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
Expand All @@ -16,9 +16,11 @@
#' proportion1 <- c(0.4, 0.6)
#' pi1 <- matrix(c(0.8, 0.75), nrow = 2)
#' mu1 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
#'
#' proportion2 <- c(0.43, 0.57)
#' pi2 <- matrix(c(0.82, 0.7), nrow = 2)
#' mu2 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
#'
#' dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4)
#'
#' @author Quentin Grimonprez
Expand Down Expand Up @@ -83,7 +85,7 @@ checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m)
#'
#' @param data a matrix in which each row is a rank of size m.
#' @param proportion a vector (which sums to 1) containing the K mixture proportion.
#' @param pi a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparaison of the model (dispersion parameters).
#' @param pi a vector of size K, where K is the number of clusters, containing the probabilities of a good paired comparison of the model (dispersion parameters).
#' @param mu a matrix of size K*m, where m is the size of a rank, containing the modal rankings of the model (position parameters).
#' @param nBoot number of bootstrap iterations used to estimate the khi2 adequation test p-value.
#'
Expand All @@ -95,15 +97,14 @@ checkKullback <- function(proportion1, pi1, mu1, proportion2, pi2, mu2, m)
#' mu <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)
#' # simulate a data set with declared parameters.
#' data <- rbind(simulISR(proportion[1] * 100, pi[1], mu[1, ]),
#' simulISR(proportion[2] * 100, pi[2], mu[2, ]))
#' simulISR(proportion[2] * 100, pi[2], mu[2, ]))
#' pval <- khi2(data, proportion, mu, pi)
#'
#' @author Quentin Grimonprez
#'
#' @export
khi2 <- function(data, proportion, mu, pi, nBoot = 1000)
{

checkKhi2(data, proportion, mu, pi, nBoot)

pval = .Call("adkhi2partial", data, pi, proportion, mu, nBoot, PACKAGE = "Rankcluster")
Expand Down
4 changes: 2 additions & 2 deletions man/khi2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/kullback.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 6 additions & 5 deletions man/rankclust.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions tests/testthat/test.rankclust.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,25 @@ test_that("rankclust works on simulated data K>1", {
expect_equivalent(res@results[[2]]@mu, matrix(c(1:4, 2, 4, 3, 1), nrow = 2, byrow = TRUE))
})

test_that("rankclust finds the right parameters on a simple case", {
set.seed(42)

x <- rbind(simulISR(200, 0.95, 1:4), simulISR(150, 0.95, 4:1))
res <- rankclust(x, K = 2)

expect_s4_class(res, "Rankclust")
expect_equal(res@K, 2)
expect_equal(res@criterion, "bic")
expect_true(res@convergence)
expect_length(res@results, 1)
expect_true(res@results[[1]]@convergence)
expect_false(res@results[[1]]@partial)
expect_equivalent(res@results[[1]]@mu, matrix(c(1:4, 4:1), nrow = 2, byrow = TRUE))
expect_equal(res@results[[1]]@proportion, c(200, 150)/350, tolerance = 2e-2)
expect_equivalent(res@results[[1]]@pi, c(0.95, 0.95), tolerance = 2e-2)
expect_gte(mean(res@results[[1]]@tik > 0.95) * 2, 0.9)
})


test_that("rankclust works on simulated data with tied", {
set.seed(42)
Expand Down
74 changes: 74 additions & 0 deletions tests/testthat/test.test.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Author: Quentin Grimonprez

context("Test functions")

test_that("kullback-leibler divergence is close to 0 for same parameters", {
set.seed(42)
proportion1 <- c(0.4, 0.6)
pi1 <- matrix(c(0.8, 0.75), nrow = 2)
mu1 <- matrix(c(1, 2, 3, 4, 4, 2, 1, 3), nrow = 2, byrow = TRUE)

dK <- kullback(proportion1, pi1, mu1, proportion1, pi1, mu1, 4)
expect_equal(dK, 0, tolerance = 1e-2)
})

test_that("kullback-leibler divergence is large for different parameters", {
set.seed(42)
proportion1 <- c(0.4, 0.6)
pi1 <- matrix(c(0.8, 0.75), nrow = 2)
mu1 <- matrix(c(1, 2, 3, 4,
4, 2, 1, 3), nrow = 2, byrow = TRUE)

proportion2 <- c(0.8, 0.2)
pi2 <- matrix(c(0.9, 0.9), nrow = 2)
mu2 <- matrix(c(4, 3, 2, 1,
1, 3, 4, 2), nrow = 2, byrow = TRUE)

dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4)
expect_gt(dK, 1.5)
})


test_that("kullback-leibler divergence is close to 0 for close parameters", {
set.seed(42)
proportion1 <- c(0.4, 0.6)
pi1 <- matrix(c(0.8, 0.75), nrow = 2)
mu1 <- matrix(c(1, 2, 3, 4,
4, 2, 1, 3), nrow = 2, byrow = TRUE)

proportion2 <- c(0.42, 0.58)
pi2 <- matrix(c(0.85, 0.7), nrow = 2)
mu2 <- matrix(c(1, 2, 3, 4,
4, 2, 1, 3), nrow = 2, byrow = TRUE)

dK <- kullback(proportion1, pi1, mu1, proportion2, pi2, mu2, 4)
expect_lt(dK, 0.3)
})

test_that("khi2 test has a p-value greater than 0.05 for data from given parameters", {
set.seed(42)
proportion <- c(0.4, 0.6)
pi <- c(0.8, 0.75)
mu <- matrix(c(1, 2, 3, 4,
4, 2, 1, 3), nrow = 2, byrow = TRUE)

data <- rbind(simulISR(proportion[1] * 500, pi[1], mu[1, ]),
simulISR(proportion[2] * 500, pi[2], mu[2, ]))
pval <- khi2(data, proportion, mu, pi)

expect_gt(pval, 0.05)
})

test_that("khi2 test has a p-value lower than 0.05 for data from other parameters", {
set.seed(42)
proportion <- c(0.4, 0.6)
pi <- c(0.8, 0.75)
mu <- matrix(c(1, 2, 3, 4,
4, 2, 1, 3), nrow = 2, byrow = TRUE)

data <- rbind(simulISR(0.8 * 500, 0.9, c(4, 3, 2, 1)),
simulISR(0.2 * 500, 0.9, c(3, 1, 4, 2)))
pval <- khi2(data, proportion, mu, pi)

expect_lt(pval, 0.05)
})

0 comments on commit d3449b3

Please sign in to comment.