diff --git a/DESCRIPTION b/DESCRIPTION index c483e41c..9c40c6a4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: sjstats Type: Package Encoding: UTF-8 Title: Collection of Convenient Functions for Common Statistical Computations -Version: 0.16.0.9000 -Date: 2018-08-02 +Version: 0.17.0.9000 +Date: 2018-08-20 Authors@R: person("Daniel", "Lüdecke", role = c("aut", "cre"), email = "d.luedecke@uke.de", comment = c(ORCID = "0000-0002-8895-3206")) Maintainer: Daniel Lüdecke Description: Collection of convenient functions for common statistical computations, diff --git a/R/overdisp.R b/R/overdisp.R index c87a890e..a448bc1c 100644 --- a/R/overdisp.R +++ b/R/overdisp.R @@ -14,6 +14,7 @@ #' zeros to considered as over- or underfitting zero-counts. A ratio #' between 1 +/- \code{tolerance} are considered as OK, while a ratio #' beyond or below this treshold would indicate over- or underfitting. +#' @param ... Currently not used. #' #' @return For \code{overdisp()}, information on the overdispersion test; for #' \code{zero_count()}, the amount of predicted and observed zeros in diff --git a/R/robust.R b/R/robust.R index 0559a975..8b71d46a 100644 --- a/R/robust.R +++ b/R/robust.R @@ -1,9 +1,9 @@ #' @title Robust standard errors for regression models #' @name robust #' @description \code{robust()} computes robust standard error for regression models. -#' This method calls one of the \code{vcov*()}-functions from the \pkg{sandwich} -#' package for robust covariance matrix estimators. Results are returned -#' as tidy data frame. +#' This method calls one of the \code{vcov*()}-functions from the +#' \pkg{sandwich}-package for robust covariance matrix estimators. Results are +#' returned as tidy data frame. #' \cr \cr #' \code{svy()} is intended to compute standard errors for survey #' designs (complex samples) fitted with regular \code{lm} or @@ -16,10 +16,10 @@ #' from the \pkg{sandwich} package. For \code{svy()}, \code{x} must be #' \code{lm} object, fitted with weights. #' @param vcov.fun String, indicating the name of the \code{vcov*()}-function -#' from the \pkg{sandwich} package, e.g. \code{vcov.fun = "vcovCL"}. +#' from the \pkg{sandwich}-package, e.g. \code{vcov.fun = "vcovCL"}. #' @param vcov.type Character vector, specifying the estimation type for the -#' heteroskedasticity-consistent covariance matrix estimation -#' (see \code{\link[sandwich]{vcovHC}} for details). +#' robust covariance matrix estimation (see \code{\link[sandwich]{vcovHC}} +#' for details). #' @param vcov.args List of named vectors, used as additional arguments that #' are passed down to \code{vcov.fun}. #' @param conf.int Logical, \code{TRUE} if confidence intervals based on robust @@ -54,7 +54,7 @@ #' #' confint(fit) #' robust(fit, conf.int = TRUE) -#' robust(fit, vcov = "HC1", conf.int = TRUE) # "HC1" should be Stata default +#' robust(fit, vcov.type = "HC1", conf.int = TRUE) # "HC1" should be Stata default #' #' library(sjmisc) #' # dichtomozize service usage by "service usage yes/no" @@ -65,7 +65,7 @@ #' robust(fit) #' robust(fit, conf.int = TRUE, exponentiate = TRUE) #' -#' @importFrom stats qt +#' @importFrom stats qt pt df.residual qnorm pnorm nobs coef #' @export robust <- function(x, vcov.fun = "vcovHC", vcov.type = c("HC3", "const", "HC", "HC0", "HC1", "HC2", "HC4", "HC4m", "HC5"), vcov.args = NULL, conf.int = FALSE, exponentiate = FALSE) { @@ -85,8 +85,43 @@ robust <- function(x, vcov.fun = "vcovHC", vcov.type = c("HC3", "const", "HC", " se <- sqrt(diag(.vcov)) + dendf <- tryCatch( + stats::df.residual(x), + error = function(x) { NULL }, + warning = function(x) { NULL }, + finally = function(x) { NULL } + ) + + # 2nd try + if (is.null(dendf)) { + dendf <- tryCatch( + summary(x)$df[2], + error = function(x) { NULL }, + warning = function(x) { NULL }, + finally = function(x) { NULL } + ) + } + + # 3rd try + if (is.null(dendf)) { + dendf <- tryCatch( + stats::nobs(x) - length(est), + error = function(x) { NULL }, + warning = function(x) { NULL }, + finally = function(x) { NULL } + ) + } + + t.stat <- est / se - p.value <- 2 * pt(abs(t.stat), df = 3, lower.tail = F) + + if (is.null(dendf)) { + p.value <- 2 * stats::pnorm(abs(t.stat), lower.tail = FALSE) + se.factor <- stats::qnorm(.975) + } else { + p.value <- 2 * stats::pt(abs(t.stat), df = dendf, lower.tail = FALSE) + se.factor <- stats::qt(.975, df = dendf) + } # create tidy data frame @@ -100,13 +135,12 @@ robust <- function(x, vcov.fun = "vcovHC", vcov.type = c("HC3", "const", "HC", " # add CI if (conf.int) { - # denominator df - dendf <- summary(x)$df[2] # add columns with CI result <- add_cols( result, - conf.low = result$estimate - (stats::qt(.975, df = dendf) * result$std.error), - conf.high = result$estimate + (stats::qt(.975, df = dendf) * result$std.error) + conf.low = result$estimate - se.factor * result$std.error, + conf.high = result$estimate + se.factor * result$std.error, + .after = "std.error" ) } diff --git a/R/select_helpers.R b/R/select_helpers.R index e472a4cd..f982fbb1 100644 --- a/R/select_helpers.R +++ b/R/select_helpers.R @@ -44,6 +44,8 @@ add_cols <- function(data, ..., .after = 1) { if (.after < 1) { cbind(dat, data) + } else if (is.infinite(.after)) { + cbind(data, dat) } else { c1 <- 1:.after c2 <- (.after + 1):ncol(data) diff --git a/R/typical.R b/R/typical.R index 3fa79c9e..062f78f1 100644 --- a/R/typical.R +++ b/R/typical.R @@ -17,9 +17,10 @@ #' vector to apply other different functions to numeric and categorical #' \code{x}, where factors are first converted to numeric vectors, e.g. #' \code{fun = c(numeric = "median", factor = "mean")}. See 'Examples'. -#' @param weight.by Name of variable in \code{x} that indicated the vector of +#' @param weights Name of variable in \code{x} that indicated the vector of #' weights that will be applied to weight all observations. Default is #' \code{NULL}, so no weights are used. +#' @param weight.by Deprecated. #' @param ... Further arguments, passed down to \code{fun}. #' #' @inheritParams grpmean @@ -65,14 +66,12 @@ #' #' #' @export -typical_value <- function(x, fun = "mean", weight.by = NULL, ...) { +typical_value <- function(x, fun = "mean", weights = NULL, weight.by, ...) { - ## TODO activate later - - # if (!missing(weight.by)) { - # # message("Argument `weight.by` is deprecated. Please use `weights`.") - # weights <- weight.by - # } + if (!missing(weight.by)) { + message("Argument `weight.by` is deprecated. Please use `weights`.") + weights <- weight.by + } # check if we have named vectors and find the requested function # for special functions for factors, convert to numeric first diff --git a/docs/articles/anova-statistics.html b/docs/articles/anova-statistics.html index 3013498f..32a78e5a 100644 --- a/docs/articles/anova-statistics.html +++ b/docs/articles/anova-statistics.html @@ -30,7 +30,7 @@ sjstats - 0.16.0 + 0.17.0 @@ -89,7 +89,7 @@

Statistics for Anova Tables

Daniel Lüdecke

-

2018-07-10

+

2018-08-20

Source: vignettes/anova-statistics.Rmd @@ -127,12 +127,10 @@

For variables with 1 degree of freedeom (in the numerator), the square root of eta-squared is equal to the correlation coefficient r. For variables with more than 1 degree of freedom, eta-squared equals R2. This makes eta-squared easily interpretable. Furthermore, these effect sizes can easily be converted into effect size measures that can be, for instance, further processed in meta-analyses.

Eta-squared can be computed simply with:

+#> term etasq +#> 1 e42dep 0.266 +#> 2 c172code 0.005 +#> 3 c160age 0.048

@@ -140,12 +138,10 @@

The partial eta-squared value is the ratio of the sum of squares for each group level to the sum of squares for each group level plus the residual sum of squares. It is more difficult to interpret, because its value strongly depends on the variability of the residuals. Partial eta-squared values should be reported with caution, and Levine and Hullett (2002) recommend reporting eta- or omega-squared rather than partial eta-squared.

Use the partial-argument to compute partial eta-squared values:

+#> term partial.etasq +#> 1 e42dep 0.281 +#> 2 c172code 0.008 +#> 3 c160age 0.066

@@ -153,60 +149,50 @@

While eta-squared estimates tend to be biased in certain situations, e.g. when the sample size is small or the independent variables have many group levels, omega-squared estimates are corrected for this bias.

Omega-squared can be simply computed with:

+#> term omegasq +#> 1 e42dep 0.263 +#> 2 c172code 0.004 +#> 3 c160age 0.048

Partial Omega-Squared

omega_sq() also has a partial-argument to compute partial omega-squared values. Computing the partial omega-squared statistics is based on bootstrapping. In this case, use n to define the number of samples (1000 by default.)

+#> term partial.omegasq +#> 1 e42dep 0.278 +#> 2 c172code 0.005 +#> 3 c160age 0.065

Like the other functions, the input may also be an object of class anova, so you can also use model fits from the car package, which allows fitting Anova’s with different types of sum of squares:

+#> term power sumsq meansq df statistic p.value etasq partial.etasq omegasq partial.omegasq cohens.f +#> 1 (Intercept) 0.973 26851.070 26851.070 1 15.167 0.000 0.013 0.018 0.012 0.017 0.135 +#> 2 e42dep 1.000 426461.571 142153.857 3 80.299 0.000 0.209 0.224 0.206 0.220 0.537 +#> 3 c172code 0.429 7352.049 3676.025 2 2.076 0.126 0.004 0.005 0.002 0.003 0.071 +#> 4 c160age 1.000 105169.595 105169.595 1 59.408 0.000 0.051 0.066 0.051 0.065 0.267 +#> 5 Residuals NA 1476436.343 1770.307 834 NA NA NA NA NA NA NA

@@ -214,21 +200,17 @@

eta_sq() and omega_sq() have a ci.lvl-argument, which - if not NULL - also computes a confidence interval.

For eta-squared, i.e. eta_sq() with partial = FALSE, due to non-symmetry, confidence intervals are based on bootstrap-methods. Confidence intervals for partial omega-squared, i.e. omega_sq() with partial = TRUE - is also based on bootstrapping. In these cases, n indicates the number of bootstrap samples to be drawn to compute the confidence intervals.

+#> term partial.etasq conf.low conf.high +#> 1 e42dep 0.281 0.247 0.310 +#> 2 c172code 0.008 0.001 0.016 +#> 3 c160age 0.066 0.047 0.089 + +# uses bootstrapping - here, for speed, just 100 samples +omega_sq(fit, partial = TRUE, ci.lvl = .9, n = 100) +#> term partial.omegasq conf.low conf.high +#> 1 e42dep 0.278 0.226 0.331 +#> 2 c172code 0.005 -0.006 0.023 +#> 3 c160age 0.065 0.032 0.101

diff --git a/docs/articles/bayesian-statistics.html b/docs/articles/bayesian-statistics.html index 15ca6798..69af59aa 100644 --- a/docs/articles/bayesian-statistics.html +++ b/docs/articles/bayesian-statistics.html @@ -30,7 +30,7 @@ sjstats - 0.16.0 + 0.17.0

@@ -89,7 +89,7 @@

Statistics for Bayesian Models

Daniel Lüdecke

-

2018-07-10

+

2018-08-20

Source: vignettes/bayesian-statistics.Rmd @@ -170,15 +170,15 @@

#> # Highest Density Interval #> #> HDI(90%) -#> b_jobseek_Intercept [ 3.48 3.89] -#> b_depress2_Intercept [ 1.99 2.46] -#> b_jobseek_treat [-0.02 0.15] -#> b_jobseek_econ_hard [ 0.02 0.10] -#> b_jobseek_sex [-0.09 0.08] +#> b_jobseek_Intercept [ 3.47 3.89] +#> b_depress2_Intercept [ 1.94 2.45] +#> b_jobseek_treat [-0.01 0.16] +#> b_jobseek_econ_hard [ 0.01 0.09] +#> b_jobseek_sex [-0.09 0.07] #> b_jobseek_age [ 0.00 0.01] #> b_depress2_treat [-0.11 0.03] #> b_depress2_job_seek [-0.29 -0.19] -#> b_depress2_econ_hard [ 0.12 0.18] +#> b_depress2_econ_hard [ 0.11 0.18] #> b_depress2_sex [ 0.04 0.17] #> b_depress2_age [-0.00 0.00] #> sigma_jobseek [ 0.70 0.76] @@ -189,18 +189,18 @@

#> # Highest Density Interval #> #> HDI(50%) HDI(89%) -#> b_jobseek_Intercept [ 3.58 3.75] [ 3.48 3.88] -#> b_depress2_Intercept [ 2.12 2.33] [ 1.98 2.44] -#> b_jobseek_treat [ 0.04 0.11] [-0.02 0.15] -#> b_jobseek_econ_hard [ 0.03 0.07] [ 0.02 0.09] -#> b_jobseek_sex [-0.04 0.02] [-0.08 0.08] +#> b_jobseek_Intercept [ 3.59 3.76] [ 3.47 3.87] +#> b_depress2_Intercept [ 2.12 2.32] [ 1.96 2.45] +#> b_jobseek_treat [ 0.03 0.10] [-0.01 0.15] +#> b_jobseek_econ_hard [ 0.04 0.07] [ 0.01 0.09] +#> b_jobseek_sex [-0.04 0.02] [-0.09 0.07] #> b_jobseek_age [ 0.00 0.01] [ 0.00 0.01] -#> b_depress2_treat [-0.07 -0.01] [-0.11 0.03] -#> b_depress2_job_seek [-0.26 -0.22] [-0.28 -0.19] -#> b_depress2_econ_hard [ 0.14 0.16] [ 0.12 0.18] +#> b_depress2_treat [-0.07 -0.01] [-0.11 0.02] +#> b_depress2_job_seek [-0.26 -0.22] [-0.29 -0.19] +#> b_depress2_econ_hard [ 0.13 0.16] [ 0.11 0.18] #> b_depress2_sex [ 0.08 0.13] [ 0.04 0.17] #> b_depress2_age [-0.00 0.00] [-0.00 0.00] -#> sigma_jobseek [ 0.72 0.74] [ 0.70 0.75] +#> sigma_jobseek [ 0.71 0.74] [ 0.70 0.76] #> sigma_depress2 [ 0.60 0.62] [ 0.59 0.64]

For multilevel models, the type-argument defines whether the HDI of fixed, random or all effects are shown.

+#> r_e15relat.1.Intercept. [-0.15 1.31] +#> r_e15relat.2.Intercept. [-0.15 1.02] +#> r_e15relat.3.Intercept. [-0.88 0.74] +#> r_e15relat.4.Intercept. [-0.60 0.74] +#> r_e15relat.5.Intercept. [-0.89 0.80] +#> r_e15relat.6.Intercept. [-1.63 0.25] +#> r_e15relat.7.Intercept. [-1.13 0.81] +#> r_e15relat.8.Intercept. [-0.85 0.46]

The computation for the HDI is based on the code from Kruschke 2015, pp. 727f. For default sampling in Stan (4000 samples), the 90% intervals for HDI are more stable than, for instance, 95% intervals. An effective sample size of at least 10.000 is recommended if 95% intervals should be computed (see Kruschke 2015, p. 183ff).

rope() does not suggest limits for the region of practical equivalence and does not tell you how big is practically equivalent to the null value. However, there are suggestions how to choose reasonable limits (see Kruschke 2018), which are implemented in the equi_test() functions.

@@ -253,14 +253,14 @@

#> Samples: 4000 #> #> H0 %inROPE HDI(95%) -#> b_Intercept (*) reject 0.00 [ 7.64 9.92] -#> b_e42dep2 (*) undecided 8.03 [ 0.11 2.11] -#> b_e42dep3 (*) reject 0.00 [ 1.29 3.26] -#> b_e42dep4 (*) reject 0.00 [ 2.79 4.92] +#> b_Intercept (*) reject 0.00 [ 7.58 9.94] +#> b_e42dep2 (*) undecided 8.47 [ 0.12 2.11] +#> b_e42dep3 (*) reject 0.02 [ 1.24 3.25] +#> b_e42dep4 (*) reject 0.00 [ 2.79 4.91] #> b_c12hour accept 100.00 [ 0.00 0.01] -#> b_c172code2 undecided 73.60 [-0.45 0.78] -#> b_c172code3 undecided 22.95 [-0.09 1.48] -#> sigma reject 0.00 [ 3.40 3.75] +#> b_c172code2 (*) undecided 72.42 [-0.45 0.75] +#> b_c172code3 undecided 23.82 [-0.08 1.50] +#> sigma reject 0.00 [ 3.41 3.77] #> #> (*) the number of effective samples may be insufficient for some parameters

For models with binary outcome, there is no concrete way to derive the effect size that defines the ROPE limits. Two examples from Kruschke suggest that a negligible change is about .05 on the logit-scale. In these cases, it is recommended to specify the rope argument, however, if not specified, the ROPE limits are calculated in this way: 0 +/- .1 * sd(intercept) / 4. For all other models, 0 +/- .1 * sd(intercept) is used to determine the ROPE limits. These formulas are based on experience that worked well in real-life situations, but are most likely not generally the best approach.

@@ -285,16 +285,16 @@

#> ## Conditional Model: #> #> estimate std.error HDI(89%) ratio rhat mcse -#> Intercept 1.18 0.73 [-0.34 2.50] 0.16 1 0.04 -#> child -1.15 0.09 [-1.32 -1.00] 0.80 1 0.00 -#> camper 0.73 0.09 [ 0.58 0.89] 0.90 1 0.00 +#> Intercept 1.25 0.73 [-0.20 2.61] 0.19 1.01 0.03 +#> child -1.15 0.09 [-1.29 -0.99] 0.91 1.00 0.00 +#> camper 0.73 0.09 [ 0.57 0.88] 0.87 1.00 0.00 #> #> ## Zero-Inflated Model: #> #> estimate std.error HDI(89%) ratio rhat mcse -#> Intercept -0.65 0.72 [-1.97 0.56] 0.35 1 0.02 -#> child 1.88 0.34 [ 1.33 2.42] 0.83 1 0.01 -#> camper -0.84 0.35 [-1.36 -0.22] 0.78 1 0.01 +#> Intercept -0.69 0.70 [-1.98 0.49] 0.44 1 0.02 +#> child 1.88 0.35 [ 1.34 2.42] 0.72 1 0.01 +#> camper -0.85 0.38 [-1.41 -0.25] 0.84 1 0.01

Additional statistics in the output are:

-
tidy_stan(m3)
-#> 
-#> # Summary Statistics of Stan-Model
-#> 
-#> ## Conditional Model:
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  Intercept     1.24      0.72 [-0.02  2.64]  0.18    1 0.03
-#>  child        -1.14      0.09 [-1.30 -1.00]  0.94    1 0.00
-#>  camper        0.73      0.10 [ 0.57  0.88]  0.61    1 0.00
-#> 
-#> ## Zero-Inflated Model:
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  Intercept    -0.66      0.70 [-1.92  0.63]   0.4    1 0.02
-#>  child         1.87      0.33 [ 1.33  2.38]   1.0    1 0.01
-#>  camper       -0.84      0.37 [-1.47 -0.28]   1.0    1 0.01
+

Additional statistics in the output are:

By default, the “estimate” is the median of the posterior distribution, but this can be changed with the typical-argument.

-
tidy_stan(m3, typical = "mean")
-#> 
-#> # Summary Statistics of Stan-Model
-#> 
-#> ## Conditional Model:
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  Intercept     1.23      0.72 [-0.02  2.64]  0.18    1 0.03
-#>  child        -1.15      0.09 [-1.30 -1.00]  0.94    1 0.00
-#>  camper        0.73      0.10 [ 0.57  0.88]  0.61    1 0.00
-#> 
-#> ## Zero-Inflated Model:
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  Intercept    -0.69      0.70 [-1.92  0.63]   0.4    1 0.02
-#>  child         1.88      0.33 [ 1.33  2.38]   1.0    1 0.01
-#>  camper       -0.85      0.37 [-1.47 -0.28]   1.0    1 0.01
+

To also show random effects of multilevel models, use the type-argument.

-
# printing fixed and random effects of multilevel model
-tidy_stan(m3, type = "all")
-#> 
-#> # Summary Statistics of Stan-Model
-#> 
-#> ## Conditional Model: Fixed effects
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  Intercept     1.24      0.72 [-0.02  2.64]  0.18    1 0.03
-#>  child        -1.14      0.09 [-1.30 -1.00]  0.94    1 0.00
-#>  camper        0.73      0.10 [ 0.57  0.88]  0.61    1 0.00
-#> 
-#> ## Conditional Model: Random effect (Intercept: persons)
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  persons.1    -1.30      0.73 [-2.68  0.03]  0.20    1 0.03
-#>  persons.2    -0.33      0.71 [-1.65  0.98]  0.18    1 0.03
-#>  persons.3     0.39      0.70 [-1.01  1.67]  0.18    1 0.03
-#>  persons.4     1.28      0.71 [-0.06  2.57]  0.18    1 0.03
-#> 
-#> ## Zero-Inflated Model: Fixed effects
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  Intercept    -0.66      0.70 [-1.92  0.63]   0.4    1 0.02
-#>  child         1.87      0.33 [ 1.33  2.38]   1.0    1 0.01
-#>  camper       -0.84      0.37 [-1.47 -0.28]   1.0    1 0.01
-#> 
-#> ## Zero-Inflated Model: Random effect (Intercept: persons)
-#> 
-#>            estimate std.error      HDI(89%) ratio rhat mcse
-#>  persons.1     1.22      0.79 [-0.07  2.64]  0.34    1 0.02
-#>  persons.2     0.32      0.70 [-0.98  1.52]  0.35    1 0.02
-#>  persons.3    -0.15      0.68 [-1.41  1.09]  0.37    1 0.02
-#>  persons.4    -1.25      0.74 [-2.58  0.00]  0.42    1 0.02
+

By default, 89%-HDI are computed (a convention following McElreath 2015), but other or even multiple HDI can be computed using the prob argument.

-
# two different HDI for multivariate response model
-tidy_stan(m2, prob = c(.5, .95))
-#> 
-#> # Summary Statistics of Stan-Model
-#> 
-#> ## Response: jobseek
-#> 
-#>            estimate std.error      HDI(50%)      HDI(95%) ratio rhat mcse
-#>  Intercept     3.67      0.12 [ 3.59  3.75] [ 3.42  3.91]     1    1    0
-#>  treat         0.07      0.05 [ 0.03  0.10] [-0.03  0.17]     1    1    0
-#>  econ_hard     0.05      0.02 [ 0.04  0.07] [ 0.01  0.10]     1    1    0
-#>  sex          -0.01      0.05 [-0.04  0.03] [-0.10  0.08]     1    1    0
-#>  age           0.00      0.00 [ 0.00  0.01] [ 0.00  0.01]     1    1    0
-#> 
-#> ## Response: depress2
-#> 
-#>            estimate std.error      HDI(50%)      HDI(95%) ratio rhat mcse
-#>  Intercept     2.21      0.15 [ 2.13  2.34] [ 1.92  2.50]     1    1    0
-#>  treat        -0.04      0.04 [-0.07 -0.01] [-0.13  0.05]     1    1    0
-#>  joseek       -0.24      0.03 [-0.26 -0.22] [-0.30 -0.18]     1    1    0
-#>  econ_hard     0.15      0.02 [ 0.14  0.16] [ 0.11  0.19]     1    1    0
-#>  sex           0.11      0.04 [ 0.08  0.13] [ 0.03  0.19]     1    1    0
-#>  age           0.00      0.00 [-0.00  0.00] [-0.00  0.00]     1    1    0
+

Summary of Mediation Analysis

mediation() is another summary function, especially for mediation analysis, i.e. for multivariate response models with casual mediation effects.

Let us recall the models:

-
f1 <- bf(job_seek ~ treat + econ_hard + sex + age)
-f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age)
-
-m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4)
+

Here, treat is the treatment effect, job_seek is the mediator effect, f1 describes the mediator model and f2 describes the outcome model.

mediation() returns a data frame with information on the direct effect (median value of posterior samples from treatment of the outcome model), mediator effect (median value of posterior samples from mediator of the outcome model), indirect effect (median value of the multiplication of the posterior samples from mediator of the outcome model and the posterior samples from treatment of the mediation model) and the total effect (median value of sums of posterior samples used for the direct and indirect effect). The proportion mediated is the indirect effect divided by the total effect.

The simplest call just needs the model-object.

-
mediation(m2)
-#> 
-#> # Causal Mediation Analysis for Stan Model
-#> 
-#>   Treatment: treat
-#>    Mediator: job_seek
-#>    Response: depress2
-#> 
-#>                  Estimate    HDI (90%)
-#>   Direct effect:    -0.04 [-0.11 0.04]
-#> Indirect effect:    -0.02 [-0.04 0.00]
-#>    Total effect:    -0.06 [-0.13 0.02]
-#> 
-#> Proportion mediated: 27.63% [-68.95% 124.22%]
+

Typically, mediation() finds the treatment and mediator variables automatically. If this does not work, use the treatment and mediator arguments to specify the related variable names. For all values, the 90% HDIs are calculated by default. Use prob to calculate a different interval.

Here is a comparison with the mediation package. Note that the summary()-output of the mediation package shows the indirect effect first, followed by the direct effect.

-
summary(m1)
-#> 
-#> Causal Mediation Analysis 
-#> 
-#> Quasi-Bayesian Confidence Intervals
-#> 
-#>                Estimate 95% CI Lower 95% CI Upper p-value
-#> ACME            -0.0159      -0.0402         0.01    0.17
-#> ADE             -0.0413      -0.1276         0.04    0.37
-#> Total Effect    -0.0572      -0.1487         0.03    0.20
-#> Prop. Mediated   0.2234      -1.4474         3.38    0.33
-#> 
-#> Sample Size Used: 899 
-#> 
-#> 
-#> Simulations: 1000
-
-mediation(m2, prob = .95)
-#> 
-#> # Causal Mediation Analysis for Stan Model
-#> 
-#>   Treatment: treat
-#>    Mediator: job_seek
-#>    Response: depress2
-#> 
-#>                  Estimate    HDI (95%)
-#>   Direct effect:    -0.04 [-0.13 0.05]
-#> Indirect effect:    -0.02 [-0.04 0.01]
-#>    Total effect:    -0.06 [-0.15 0.03]
-#> 
-#> Proportion mediated: 27.63% [-166.36% 221.63%]
+

If you want to calculate mean instead of median values from the posterior samples, use the typical-argument. Furthermore, there is a print()-method, which allows to print more digits.

-
mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4)
-#> 
-#> # Causal Mediation Analysis for Stan Model
-#> 
-#>   Treatment: treat
-#>    Mediator: job_seek
-#>    Response: depress2
-#> 
-#>                  Estimate        HDI (95%)
-#>   Direct effect:  -0.0405 [-0.1313 0.0457]
-#> Indirect effect:  -0.0161 [-0.0412 0.0084]
-#>    Total effect:  -0.0566 [-0.1515 0.0336]
-#> 
-#> Proportion mediated: 28.4526% [-165.542% 222.4472%]
+

As you can see, the results are similar to what the mediation package produces for non-Bayesian models.

ICC for multilevel models

Similar to frequentist multilevel models, icc() computes the intraclass correlation coefficient for Bayesian multilevel models. One advantage of Bayesian regression models is that you can compute the ICC for each sample of the posterior distribution, which allows you to easily calculate uncertainty intervals.

-
icc(m4)
-#> 
-#> # Random Effect Variances and ICC
-#> 
-#> Family: gaussian (identity)
-#> 
-#> ## cyl
-#>           ICC:  0.26  HDI 89%: [0.00  0.70]
-#> Between-group: 17.29  HDI 89%: [0.00 42.24]
-#> 
-#> ## gear
-#>           ICC:  0.51  HDI 89%: [0.00   0.93]
-#> Between-group: 43.84  HDI 89%: [0.00 118.68]
-#> 
-#> ## Residuals
-#> Within-group: 6.32  HDI 89%: [3.32 8.64]
-#> 
-#> ## Random-slope-variance
-#> gear: 5.48  HDI 89%: [0.00 13.60]
-
-icc(m5)
-#> 
-#> # Random Effect Variances and ICC
-#> 
-#> Family: gaussian (identity)
-#> 
-#> ## e15relat
-#>           ICC: 0.04  HDI 89%: [0.00 0.09]
-#> Between-group: 0.62  HDI 89%: [0.00 1.30]
-#> 
-#> ## Residuals
-#> Within-group: 12.83  HDI 89%: [11.82 13.87]
+

For non-Gaussian models, there is no clean variance decomposition and hence the ICC can’t be calculated exactly. The general Bayesian way to analyse the random-effect variances is then to draw samples from the posterior predictive distribution, calculate the variances and compare how the variance across models changes when group-specific term are included or dropped.

You can achieve this with the ppd-argument. In this case, draws from the posterior predictive distribution not conditioned on group-level terms (using posterior_predict(..., re.form = NA)) as well as draws from this distribution conditioned on all random effects (by default, unless specified else in the re.form-argument) are taken. Then, the variances for each of these draws are calculated. The “ICC” is then the ratio between these two variances.

-
icc(m4, ppd = TRUE, re.form = ~(1 | cyl), prob = .5)
-#> 
-#> # Random Effect Variances and ICC
-#> 
-#> Family: gaussian (identity)
-#> Conditioned on: ~(1 | cyl)
-#> 
-#> ## Variance Ratio (comparable to ICC)
-#> Ratio: 0.09  HDI 50%: [-0.07 0.28]
-#> 
-#> ## Variances of Posterior Predicted Distribution
-#> Conditioned on fixed effects: 36.38  HDI 50%: [22.30 40.72]
-#> Conditioned on rand. effects: 40.85  HDI 50%: [25.31 42.49]
-#> 
-#> ## Difference in Variances
-#> Difference: 4.47  HDI 50%: [-3.04 10.65]
+

Sometimes, when the variance of the posterior predictive distribution is very large, the variance ratio in the output makes no sense, e.g. because it is negative. In such cases, it might help to use a more robust measure to calculate the central tendency of the variances. This can be done with the typical-argument.

-
# the "classical" ICC, not recommended for non-Gaussian
-icc(m3)
-#> 
-#> # Random Effect Variances and ICC
-#> 
-#> Family: zero_inflated_poisson (log)
-#> 
-#> ## persons
-#>           ICC: 0.67  HDI 89%: [0.41 0.94]
-#> Between-group: 3.55  HDI 89%: [0.26 8.06]
-#> 
-#> ## Residuals
-#> Within-group: 1.00  HDI 89%: [1.00 1.00]
-
-# variance ratio based on posterior predictive distributions,
-# which is negative and hence obviously nonsense
-icc(m3, ppd = TRUE)
-#> 
-#> # Random Effect Variances and ICC
-#> 
-#> Family: zero_inflated_poisson (log)
-#> Conditioned on: all random effects
-#> 
-#> ## Variance Ratio (comparable to ICC)
-#> Ratio: -0.23  HDI 89%: [-0.64 1.00]
-#> 
-#> ## Variances of Posterior Predicted Distribution
-#> Conditioned on fixed effects: 48.07  HDI 89%: [ 0.10 63.82]
-#> Conditioned on rand. effects: 39.31  HDI 89%: [30.21 47.27]
-#> 
-#> ## Difference in Variances
-#> Difference: -8.76  HDI 89%: [-26.53 49.70]
-
-# use median instead of mean
-icc(m3, ppd = TRUE, typical = "median")
-#> 
-#> # Random Effect Variances and ICC
-#> 
-#> Family: zero_inflated_poisson (log)
-#> Conditioned on: all random effects
-#> 
-#> ## Variance Ratio (comparable to ICC)
-#> Ratio: 0.72  HDI 89%: [-0.64 1.00]
-#> 
-#> ## Variances of Posterior Predicted Distribution
-#> Conditioned on fixed effects: 10.98  HDI 89%: [ 0.07 64.13]
-#> Conditioned on rand. effects: 39.18  HDI 89%: [31.12 48.07]
-#> 
-#> ## Difference in Variances
-#> Difference: 27.04  HDI 89%: [-26.92 49.25]
+

Bayes r-squared and LOO-adjusted r-squared

r2() computes either the Bayes r-squared value, or - if loo = TRUE - a LOO-adjusted r-squared value (which comes conceptionally closer to an adjusted r-squared measure).

For the Bayes r-squared, the standard error is also reported. Note that r2() uses the median as measure of central tendency and the median absolute deviation as measure for variability.

-
r2(m5)
-#> 
-#> R-Squared for Generalized Linear Mixed Model
-#> 
-#>       Bayes R2: 0.168
-#> Standard Error: 0.022
-
-r2(m5, loo = TRUE)
-#> 
-#> R-Squared for Generalized Linear Mixed Model
-#> 
-#> LOO-adjusted R2: 0.146
+

References

diff --git a/inst/doc/mixedmodels-statistics.html b/inst/doc/mixedmodels-statistics.html index 9b59c9c7..07a73d9c 100644 --- a/inst/doc/mixedmodels-statistics.html +++ b/inst/doc/mixedmodels-statistics.html @@ -12,7 +12,7 @@ - + Statistics for Mixed Effects Models @@ -20,46 +20,254 @@ - + @@ -70,7 +278,7 @@

Statistics for Mixed Effects Models

Daniel Lüdecke

-

2018-08-16

+

2018-08-20

@@ -87,55 +295,55 @@

Statistics and Measures for Mixed Effects Models

  • r2()
  • Befor we start, we fit a simple linear mixed model:

    -
    library(sjstats)
    -library(lme4)
    -# load sample data
    -data(sleepstudy)
    -
    -# fit linear mixed model
    -m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)
    -
    -sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE)
    -m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy)
    +

    Sample Size Calculation for Mixed Models

    The first two functions, deff() and smpsize_lmm(), can be used to approximately calculate the sample size in the context of power calculation. Calculating the sample size for simple linear models is pretty straightforward, however, for (linear) mixed models, statistical power is affected through the change of the variance of test statistics. This is what Hsieh et al. (2003) call a design effect (or variance inflation factor, VIF). Once this design effect is calculated, the sample size calculated for a standard design can be adjusted accordingly.

    Design Effect for Two-Level Mixed Models

    deff() computes this design effect for linear mixed models with two-level design. It requires the approximated average number of observations per grouping cluster (i.e. level-2 unit) and the assumed intraclass correlation coefficient (ICC) for the multilevel-model. Typically, the minimum assumed value for the ICC is 0.05.

    -
    # Design effect for two-level model with 30 observations per
    -# cluster group (level-2 unit) and an assumed intraclass
    -# correlation coefficient of 0.05.
    -deff(n = 30)
    -#> [1] 2.45
    -
    -# Design effect for two-level model with 24 observation per cluster
    -# group and an assumed intraclass correlation coefficient of 0.2.
    -deff(n = 24, icc = 0.2)
    -#> [1] 5.6
    +

    Calculating the Sample Size for Linear Mixed Models

    smpsize_lmm() combines the functions for power calculation from the pwr-package and design effect deff(). It computes an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs.

    -
    # Sample size for multilevel model with 30 cluster groups and a small to
    -# medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and
    -# hence a total sample size of about 802 observations is needed.
    -smpsize_lmm(eff.size = .3, k = 30)
    -#> $`Subjects per Cluster`
    -#> [1] 27
    -#> 
    -#> $`Total Sample Size`
    -#> [1] 802
    -
    -# Sample size for multilevel model with 20 cluster groups and a medium
    -# to large effect size for linear models of 0.2. Five subjects per cluster and
    -# hence a total sample size of about 107 observations is needed.
    -smpsize_lmm(eff.size = .2, df.n = 5, k = 20, power = .9)
    -#> $`Subjects per Cluster`
    -#> [1] 5
    -#> 
    -#> $`Total Sample Size`
    -#> [1] 107
    +

    There are more ways to perform power calculations for multilevel models, however, most of these require very detailed knowledge about the sample characteristics and performing simulation studys. smpsize_lmm() is a more pragmatic alternative to these approaches.

    @@ -143,12 +351,12 @@

    Calculating the Sample Size for Linear Mixed Models

    Trouble Shooting

    Sometimes, when fitting mixed models, covergence warnings or warnings about singularity may come up (see details on these issues in this FAQ). These warnings may arise due to the strict tresholds and / or may be safely ignored. converge_ok() and is_singular() may help to check whether such a warning is problematic or not.

    converge_ok() provides an alternative convergence test for merMod-objects (with a less strict treshold, as suggested by one of the lme4-package authors), while is_singular() can be used in case of post-fitting convergence warnings, such as warnings about negative eigenvalues of the Hessian. In both cases, if the function returns TRUE, these warnings can most likely be ignored.

    -
    converge_ok(m)
    -#> 1.79669205387819e-07 
    -#>                 TRUE
    -
    -is_singular(m)
    -#> [1] FALSE
    +

    Rescale model weights for complex samples

    @@ -156,45 +364,45 @@

    Rescale model weights for complex samples

    scale_weights() implements an algorithm proposed by Aaparouhov (2006) and Carle (2009) to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling.

    To calculate a weight-vector that can be used in multilevel models, scale_weights() needs the data frame with survey data as x-argument. This data frame should contain 1) a cluster ID (argument cluster.id), which represents the strata of the survey data (the level-2-cluster variable) and 2) the probability weights (argument pweight), which represents the design or sampling weights of the survey data (level-1-weight).

    scale_weights() then returns the original data frame, including two new variables: svywght_a, where the sample weights pweight are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for svywght_b is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see Carle (2009), Appendix B, for details).

    -
    data(nhanes_sample)
    -scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR)
    -#> # A tibble: 2,992 x 9
    -#>    total   age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR svywght_a svywght_b
    -#>    <dbl> <dbl>    <dbl>    <dbl>   <dbl>    <dbl>    <dbl>     <dbl>     <dbl>
    -#>  1     1  2.2         1        3       2       31   97594.     1.57      1.20 
    -#>  2     7  2.08        2        3       1       29   39599.     0.623     0.525
    -#>  3     3  1.48        2        1       2       42   26620.     0.898     0.544
    -#>  4     4  1.32        2        4       2       33   34999.     0.708     0.550
    -#>  5     1  2           2        1       1       41   14746.     0.422     0.312
    -#>  6     6  2.2         2        4       1       38   28232.     0.688     0.516
    -#>  7   350  1.6         1        3       2       33   93162.     1.89      1.46 
    -#>  8    NA  1.48        2        3       1       29   82276.     1.29      1.09 
    -#>  9     3  2.28        2        4       1       41   24726.     0.707     0.523
    -#> 10    30  0.84        1        3       2       35   39895.     0.760     0.594
    -#> # ... with 2,982 more rows
    +

    P-Values

    For linear mixed models, the summary() in lme4 does not report p-values. Other objects from regression functions are not consistent in their output structure when reporting p-values. p_value() aims at returning a standardized (“tidy”) output for any regression model. The return value is always a data frame with three columns: term, p.value and std.error, which represent the name, p-value and standard error for each term.

    For linear mixed models, the approximation of p-values are more precise when p.kr = TRUE, based on conditional F-tests with Kenward-Roger approximation for the degrees of freedom (calling pbkrtest::get_Lb_ddf()).

    -
    # Using the t-statistics for calculating the p-value
    -p_value(m2)
    -#>          term p.value std.error
    -#> 1 (Intercept)       0     9.747
    -#> 2        Days       0     0.804
    -
    -# p-values based on conditional F-tests with 
    -# Kenward-Roger approximation for the degrees of freedom
    -p_value(m2, p.kr = TRUE)
    -#>          term p.value std.error
    -#> 1 (Intercept)       0     9.766
    -#> 2        Days       0     0.813
    +

    To see more details on the degrees of freedom when using Kenward-Roger approximation, use the summary()-method:

    -
    pv <- p_value(m2, p.kr = TRUE)
    -summary(pv)
    -#>          term p.value std.error      df statistic
    -#> 1 (Intercept)       0     9.766  22.762    25.742
    -#> 2        Days       0     0.813 160.943    12.876
    +

    Random Effect Variances

    @@ -207,70 +415,76 @@

    Random Effect Variances

  • rho.01: Random-Intercept-Slope-correlation
  • You can access on of these values with get_re_var(), or all of them with re_var():

    -
    # get residual variance
    -get_re_var(m, "sigma_2")
    -#> [1] 654.941
    -
    -# get all random effect variances
    -re_var(m)
    -#>       Within-group-variance:  654.941
    -#>      Between-group-variance:  612.090 (Subject)
    -#>       Random-slope-variance:   35.072 (Subject.Days)
    -#>  Slope-Intercept-covariance:    9.604 (Subject.(Intercept))
    -#> Slope-Intercept-correlation:    0.066 (Subject)
    +

    R-squared

    Nakagawa et al. (2017) proposed a method to compute marginal and conditional r-squared values, which is implemented in the r2()-function. For mixed models, the marginal r-squared considers only the variance of the fixed effects, while the conditional r-squared takes both the fixed and random effects into account. r2() can be used with models fitted with the functions of the lme4 and glmmTMB packages.

    -
    r2(m)
    -#> 
    -#> R-Squared for Generalized Linear Mixed Model
    -#> 
    -#> Family : gaussian (identity)
    -#> Formula: Reaction ~ Days + (Days | Subject)
    -#> 
    -#>    Marginal R2: 0.279
    -#> Conditional R2: 0.799
    +

    Intraclass-Correlation Coefficient

    The components of the random effect variances are of interest when calculating the intraclass-correlation coefficient, ICC. The ICC is calculated by dividing the between-group-variance (random intercept variance) by the total variance (i.e. sum of between-group-variance and within-group (residual) variance). The ICC can be interpreted as “the proportion of the variance explained by the grouping structure in the population” (Hox 2002: 15).

    Usually, the ICC is calculated for the null model (“unconditional model”). However, according to Raudenbush and Bryk (2002) or Rabe-Hesketh and Skrondal (2012) it is also feasible to compute the ICC for full models with covariates (“conditional models”) and compare how much a level-2 variable explains the portion of variation in the grouping structure (random intercept).

    -

    The ICC for mixed models can be computed with icc(). Caution: For random-slope-intercept models, the ICC would differ at each unit of the predictors. Hence, the ICC for these kind of models cannot be understood simply as proportion of variance (see Goldstein et al. 2010). For convenience reasons, as the icc() function is also used to extract the different random effects variances (see re_var() above), the ICC for random-slope-intercept-models is reported nonetheless, but it is usually no meaningful summary of the proportion of variances.

    +

    The ICC for mixed models can be computed with icc(). Caution: For random-slope-intercept models, the ICC would differ at each unit of the predictors. Hence, the ICC for these kind of models cannot be understood simply as proportion of variance (see Goldstein et al. 2010). For convenience reasons, as the icc() function is also used to extract the different random effects variances (see re_var() above), the ICC for random-slope-intercept-models is reported nonetheless, but it is usually no meaningful summary of the proportion of variances.

    By default, for three-level-models, depending on the nested structure of the model, or for cross-classified models, icc() only reports the proportion of variance explained for each grouping level. Use adjusted = TRUE to calculate the adjusted and conditional ICC.

    -
    icc(m)
    -#> Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.
    -#> 
    -#> Linear mixed model
    -#> 
    -#> Family : gaussian (identity)
    -#> Formula: Reaction ~ Days + (Days | Subject)
    -#> 
    -#>   ICC (Subject): 0.4831
    -
    -icc(m2)
    -#> 
    -#> Linear mixed model
    -#> 
    -#> Family : gaussian (identity)
    -#> Formula: Reaction ~ Days + (1 | mygrp) + (1 | Subject)
    -#> 
    -#>     ICC (mygrp): 0.0000
    -#>   ICC (Subject): 0.5893
    -

    If adjusted = TRUE, an adjusted and conditional ICC are calculated, which take all sources of uncertainty (of all random effects) into account to report an “adjusted” ICC, as well as the conditional ICC. The latter also takes the fixed effects variances into account (see Nakagawa et al. 2017). If random effects are not nested and not cross-classified, the adjusted (adjusted = TRUE) and unadjusted (adjusted = FALSE) ICC are identical.

    -
    icc(m, adjusted = TRUE)
    -#> 
    -#> Intra-Class Correlation Coefficient for Generalized Linear Mixed Model
    -#> 
    -#> Family : gaussian (identity)
    -#> Formula: Reaction ~ Days + (Days | Subject)
    -#> 
    -#>    Adjusted ICC: 0.7217
    -#> Conditional ICC: 0.5206
    -
    -icc(m2, adjusted = TRUE)
    -#> Some variance components equal zero. Respecify random structure!
    -#> NULL
    + +

    If adjusted = TRUE, an adjusted and conditional ICC are calculated, which take all sources of uncertainty (of all random effects) into account to report an “adjusted” ICC, as well as the conditional ICC. The latter also takes the fixed effects variances into account (see Nakagawa et al. 2017). If random effects are not nested and not cross-classified, the adjusted (adjusted = TRUE) and unadjusted (adjusted = FALSE) ICC are identical.

    +
    diff --git a/man/overdisp.Rd b/man/overdisp.Rd index e02a8456..073aadc2 100644 --- a/man/overdisp.Rd +++ b/man/overdisp.Rd @@ -15,6 +15,8 @@ zero_count(x, tolerance = 0.05) \arguments{ \item{x}{Fitted GLMM (of class \code{merMod} or \code{glmmTMB}) or \code{glm} model.} +\item{...}{Currently not used.} + \item{trafo}{A specification of the alternative, can be numeric or a (positive) function or \code{NULL} (the default). See 'Details' in \code{\link[AER]{dispersiontest}} in package \CRANpkg{AER}. Does not diff --git a/man/robust.Rd b/man/robust.Rd index 5bb6dc87..af5b5d8d 100644 --- a/man/robust.Rd +++ b/man/robust.Rd @@ -19,11 +19,11 @@ from the \pkg{sandwich} package. For \code{svy()}, \code{x} must be \code{lm} object, fitted with weights.} \item{vcov.fun}{String, indicating the name of the \code{vcov*()}-function -from the \pkg{sandwich} package, e.g. \code{vcov.fun = "vcovCL"}.} +from the \pkg{sandwich}-package, e.g. \code{vcov.fun = "vcovCL"}.} \item{vcov.type}{Character vector, specifying the estimation type for the -heteroskedasticity-consistent covariance matrix estimation -(see \code{\link[sandwich]{vcovHC}} for details).} +robust covariance matrix estimation (see \code{\link[sandwich]{vcovHC}} +for details).} \item{vcov.args}{List of named vectors, used as additional arguments that are passed down to \code{vcov.fun}.} @@ -40,9 +40,9 @@ A summary of the model, including estimates, robust standard error, } \description{ \code{robust()} computes robust standard error for regression models. - This method calls one of the \code{vcov*()}-functions from the \pkg{sandwich} - package for robust covariance matrix estimators. Results are returned - as tidy data frame. + This method calls one of the \code{vcov*()}-functions from the + \pkg{sandwich}-package for robust covariance matrix estimators. Results are + returned as tidy data frame. \cr \cr \code{svy()} is intended to compute standard errors for survey designs (complex samples) fitted with regular \code{lm} or @@ -76,7 +76,7 @@ robust(fit) confint(fit) robust(fit, conf.int = TRUE) -robust(fit, vcov = "HC1", conf.int = TRUE) # "HC1" should be Stata default +robust(fit, vcov.type = "HC1", conf.int = TRUE) # "HC1" should be Stata default library(sjmisc) # dichtomozize service usage by "service usage yes/no" diff --git a/man/typical_value.Rd b/man/typical_value.Rd index ae328dfb..00219938 100644 --- a/man/typical_value.Rd +++ b/man/typical_value.Rd @@ -4,7 +4,7 @@ \alias{typical_value} \title{Return the typical value of a vector} \usage{ -typical_value(x, fun = "mean", weight.by = NULL, ...) +typical_value(x, fun = "mean", weights = NULL, weight.by, ...) } \arguments{ \item{x}{A variable.} @@ -22,10 +22,12 @@ vector to apply other different functions to numeric and categorical \code{x}, where factors are first converted to numeric vectors, e.g. \code{fun = c(numeric = "median", factor = "mean")}. See 'Examples'.} -\item{weight.by}{Name of variable in \code{x} that indicated the vector of +\item{weights}{Name of variable in \code{x} that indicated the vector of weights that will be applied to weight all observations. Default is \code{NULL}, so no weights are used.} +\item{weight.by}{Deprecated.} + \item{...}{Further arguments, passed down to \code{fun}.} } \value{ diff --git a/vignettes/anova-statistics.R b/vignettes/anova-statistics.R new file mode 100644 index 00000000..eaecf508 --- /dev/null +++ b/vignettes/anova-statistics.R @@ -0,0 +1,42 @@ +## ----set-options, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) +options(width = 800) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +library(sjstats) +# load sample data +data(efc) + +# fit linear model +fit <- aov( + c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, + data = efc +) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +eta_sq(fit) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +eta_sq(fit, partial = TRUE) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +omega_sq(fit) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +omega_sq(fit, partial = TRUE, n = 100) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +cohens_f(fit) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +anova_stats(fit) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +anova_stats(car::Anova(fit, type = 3)) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +eta_sq(fit, partial = TRUE, ci.lvl = .8) + +# uses bootstrapping - here, for speed, just 100 samples +omega_sq(fit, partial = TRUE, ci.lvl = .9, n = 100) + diff --git a/vignettes/anova-statistics.html b/vignettes/anova-statistics.html new file mode 100644 index 00000000..ae871fbb --- /dev/null +++ b/vignettes/anova-statistics.html @@ -0,0 +1,409 @@ + + + + + + + + + + + + + + + + +Statistics for Anova Tables + + + + + + + + + + + + + + + + + +

    Statistics for Anova Tables

    +

    Daniel Lüdecke

    +

    2018-08-20

    + + + +
    +

    Effect Size Statistics for Anova Tables

    +

    This vignettes demontrates those functions of the sjstats-package that deal with Anova tables. These functions report different effect size measures, which are useful beyond significance tests (p-values), because they estimate the magnitude of effects, independent from sample size. sjstats provides following functions:

    +
      +
    • eta_sq()
    • +
    • omega_sq()
    • +
    • cohens_f()
    • +
    • anova_stats()
    • +
    +

    Befor we start, we fit a simple model:

    + +

    All functions accept objects of class aov or anova, so you can also use model fits from the car package, which allows fitting Anova’s with different types of sum of squares. Other objects, like lm, will be coerced to anova internally.

    +

    The following functions return the effect size statistic as named numeric vector, using the model’s term names.

    +
    +

    Eta-Squared

    +

    The eta-squared is the proportion of the total variability in the dependent variable that is accounted for by the variation in the independent variable. It is the ratio of the sum of squares for each group level to the total sum of squares. It can be interpreted as percentage of variance accounted for by a variable.

    +

    For variables with 1 degree of freedeom (in the numerator), the square root of eta-squared is equal to the correlation coefficient r. For variables with more than 1 degree of freedom, eta-squared equals R2. This makes eta-squared easily interpretable. Furthermore, these effect sizes can easily be converted into effect size measures that can be, for instance, further processed in meta-analyses.

    +

    Eta-squared can be computed simply with:

    + +
    +
    +

    Partial Eta-Squared

    +

    The partial eta-squared value is the ratio of the sum of squares for each group level to the sum of squares for each group level plus the residual sum of squares. It is more difficult to interpret, because its value strongly depends on the variability of the residuals. Partial eta-squared values should be reported with caution, and Levine and Hullett (2002) recommend reporting eta- or omega-squared rather than partial eta-squared.

    +

    Use the partial-argument to compute partial eta-squared values:

    + +
    +
    +

    Omega-Squared

    +

    While eta-squared estimates tend to be biased in certain situations, e.g. when the sample size is small or the independent variables have many group levels, omega-squared estimates are corrected for this bias.

    +

    Omega-squared can be simply computed with:

    + +
    +
    +

    Partial Omega-Squared

    +

    omega_sq() also has a partial-argument to compute partial omega-squared values. Computing the partial omega-squared statistics is based on bootstrapping. In this case, use n to define the number of samples (1000 by default.)

    + +
    +
    +

    Cohen’s F

    +

    Finally, cohens_f() computes Cohen’s F effect size for all independent variables in the model:

    + +
    +
    + +
    +

    Confidence Intervals

    +

    eta_sq() and omega_sq() have a ci.lvl-argument, which - if not NULL - also computes a confidence interval.

    +

    For eta-squared, i.e. eta_sq() with partial = FALSE, due to non-symmetry, confidence intervals are based on bootstrap-methods. Confidence intervals for partial omega-squared, i.e. omega_sq() with partial = TRUE - is also based on bootstrapping. In these cases, n indicates the number of bootstrap samples to be drawn to compute the confidence intervals.

    + +
    +
    +

    References

    +

    Levine TR, Hullet CR. Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. Human Communication Research 28(4); 2002: 612-625

    +
    + + + + + + + + diff --git a/vignettes/bayesian-statistics.R b/vignettes/bayesian-statistics.R new file mode 100644 index 00000000..c86d53ea --- /dev/null +++ b/vignettes/bayesian-statistics.R @@ -0,0 +1,138 @@ +params <- +list(EVAL = TRUE) + +## ---- SETTINGS-knitr, echo = FALSE, warning = FALSE, message = FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + dev = "png", + fig.width = 7, + fig.height = 5, + message = FALSE, + warning = FALSE, + eval = if (isTRUE(exists("params"))) params$EVAL else FALSE +) + +options(width = 800) + +if (!requireNamespace("mediation", quietly = TRUE)) { + warning("Package 'mediation' required for this vignette.", call. = FALSE) +} + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +library(sjstats) +library(sjmisc) +library(mediation) +library(brms) + +# load sample data +data(jobs) +data(efc) +efc <- to_factor(efc, e42dep, c172code, c161sex, e15relat) +zinb <- read.csv("http://stats.idre.ucla.edu/stat/data/fish.csv") + +# linear models, for mediation analysis +b1 <- lm(job_seek ~ treat + econ_hard + sex + age, data = jobs) +b2 <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data = jobs) + +# mediation analysis, for comparison with brms +m1 <- mediate(b1, b2, sims = 1000, treat = "treat", mediator = "job_seek") + +# fit Bayesian models +f1 <- bf(job_seek ~ treat + econ_hard + sex + age) +f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age) + +m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4) + +m3 <- brm( + bf(count ~ child + camper + (1 | persons), + zi ~ child + camper + (1 | persons)), + data = zinb, + family = zero_inflated_poisson(), + cores = 4 +) + +m4 <- brm( + mpg ~ wt + hp + (1 | cyl) + (1 + wt | gear), + data = mtcars, + cores = 4 +) + +m5 <- brm( + neg_c_7 ~ e42dep + c12hour + c172code + (1 | e15relat), + data = efc, + cores = 4 +) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +hdi(m2) + +hdi(m2, prob = c(.5, .89)) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +hdi(m5, type = "random") + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +rope(m5, rope = c(-1, 1)) + +## ---- message=TRUE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +equi_test(m5) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +equi_test(m5, out = "plot") + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +tidy_stan(m3) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +tidy_stan(m3, typical = "mean") + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# printing fixed and random effects of multilevel model +tidy_stan(m3, type = "all") + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# two different HDI for multivariate response model +tidy_stan(m2, prob = c(.5, .95)) + +## ----eval=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# f1 <- bf(job_seek ~ treat + econ_hard + sex + age) +# f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age) +# +# m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4) + +## ---- message=TRUE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +mediation(m2) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +summary(m1) + +mediation(m2, prob = .95) + +## ---- message=TRUE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +icc(m4) + +icc(m5) + +## ---- message=TRUE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +icc(m4, ppd = TRUE, re.form = ~(1 | cyl), prob = .5) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# the "classical" ICC, not recommended for non-Gaussian +icc(m3) + +# variance ratio based on posterior predictive distributions, +# which is negative and hence obviously nonsense +icc(m3, ppd = TRUE) + +# use median instead of mean +icc(m3, ppd = TRUE, typical = "median") + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +r2(m5) + +r2(m5, loo = TRUE) + diff --git a/vignettes/bayesian-statistics.html b/vignettes/bayesian-statistics.html new file mode 100644 index 00000000..f60aee41 --- /dev/null +++ b/vignettes/bayesian-statistics.html @@ -0,0 +1,781 @@ + + + + + + + + + + + + + + + + +Statistics for Bayesian Models + + + + + + + + + + + + + + + + + +

    Statistics for Bayesian Models

    +

    Daniel Lüdecke

    +

    2018-08-20

    + + + + +

    This vignettes demontrates those functions of the sjstats-package that deal especially with Bayesian models. sjstats provides following functions:

    +
      +
    • hdi()
    • +
    • rope()
    • +
    • mcse()
    • +
    • n_eff()
    • +
    • tidy_stan()
    • +
    • equi_test()
    • +
    • mediation()
    • +
    • icc()
    • +
    • r2()
    • +
    +

    Befor we start, we fit some models, including a mediation-object from the mediation-package, which we use for comparison with brms. The functions work with brmsfit, stanreg and stanfit-objects.

    + +
    +

    Highest Density Interval

    +

    hdi() computes the highest density interval for posterior samples. Unlike equal-tailed intervals that exclude 2.5% from each tail of the distribution, the HDI is not equal-tailed and therefor always includes the mode(s) of posterior distributions.

    +

    By default, hdi() prints the 90% intervals, however, the prob-argument can be used to calculate different or even multiple intervals.

    + +

    For multilevel models, the type-argument defines whether the HDI of fixed, random or all effects are shown.

    + +

    The computation for the HDI is based on the code from Kruschke 2015, pp. 727f. For default sampling in Stan (4000 samples), the 90% intervals for HDI are more stable than, for instance, 95% intervals. An effective sample size of at least 10.000 is recommended if 95% intervals should be computed (see Kruschke 2015, p. 183ff).

    +
    +
    +

    Region of Practical Equivalence (ROPE)

    +

    Unlike a frequentist approach, Bayesian inference is not based on stastical significance, where effects need to be different from “zero”. Rather, the magnitude of a model’s parameter value and its uncertainty should not be ignored, and hence, an effect is not present when it simply differs from zero, but if it’s outside a specific range that can be considered as “practically no effect”. This range is called the region of practical equivalence (ROPE).

    +

    rope() requires the rope-argument, which defined this region, and then gives a summary about the parameters and their proportion that lies inside and outside this ROPE.

    + +

    rope() does not suggest limits for the region of practical equivalence and does not tell you how big is practically equivalent to the null value. However, there are suggestions how to choose reasonable limits (see Kruschke 2018), which are implemented in the equi_test() functions.

    +
    +
    +

    Test for Practical Equivalence

    +

    equi_test() combines the two functions hdi() and rope() and performs a “HDI+ROPE decision rule” (Test for Practical Equivalence) (Kruschke 2018) to check whether parameter values should be accepted or rejected against the background of a formulated null hypothesis.

    +

    equi_test() computes the 95%-HDI and checks if a model predictor’s HDI lies completely outside, completely inside or partially inside the ROPE. If the HDI is completely outside the ROPE, the “null hypothesis” for this parameter is “rejected”. If the ROPE completely covers the HDI, i.e. all most credible values of a parameter are inside the region of practical equivalence, the null hypothesis is accepted. Else, it’s undecided whether to accept or reject the null hypothesis. In short, desirable results are low proportions inside the ROPE (the closer to zero the better) and the H0 should be rejected.

    +

    If neither the rope nor eff_size argument are specified, the effect size will be set to 0.1 (half of a small effect according to Cohen) and the ROPE is then 0 +/- .1 * sd(y) for linear models. This is the suggested way to specify the ROPE limits according to Kruschke (2018).

    + +

    For models with binary outcome, there is no concrete way to derive the effect size that defines the ROPE limits. Two examples from Kruschke suggest that a negligible change is about .05 on the logit-scale. In these cases, it is recommended to specify the rope argument, however, if not specified, the ROPE limits are calculated in this way: 0 +/- .1 * sd(intercept) / 4. For all other models, 0 +/- .1 * sd(intercept) is used to determine the ROPE limits. These formulas are based on experience that worked well in real-life situations, but are most likely not generally the best approach.

    +

    Beside a numerical output, the results can also be printed as HTML-table or plotted, using the out-argument. For plots, the 95% distributions of the posterior samles are shown, the ROPE is a light-blue shaded region in the plot, and the distributions are colored depending on whether the parameter values are accepted, rejected or undecided.

    + +

    +
    +
    +

    Tidy Summary of Bayesian Models

    +

    tidy_stan() is no substitute, but rather a convenient alternative to summary(). The major differences are: tidy_stan()

    +
      +
    • focusses on the parameter values (estimates) and gives no information on samples, data, or formula
    • +
    • calculates the HDI rather than equi-tailed intervals
    • +
    • separates different model parts, e.g. random from fixed effects, or conditional from zero-inflated models
    • +
    • and prints everything nicely
    • +
    + +

    Additional statistics in the output are:

    +
      +
    • standard errors (which are actually median absolute deviations)
    • +
    • ratio of effective numbers of samples, neff_ratio, (i.e. effective number of samples divided by total number of samples); this ratio ranges from 0 to 1, and should be close to 1; the closer this ratio comes to zero means that the chains may be inefficient, but possibly still okay
    • +
    • Rhat statistics; when Rhat is above 1, it usually indicates that the chain has not yet converged, indicating that the drawn samples might not be trustworthy; drawing more iteration may solve this issue
    • +
    • Monte Carlo Standard Error (mcse);
    • +
    +

    By default, the “estimate” is the median of the posterior distribution, but this can be changed with the typical-argument.

    + +

    To also show random effects of multilevel models, use the type-argument.

    + +

    By default, 89%-HDI are computed (a convention following McElreath 2015), but other or even multiple HDI can be computed using the prob argument.

    + +
    +
    +

    Summary of Mediation Analysis

    +

    mediation() is another summary function, especially for mediation analysis, i.e. for multivariate response models with casual mediation effects.

    +

    Let us recall the models:

    + +

    Here, treat is the treatment effect, job_seek is the mediator effect, f1 describes the mediator model and f2 describes the outcome model.

    +

    mediation() returns a data frame with information on the direct effect (median value of posterior samples from treatment of the outcome model), mediator effect (median value of posterior samples from mediator of the outcome model), indirect effect (median value of the multiplication of the posterior samples from mediator of the outcome model and the posterior samples from treatment of the mediation model) and the total effect (median value of sums of posterior samples used for the direct and indirect effect). The proportion mediated is the indirect effect divided by the total effect.

    +

    The simplest call just needs the model-object.

    + +

    Typically, mediation() finds the treatment and mediator variables automatically. If this does not work, use the treatment and mediator arguments to specify the related variable names. For all values, the 90% HDIs are calculated by default. Use prob to calculate a different interval.

    +

    Here is a comparison with the mediation package. Note that the summary()-output of the mediation package shows the indirect effect first, followed by the direct effect.

    + +

    If you want to calculate mean instead of median values from the posterior samples, use the typical-argument. Furthermore, there is a print()-method, which allows to print more digits.

    + +

    As you can see, the results are similar to what the mediation package produces for non-Bayesian models.

    +
    +
    +

    ICC for multilevel models

    +

    Similar to frequentist multilevel models, icc() computes the intraclass correlation coefficient for Bayesian multilevel models. One advantage of Bayesian regression models is that you can compute the ICC for each sample of the posterior distribution, which allows you to easily calculate uncertainty intervals.

    + +

    For non-Gaussian models, there is no clean variance decomposition and hence the ICC can’t be calculated exactly. The general Bayesian way to analyse the random-effect variances is then to draw samples from the posterior predictive distribution, calculate the variances and compare how the variance across models changes when group-specific term are included or dropped.

    +

    You can achieve this with the ppd-argument. In this case, draws from the posterior predictive distribution not conditioned on group-level terms (using posterior_predict(..., re.form = NA)) as well as draws from this distribution conditioned on all random effects (by default, unless specified else in the re.form-argument) are taken. Then, the variances for each of these draws are calculated. The “ICC” is then the ratio between these two variances.

    + +

    Sometimes, when the variance of the posterior predictive distribution is very large, the variance ratio in the output makes no sense, e.g. because it is negative. In such cases, it might help to use a more robust measure to calculate the central tendency of the variances. This can be done with the typical-argument.

    + +
    +
    +

    Bayes r-squared and LOO-adjusted r-squared

    +

    r2() computes either the Bayes r-squared value, or - if loo = TRUE - a LOO-adjusted r-squared value (which comes conceptionally closer to an adjusted r-squared measure).

    +

    For the Bayes r-squared, the standard error is also reported. Note that r2() uses the median as measure of central tendency and the median absolute deviation as measure for variability.

    + +
    +
    +

    References

    +

    Kruschke JK. Doing Bayesian Data Analysis: A Tutorial with R, JAGS, and Stan. 2nd edition. Academic Press, 2015

    +

    Kruschke JK. Rejecting or Accepting Parameter Values in Bayesian Estimation. Advances in Methods and Practices in Psychological Science. 2018; doi: 10.1177/2515245918771304

    +

    McElreath R. Statistical Rethinking. A Bayesian Course with Examples in R and Stan. Chapman and Hall, 2015

    +

    Norman GR, Sloan JA, Wyrwich KW. Interpretation of Changes in Health-related Quality of Life: The Remarkable Universality of Half a Standard Deviation. Medical Care. 2003;41: 582–592. doi: 10.1097/01.MLR.0000062554.74615.4C

    +
    + + + + + + + + diff --git a/vignettes/mixedmodels-statistics.R b/vignettes/mixedmodels-statistics.R new file mode 100644 index 00000000..2372cc45 --- /dev/null +++ b/vignettes/mixedmodels-statistics.R @@ -0,0 +1,78 @@ +## ----set-options, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +knitr::opts_chunk$set(collapse = TRUE, comment = "#>", dev = "png", fig.width = 7, fig.height = 3.5, message = FALSE, warning = FALSE) +options(width = 800) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +library(sjstats) +library(lme4) +# load sample data +data(sleepstudy) + +# fit linear mixed model +m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy) + +sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) +m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# Design effect for two-level model with 30 observations per +# cluster group (level-2 unit) and an assumed intraclass +# correlation coefficient of 0.05. +deff(n = 30) + +# Design effect for two-level model with 24 observation per cluster +# group and an assumed intraclass correlation coefficient of 0.2. +deff(n = 24, icc = 0.2) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# Sample size for multilevel model with 30 cluster groups and a small to +# medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and +# hence a total sample size of about 802 observations is needed. +smpsize_lmm(eff.size = .3, k = 30) + +# Sample size for multilevel model with 20 cluster groups and a medium +# to large effect size for linear models of 0.2. Five subjects per cluster and +# hence a total sample size of about 107 observations is needed. +smpsize_lmm(eff.size = .2, df.n = 5, k = 20, power = .9) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +converge_ok(m) + +is_singular(m) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +data(nhanes_sample) +scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# Using the t-statistics for calculating the p-value +p_value(m2) + +# p-values based on conditional F-tests with +# Kenward-Roger approximation for the degrees of freedom +p_value(m2, p.kr = TRUE) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +pv <- p_value(m2, p.kr = TRUE) +summary(pv) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# get residual variance +get_re_var(m, "sigma_2") + +# get all random effect variances +re_var(m) + +## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +r2(m) + +## ----message = TRUE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +icc(m) + +icc(m2) + +## ----message = TRUE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +icc(m, adjusted = TRUE) + +icc(m2, adjusted = TRUE) + diff --git a/vignettes/mixedmodels-statistics.html b/vignettes/mixedmodels-statistics.html new file mode 100644 index 00000000..3cef5183 --- /dev/null +++ b/vignettes/mixedmodels-statistics.html @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + +Statistics for Mixed Effects Models + + + + + + + + + + + + + + + + + +

    Statistics for Mixed Effects Models

    +

    Daniel Lüdecke

    +

    2018-08-20

    + + + +
    +

    Statistics and Measures for Mixed Effects Models

    +

    This vignettes demontrates those functions of the sjstats-package that deal especially with mixed effects models. sjstats provides following functions:

    +
      +
    • deff() and smpsize_lmm()
    • +
    • converge_ok() and is_singular()
    • +
    • p_value()
    • +
    • scale_weights()
    • +
    • get_re_var() and re_var()
    • +
    • icc()
    • +
    • r2()
    • +
    +

    Befor we start, we fit a simple linear mixed model:

    + +
    +

    Sample Size Calculation for Mixed Models

    +

    The first two functions, deff() and smpsize_lmm(), can be used to approximately calculate the sample size in the context of power calculation. Calculating the sample size for simple linear models is pretty straightforward, however, for (linear) mixed models, statistical power is affected through the change of the variance of test statistics. This is what Hsieh et al. (2003) call a design effect (or variance inflation factor, VIF). Once this design effect is calculated, the sample size calculated for a standard design can be adjusted accordingly.

    +
    +

    Design Effect for Two-Level Mixed Models

    +

    deff() computes this design effect for linear mixed models with two-level design. It requires the approximated average number of observations per grouping cluster (i.e. level-2 unit) and the assumed intraclass correlation coefficient (ICC) for the multilevel-model. Typically, the minimum assumed value for the ICC is 0.05.

    + +
    +
    +

    Calculating the Sample Size for Linear Mixed Models

    +

    smpsize_lmm() combines the functions for power calculation from the pwr-package and design effect deff(). It computes an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs.

    + +

    There are more ways to perform power calculations for multilevel models, however, most of these require very detailed knowledge about the sample characteristics and performing simulation studys. smpsize_lmm() is a more pragmatic alternative to these approaches.

    +
    +
    +
    +

    Trouble Shooting

    +

    Sometimes, when fitting mixed models, covergence warnings or warnings about singularity may come up (see details on these issues in this FAQ). These warnings may arise due to the strict tresholds and / or may be safely ignored. converge_ok() and is_singular() may help to check whether such a warning is problematic or not.

    +

    converge_ok() provides an alternative convergence test for merMod-objects (with a less strict treshold, as suggested by one of the lme4-package authors), while is_singular() can be used in case of post-fitting convergence warnings, such as warnings about negative eigenvalues of the Hessian. In both cases, if the function returns TRUE, these warnings can most likely be ignored.

    + +
    +
    +

    Rescale model weights for complex samples

    +

    Most functions to fit multilevel and mixed effects models only allow to specify frequency weights, but not design (i.e. sampling or probability) weights, which should be used when analyzing complex samples and survey data.

    +

    scale_weights() implements an algorithm proposed by Aaparouhov (2006) and Carle (2009) to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling.

    +

    To calculate a weight-vector that can be used in multilevel models, scale_weights() needs the data frame with survey data as x-argument. This data frame should contain 1) a cluster ID (argument cluster.id), which represents the strata of the survey data (the level-2-cluster variable) and 2) the probability weights (argument pweight), which represents the design or sampling weights of the survey data (level-1-weight).

    +

    scale_weights() then returns the original data frame, including two new variables: svywght_a, where the sample weights pweight are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for svywght_b is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see Carle (2009), Appendix B, for details).

    + +
    +
    +

    P-Values

    +

    For linear mixed models, the summary() in lme4 does not report p-values. Other objects from regression functions are not consistent in their output structure when reporting p-values. p_value() aims at returning a standardized (“tidy”) output for any regression model. The return value is always a data frame with three columns: term, p.value and std.error, which represent the name, p-value and standard error for each term.

    +

    For linear mixed models, the approximation of p-values are more precise when p.kr = TRUE, based on conditional F-tests with Kenward-Roger approximation for the degrees of freedom (calling pbkrtest::get_Lb_ddf()).

    + +

    To see more details on the degrees of freedom when using Kenward-Roger approximation, use the summary()-method:

    + +
    +
    +

    Random Effect Variances

    +

    In mixed effects models, several random effect variances (depending on the model specification) are calculated:

    +
      +
    • sigma_2: Within-group (residual) variance
    • +
    • tau.00: Between-group-variance (variation between individual intercepts and average intercept)
    • +
    • tau.11: Random-slope-variance (variation between individual slopes and average slope)
    • +
    • tau.01: Random-Intercept-Slope-covariance
    • +
    • rho.01: Random-Intercept-Slope-correlation
    • +
    +

    You can access on of these values with get_re_var(), or all of them with re_var():

    + +
    +
    +

    R-squared

    +

    Nakagawa et al. (2017) proposed a method to compute marginal and conditional r-squared values, which is implemented in the r2()-function. For mixed models, the marginal r-squared considers only the variance of the fixed effects, while the conditional r-squared takes both the fixed and random effects into account. r2() can be used with models fitted with the functions of the lme4 and glmmTMB packages.

    + +
    +
    +

    Intraclass-Correlation Coefficient

    +

    The components of the random effect variances are of interest when calculating the intraclass-correlation coefficient, ICC. The ICC is calculated by dividing the between-group-variance (random intercept variance) by the total variance (i.e. sum of between-group-variance and within-group (residual) variance). The ICC can be interpreted as “the proportion of the variance explained by the grouping structure in the population” (Hox 2002: 15).

    +

    Usually, the ICC is calculated for the null model (“unconditional model”). However, according to Raudenbush and Bryk (2002) or Rabe-Hesketh and Skrondal (2012) it is also feasible to compute the ICC for full models with covariates (“conditional models”) and compare how much a level-2 variable explains the portion of variation in the grouping structure (random intercept).

    +

    The ICC for mixed models can be computed with icc(). Caution: For random-slope-intercept models, the ICC would differ at each unit of the predictors. Hence, the ICC for these kind of models cannot be understood simply as proportion of variance (see Goldstein et al. 2010). For convenience reasons, as the icc() function is also used to extract the different random effects variances (see re_var() above), the ICC for random-slope-intercept-models is reported nonetheless, but it is usually no meaningful summary of the proportion of variances.

    +

    By default, for three-level-models, depending on the nested structure of the model, or for cross-classified models, icc() only reports the proportion of variance explained for each grouping level. Use adjusted = TRUE to calculate the adjusted and conditional ICC.

    + +

    If adjusted = TRUE, an adjusted and conditional ICC are calculated, which take all sources of uncertainty (of all random effects) into account to report an “adjusted” ICC, as well as the conditional ICC. The latter also takes the fixed effects variances into account (see Nakagawa et al. 2017). If random effects are not nested and not cross-classified, the adjusted (adjusted = TRUE) and unadjusted (adjusted = FALSE) ICC are identical.

    + +
    +
    +
    +

    References

    +

    Aaparouhov T. 2006. General Multi-Level Modeling with Sampling Weights. Communications in Statistics—Theory and Methods (35): 439–460

    +

    Carle AC. 2009. Fitting multilevel models in complex survey data with design weights: Recommendations. BMC Medical Research Methodology 9(49): 1-13

    +

    Goldstein H, Browne W, Rasbash J. 2010. Partitioning Variation in Multilevel Models. Understanding Statistics, 1:4, 223-231, doi: 10.1207/S15328031US0104_02

    +

    Hox J. 2002. Multilevel analysis: techniques and applications. Mahwah, NJ: Erlbaum

    +

    Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation & the Health Professions 26: 239–257. doi: 10.1177/0163278703255230

    +

    Nakagawa S, Johnson P, Schielzeth H. 2017. The coefficient of determination R2 and intra-class correlation coefficient from generalized linear mixed-effects models revisted and expanded. J. R. Soc. Interface 14. doi: 10.1098/rsif.2017.0213

    +

    Rabe-Hesketh S, Skrondal A. 2012. Multilevel and longitudinal modeling using Stata. 3rd ed. College Station, Tex: Stata Press Publication

    +

    Raudenbush SW, Bryk AS. 2002. Hierarchical linear models: applications and data analysis methods. 2nd ed. Thousand Oaks: Sage Publications

    +
    + + + + + + + +