Skip to content

tutorial with code to use splines for prevalence ratios

Notifications You must be signed in to change notification settings

jhs-hwg/spline-pr

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

9 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

spline-pr

The goal of spline-pr is to fit poisson regression models with generalized estimating equations (i.e., robust variance estimators for prevalence ratios) that include spline terms. The code in this repo also generates plots that show the estimated spline curve over a domain specified by the user.

Example

This example uses NHANES data to create a spline figure showing the prevalence ratio for blood pressure control as a function of age. Since we are dealing with blood pressure control, we filter the data to include only participants with self-reported antihypertensive medication use. Also, since this is not a ‘survey’ tutorial, we do not engage with the NHANES survey weights.

library(table.glue)
library(tidyverse)
library(splines)
library(geepack)
library(mice)

source("R/functions.R")

data_nhanes <- table.glue::nhanes %>% 
 as_tibble() %>% 
 select(age, bp_sys_mmhg, bp_dia_mmhg, meds_bp) %>% 
 filter(meds_bp == 'yes', age >= 20)

data_nhanes
#> # A tibble: 4,796 x 4
#>      age bp_sys_mmhg bp_dia_mmhg meds_bp
#>    <dbl>       <dbl>       <dbl> <chr>  
#>  1    72        142         82   yes    
#>  2    73        137.        86.7 yes    
#>  3    56        157.        82   yes    
#>  4    76        127.        66.7 yes    
#>  5    60        127.        74   yes    
#>  6    65        141.        57.3 yes    
#>  7    72        160         71.3 yes    
#>  8    80        109.        53.3 yes    
#>  9    43        140         82.7 yes    
#> 10    65         NA         NA   yes    
#> # ... with 4,786 more rows

Imputation of blood pressure values

As with almost all data, we have missing values. Thus, we will use multiple imputation and pool our spline estimates from each imputed dataset.

n_imputes <- 5

data_nhanes_impute <- data_nhanes %>% 
 # remove constant column (all meds_bp == 'yes')
 select(-meds_bp) %>% 
 # run the imputation models
 mice(method = 'pmm', m = n_imputes, printFlag = FALSE) %>% 
 # get the imputed data, action = 'all' means return every imputed data set
 complete(action = 'all') %>% 
 # convert each imputed dataset to a tibble for better printing
 map(as_tibble)

# print the first imputed dataset 
data_nhanes_impute[[1]]
#> # A tibble: 4,796 x 3
#>      age bp_sys_mmhg bp_dia_mmhg
#>    <dbl>       <dbl>       <dbl>
#>  1    72        142         82  
#>  2    73        137.        86.7
#>  3    56        157.        82  
#>  4    76        127.        66.7
#>  5    60        127.        74  
#>  6    65        141.        57.3
#>  7    72        160         71.3
#>  8    80        109.        53.3
#>  9    43        140         82.7
#> 10    65        145.        66.7
#> # ... with 4,786 more rows

Post-processing of imputed values

Let’s not forget that our outcome is dependent on the two blood pressure variables; i.e., blood pressure control is 1 if systolic blood pressure is < 140 mm Hg and diastolic blood pressure is < 90 mm Hg. .

data_nhanes_impute <- data_nhanes_impute %>% 
 map(
  ~ .x %>% 
   mutate(
    bp_control = if_else(
     condition = bp_sys_mmhg < 140 & bp_dia_mmhg < 90,
     true = 1,
     false = 0
    )
   )
 )

data_nhanes_impute[[1]]
#> # A tibble: 4,796 x 4
#>      age bp_sys_mmhg bp_dia_mmhg bp_control
#>    <dbl>       <dbl>       <dbl>      <dbl>
#>  1    72        142         82            0
#>  2    73        137.        86.7          1
#>  3    56        157.        82            0
#>  4    76        127.        66.7          1
#>  5    60        127.        74            1
#>  6    65        141.        57.3          0
#>  7    72        160         71.3          0
#>  8    80        109.        53.3          1
#>  9    43        140         82.7          0
#> 10    65        145.        66.7          0
#> # ... with 4,786 more rows

Model fitting

Next we fit a geeglm model to each imputed dataset. Note that

fits <- map(
  .x = data_nhanes_impute,
  .f = ~ geeglm(bp_control ~ ns(age, df = 4), 
                data = .x, 
                family = 'poisson',
                id = seq(nrow(.x)))
)

summary(fits[[1]])
#> 
#> Call:
#> geeglm(formula = bp_control ~ ns(age, df = 4), family = "poisson", 
#>     data = .x, id = seq(nrow(.x)))
#> 
#>  Coefficients:
#>                  Estimate  Std.err   Wald Pr(>|W|)    
#> (Intercept)      -0.36379  0.09249 15.473 8.37e-05 ***
#> ns(age, df = 4)1 -0.06298  0.08449  0.556    0.456    
#> ns(age, df = 4)2 -0.13777  0.08429  2.671    0.102    
#> ns(age, df = 4)3 -0.25347  0.21064  1.448    0.229    
#> ns(age, df = 4)4 -0.40986  0.04653 77.582  < 2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Correlation structure = independence 
#> Estimated Scale Parameters:
#> 
#>             Estimate  Std.err
#> (Intercept)    0.372 0.008383
#> Number of clusters:   4796  Maximum cluster size: 1

Spline pooling

# determine a spline basis function
# the terms used here must match those in geeglm()
bases <- map(
  .x = data_nhanes_impute,
  .f = ~ ns(.x$age, df = 4)
)

# using the fit and basis function,
# get a predicted spline estimate + SE
spline_preds <- map2(
  .x = fits, 
  .y = bases, 
  .f = get_spline_preds,
  pattern = '^ns\\(',
  x_min = 20, 
  x_max = 80,
  x_ref = 50
)

# pool results using Rubin's rules

# V_w = mean of the variance estimates
variance_within <- map_dfc(spline_preds, "se") %>% 
  apply(MARGIN = 1, function(x) mean(x^2))

# V_b = variance of the predictions
variance_between <- map_dfc(spline_preds, "pred") %>% 
  apply(MARGIN = 1, var)

# V_total = V_w + V_b * (n_imputes+1) / n_imputes
variance_total <- 
  variance_within + variance_between + variance_between/n_imputes

se_pooled <- sqrt(variance_total)

spline_pool <- tibble(
  x = spline_preds[[1]]$x,
  pred = apply(map_dfc(spline_preds, "pred"), 1, mean),
  se = se_pooled,
  ci_lwr = pred + qnorm(0.025) * se,
  ci_upr = pred + qnorm(0.975) * se
)

spline_pool
#> # A tibble: 1,000 x 5
#>        x    pred    se ci_lwr ci_upr
#>    <dbl>   <dbl> <dbl>  <dbl>  <dbl>
#>  1  20   -0.0293 0.113 -0.251  0.192
#>  2  20.1 -0.0291 0.113 -0.250  0.192
#>  3  20.1 -0.0290 0.112 -0.249  0.191
#>  4  20.2 -0.0288 0.112 -0.248  0.191
#>  5  20.2 -0.0286 0.112 -0.247  0.190
#>  6  20.3 -0.0284 0.111 -0.247  0.190
#>  7  20.4 -0.0283 0.111 -0.246  0.189
#>  8  20.4 -0.0281 0.111 -0.245  0.189
#>  9  20.5 -0.0279 0.110 -0.244  0.188
#> 10  20.5 -0.0278 0.110 -0.243  0.188
#> # ... with 990 more rows

Spline visualization

data_imputed_stack <- bind_rows(data_nhanes_impute)

data_segment <- bin_segments(x = data_imputed_stack$age,
                             y = data_imputed_stack$bp_control,
                             x_min = 20,
                             x_max = 80,
                             by_y = TRUE,
                             bin_length = 2/3,
                             bin_count = 15,
                             bin_yintercept = 0.45) %>%
  mutate(event_status = factor(event_status,
                               levels = c(1, 0),
                               labels = c("Yes", "No")))

# you might need to tune this based on the range of your figure.
nudge_y <- (as.numeric(data_segment$event_status=='Yes')-1/2)*0.01

fig <- ggplot(spline_pool) + 
  aes(x = x, 
      y = exp(pred), 
      ymin = exp(ci_lwr), 
      ymax = exp(ci_upr)) + 
  labs(x = 'age, years',
       y = 'Prevalence ratio',
       color = 'Blood pressure control') + 
  geom_line() + 
  geom_ribbon(alpha = 0.2) + 
  scale_y_log10(limits = c(0.35, 2.5)) + 
  geom_segment(data = data_segment, 
               inherit.aes = FALSE,
               size = 12,
               mapping = aes(x = x, 
                             y = y,
                             color = event_status,
                             xend = xend, 
                             yend = yend)) + 
  geom_text(data = data_segment,
             inherit.aes = FALSE,
             show.legend = FALSE,
             nudge_y = nudge_y,
             mapping = aes(x = x, 
                           y = yend,
                           vjust = as.numeric(event_status=='No'),
                           color = event_status,
                           label = count)) +
  theme_bw() + 
  geom_hline(yintercept = 1, linetype = 2, color = 'grey') + 
  theme(panel.grid = element_blank(),
        legend.position = c(.25, 0.80)) + 
  scale_color_manual(values = c("grey", "black"))

fig

About

tutorial with code to use splines for prevalence ratios

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages