-
Notifications
You must be signed in to change notification settings - Fork 4
/
assignment5.R
1244 lines (1057 loc) · 47.4 KB
/
assignment5.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' ---
#' title: "Homework 5"
#' author: "Allan Kimaina"
#' header-includes:
#' - \usepackage{pdflscape}
#' - \newcommand{\blandscape}{\begin{landscape}}
#' - \newcommand{\elandscape}{\end{landscape}}
#' output:
#' pdf_document: default
#' html_document: default
#' ---
#'
#'
knitr::opts_chunk$set(echo = F)
# loadl lm package
library(plyr)
library(dplyr)
library("tibble")
library(car)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(ggpubr)
library(ggpmisc)
library(gridExtra)
library(stargazer)
library(e1071)
library(jtools)
library(effects)
library(multcompView)
library(ggplot2)
library(ggrepel)
library(MASS)
library(broom)
library(ggcorrplot)
library(leaps)
library(relaimpo)
library(olsrr)
# aestetics
library(xtable)# latex table
library(texreg) # for latex table
# load GLM packages
library(ROCR)
library(arm)
library(foreign)
library(nnet)
library(VGAM)
library(ordinal)
library(ModelGood)
library(InformationValue)
library(rms)
library(AER) # for overdispersion
library(metRology)
library(hett)
library(lme4)
# load required data
riskyBehaviours.df=read.csv("data/risky_behaviors.csv")
# Data wrangling
riskyBehaviours.df$couples <- factor(riskyBehaviours.df$couples)
riskyBehaviours.df$women_alone <- factor(riskyBehaviours.df$women_alone)
# https://stackoverflow.com/questions/14510277/scale-back-linear-regression-coefficients-in-r-from-scaled-and-centered-data
# scaling
riskyBehaviours.df$cBupacts <- (riskyBehaviours.df$bupacts - mean(riskyBehaviours.df$bupacts)) / (2 * sd(riskyBehaviours.df$bupacts))
#hist(riskyBehaviours.df$fupacts)
#'
#'
#' \onecolumn
#' # Chapter 7 Question 2:
#'
#' Continuous probability simulation: the logarithms of weights (in pounds) of men in the United States are approximately normally distributed with mean 5.13 and standard deviation 0.17; women with mean 4.96 and standard deviation 0.20. Suppose 10 adults selected at random step on an elevator with a capacity of 1750 pounds.
#'
#' ``What is the probability that the elevator cable breaks?``
#'
#'
set.seed(111)
men_weight <- 5.13
men_sd <- .17
prop_female <- .52
women_weight <- 4.96
women_sd <- .20
n <- 10
num_sims <- 1000
capacity <- 1750
elevatorCableBreaks <- rep(NA, num_sims)
for (j in 1:num_sims) {
weights <- rep(NA, n)
is_female <- rbinom(n=n, size=1, prob=prop_female)
for (i in 1:n) {
log_weight <- ifelse(is_female[i]==1,
rnorm(n=1, mean=women_weight, sd=women_sd),
rnorm(n=1, mean=men_weight, sd=men_sd)
)
weights[i]<- exp(log_weight)
}
elevatorCableBreaks[j] <- ifelse(capacity<sum(weights), 1, 0)
}
sum(elevatorCableBreaks) / num_sims
#'
#'
#' We assumed the proportion of women to men is .52 (provided in the chapter but not in this question)
#' \onecolumn
#'
#' # Chapter 7 Question 8:
#' Inference for the ratio of parameters: a (hypothetical) study compares the costs and effectiveness of two different medical treatments.
#'
#' * In the first part of the study, the difference in costs between treatments A and B is estimated at $600 per patient, with a standard error of $400, based on a regression with 50 degrees of freedom.
#'
#' * In the second part of the study, the difference in effectiveness is estimated at 3.0 (on some relevant measure), with a standard error of 1.0, based on a regression with 100 degrees of freedom.
#'
#' * For simplicity, assume that the data from the two parts of the study were collected independently.
#'
#' Inference is desired for the incremental cost-effectiveness ratio: the difference between the average costs of the two treatments, divided by the difference between their average effectiveness. (This problem is discussed further by heitjan, Moskowitz, and Whang, 1999.)
#'
#' ## Part A:
#' `` Create 1000 simulation draws of the cost difference and the effectiveness difference, and make a scatterplot of these draws. ``
#'
#'
set.seed(111)
num_sims <- 1000
# initialize with NA
cost_diff = rep(NA, num_sims)
effect_diff = rep(NA, num_sims)
n_cost = 52
n_effect = 102
for (i in 1:num_sims) {
# cost difference
treatment = rbinom(n = n_cost, size = 1, prob = 0.5)
X = cbind(1, treatment)
sigma = sqrt(400 ^ 2 / solve((t(X) %*% X))[2, 2])
error = rnorm(n_cost, mean = 0, sd = sigma)
cost = 100000 + 600 * treatment + error
Cost_model = lm(cost ~ treatment)
cost_diff[i] = coef(Cost_model)[2]
# effect difference
treatment = rbinom(n = n_effect, size = 1, prob = 0.5)
X = cbind(1, treatment)
sigma = sqrt(1 / solve((t(X) %*% X))[2, 2])
error = rnorm(n_effect, mean = 0, sd = sigma)
effect = 100000 + 3 * treatment + error
effect_model = lm(effect ~ treatment)
effect_diff[i] = coef(effect_model)[2]
}
costEffect.df <- data.frame(cost=cost_diff, effect=effect_diff)
ggplot(data=costEffect.df, aes(x=cost, y=effect)) +
geom_point() +
labs(x="Cost Difference", y="Effectiveness Difference") +
theme_classic()+
guides(fill=FALSE)
#'
#'
#'
#' ## Part B:
#' ``Use simulation to come up with an estimate, 50% interval, and 95% interval for the incremental cost-effectiveness ratio. ``
#'
#'
costEffect.df$ratio <- costEffect.df$cost/costEffect.df$effect
hist(costEffect.df$ratio,
col="blue",
main = "Cost Effectiveness", xlim = c(-2000,3000), xlab="ratio",
breaks=300)
#'
#'
#'
quantile(costEffect.df$ratio, c(.25, .75))
quantile(costEffect.df$ratio, c(.025, .975))
#stargazer(summary(costEffect.df))
#'
#'
#' ## Part C:
#' ``Repeat this problem, changing the standard error on the difference in effectiveness to 2.0.``
#'
#'
num_sims <- 1000
effect_diff = rep(NA, num_sims)
n_cost = 52
n_effect = 102
for (i in 1:num_sims) {
# effect difference
treatment = rbinom(n = n_effect, size = 1, prob = 0.5)
X = cbind(1, treatment)
sigma = sqrt(1 / solve((t(X) %*% X))[2, 2])
error = rnorm(n_effect, mean = 0, sd = sigma)
effect = 100000 + 2 * treatment + error
effect_model = lm(effect ~ treatment)
effect_diff[i] = coef(effect_model)[2]
}
costEffect.df$ratio <- costEffect.df$cost/costEffect.df$effect
hist(costEffect.df$ratio,
col="blue",
main = "Cost Effectiveness", xlim = c(-2000,3000), xlab="ratio",
breaks=300)
#'
#'
#'
#'
#'
quantile(costEffect.df$ratio, c(.25, .75))
quantile(costEffect.df$ratio, c(.025, .975))
#'
#'
#'
#' \onecolumn
#'
#' # Chapter 8 Question 1:
#' Fitting the wrong model: suppose you have 100 data points that arose from the following model: y =3+0 .1x1+0.5x2+error, with errors having a t distribution with mean 0, scale 5, and 4 degrees of freedom.We shall explore the implications of fitting a standard linear regression to these data.
#'
#'
#' ## Part A:
#' ``Simulate data from this model. For simplicity, suppose the values of x1 are simply the integers from 1 to 100, and that the values of x2 are random and equally likely to be 0 or 1.3 Fit a linear regression (with normal errors) to these data and see if the 68% confidence intervals for the regression coefficients (for each, the estimates ±1 standard error) cover the true values. ``
#'
#'
set.seed(111)
x1 <- 1:100
x2 <- rbinom(100, 1, 0.5)
error <- rnorm(100, 0, 1)
y = 3 + .1*x1 + .5*x2 + error
model_8.1.a <- lm(y ~ x1 + x2)
texreg(list(model_8.1.a),
custom.model.names = c("Model 8A"),
single.row=TRUE, float.pos = "h")
#'
#'
#'
coverage_test = c(3, 0.1, 0.5)
regression_coef = as.data.frame(summary(model_8.1.a)$coefficients)
int_coverage = cbind(regression_coef$Estimate-regression_coef$`Std. Error`,
regression_coef$Estimate+regression_coef$`Std. Error`)
int_coverage_test = cbind(coverage_test>=int_coverage[,1]) & (coverage_test<=int_coverage[,2])
rownames(int_coverage) <- c("Intercept","X1","X2")
rownames(int_coverage_test) <- c("Intercept","X1","X2")
test_matrix <- merge(int_coverage, int_coverage_test, by = "row.names", all = TRUE)
colnames(test_matrix) <- c("Coef","Lower","Upper","Coverage")
xtable(test_matrix, comment=FALSE)
#'
#' After generating 68% confidence intervals for the regression coefficient, all the point estimates except Intercept were not contained in the 68% confidence intervals. However the CI band was too wide making high uncertainty in the point estimate. Repeating this simulation without setting seed would definitely result in difference coverage conclusion due to this wide CI
#'
#' ## Part B:
#' `` Put the above step in a loop and repeat 1000 times. Calculate the confidence coverage for the 68% intervals for each of the three coefficients in the model. ``
#'
#'
set.seed(111)
coefs <- array(NA, c(3, 1000))
se <- array(NA, c(3, 1000))
for (i in 1:ncol(coefs)) {
x1 <- 1:100
x2 <- rbinom(100, 1, 0.5)
error <-rnorm(100, 0, 1)
y = 3 + 0.1*x1 + 0.5*x2 + error
lm.model <- summary(lm(y ~ x1 + x2))
coefs[1,i] <- tidy(lm.model)[1,2]
coefs[2,i] <- tidy(lm.model)[2,2]
coefs[3,i] <- tidy(lm.model)[3,2]
se[1,i] <- tidy(lm.model)[1,3]
se[2,i] <- tidy(lm.model)[2,3]
se[3,i] <- tidy(lm.model)[3,3]
}
mean_coef <- rowMeans(coefs)
mean_se <- rowMeans(se)
int_coverage<- cbind(mean_coef + (-1 * mean_se),
mean_coef + (1 * mean_se))
int_coverage_test = cbind(coverage_test>=int_coverage[,1]) & (coverage_test<=int_coverage[,2])
rownames(int_coverage) <- c("Intercept","X1","X2")
rownames(int_coverage_test) <- c("Intercept","X1","X2")
test_matrix <- merge(int_coverage, int_coverage_test, by = "row.names", all = TRUE)
colnames(test_matrix) <- c("Coef","Lower","Upper","Coverage")
xtable(test_matrix, comment=FALSE)
#'
#'
#' After simulating the previous step 1000 times, we calculated the mean of all the point estimates and standard error. All the 3 estimates were contained in the 68% confidence intervals.
#'
#' ## Part C:
#' ``Repeat this simulation, but instead fit the model using t errors (see Exercise 6.6).``
#'
#'
set.seed(111)
coefs <- array(NA, c(3, 1000))
se <- array(NA, c(3, 1000))
for (i in 1:ncol(coefs)) {
x1 <- 1:100
x2 <- rbinom(100, 1, 0.5)
error <- rt.scaled(100, df = 4, mean = 0, sd = 5)
y = 3 + 0.1*x1 + 0.5*x2 + error
lm.model <- summary(tlm(y ~ x1 + x2))
coefs[1,i] <- lm.model$loc.summary$coefficients[1,1]
coefs[2,i] <- lm.model$loc.summary$coefficients[2,1]
coefs[3,i] <- lm.model$loc.summary$coefficients[3,1]
se[1,i] <- lm.model$loc.summary$coefficients[1,2]
se[2,i] <- lm.model$loc.summary$coefficients[2,2]
se[3,i] <- lm.model$loc.summary$coefficients[3,2]
}
mean_coef <- rowMeans(coefs)
mean_se <- rowMeans(se)
int_coverage<- cbind(mean_coef + (-1 * mean_se),
mean_coef + (1 * mean_se))
int_coverage_test = cbind(coverage_test>=int_coverage[,1]) & (coverage_test<=int_coverage[,2])
rownames(int_coverage) <- c("Intercept","X1","X2")
rownames(int_coverage_test) <- c("Intercept","X1","X2")
test_matrix <- merge(int_coverage, int_coverage_test, by = "row.names", all = TRUE)
colnames(test_matrix) <- c("Coef","Lower","Upper","Coverage")
xtable(test_matrix, comment=FALSE)
#'
#' After the previous step 1000 times using t errors, we calculated the mean of all the point estimates and standard error. All the 3 estimates were contained in the 68% confidence intervals.
#' However the standard error were inflated making the CI band too wide compared to the previous model
#' \onecolumn
#'
#' # Chapter 8 Question 4:
#' Model checking for count data: the folder risky.behavior contains data from a study of behavior of couples at risk for HIV; see Exercise 6.1.
#'
#' ## Part A:
#' ``Fit a Poisson regression model predicting number of unprotected sex acts from baseline hIV status. Perform predictive simulation to generate 1000 datasets and record both the percent of observations that are equal to 0 and the percent that are greater than 10 (the third quartile in the observed data) for each. Compare these values to the observed value in the original data. ``
#'
#' ### Fit a Poisson regression model
#'
riskyBehaviours.glm1 <- glm(fupacts ~ bs_hiv, family=poisson, data=riskyBehaviours.df)
texreg(list(riskyBehaviours.glm1),
custom.model.names = c("Model 1: Poisson"),
single.row=TRUE, float.pos = "h")
#'
#' This model does not fit well, the rule of thumb dictates that the ratio of residual deviance to degree of freedom should be close to 1, however we get a value which is way far from 1.
#'
#' ### 1000 Simulations
#'
# data wrangling and cleaning
riskyBehaviours.df$bs_hiv_bin <- ifelse(riskyBehaviours.df$bs_hiv=="negative",0,1)
X = cbind(1,as.numeric(riskyBehaviours.df$bs_hiv_bin))
# simulate 1000
n.sims <- 1000
riskyBehaviours.sims1 <- arm::sim(riskyBehaviours.glm1, n.sims)
n<- length(riskyBehaviours.df$fupacts)
y.rep <- array(NA, c(n.sims,n))
beta <- coef(riskyBehaviours.sims1)
# do 1000 simulations
for(i in 1:n.sims){
y.hat <- exp(X%*%beta[i,])
y.rep[i,]<-rpois(n,y.hat)
}
# test statistics
test.rep <- rep(NA, n.sims)
test.rep.gt10 <- rep(NA, n.sims)
for (i in 1:n.sims){
test.rep[i]<- mean(y.rep[i,]==0)
test.rep.gt10[i]<-mean(y.rep[i,]>10)
}
#summary(rowSums(y.hat == 0)/ncol(y.hat))
#summary(rowSums(y.hat > 10)/ncol(y.hat))
real.gt.0= mean(riskyBehaviours.df$fupacts == 0)
real.gt.10= mean(riskyBehaviours.df$fupacts > 10)
# summary(test.rep)
# summary(test.rep.gt10)
par(mfrow = c(1, 2))
hist(test.rep, main="# Plot A ( 1K Simulations)", xlab="proportion of sex acts = 0", col="blue")
hist(test.rep.gt10, main="# Plot B (1K Simulations)", xlab="proportion of of sex acts > 10", col="blue")
#'
#'
#' ### Plot A: Frequency of number of unprotected sex acts at followup = 0
#'
#' ```
#' ## Min. 1st Qu. Median Mean 3rd Qu. Max.
#' ## 0.000e+00 0.000e+00 0.000e+00 1.843e-05 0.000e+00 2.304e-03
#' ```
#'
#' The actual dataset had 29% of the observation having 0 number of unprotected sex acts at followup. However, after performing predictive simulation to generate 1000 datasets, we found out that the percentage of hypothesized observations with number of unprotected sex acts at followup equal to 0 ranged from 0% to 0.2% with most of replication at 0% - all of which were much much lower than the observed test statistic of 29%.
#'
#' ### Plot B: Frequency of number of unprotected sex acts at followup > 10
#'
#' ```
#' ## Min. 1st Qu. Median Mean 3rd Qu. Max.
#' ## 0.7995 0.8364 0.8479 0.8485 0.8594 0.9101
#' ```
#'
#' The actual dataset had 36% of the observations having number of unprotected sex acts at followup that is greater than 10. However, after performing predictive simulation to generate 1000 datasets, we found out that the percentage of hypothesized observations with number of unprotected sex acts at followup greater than 10 ranged from 84% `(1stIQR)` to 86% `(3rdIQR)` with an average of 85% - all of which were much much higher than the observed test statistic of 36%.
#'
#' ## Part B:
#' ``Repeat (a) using an overdispersed Poisson regression model.``
#'
#' ### Fit an overdispersed Poisson regression model
#'
riskyBehaviours.glm2 <- glm(fupacts ~ bs_hiv, family=quasipoisson, data=riskyBehaviours.df)
texreg(list(riskyBehaviours.glm2),
custom.model.names = c("Model 2: overdispersed Poisson"),
single.row=TRUE, float.pos = "h")
#'
#' Much better model but it does not fit as well, the ratio of residual deviance to degree of freedom is still far from 1.
#'
#' ### 1000 Simulations
#'
# simulate 1000
n.sims <- 1000
riskyBehaviours.sims2 <- arm::sim(riskyBehaviours.glm2, n.sims)
n<- length(riskyBehaviours.df$fupacts)
y.rep <- array(NA, c(n.sims,n))
beta <- coef(riskyBehaviours.sims2)
# do 1000 simulations
overdisp <- summary(riskyBehaviours.glm2)$dispersion
for(i in 1:n.sims){
y.hat <- exp(X%*%beta[i,])
a <- y.hat/(overdisp-1) # dispersion param
y.rep[i,]<-rnegbin(n,y.hat, a)
}
# test statistics
test.rep <- rep(NA, n.sims)
test.rep.gt10 <- rep(NA, n.sims)
for (i in 1:n.sims){
test.rep[i]<- mean(y.rep[i,]==0)
test.rep.gt10[i]<-mean(y.rep[i,]>10)
}
#summary(rowSums(y.hat == 0)/ncol(y.hat))
#summary(rowSums(y.hat > 10)/ncol(y.hat))
real.gt.0= mean(riskyBehaviours.df$fupacts == 0)
real.gt.10= mean(riskyBehaviours.df$fupacts > 10)
# summary(test.rep)
# summary(test.rep.gt10)
par(mfrow = c(1, 2))
hist(test.rep, main="# Plot C ( 1K Simulations)", xlab="proportion of sex acts = 0", col="blue")
hist(test.rep.gt10, main="# Plot D (1K Simulations)", xlab="proportion of of sex acts > 10", col="blue")
#'
#'
#' ### Plot C: Frequency of number of unprotected sex acts at followup = 0
#'
#' ```
#' ## Min. 1st Qu. Median Mean 3rd Qu. Max.
#' ## 0.1290 0.2166 0.2373 0.2388 0.2604 0.3364
#' ```
#'
#' After performing predictive simulation based on overdispersed model, we found out that the percentage of hypothesized observations with number of unprotected sex acts at followup equal to 0 varied from 21% to 26%, with most of replication averaging at 24% - all of which were reasonably much closer to the observed test statistic of 29% compared to the previous model. In summary, this tells us that this model reasonably fits this aspect of data well, however other aspects of this data might not be well fit by this model.
#'
#' ### Plot D: Frequency of number of unprotected sex acts at followup > 10
#'
#' ```
#' ## Min. 1st Qu. Median Mean 3rd Qu. Max.
#' ## 0.2834 0.3664 0.3871 0.3881 0.4124 0.4862
#' ```
#'
#' The actual dataset had 36% of the observations having number of unprotected sex acts at followup that is greater than 10. After the predictive simulation, we found out that the percentage of hypothesized observations with number of unprotected sex acts at followup greater than 10 varied from 37% `(1stIQR)` to 41% `(3rdIQR)` with an average of 39% - all of which were reasonably much closer to the observed test statistic of 36% than the previous model. This tells us that this model reasonably fits this aspect of data well, however other aspects of this data might not be well fit by this model.
#'
#'
#' ## Part C:
#' ``Repeat (b), also including ethnicity and baseline number of unprotected sex acts as input variables.``
#'
#' ### Fit an overdispersed Poisson regression model
#'
riskyBehaviours.glm3 <- glm(fupacts ~ bs_hiv + bupacts, family=quasipoisson, data=riskyBehaviours.df)
texreg(list(riskyBehaviours.glm3),
custom.model.names = c("Model 3: overdispersed Poisson"),
single.row=TRUE, float.pos = "h")
#'
#' Much better model than the previous model but it does not meet the standard, the ratio of residual deviance to degree of freedom is 24 which is still far from 1.
#'
#' ### 1000 Simulations
#'
# simulate 1000
n.sims <- 1000
riskyBehaviours.sims3<- arm::sim(riskyBehaviours.glm3, n.sims)
n<- length(riskyBehaviours.df$fupacts)
y.rep <- array(NA, c(n.sims,n))
beta <- coef(riskyBehaviours.sims3)
X = cbind(1,as.numeric(riskyBehaviours.df$bs_hiv_bin),riskyBehaviours.df$bupacts)
# do 1000 simulations
overdisp <- summary(riskyBehaviours.glm3)$dispersion
for(i in 1:n.sims){
y.hat <- exp(X%*%beta[i,])
a <- y.hat/(overdisp-1) # dispersion param
y.rep[i,]<-rnegbin(n,y.hat, a)
}
# test statistics
test.rep <- rep(NA, n.sims)
test.rep.gt10 <- rep(NA, n.sims)
for (i in 1:n.sims){
test.rep[i]<- mean(y.rep[i,]==0)
test.rep.gt10[i]<-mean(y.rep[i,]>10)
}
#summary(rowSums(y.hat == 0)/ncol(y.hat))
#summary(rowSums(y.hat > 10)/ncol(y.hat))
real.gt.0= mean(riskyBehaviours.df$fupacts == 0)
real.gt.10= mean(riskyBehaviours.df$fupacts > 10)
# summary(test.rep)
# summary(test.rep.gt10)
par(mfrow = c(1, 2))
hist(test.rep, main="# Plot E ( 1K Simulations)", xlab="proportion of sex acts = 0", col="blue")
hist(test.rep.gt10, main="# Plot F (1K Simulations)", xlab="proportion of of sex acts > 10", col="blue")
#'
#'
#' ### Plot E: Frequency of number of unprotected sex acts at followup = 0
#'
#' ```
#' ## Min. 1st Qu. Median Mean 3rd Qu. Max
#' ## 0.1198 0.1843 0.2028 0.2041 0.2235 0.2857
#' ```
#'
#' After performing predictive simulation based on overdispersed model with added predictor (bupacts), we found out that the percentage of hypothesized observations with number of unprotected sex acts at followup equal to 0 varied from 18% to 22%, with most of replication averaging at 20% - most of which were closer to the observed test statistic of 29%
#'
#' ### Plot F: Frequency of number of unprotected sex acts at followup > 10
#'
#' ```
#' ## Min. 1st Qu. Median Mean 3rd Qu. Max.
#' ## 0.3111 0.3802 0.4032 0.4033 0.4263 0.4908
#' ```
#'
#' The actual dataset had 36% of the observations having number of unprotected sex acts at followup that is greater than 10. After the predictive simulation, we found out that the percentage of hypothesized observations with number of unprotected sex acts at followup greater than 10 varied from 38% `(1stIQR)` to 42% `(3rdIQR)` with an average of 40% - all of which were closer to the observed test statistic of 36%.
#'
#'
#' In summary this model really does well in compromising and balancing between the 2 aspect of data we have accessed. The upshot is that this model fit the data much much better compared to the previous 2 model
#'
#'
#' \onecolumn
#'
#' # Chapter 11 Question 4:
#' The folder cd4 has CD4 percentages for a set of young children with HIV who were measured several times over a period of two years. The dataset also includes the ages of the children at each measurement.
#'
#' ### Time Function
#'
# load required data
cd4.df=read.csv("data/cd4.csv")
cd4.df$newpid= as.character(cd4.df$newpid)
cd4.df$treatmnt= as.factor(cd4.df$treatmnt)
# transfrom data
time<-cd4.df$visage-cd4.df$baseage
cd4.df<-cbind(cd4.df,time)
# define ggplot alpha level
cd4.df = ddply(cd4.df, .(newpid), function(x){
x$alpha = ifelse(runif(n = 1) > 0.9, 1, 0.1)
x$is_test = factor(ifelse(x$newpid<5, 0, 1))
x
})
# define timne function
time_function_plot <- ggplot(cd4.df, aes(x = time, y = cd4pct) )+
geom_point()+
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
aes(alpha=alpha, color=factor(newpid)) +
theme_classic()+
theme(legend.position="none")
time_function_plot
#'
#'
#'
#' We define our time function as visit_age - base_Age. This ensured that all children visit started from day 0 and had a serial/sequential progression of events as shown in the graph above. Some data points (children) have been grey scaled for interchangeability.
#'
#' It is also interesting to note that most visit occurred quarterly
#'
#'
#' ## Part A:
#' ``Graph the outcome (the CD4 percentage, on the square root scale) for each child as a function of time.``
#'
#'
#'
#' ### Overall Plot - all 254 children
#'
cd4_c11_q4.plot<- ggplot(cd4.df, aes(x = time, y = cd4pct)) +
guides(colour=FALSE) +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
theme_classic()+ guides(fill=FALSE)+
theme(legend.position="none") +
scale_y_sqrt()
#geom_smooth(se=FALSE, colour="black", method = "lm", size=2)
#facet_wrap(~person_id)
cd4_c11_q4.plot +
geom_line()+
geom_point()+
aes(color = factor(newpid))
#'
#'
#' The plot above is difficult synthesize and inter prate, let's try plotting 5% of the children.
#' The below shows 5% of the children (colored) and the remaining 90% of children (grey scale).
#'
#' ### Plot for 5% of all children
#'
cd4_c11_q4a.plot.2<- cd4_c11_q4.plot+
geom_line(aes(alpha=alpha, color=factor(newpid)) )+
geom_point(aes(alpha=alpha, color=factor(newpid)) )+
guides(alpha=FALSE)
#geom_smooth(se=T, colour="black", method = "lm", size=1, linetype=6)
subset <- c(3,4,6,7,10,11,12,15,17,19,62,63)
cd4_c11_q4a.plot.3<- ggplot(cd4.df[cd4.df$newpid %in% subset,],
aes(x = time, y = cd4pct)) +
guides(colour=FALSE) +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
theme_bw()+
theme(legend.position="none") +
geom_line()+
geom_point()+
aes(color = factor(newpid)) +
# geom_smooth(se=T, colour="red", method = "lm", linetype=6)+
facet_wrap(~factor(newpid))
#grid.arrange(cd4_c11_q4a.plot.2, cd4_c11_q4a.plot.3, nrow = 2.)
cd4_c11_q4a.plot.2
cd4_c11_q4a.plot.3
#'
#'
#'
#'
#'
#'
#' ## Part B:
#' ``Each child’s data has a time course that can be summarized by a linear fit. Estimate these lines and plot them for all the children.``
#'
#'
#'
cd4_c11_q4b.plot.3<- ggplot(cd4.df, aes(x = time, y = cd4pct, color=factor(newpid)) )+
# geom_point()+
geom_smooth(se=F,method = "lm", linetype=1) +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
theme_classic()+
theme(legend.position="none")
cd4_c11_q4b.plot.3
#'
#'
#' From the above plot, it is difficult to decipher the magnitude and direction of each regression line. The next page provides a plot for only a subset of these children which is more interpretable.
#'
#'
cd4_c11_q4b.plot.4 <- ggplot(cd4.df)+
#geom_point(
# aes(x = time, y = cd4pct, alpha=alpha, color=factor(newpid) ) )+
geom_line(stat = "smooth",
method = lm,
aes(x = time, y = cd4pct, alpha=alpha, color=factor(newpid))
) +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
coord_cartesian(ylim = c(0,60), xlim = c(0, 2))+
theme_classic()+
theme(legend.position="none")
cd4_c11_q4b.plot.4
#'
#'
#'
#'
# selective subsetting
subset <- c(3,4,6,7,10,11,12,15,17,19,62,63)
cd4_c11_q4b.plot.2<- ggplot(cd4.df[cd4.df$newpid %in% subset,],
aes(x = time, y = cd4pct)) +
guides(colour=FALSE) +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
theme_bw()+
theme(legend.position="none") +
geom_line()+
geom_point()+
aes(color = factor(newpid)) +
# geom_smooth(se=T, colour="red", method = "lm", linetype=6)+
facet_wrap(~factor(newpid)) +
geom_smooth(se=F, colour="black", method = "lm", linetype=6)
cd4_c11_q4b.plot.2
#'
#'
#' For some children, cd4 percent increases over time. However, for most children, cd4 percent decreases with time indicating that time is negatively associated with the outcome.
#'
#' This is rather odd! we expect that most of the children to have increasing cd4 percent over time. Lets take a look at the general trend of all children in general (Group level regression) as shown below.
#'
#'
cd4_c11_q4b.plot<- ggplot(cd4.df, aes(x = time, y = cd4pct)) +
geom_line() +
# geom_point()+
guides(colour=FALSE) +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
theme_classic()+ guides(fill=FALSE)+
theme(legend.position="none") +
aes(alpha=alpha, color=factor(newpid)) +
geom_smooth(se=T, colour="black", method = "lm", linetype=6)
cd4_c11_q4b.plot
#'
#'
#' The dotted line is the population level regression and apparently it has a negative slope. It's likely that some treatment/ARVs works better than others or maybe other child level factors might be confounded with the design variables. This is beyond the scope of this analysis, therefore we shall not make any adjustment or include other predictors.
#'
#'
#' ## Part C:
#' ``Set up a model for the children’s slopes and intercepts as a function of the treatment and age at baseline. Estimate this model using the two-step procedure–first estimate the intercept and slope separately for each child, then fit the between-child models using the point estimates from the first step.``
#'
#'
# step 1
df_1 = cd4.df %>% na.omit() %>%
group_by(newpid) %>%
dplyr::select(newpid,cd4pct,time) %>%
mutate(intercept = coef(lm(cd4pct~time))[1]) %>%
mutate(slope = coef(lm(cd4pct~time))[2]) %>%
dplyr::select(newpid, intercept,slope)
# step 2
df_2 = inner_join(cd4.df,df_1,by="newpid")
model.intercept = lm(intercept~baseage+treatmnt, data =df_2 )
model.slope = lm(slope~baseage+treatmnt, data =df_2 )
texreg(list(model.intercept,model.slope),
custom.model.names = c("Intercept Model", "Slope Model"),
single.row=TRUE, float.pos = "h")
#'
#'
#' \onecolumn
#'
#' # Chapter 12 Question 2:
#' Continuing with the analysis of the CD4 data from Exercise 11.4:
#'
#' ## Part A:
#' ``Write a model predicting CD4 percentage as a function of time with varying intercepts across children. Fit using lmer() and interpret the coefficient for time.``
#'
#'
model_chap12_q2a = lmer(cd4pct~1+time+(1|newpid),data=cd4.df,REML=F) # varying intercepts
texreg(list(model_chap12_q2a),
custom.model.names = c("Model 12.2 A"),
single.row=TRUE, float.pos = "h")
#summary(model_chap12_q2a)
#coef(model_chap12_q2a)
#'
#'
#' Given that this model is a random intercept model, it only accounts for "idiosyncratic" variation that is due to individual level differences in baseline cd4 percent. It also assumes that time effects on cd4 percent is fixed meaning that every child cd4 percent varies systematically with time.
#'
#' * **Time** - Given that time is a fixed effect on every child, each year on the study is associated with a decreased in cd4 percentage by 3 % on average. In this model, time was statistically significant and had a substantial effect on cd4 percentage.
#'
#' ## Part B:
#' ``Extend the model in (a) to include child-level predictors (that is, group-level predictors) for treatment and age at baseline. Fit using lmer() and interpret the coefficients on time, treatment, and age at baseline.``
#'
#'
#'
#'
#model_chap12_q2b=lmer(cd4pct~1+time+(1+treatmnt+baseage|newpid),data=cd4.df,REML=F)
model_chap12_q2b=lmer(cd4pct~1+time+treatmnt+baseage+(1|newpid),data=cd4.df,REML=F)
texreg(list(model_chap12_q2b),
custom.model.names = c("Model 12.2 B"),
single.row=TRUE, float.pos = "h")
#summary(model_chap12_q2b)
#coef(model_chap12_q2b)
#toLatex(mtable(model_chap12_q2b))
#'
#'
#' In general, adding child-level predictors (i.e, group-level predictors) definitely improves the model. In fact it increased the precision of our estimates. Note that the variance of our random effect estimate has reduced from 128 to 123 meaning some of the variations has been explained by addition of fixed effect predictors like base age and treatment. Conversely, residual has not reduced by much possibly because we haven't added another random effect predictor that is due to child-specific idiosyncrasies.
#'
#' ### Interpretation
#'
#' * **Treatment** - Children who were on second treatment had 1.2125% higher cd4 percent on average than children on first treatment. This predictor had substantial effect on cd4 percent however it was insignificant.
#'
#' * **Baseline Age** - 1 year difference in baseline age is associate with changing cd4 percentage by 0.9485 % on average. Meaning children who were enrolled into the study early had higher cd4 percentage. This predictor was highly significant.
#'
#' * **Time** - Each year on treatment was associated with a decreased in cd4 percentage by 2.9615 % on average. This predictor was highly significant.
#'
#' ## Part C:
#' ``Investigate the change in partial pooling from (a) to (b) both graphically and numerically.``
#'
#'
#'
#'
#' ### Graphically
#'
#'
# complete pooling
complete_pooling.fit<-lm(cd4pct~time, data=cd4.df)
df_complete_pooling <- data_frame(
model = "Complete pooling",
newpid = unique(cd4.df$newpid),
intercept = coef(complete_pooling.fit)[1],
slope_time = coef(complete_pooling.fit)[2])
# model for question 2 A
df = coef(model_chap12_q2a)[["newpid"]]
df$intercept = df$'(Intercept)'
df_chap12_q2a <- df %>%
dplyr::select(intercept, slope_time = time) %>%
as_tibble() %>%
rownames_to_column("newpid") %>%
add_column(model = "Model 12.2 A")
# model for question 2 B
df = coef(model_chap12_q2b)[["newpid"]]
df$intercept = df$'(Intercept)'
df_chap12_q2b <- df %>%
dplyr::select(intercept,
slope_time = time,
treatmnt_slope= treatmnt2, baseage_slope= baseage ) %>%
as_tibble() %>%
rownames_to_column("newpid") %>%
add_column(model = "Model 12.2 B")
# bind data
df_models <- bind_rows(df_chap12_q2a, df_chap12_q2b, df_complete_pooling) %>%
left_join(cd4.df, by = "newpid")
# create subset
subset <- c(67,72,77,78,79,11,12,19,27,23,62,63)
# construct plot
model_comparison <- ggplot(df_models[df_models$newpid %in% subset,]) +
aes(x = time, y = cd4pct) +
geom_abline(aes(intercept = intercept, slope = slope_time, color = model),
size = .75) +
#theme_bw()+
#geom_line( linetype=6)+
geom_point()+
facet_wrap("newpid") +
labs(y = "CD4 Percentage", x="Time (visit_age - base_Age) ")+
scale_x_continuous(breaks = 0:4 * 2) +
scale_color_brewer(palette = "Dark2") +
theme(legend.position = "top")
model_comparison
#'
#'
#'
#' A few observation to make.
#'
#' * From the plot, observe that the partial pooling model with added child-level predictors (Model 12.2B) is pulled substantially towards the complete pooling model (group average) compared to Model 12.2.A
#'
#' * If you take a look at children with less observations (incomplete data), you will notice that Model 12.2.B is pulled more towards the complete pooling model (group average) compared to Model 12.2.A. See child 67, 79, 77
#'
#'
#' ### Numerically
#'
anova_mixd <- anova(model_chap12_q2a,model_chap12_q2b)
xtable(anova_mixd, comment=FALSE)
#'
#'
#' Using ANOVA to check the changes between the 2 models, we find out that adding additional child-level predictors actually reduced the residual sum of squares . This reduction was statistically significant. We also had lower AIC and BIC in the second model implying that the additional regressors actually improved the model
#'
#'
#'
#'
#coef(model_chap12_q2a)
q2a_plot <- as.data.frame(lme4::ranef(model_chap12_q2a,condVar=T)) %>%
ggplot(aes(y=grp,x=condval))+
geom_point()+facet_wrap(~term,scales="free_x")+
geom_errorbarh(aes(xmin=condval-2*condsd,xmax=condval+2*condsd),height=0)+
labs(x="Random Effects", y="Children") +
theme_classic()+
guides(fill=FALSE) +
ggtitle("Model 12.2 A")
q2b_plot <- as.data.frame(lme4::ranef(model_chap12_q2b,condVar=T)) %>%
ggplot(aes(y=grp,x=condval))+
geom_point()+facet_wrap(~term,scales="free_x")+
geom_errorbarh(aes(xmin=condval-2*condsd,xmax=condval+2*condsd),height=0)+
labs(x="Random Effects", y="Children") +
theme_classic()+
guides(fill=FALSE)+
ggtitle("Model 12.2 B")
grid.arrange(q2a_plot,q2b_plot, ncol = 2.)
#'
#'
#' From the above plot, notice the scale of each plot. Model 12.2.A has higher random error than Model 12.2.B. This implies that adding child-level predictors reduces the variance of our random effect estimate. We noticed a substantial reduction in the variance of the intercept - 128 to 123. This is mainly because some of the variations had been explained by the addition of fixed effect predictors like base age and treatment.
#'
#' ## Part D:
#' Compare results in (b) to those obtained in part (c).
#'
#' As seen in the previous question, We observed that by adding child-level predictors (i.e, group-level predictors) the model improved significantly hence increasing the precision of our estimates. The addition of the 2 fixed effect predictors like base age and treatment made the variance of the random effect estimates to reduce by 5 units essentially by compromising between group level and individual level variation.
#' However, the residual did not reduced by much possibly because we did not add another random effect predictor that is due to child-specific idiosyncrasies e.g ARVS.
#'
#'
df_zoom <- df_models %>%
filter(newpid %in% c("63", "67", "79", "77"))
model_comparison %+% df_zoom
#'
#'
#' In summary, this shrinkage effect observed in model 12.2.B ensures that our model compromised between individual-level effects and group level effects at the same regularizing extreme estimates and taming children with less visits/observations as depicted by child 67,79 and 77.
#'
#' \onecolumn
#'
#' # Chapter 12 Question 3:
#'
#' `Predictions for new observations and new groups:`
#'
#' ## Part A:
#' ``Use the model fit from Exercise 12.2(b) to generate simulation of predicted CD4 percentages for each child in the dataset at a hypothetical next time point.``
#'
#'
set.seed(111)
n.sims = 1000
# create hypothetical next time point
cd4.df3 = cd4.df %>%
na.omit() %>% # remove missing values
group_by(newpid) %>% # group by PID
mutate(current.time = max(time)) %>% # get maximum time for each child
mutate(future.time = current.time + .25) # add .25 year beacause appintments are done quarterly
# slelect only relevant time
cd4.df4 = cd4.df3 %>%
dplyr::select(newpid,treatmnt,baseage,future.time) %>%
rename(time=future.time) %>%
unique() # make sure it is a one row per patient dataset