-
Notifications
You must be signed in to change notification settings - Fork 3
/
ConvolutionsSAR.R
249 lines (217 loc) · 9.73 KB
/
ConvolutionsSAR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# Authors: Christophe Fraser, Chris Wymant, Luca Ferretti
library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
# WARNING: command below deletes all objects currently in R's memory
rm(list=ls())
# A csv file with columns startDate, receivedRiskyContactNotification,
# and hasHadRiskyContact_AND_receivedPositiveTestResult. These are explained in
# supplementary material. Briefly: the date, the number of people notified,
# and the number of people reporting a positive test result after having been
# recently notified.
file.app.data <- "notifications_and_positives.csv"
##############################################################################
# PARAMETERS:
# Time step size
dt <- 0.1
# Parameters for the exposure-to-notification distribution. These are the
# median values of all days in December.
mean.exposure.to.notification <- 2.6 # days.
variance.exposure.to.notification <- 2.2^2 # days^2
#Incubation period: lognormal with mean 5.42 days and SD 2.7 days
mean.log.exposure.to.symptoms <- 1.58
sd.log.exposure.to.symptoms <- 0.47
# Symptoms to positive distribution parameters
mean.symptoms.to.positive <- 3 # days
variance.symptoms.to.positive <- 2.5 # days^2
# Neglect the probability that notification to positive could take longer than
# this many days:
max.days.n.to.p <- 30
# When calculating the likelihood of the observed number of positives given
# the notifications on previous days (and the SAR), censor this many days at the
# start (because positives lag behind notifications, so "burn in" is needed to
# get enough data).
first.n.days.to.censor <- 7
################################################################################
# CALCULATION
# NB Gamma distribution: shape = mean^2/variance, scale = variance/mean
sh.exposure.to.notification <- (mean.exposure.to.notification)^2/variance.exposure.to.notification
sc.exposure.to.notification <- variance.exposure.to.notification/mean.exposure.to.notification
pdf.exposure.to.notification <- Vectorize(function(tau) {
return(dgamma(tau,shape = sh.exposure.to.notification, scale = sc.exposure.to.notification))
})
pdf.exposure.to.symptoms <- Vectorize(function(tau) { # aka the incubation period
return(dlnorm(tau, meanlog = mean.log.exposure.to.symptoms,
sdlog = sd.log.exposure.to.symptoms))
})
sh.symptoms.to.positive <- (mean.symptoms.to.positive)^2/variance.symptoms.to.positive
sc.symptoms.to.positive <- variance.symptoms.to.positive/mean.symptoms.to.positive
pdf.symptoms.to.positive <- Vectorize(function(tau) {
return(dgamma(tau,shape = sh.symptoms.to.positive, scale = sc.symptoms.to.positive))
})
#convolution of exposure to symptoms, and symptoms to positive
pdf.exposure.to.positive <- Vectorize(function(tau) {
if (tau==0) return(0) else {
dtau = tau/100
s <- seq(0, tau, by = dtau)
conv <- dtau * sum(pdf.exposure.to.symptoms(s) * pdf.symptoms.to.positive(tau - s))
return(conv)
}
})
tau.precompute <- seq(0,100,by=dt)
pdf.exposure.to.positive.precompute <- sapply(tau.precompute, pdf.exposure.to.positive)
tau2.precompute <- seq(-10,90,by=dt)
convolution <- tau2.precompute
for(i in 1:length(tau2.precompute)) {
tau2 <- tau2.precompute[i]
conv <- 0
blist <- seq(0,min(100,100-tau2),by=dt)
for(j in 1:length(blist)){
b <- blist[j]
if(tau2+b>0) {
# tau2 = notification-to-positive time. b = exposure-to-notification time.
# tau2 + b = exposure-to-positive time
conv <- conv + pdf.exposure.to.positive.precompute[1+(tau2+b)/dt] *
pdf.exposure.to.notification(b)
}
}
conv <- conv * dt
convolution[i] <- conv
}
# A df with the values of notification-to-positive pdf, by time step
df <- tibble(tau = tau2.precompute, pdf = convolution)
df$pdf <- df$pdf/sum(df$pdf)
# A df with the values of notification-to-positive pdf, discretised by day
df.discrete <- tibble(tau = (-10):max.days.n.to.p,
pmf = NA_real_)
for (i in 1:nrow(df.discrete)) {
df.discrete$pmf[[i]] <- df %>%
filter(tau < df.discrete$tau[[i]] + 0.5,
tau >= df.discrete$tau[[i]] - 0.5) %>%
pull(pdf) %>%
sum()
}
# Neglect negative delays (testing positive before being notified)
df.discrete <- df.discrete %>% filter(tau>=0)
df.discrete$pmf <- df.discrete$pmf/sum(df.discrete$pmf)
sar.df <- read_csv(file.app.data)
sar.df <- sar.df %>%
# The receivedRiskyContactNotification field was introduced in the release on Dec 17:
filter(startDate >= as.Date("2020-12-17")) %>%
select(startDate, receivedRiskyContactNotification,
hasHadRiskyContact_AND_receivedPositiveTestResult)
# Check that dates in the df consistently increase by 1
stopifnot(sar.df$startDate == sort(sar.df$startDate))
num.days <- nrow(sar.df)
stopifnot(as.integer(difftime(sar.df$startDate[[num.days]], sar.df$startDate[[1]],
units = "days")) == num.days - 1L)
sar.df$day.as.int <- 1:num.days
# Calculate the number of positives-among-those-recently-notified that would be
# expected if the SAR were 1:
sar.df$positives.expected.with.unit.sar <- 0
for (day.positive in 1:num.days) {
for (days.since.notification in 0:min(day.positive - 1, max.days.n.to.p)) {
sar.df$positives.expected.with.unit.sar[[day.positive]] <-
sar.df$positives.expected.with.unit.sar[[day.positive]] +
sar.df$receivedRiskyContactNotification[[day.positive - days.since.notification]] *
df.discrete$pmf[[days.since.notification + 1]]
}
}
# Calculate the negative log likelihood as a function of SAR. Assume Poisson
# distribution for numbers of positives observed given the number expected.
neg.log.likelihood <- Vectorize(function(sar) {
positives.expected <- sar.df$positives.expected.with.unit.sar * sar
neg_log_likelihood_by_day <- positives.expected -
sar.df$hasHadRiskyContact_AND_receivedPositiveTestResult *
log(positives.expected)
sum(neg_log_likelihood_by_day[seq(first.n.days.to.censor + 1, num.days)]) # censor early days
})
sar.ml <- optimize(neg.log.likelihood, c(0.001, 0.999))$minimum
# Calculate 95% confidence interval by likelihood profiling
best.neg.log.likelihood <- neg.log.likelihood(sar.ml)
sar.ml.lower <- uniroot(function(sar) {
neg.log.likelihood(sar) - 1.92 - best.neg.log.likelihood},
lower = 0.001, upper = sar.ml)$root
sar.ml.upper <- uniroot(function(sar) {
neg.log.likelihood(sar) - 1.92 - best.neg.log.likelihood},
lower = sar.ml, upper = 0.99)$root
################################################################################
# RESULT
cat(paste0("SAR = ", round(100 * sar.ml, 2), "% (", round(100 * sar.ml.lower, 2),
" - ", round(100* sar.ml.upper, 2), "%)"))
################################################################################
# DEPRECATED THINGS
# Our original SAR calculation calculated one value of SAR per day, summarising
# these values with a median:
use.deprecated.daily.sar <- FALSE
if (use.deprecated.daily.sar) {
# Calculate the denominator terms appearing in the final expression for SAR.
# For each day on which one could test positive, y, sum notifications from
# previous days x multiplied by the probability of a notification-to-positive
# delay being y-x.
sar.df$sar.denominator.term <- NA_real_
for(y in 1:num.days) {
denominator_today <- 0
for (x in 1:num.days) {
notification.to.positive <- y - x
if (notification.to.positive >= 0 && notification.to.positive <= max.days.n.to.p) {
denominator_today <- denominator_today +
df.discrete$pmf[[notification.to.positive + 1]] *
sar.df$receivedRiskyContactNotification[[x]]
}
}
sar.df$sar.denominator.term[[y]] <- denominator_today
}
# For each day t, calculate SAR(t) by summing over the contributing
# y terms as shown in our final expression for SAR(t)
sar.df$sar <- NA_real_
for(t in 1:num.days) {
sar_today <- 0
for (y in 1:num.days) {
notification.to.positive <- y - t
if (notification.to.positive >= 0 && notification.to.positive <= max.days.n.to.p) {
sar_today <- sar_today +
sar.df$hasHadRiskyContact_AND_receivedPositiveTestResult[[y]] *
df.discrete$pmf[[notification.to.positive + 1]] /
sar.df$sar.denominator.term[[y]]
}
}
sar.df$sar[[t]] <- sar_today
}
# The P_NP(t1|t2) matrix defined in our supplement: the probability of being
# notified at time t1 conditional on testing positive at time t2. The matrix
# row indexes t1 and the column indexes t2.
matrix_p_np <- array(0, dim = c(num.days, num.days))
for (t1 in 1:num.days) {
for (t2 in 1:num.days) {
notification.to.positive <- t2 - t1
if (notification.to.positive < 0 ||
notification.to.positive > max.days.n.to.p) next
x.range.for.sum <- seq(max(1, t2 - max.days.n.to.p), t2)
denominator <- sum(df.discrete$pmf[t2 - x.range.for.sum + 1] *
sar.df$receivedRiskyContactNotification[x.range.for.sum])
matrix_p_np[t1, t2] <- df.discrete$pmf[[notification.to.positive + 1]] *
sar.df$receivedRiskyContactNotification[[t1]] / denominator
}
}
# Summing over one column of this matrix is summing over t1, which by definition
# gives 1.
tolerance <- 1e-10
stopifnot(max(abs(colSums(matrix_p_np) - 1)) < tolerance)
# Calculate the SAR each day using the P_NP(t1|t2) matrix as an alternative.
# Verify they agree.
sar.df$sar.alt <- NA_real_
for(t in 1:num.days) {
y.range.for.sum <- seq(t, min(t + max.days.n.to.p, num.days))
sar.df$sar.alt[[t]] <- sum(matrix_p_np[t, y.range.for.sum] *
sar.df$hasHadRiskyContact_AND_receivedPositiveTestResult[y.range.for.sum]) /
sar.df$receivedRiskyContactNotification[[t]]
}
stopifnot(max(abs(sar.df$sar - sar.df$sar.alt)) < tolerance)
sar.median <- sar.df %>%
filter(! censored) %>%
pull(sar) %>%
median()
sar.median
}