-
Notifications
You must be signed in to change notification settings - Fork 0
/
correct_bias.R
142 lines (117 loc) · 5.12 KB
/
correct_bias.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(here)
library(tidyverse)
library(metacal)
library(Polychrome)
library(withr)
# Read data ---------------------------------------------------------------
counts <- readr::read_rds(here("data", "species_counts.rds"))
expdesign <- readr::read_tsv(here("data", "metadata.tsv"), col_types="ccddc")
# Prepare -----------------------------------------------------------------
# We will use the metacal package for estimating bias and performing
# calibration in the special case where the bias of all the taxa of interest
# can be directly measured from the control sample. Since we know the exact
# concentration of cells added in the begining of the experiment we can
# estimate a corrected/unbiased relative abundance.
# These are the control samples
controls <- c("HAMBI1","HAMBI2", "HAMBI3", "HAMBI4", "HAMBI5")
# Make a matrix of observed counts
observed_mat <- counts %>%
filter(sample %in% controls) %>%
select(sample, strainID, count) %>%
pivot_wider(names_from="strainID", values_from="count") %>%
column_to_rownames(var="sample") %>%
as.matrix()
# Make a matrix of true proportions
actual_mat <- counts %>%
filter(sample %in% controls) %>%
mutate(prop=round(1/24, 3)) %>%
select(sample, strainID, prop) %>%
pivot_wider(names_from="strainID", values_from="prop") %>%
column_to_rownames(var="sample") %>%
as.matrix()
# Bias estimation ---------------------------------------------------------
with_seed(12378, mc_fit <- estimate_bias(observed_mat, actual_mat, 1, boot = TRUE) %>% print())
bias <- coef(mc_fit) %>% print
mc_fit.summary <- summary(mc_fit)
coef_tb <- mc_fit.summary$coefficients
print(mc_fit.summary)
# Plot bias estimation
bias_est <- coef_tb %>%
mutate(taxon = fct_reorder(taxon, estimate)) %>%
ggplot(aes(taxon, estimate,
ymin = estimate / gm_se^2, ymax = estimate * gm_se^2)) +
geom_hline(yintercept = 1, color = "grey") +
geom_pointrange() +
scale_y_log10() +
coord_flip() +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave(here::here("figs", "metalcal_bias_est.svg"), plot=bias_est, device="svg", units="cm", width=17.8, height=12.8)
# Plot model fit
mypal <- unname(createPalette(24, c("#F3874AFF", "#FCD125FF"), M=5000))
observed.fitted <- fitted(mc_fit)
a <- as_tibble(observed.fitted) %>%
pivot_longer(everything()) %>%
mutate(type="Fitted")
b <- as_tibble(actual_mat) %>%
pivot_longer(everything()) %>%
mutate(type="Actual")
c <- as_tibble(observed_mat/rowSums(observed_mat)) %>%
pivot_longer(everything(), values_to="observed")
metacal_fit_plot <- bind_rows(a,b) %>%
left_join(., c) %>%
ggplot(aes(x=observed, y=value, color = name)) +
geom_abline(color = "darkgrey") +
geom_point(show.legend = T) +
scale_color_manual(values=mypal) +
facet_wrap(~type) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave(here::here("figs", "metalcal_fit.svg"), plot=metacal_fit_plot, device="svg", units="cm", width=17.8, height=8.8)
# HAMBI-1842 is off by so much because it actually has 4 copies of the 16S
# rRNA gene!! Metacal procedure accounts for this!
# Calibrate ---------------------------------------------------------------
# Make a matrix of observed counts
ps.mat <- counts %>%
select(sample, strainID, count) %>%
pivot_wider(names_from="strainID", values_from="count") %>%
column_to_rownames(var="sample") %>%
as.matrix()
with_seed(435761, ps.cal <- calibrate(ps.mat, bias, margin=1))
# convert to long
counts_final <- rownames_to_column(as.data.frame(ps.cal), var="sample") %>%
as_tibble() %>%
pivot_longer(`HAMBI-0006`:last_col(), names_to="strainID", values_to="f.correct") %>%
left_join(., counts) %>%
group_by(sample) %>%
mutate(total=sum(count)) %>%
ungroup() %>%
mutate(count.correct=total*f.correct,
f.obs=count/total)
samples <- c("HAMBI1", "HN2-d5", "HN2-d9", "HN2-d13")
metalcal_abund_correct <- counts_final %>%
filter(sample %in% samples) %>%
dplyr::select(sample, strainID, f.correct, f.obs) %>%
pivot_longer(cols=c(f.correct, f.obs), names_to="type", values_to="prop") %>%
ggplot(aes(x=type, prop, fill = strainID)) +
geom_col(width = 0.7) +
scale_fill_manual(values=mypal) +
facet_wrap(~sample, ncol = 5) +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5))
ggsave(here::here("figs", "metalcal_abund_correct.svg"), plot=metalcal_abund_correct, device="svg", units="cm", width=17.8, height=8.8)
# Output ------------------------------------------------------------------
# What I am doing here is multiplying each species relative abundance by the
# total number of reads per sample so you get corrected counts. Counts are
# necessary for DEseq normalization etc...
# Also make sure to remove the negative contols
counts_final %>%
mutate(count.correct = round(count.correct)) %>%
filter(!str_detect(sample, "negCTRL")) %>%
dplyr::select(sample, strainID, genus, species, count, count.correct) %>%
write_rds(., here::here("data", "corrected_species_counts.rds"))
#sessioninfo::session_info()