-
Notifications
You must be signed in to change notification settings - Fork 17
/
3-combine_category_kappa.R
178 lines (150 loc) · 7.81 KB
/
3-combine_category_kappa.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# J. Taroni Jul 2016
# The purpose of this script is to combine and save Kappa statistics from category
# predictions on hold-out data. It should be run from the command line
# through the classifier_repeat_wrapper.R script or alternatively
# USAGE: Rscript 3-combine_category_kappa.R
option_list <- list(
optparse::make_option("--cancer_type",
default = NA_character_,
help = "Cancer type"),
optparse::make_option("--predictor",
default = NA_character_,
help = "Predictor used"),
optparse::make_option("--null_model",
action = "store_true",
default = FALSE,
help = "Use null model as baseline for plotting delta kappa")
)
opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
source(here::here("util/option_functions.R"))
check_options(opt)
# load libraries
suppressMessages(source(here::here("load_packages.R")))
source(here::here("util", "color_blind_friendly_palette.R"))
# set options
cancer_type <- opt$cancer_type
predictor <- opt$predictor
null_model <- opt$null_model
file_identifier <- str_c(cancer_type, predictor, sep = "_")
# define directories
plot.dir <- here::here("plots")
plot.data.dir <- file.path(plot.dir, "data")
res.dir <- here::here("results")
# list array and seq files from results directory
lf <- list.files(res.dir, full.names = TRUE)
array.files <- lf[grepl(paste0(file_identifier,
"_train_3_models_array_kappa_"), lf)]
seq.files <- lf[grepl(paste0(file_identifier,
"_train_3_models_seq_kappa_"), lf)]
if (null_model) {
null_array.files <- lf[grepl(paste0(file_identifier,
"_null_train_3_models_array_kappa_"), lf)]
null_seq.files <- lf[grepl(paste0(file_identifier,
"_null_train_3_models_seq_kappa_"), lf)]
# check that we have ordered pairs of regular and null files for array and seq
array_seeds <- stringr::str_sub(array.files, -8, -5)
null_array_seeds <- stringr::str_sub(null_array.files, -8, -5)
seq_seeds <- stringr::str_sub(seq.files, -8, -5)
null_seq_seeds <- stringr::str_sub(null_seq.files, -8, -5)
if (!(all(array_seeds == null_array_seeds) &
all(seq_seeds == null_seq_seeds))) {
stop("Array or seq seeds do not match in delta kappa plotting script.")
}
}
# define output files
test.df.filename <- ifelse(null_model,
file.path(plot.data.dir,
paste0(file_identifier,
"_train_3_models_delta_kappa.tsv")),
file.path(plot.data.dir,
paste0(file_identifier,
"_train_3_models_kappa.tsv")))
summary.df.filename <- ifelse(null_model,
file.path(plot.data.dir,
paste0(file_identifier,
"_train_3_models_delta_kappa_summary_table.tsv")),
file.path(plot.data.dir,
paste0(file_identifier,
"_train_3_models_kappa_summary_table.tsv")))
#### read in data --------------------------------------------------------------
# read in the tables that contain the kappa statistics for predictions on test
# data
array.list <- list() # initialize list that will hold all array tables
seq.list <- list() # initialize list that will hold all the RNA-seq tables
for (file_index in 1:length(array.files)) {
array.list[[file_index]] <- fread(array.files[file_index], data.table = F)
seq.list[[file_index]] <- fread(seq.files[file_index], data.table = F)
}
if (null_model) {
null_array.list <- list() # initialize list that will hold null array tables
null_seq.list <- list() # initialize list that will hold null RNA-seq tables
for (null_file_index in 1:length(null_array.files)) {
null_array.list[[null_file_index]] <- fread(null_array.files[null_file_index], data.table = F)
null_seq.list[[null_file_index]] <- fread(null_seq.files[null_file_index], data.table = F)
}
# calculate delta kappa values
delta_kappa_array.list <- list() # list for delta kappa array values
delta_kappa_seq.list <- list() # list for delta kappa seq values
for (pair_index in 1:length(array.files)) {
delta_kappa_array.list[[pair_index]] <- array.list[[pair_index]] %>%
left_join(null_array.list[[pair_index]],
by = c("perc.seq", "classifier", "norm.method"),
suffix = c(".true", ".null")) %>%
mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa
select(delta_kappa, auc.true, sensitivity.true, specificity.true, perc.seq, classifier, norm.method)
delta_kappa_seq.list[[pair_index]] <- seq.list[[pair_index]] %>%
left_join(null_seq.list[[pair_index]],
by = c("perc.seq", "classifier", "norm.method"),
suffix = c(".true", ".null")) %>%
mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa
select(delta_kappa, auc.true, sensitivity.true, specificity.true, perc.seq, classifier, norm.method)
}
}
# combine all tables from each platform into a data.frame
# cannot use ifelse() because return value must be same dim as conditional test
if (null_model) {
array.df <- data.table::rbindlist(delta_kappa_array.list)
seq.df <- data.table::rbindlist(delta_kappa_seq.list)
} else {
array.df <- data.table::rbindlist(array.list) %>%
select(kappa, auc, sensitivity, specificity, perc.seq, classifier, norm.method)
seq.df <- data.table::rbindlist(seq.list) %>%
select(kappa, auc, sensitivity, specificity, perc.seq, classifier, norm.method)
}
#### save test set results -----------------------------------------------------
# bind all kappa stats together
test.df <- cbind(rbind(array.df, seq.df),
c(rep("Microarray", nrow(array.df)),
rep("RNA-seq", nrow(seq.df))))
colnames(test.df) <- c("Kappa", "AUC", "Sensitivity", "Specificity", "Perc.Seq", "Classifier",
"Normalization", "Platform")
# order %seq to display 0-100
test.df$Perc.Seq <- factor(test.df$Perc.Seq, levels = seq(0, 100, 10))
# recode model types
cls.recode.str <-
"'glmnet' = 'LASSO'; 'rf' = 'Random Forest'; 'svm' = 'Linear SVM'"
test.df$Classifier <- car::recode(test.df$Classifier,
recodes = cls.recode.str)
# capitalize norm methods
test.df$Normalization <- as.factor(toupper(test.df$Normalization))
test.df$Classifier <- as.factor(test.df$Classifier)
readr::write_tsv(test.df,
test.df.filename) # delta or not delta in file name
# get summary data.frame + write to file
summary.df <- test.df %>%
dplyr::group_by(Classifier, Normalization, Platform, Perc.Seq) %>%
dplyr::summarise(Median_Kappa = median(Kappa, na.rm = TRUE),
Mean_Kappa = mean(Kappa, na.rm = TRUE),
SD_Kappa = sd(Kappa, na.rm = TRUE),
Median_AUC = median(AUC, na.rm = TRUE),
Mean_AUC = mean(AUC, na.rm = TRUE),
SD_AUC = sd(AUC, na.rm = TRUE),
Median_Sensitivity = median(Sensitivity, na.rm = TRUE),
Mean_Sensitivity = mean(Sensitivity, na.rm = TRUE),
SD_Sensitivity = sd(Sensitivity, na.rm = TRUE),
Median_Specificity = median(Specificity, na.rm = TRUE),
Mean_Specificity = mean(Specificity, na.rm = TRUE),
SD_Specificity = sd(Specificity, na.rm = TRUE),
.groups = "drop")
readr::write_tsv(summary.df,
summary.df.filename) # delta or not delta in file name