-
Notifications
You must be signed in to change notification settings - Fork 1
/
004C_run_bootstraptest_sig_importances.R
79 lines (57 loc) · 2.45 KB
/
004C_run_bootstraptest_sig_importances.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#Get cohort number as argument
args = commandArgs(trailingOnly = TRUE)
cohort_index = as.integer(args[1])
sig_index = as.integer(args[2])
date_tag = "210317"
source(paste0("/.mounts/labs/reimandlab/private/users/oocsenas/CA2M/",
# source(paste0("/.mounts/labs/reimandlab/private/users/jreimand/CA2M/",
date_tag,
"/bin/000_HEADER.R"))
project_codes = list.files(pff("/data/001I_Sig_RMVs/"))
project_code = project_codes[cohort_index]
print(project_code)
#Load in predictors
ATAC_seq = fread(pff("/data/001B_TCGA_ATACSeq_1MBwindow_processed.csv"))[, -c(1,2)]
Roadmap = fread(pff("/data/001A_Roadmap_DNaseSeq_1MBwindow_processed.csv"))[,-c(1, 2)]
Vierstra = fread(pff("/data/001E_vierstra_dnaseseq_1MB.csv"))[, -c(1,2)]
RT = fread(pff("/data/001F_ENCODE_repliseq_1MBwindow_processed.csv"))[, -c(1, 2)]
Preds = as.data.table(cbind.data.frame(ATAC_seq, Roadmap, Vierstra, RT))
Sigs = unlist(lapply(list.files(paste0(pff("/data/001I_Sig_RMVs/"), project_code)),
function(x) unlist(strsplit(x, split = ".csv"))[1]))
#Function to convert R2 to adjusted R2
adjust_R2 = function(R2, n, k){
adjusted_R2 = 1 - (1 - R2)*(n - 1)/(n - k - 1)
return(adjusted_R2)
}
signature = Sigs[sig_index]
print(signature)
if(!is.na(signature)){
#Process MAF into RMV
Output = as.numeric(fread(paste0(pff("/data/001I_Sig_RMVs/"),
project_code,
"/", signature, ".csv"))[["Mut_counts"]])
if(project_code %in% c("Lymph-CLL", "Lymph-BNHL")){
Preds = Preds[-c(292, 2438)]
Output = Output[-c(292, 2438)]}
get_RF_dt = function(seed){
print(seed)
set.seed(seed)
#Convert input and output to data.table
bootstrap_sample = sample(nrow(Preds), replace = T)
input_bootstrap = Preds[bootstrap_sample]
output_bootstrap = Output[bootstrap_sample]
#Train randomForest
rf = randomForest(x = input_bootstrap,
y = output_bootstrap,
keep.forest = T,
ntree = 1000,
do.trace = F,
importance = T)
return(as.numeric(importance(rf, type = 1, scale = F)))}
RF_results = as.data.table(do.call("rbind.data.frame",
mclapply(seq(1000), get_RF_dt, mc.cores = 20)))
colnames(RF_results) = colnames(Preds)
dir.create(paste0(pff("data/004C_Sig_importance_bootstrapRF_results/"), project_code, "/"))
fwrite(RF_results, paste0(pff("data/004C_Sig_importance_bootstrapRF_results/"), project_code, "/", signature, ".csv"))
print("Finished")
}