-
Notifications
You must be signed in to change notification settings - Fork 0
/
RESPECTEx.R
149 lines (140 loc) · 7.47 KB
/
RESPECTEx.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# RESPECTEx: Reconstituting specific cell type expression
RESPECTEx = function(tag, tissueTypes, expression, cibersort,
purity, purityCol,
cibersortCelltypes, cibersortFilter = TRUE){
# OUTPUT a matrix of mean gene expression values reconstituted for each cell
# type specified. 1 row per gene, and 1 column per cell type.
# parameters:
# (1) tag: the identifier for tumour cells, to be appended to the output.
# Precisely, this will be the column name of the first column
# (representing the tumour cells) of the output.
# (2) tissueTypes: a list of tissueTypes, its order corresponding to the list "expression".
# This will be taken to name the list of expression matrices inputted.
# (3) expression: a list of filepaths pointing to expression matrices of each cohort.
# 1 row per gene (gene names as row names) and 1 column per bulk tumour
# sample. Take a string if only 1 expression matrix is to be analysed.
# (4) cibersort: a list of filepaths pointing to the immune cell proportion data.
# For instance, generated by CIBERSORT. 1 column per cell type and
# 1 row per sample. Take a string if only 1 immune-cell-proportion
# matrix is to be anaylsed.
# (5) purity: a data.frame object, 1 row per sample, of tumour purity estimates.
# (6) purityCol: a string representing the column names corresponding to a column in
# "purity", from which tumour purity estimates are taken.
# (7) cibersortCelltypes: a list of immune cell types included. These must correspond to
# input columns in "cibersort". They will be taken as column
# names of the output matrix.
# (8) cibersortFilter: default = TRUE. whether to filter CIBERSORT immune cell proportion
# estimates to include only cases with a significant (p < 0.05)
# inference as determined by CIBERSORT. Recommended if cohort is of
# substantial (e.g. n > 100) size
print(" === RESPECTEx: Reconstituting Specific Cell Type Expression ===")
#adjustment for non-immune APOBEC expression
expression = lapply(expression, function(t) return(read.table(t, header = T, sep = "\t", row.names = 1)))
names(expression) = tissueTypes
cibersort = lapply(cibersort, function(t) return(read.table(t, header = T, sep = "\t", stringsAsFactors = F)))
if( isTRUE(cibersortFilter) ){
# if isTRUE(cibersortFilter) filter away cases with P > 0.05
# in the CIBERSORT inference according to CIBERSORT output
cibersort = lapply(cibersort, function(tb){
# check names (legacy of TCGA sample codes in case in the
# source file '-' in the sample codes were already converted)
tb[, 1] = sapply(tb[, 1], function(x) return(gsub(".", "-", x, fixed = T)))
tb = tb[tb[, "P.value"] < 0.05, ]
return(tb)
})
}
names(cibersort) = tissueTypes
require(nnls)
adjust = lapply(tissueTypes, function(c){
tp = purity[purity[, "tissueType"] == c, ]
tp = tp[!is.na(tp[, purityCol]), ]
e = expression[[c]]
cib = cibersort[[c]]
colnames(e) = sapply(colnames(e), function(x) return(gsub(".", "-", x, fixed = T)))
#filtering cases included in all three data
tp = tp[which(rownames(tp) %in% colnames(e)), ]
e = e[, which(colnames(e) %in% rownames(tp))]
cib = cib[which(cib[, 1] %in% rownames(tp)), ]
cib = data.frame(cib[which(cib[, 1] %in% colnames(e)), ])
tp = data.frame(tp[which(rownames(tp) %in% cib[, 1]), ])
e = e[, which(colnames(e) %in% rownames(tp))]
print(paste0("Here we analyse ", ncol(e), " cases ..."))
# tidying and sorting according to sample codes
if (nrow(cib) > 0 | nrow(tp) > 0){
e = data.frame(e[, which(colnames(e) %in% cib[, 1])])
e = e[, order(colnames(e))]
cib = cib[order(cib[, 1]), ]
tp = tp[order(rownames(tp)), ]
}
if (nrow(tp) > 1){
for (i in 1:nrow(tp)){
# adjust the immune proportions so the sum of all immune cell
# proportions equal to (1 - tumour purity)
cib[i, 2:23] = (1 - tp[i, purityCol]) * cib[i, 2:23]
}
# the tumour and immune proportions are the covariate matrix.
betas = cbind(tp[, purityCol], cib[, 2:23])
print("covariate matrix ready ... ")
# here we do the regression
print("start the regression ... ")
adjust = t(apply(e, MARGIN = 1, function(x){
regress = nnls(as.matrix(betas), x)
# extract coefficients, and weight by the median sample
# proportion for the respective cell type to reflect the
# realistic expression level in tissue/tumour samples
ybar = coefficients(regress) * apply(as.matrix(betas), 2, median)
ybar[is.na(ybar)] = 0
return(ybar)
}))
print("regression complete!")
colnames(adjust) = c(tag, cibersortCelltypes)
return(adjust)
}
})
names(adjust) = tissueTypes
adjust = adjust[sapply(adjust, function(g) return(!is.null(g)))]
}
# --------- For iterating through all tissue types in a cohort --------- #
# Here are the codes to generate a vector of tissue Types and
# CIBERSORT files for RESPECTEx to loop through
# -------------------- expression files --------------------- #
#expression_dir = "Documents/2018/GTEx.RNAseq"
#expression = list.files(path = expression_dir,
# pattern = glob2rx("*log2_normalized.txt"),
# full.names = T)
# -------------------- tissue types ------------------------- #
# Need changes to match the specific format users name their files.
# Here we assume the tissue types are written in the filenames of
# the expression files - specifically in the format
# GTEx.<tissueType>.*
#tissueTypes = sapply(expression, function(x){
# f = unlist(strsplit(x, split = "/", fixed = T))
# f = f[length(f)]
# f = unlist(strsplit(f, split = ".", fixed = T))[2]
# return(f)
#})
# -------------------- CIBERSORT files ---------------------- #
# Need changes to match the specific format users name their files.
# Here they are named in the format GTEx.<tissueType>.CIBERSORT.txt
#cibersort_dir = "Documents/2018/CIBERSORT/GTEx.CIBERSORT"
#cibersort = sapply(tissueTypes, function(x) {
# return(paste(cibersort_dir, "/", tag, ".", x, ".CIBERSORT.txt",
# sep = ""))
#})
# ----------------------------------------------------------------- #
cibersort_celltypes = c("B-naive", "B-memory", "Plasma_cells",
"CD8", "CD4-naive", "CD4-memory_resting", "CD4-memory_activated",
"Tfh", "Treg", "gdT", "NK_rest", "NK_activated", "Monocyte",
"M0", "M1", "M2", "DC_resting", "DC_activated",
"MC_resting", "MC_activated", "Eosinophil", "Neutrophil")
GSE75688_purity = read.table("GSE75688_ESTIMATEpurity.txt",
header = T, sep = "\t", stringsAsFactors = F, check.names = F, quote = "")
GSE75688_purity[, "tissueType"] = rep("BRCA", nrow(GSE75688_purity))
GSE75688_adjusted = RESPECTEx("GSE75688", "BRCA",
"GSE75688_pooledOrTumor_TPM_normalized.txt",
"GSE75688_pooledOrTumor_TPM_CIBERSORT.txt",
GSE75688_purity, "immune_purity",
cibersort_celltypes, cibersortFilter = F)
# here we don't filter cases since the cohort is already very small
write.table(GSE75688_adjusted$BRCA, "GSE75688_RESPECTEx.txt",
row.names = T, col.names = T, sep = "\t", quote= F)