RESPECTEx.R

# RESPECTEx: Reconstituting specific cell type expression
RESPECTEx = function(tag, tissueTypes, expression, cibersort, 
                     purity, purityCol,
                     cibersortCelltypes, cibersortFilter = TRUE){
  # OUTPUT  a matrix of mean gene expression values reconstituted for each cell 
  #         type specified. 1 row per gene, and 1 column per cell type.
  # parameters:
  # (1) tag:  the identifier for tumour cells, to be appended to the output. 
  #           Precisely, this will be the column name of the first column 
  #           (representing the tumour cells) of the output.
  # (2) tissueTypes:  a list of tissueTypes, its order corresponding to the list "expression".
  #                   This will be taken to name the list of expression matrices inputted.
  # (3) expression:   a list of filepaths pointing to expression matrices of each cohort.
  #                   1 row per gene (gene names as row names) and 1 column per bulk tumour 
  #                   sample. Take a string if only 1 expression matrix is to be analysed.
  # (4) cibersort:    a list of filepaths pointing to the immune cell proportion data.
  #                   For instance, generated by CIBERSORT. 1 column per cell type and 
  #                   1 row per sample. Take a string if only 1 immune-cell-proportion 
  #                   matrix is to be anaylsed.
  # (5) purity:       a data.frame object, 1 row per sample, of tumour purity estimates.
  # (6) purityCol:    a string representing the column names corresponding to a column in
  #                   "purity", from which tumour purity estimates are taken.
  # (7) cibersortCelltypes: a list of immune cell types included. These must correspond to
  #                         input columns in "cibersort". They will be taken as column 
  #                         names of the output matrix.
  # (8) cibersortFilter:  default = TRUE. whether to filter CIBERSORT immune cell proportion
  #                       estimates to include only cases with a significant (p < 0.05)
  #                       inference as determined by CIBERSORT. Recommended if cohort is of 
  #                       substantial (e.g. n > 100) size
  print(" === RESPECTEx: Reconstituting Specific Cell Type Expression ===")
  #adjustment for non-immune APOBEC expression
  expression = lapply(expression, function(t) return(read.table(t, header = T, sep = "\t", row.names = 1)))
  names(expression) = tissueTypes
  cibersort = lapply(cibersort, function(t) return(read.table(t, header = T, sep = "\t", stringsAsFactors = F)))
  
  if( isTRUE(cibersortFilter) ){
    # if isTRUE(cibersortFilter) filter away cases with P > 0.05 
    # in the CIBERSORT inference according to CIBERSORT output
    cibersort = lapply(cibersort, function(tb){
      # check names (legacy of TCGA sample codes in case in the 
      # source file '-' in the sample codes were already converted)
      tb[, 1] = sapply(tb[, 1], function(x) return(gsub(".", "-", x, fixed = T)))
      tb = tb[tb[, "P.value"] < 0.05, ]
      return(tb)
    })
  }  
  names(cibersort) = tissueTypes
  
  require(nnls)
  adjust = lapply(tissueTypes, function(c){
    tp = purity[purity[, "tissueType"] == c, ]
    tp = tp[!is.na(tp[, purityCol]), ]
    e = expression[[c]]
    cib = cibersort[[c]]
    colnames(e) = sapply(colnames(e), function(x) return(gsub(".", "-", x, fixed = T)))
    #filtering cases included in all three data
    tp = tp[which(rownames(tp) %in% colnames(e)), ]
    e = e[, which(colnames(e) %in% rownames(tp))]
    cib = cib[which(cib[, 1] %in% rownames(tp)), ]
    cib = data.frame(cib[which(cib[, 1] %in% colnames(e)), ])
    tp = data.frame(tp[which(rownames(tp) %in% cib[, 1]), ])
    e = e[, which(colnames(e) %in% rownames(tp))]
    print(paste0("Here we analyse ", ncol(e), " cases ..."))
    # tidying and sorting according to sample codes
    if (nrow(cib) > 0 | nrow(tp) > 0){
      e = data.frame(e[, which(colnames(e) %in% cib[, 1])])
      e = e[, order(colnames(e))]
      cib = cib[order(cib[, 1]), ]
      tp = tp[order(rownames(tp)), ]
    }
    if (nrow(tp) > 1){
      for (i in 1:nrow(tp)){
        # adjust the immune proportions so the sum of all immune cell 
        # proportions equal to (1 - tumour purity)
        cib[i, 2:23] = (1 - tp[i, purityCol]) * cib[i, 2:23]
      }
      # the tumour and immune proportions are the covariate matrix.
      betas = cbind(tp[, purityCol], cib[, 2:23])
      print("covariate matrix ready ... ")
      # here we do the regression
      print("start the regression ... ")
      adjust = t(apply(e, MARGIN = 1, function(x){
        regress = nnls(as.matrix(betas), x)
        # extract coefficients, and weight by the median sample 
        # proportion for the respective cell type to reflect the 
        # realistic expression level in tissue/tumour samples
        ybar = coefficients(regress) * apply(as.matrix(betas), 2, median)
        ybar[is.na(ybar)] = 0
        return(ybar)
      }))
      print("regression complete!")
      colnames(adjust) = c(tag, cibersortCelltypes)
      return(adjust)
    }
  })
  names(adjust) = tissueTypes
  adjust = adjust[sapply(adjust, function(g) return(!is.null(g)))]
}


# --------- For iterating through all tissue types in a cohort  --------- #
# Here are the codes to generate a vector of tissue Types and
# CIBERSORT files for RESPECTEx to loop through

# -------------------- expression files --------------------- #
#expression_dir = "Documents/2018/GTEx.RNAseq"
#expression = list.files(path = expression_dir, 
#                        pattern = glob2rx("*log2_normalized.txt"), 
#                        full.names = T)

# -------------------- tissue types ------------------------- #
# Need changes to match the specific format users name their files.
# Here we assume the tissue types are written in the filenames of 
# the expression files - specifically in the format 
# GTEx.<tissueType>.*
#tissueTypes = sapply(expression, function(x){
#  f = unlist(strsplit(x, split = "/", fixed = T))
#  f = f[length(f)]
#  f = unlist(strsplit(f, split = ".", fixed = T))[2]
#  return(f)
#})

# -------------------- CIBERSORT files ---------------------- #
# Need changes to match the specific format users name their files.
# Here they are named in the format GTEx.<tissueType>.CIBERSORT.txt
#cibersort_dir = "Documents/2018/CIBERSORT/GTEx.CIBERSORT"
#cibersort = sapply(tissueTypes, function(x) {
#  return(paste(cibersort_dir, "/", tag, ".", x, ".CIBERSORT.txt", 
#               sep = ""))
#})
# ----------------------------------------------------------------- #

cibersort_celltypes = c("B-naive", "B-memory", "Plasma_cells",
                     "CD8", "CD4-naive", "CD4-memory_resting", "CD4-memory_activated",
                     "Tfh", "Treg", "gdT", "NK_rest", "NK_activated", "Monocyte",
                     "M0", "M1", "M2", "DC_resting", "DC_activated",
                     "MC_resting", "MC_activated", "Eosinophil", "Neutrophil")

GSE75688_purity = read.table("GSE75688_ESTIMATEpurity.txt", 
                             header = T, sep = "\t", stringsAsFactors = F, check.names = F, quote = "")
GSE75688_purity[, "tissueType"] = rep("BRCA", nrow(GSE75688_purity))
GSE75688_adjusted = RESPECTEx("GSE75688", "BRCA",
  "GSE75688_pooledOrTumor_TPM_normalized.txt",
  "GSE75688_pooledOrTumor_TPM_CIBERSORT.txt",
  GSE75688_purity, "immune_purity", 
  cibersort_celltypes, cibersortFilter = F) 
# here we don't filter cases since the cohort is already very small
write.table(GSE75688_adjusted$BRCA, "GSE75688_RESPECTEx.txt", 
            row.names = T, col.names = T, sep = "\t", quote= F)