From bf2f099558cb7e2f43ae2cc360f17c74b56a9e07 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sun, 13 Nov 2022 22:09:51 +0100 Subject: [PATCH] Update normalizeRecount3PerTissueAndDoPca.R --- .../normalizeRecount3PerTissueAndDoPca.R | 61 ++++++++++++++++--- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R index 18e52fac2..b0e3d8f20 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R @@ -4,7 +4,7 @@ -remoter::client("localhost", port = 55508) +remoter::client("localhost", port = 55506) library(DESeq2) library(parallel) @@ -65,6 +65,50 @@ perTissueExp <- lapply(tissueClasses, function(tissue){ #names(perTissueExp) <- tissueClasses #save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData") tissue = "Kidney" +load(file = "Metadata/combinedMeta_2022_09_15.RData", verbose = T) + +covariatesToCorrectFor <- read.delim("CovariateNames.txt", header = F)$V1 + +str(sum(is.na(combinedMeta[,"sra.sample_spots"]))) + +#TCGA samples don't have sra.sample_spots instead use recount_qc.bc_frag.count +missingSampleSpots <- is.na(combinedMeta[,"sra.sample_spots"]) +combinedMeta[missingSampleSpots,"sra.sample_spots"] <- combinedMeta[missingSampleSpots,"recount_qc.bc_frag.count"] + +#TCGA and Gtex don't report layout but is all paired +missingLayout <- is.na(combinedMeta[,"sra.library_layout"]) +combinedMeta[missingLayout,"sra.library_layout"] <- "paired" + +combinedMetaSelection <- combinedMeta[,covariatesToCorrectFor] +combinedMetaSelection$sra.library_layout <- as.factor(combinedMetaSelection$sra.library_layout) +table(combinedMeta[,"sra.library_layout"],useNA = "a") + +cl <- makeCluster(20) + +tissue <- "Salivary Gland-Minor Salivary Gland" + +sink <- lapply(tissueClasses, function(tissue){ + cat(tissue, "\n") + + + load(file = paste0("perTissueNormalization/vstExp/",make.names(tissue),".RData")) + + combinedMetaTissue <- combinedMetaSelection[colnames(vstExp),] + if(min(table(combinedMetaTissue$sra.library_layout)) < 10){ + combinedMetaTissue <- combinedMetaTissue[,-match("sra.library_layout",colnames(combinedMetaTissue))] + } + + vstExpCovCor <- parApply(cl, vstExp, 1 ,function(geneExp, combinedMetaTissue){ + return(residuals(lm(geneExp ~ . ,data = combinedMetaTissue))) + }, combinedMetaTissue = combinedMetaTissue) + vstExpCovCor <- t(vstExpCovCor) + + + save(vstExpCovCor, file = paste0("perTissueNormalization/vstExpCovCor/",make.names(tissue),".RData")) + +}) + +stopCluster(cl) sink <- lapply(tissueClasses, function(tissue){ @@ -74,7 +118,7 @@ sink <- lapply(tissueClasses, function(tissue){ load(file = paste0("perTissueNormalization/vstExp/",make.names(tissue),".RData")) #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 - expScale = vstExp - rowMeans(vstExp); + expScale = vstExpCovCor - rowMeans(vstExpCovCor); # Standardize each variable expScale = expScale / sqrt(rowSums(expScale^2)); #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale @@ -121,7 +165,6 @@ eigenVec <- tissueVstPca$eigenVectors rownames(vstExp) <- (gsub("\\..+", "", rownames(eigenVec))) write.table(eigenVec, file = gzfile(paste0("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/perTissueNormalization/vstPca/",make.names(tissue),"_eigenVec.txt.gz")), sep = "\t", col.names = NA, quote = F) -load(file = "Metadata/combinedMeta_2022_09_15.RData", verbose = T) metaNumeric <- as.matrix(combinedMeta[,sapply(combinedMeta, is.numeric)]) @@ -134,7 +177,7 @@ marCorZPerTissue <- lapply(tissueClasses, function(tissue){ pcs <- tissueVstPca$expPcs metaTest <- metaNumeric[rownames(pcs),] - metaVPcsZ <- apply(expPcs, 2, function(x){ + metaVPcsZ <- apply(pcs, 2, function(x){ apply(metaTest, 2, function(y, x){ z <- !is.na(x) & !is.na(y) if(sum(z) < 10){ @@ -157,7 +200,7 @@ str(marCorZPerTissue) marCorZPerTissue2 <- do.call(cbind, marCorZPerTissue) str(marCorZPerTissue2) -sort(apply(marCorZPerTissue2,1,max)) +covariateZscores <- apply(marCorZPerTissue2,1,mean) marCorZPerTissue2["recount_qc.star.%_of_reads_mapped_to_multiple_loci",] library(vioplot) @@ -174,6 +217,8 @@ cor.test(pcs[,1], metaTest[,"recount_qc.star.%_of_reads_mapped_to_multiple_loci" plot(pcs[,1], metaTest[,"recount_qc.star.%_of_reads_mapped_to_multiple_loci"], pch = 16, col=adjustcolor("grey", alpha.f = 0.5)) dev.off() +pcs <- expPcs + tissueSamplesInfo <- samplesWithPredictionNoOutliers[rownames(pcs),] studies <- length(unique(tissueSamplesInfo$study)) @@ -184,7 +229,7 @@ rpng(width = 1500, height = 1500) palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) pchMap <- rep(c(15,16,17), length.out = studies) plot(pcs[,1],metaTest[,"recount_qc.star.%_of_reads_mapped_to_multiple_loci"], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2, main = paste0("Studies (", studies,")"), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = "Percentage read map multiple loci", bty = "n") - +plot(pcs[,1],pcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2, main = paste0("Studies (", studies,")"), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = "Percentage read map multiple loci", bty = "n") dev.off() @@ -202,10 +247,6 @@ dev.off() - - - - #OLD