diff --git a/.gitignore b/.gitignore index 1e72b30bf..7668984af 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ deconvolutionTestResults/ .Rproj.user .Rhistory /DEPICT2/src/main/r/downstreamer_main/downstreamer_main.Rproj +Downstreamer/src/main/r/downstreamer_main/.remoterserverlog diff --git a/Downstreamer/pom.xml b/Downstreamer/pom.xml index ac6432d85..64e8ca675 100644 --- a/Downstreamer/pom.xml +++ b/Downstreamer/pom.xml @@ -7,7 +7,7 @@ 1.0.4-SNAPSHOT Downstreamer - 1.29-SNAPSHOT + 1.30-SNAPSHOT jar diff --git a/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj b/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj new file mode 100644 index 000000000..8e3c2ebc9 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R index 5b3d371fc..892d655c4 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R @@ -1564,7 +1564,7 @@ combinedMeta$CelllineName[studySamples] <- "" combinedMeta$Cancer[studySamples] <- FALSE #SRP081020 is mis-anotated in SRA as PBMC, paper and clustering both state wholeblood -studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077") +studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077", "SRP056443") combinedMeta$Tissue[studySamples] <- "" combinedMeta$Tissue2[studySamples] <- "" combinedMeta$Cellline[studySamples] <- NA @@ -1573,6 +1573,9 @@ combinedMeta$Cancer[studySamples] <- NA + + + combinedMeta$Cellline[!is.na(combinedMeta$CelllineName)&combinedMeta$CelllineName=="iPSC"] <- TRUE combinedMeta$Cancer[!is.na(combinedMeta$Cellline) & combinedMeta$Cellline] <- NA @@ -1613,7 +1616,7 @@ combinedMeta$Tissue[combinedMeta$Cohort == "GSA"] -#save(combinedMeta, file = "combinedMeta_2022_09_02.RData") +#save(combinedMeta, file = "combinedMeta_2022_09_15.RData") load(file = "combinedMeta_2022_08_19.RData") diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R new file mode 100644 index 000000000..8d261ed03 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R @@ -0,0 +1,31 @@ +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs") + +source(paste0("C:\\Users\\patri\\Documents\\GitHub\\systemsgenetics\\Downstreamer\\src\\main\\r\\downstreamer_main/downstreamer_functions.r")) + +traits <- read.delim("MetaBrain/traits.txt") + + +i <- 1 + +pdf("MetaBrain/withAndWithoutEqtls.pdf", height = 20, width = 10) +#png("MetaBrain/withAndWithoutEqtls.png", height = 2000, width = 1000) +layout(matrix(1:8, ncol =2)) +par(pty="s") +for(i in 1:nrow(traits)){ + + + +trait <- traits[i, "trait"] +name <- traits[i, "name"] + +enrichments <- read.depict2(paste0("MetaBrain/normal/",trait,"_enrichtments.xlsx"))$GenePrioritization_MetaBrain +enrichmentsIncEqtl <- read.depict2(paste0("MetaBrain/inceqt/",trait,"_enrichtments.xlsx"))$GenePrioritization_MetaBrain + +enrichmentsBoth <- merge(enrichments, enrichmentsIncEqtl, "Gene.ID" , suffixes= c("Normal", "incEqtl")) + +maxZ <- max(range(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl)) +r <- cor(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl) +plot(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl, bg = adjustcolor("dodgerblue2", alpha.f = 0.3), pch = 21, col=adjustcolor("dodgerblue2", alpha.f = 0.5), asp = 1, xlab = "Key gene score without eqtl information", ylab = "Key gene score without eqtl information", xlim = c(-maxZ,maxZ), ylim = c(-maxZ,maxZ), main = name) +mtext(paste0("Pearson r: ", signif(r,2))) +} +dev.off() diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R index f12897f41..ebff773a5 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R @@ -270,12 +270,6 @@ plot(log(expSvd$d)) abline(v=60) -library(rpca) - - -expRpca <- rpca(t(expSub2)) - - library(corpcor) expSvdFast <- fast.svd(expSubScale) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R deleted file mode 100644 index 850281aed..000000000 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R +++ /dev/null @@ -1,166 +0,0 @@ -#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i -#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T) - - -remoter::client("localhost", port = 55501, password = "laberkak") - - - -library(uwot) - -setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") -setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") - -tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "") - -load(file = "DataForPredictions.RData") - -#load(file = "combinedMeta_2022_08_30.RData", verbose = T) -#str(combinedMeta) -#updatedAnnotations <- combinedMeta[,c("Tissue", "Tissue2", "Cellline", "CelllineName", "Cancer", "Cohort", "Fetal")] - -#all(rownames(pcsAndMeta) %in% rownames(updatedAnnotations)) -#updatedAnnotations <- updatedAnnotations[rownames(pcsAndMeta),] -#all(rownames(pcsAndMeta) == rownames(updatedAnnotations)) - -#pcsAndMeta[,colnames(updatedAnnotations)] <- updatedAnnotations - -#pcsAndMeta$selectedSamples <- !pcsAndMeta$excludeBasedOnPredictionCellline2 & !pcsAndMeta$excludeBasedOnPredictionCancer & !(!is.na(pcsAndMeta$Cancer) & pcsAndMeta$Cancer) & !(!is.na(pcsAndMeta$Cellline) & pcsAndMeta$Cellline) - -table(pcsAndMeta$selectedSamples, useNA = "a") - - -clusterAnnotations <- read.delim("umap/annotationsBasedOnOldUmap.txt", row.names = 1) -pcsAndMeta <- merge(pcsAndMeta, clusterAnnotations, by = 0, all.x = T) -rownames(pcsAndMeta) <- pcsAndMeta$Row.names -table(pcsAndMeta$ClusterAnnotation) - - - - -#pcsAndMeta[!is.na(pcsAndMeta$study) & (pcsAndMeta$study== "ERP104864") & (grepl("synovium", pcsAndMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" - - -tissueSamples <- pcsAndMeta[pcsAndMeta$selectedSamples,] - -tissueSamples$class <- tissueSamples$Tissue - -hasT2 <- tissueSamples$Tissue2 != "" -tissueSamples$class[hasT2] <- paste0(tissueSamples$class[hasT2], "-", tissueSamples$Tissue2[hasT2]) - -isFetal <- !is.na(tissueSamples$Fetal) & tissueSamples$Fetal -tissueSamples$class[isFetal] <- paste0(tissueSamples$class[isFetal], "-Fetal") - -noTbutCluster <- tissueSamples$class == "" & !is.na(tissueSamples$ClusterAnnotation) -table(noTbutCluster, useNA = "a") -tissueSamples$class[noTbutCluster] <- tissueSamples$ClusterAnnotation[noTbutCluster] - -table(tissueSamples$class) -write.table(table(tissueSamples$class, useNA = "always"), file = "umap/tissues.txt", sep = "\t", quote = F, row.names = F) - -str(tissueSamples) - - - -mapping <- read.delim("umap/tissuesMapping.txt") -str(mapping) - -all(tissueSamples$class %in% mapping$Class) - - -tissueSamples$umapFactor <- as.factor(mapping$ClassificationClass[match(tissueSamples$class, mapping$Class)]) - -table(tissueSamples$umapFactor, useNA = "always") - - -defaultCol <- adjustcolor("grey", alpha.f = 0.6) -tissueCol <- read.delim("umap/col.txt", row.names = 1) - - -tissueSamples$TissueCol <- defaultCol -sum(unique(tissueSamples$umapFactor) %in% rownames(tissueCol)) -sum(tissueSamples$umapFactor %in% rownames(tissueCol)) -tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1], alpha.f = 0.5) -#tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1] -table(tissueSamples$TissueCol, useNA = "a") - -tissueSamples$plotOrderTissues <- order(tissueSamples$TissueCol != defaultCol) - - -#, n_threads = 22 - -compsToUseForUmap <- compsToUse -init <- as.matrix(tissueSamples[,paste0("PC_",1:2)]) -umapInput <- as.matrix(tissueSamples[,paste0("PC_",1:compsToUseForUmap)]) - -sampleUmap <- umap( - umapInput, - n_epochs = 1000, - init = init, - n_neighbors = 500, - min_dist = 2, init_sdev = 1e-4, learning_rate = 1, - spread = 15, - bandwidth = 10, - scale = "scale", - local_connectivity = 1, - metric = "correlation") - - -rownames(sampleUmap) <- rownames(tissueSamples) -colnames(sampleUmap) <- c("UMAP1", "UMAP2") -umapAndMeta <- merge(sampleUmap, tissueSamples, by = 0) -dim(umapAndMeta) - - - - - -rpng() - -par(mar = c(3,5,0.1,0.1), xpd = NA) -plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16) - -dev.off() - - -locator(n =2, type = "l") - - - - -write.table(umapAndMeta,file = "umaptest.txt", sep = "\t", quote = F, col.names = NA) - -#save.image( file="umap_tmp.RData") -#load("umap_tmp.RData") - -rpng() - -par(mar = c(3,5,0.1,0.1), xpd = NA) -plot(umapAndMeta[plotOrder,"UMAP1"], umapAndMeta[plotOrder,"UMAP2"], col = umapAndMeta$TissueCol[plotOrder], cex = 0.8, pch = 16, xlim = c(-25,25), ylim = c(-25,25)) - -dev.off() - - - -#png(file = "umaptest.png", width = 1600, height = 800) - -pdf(file = "umaptest.pdf", width = 16, height = 8) -#rpng() - -layout(matrix(1:2,ncol = 2)) - -par(mar = c(3,5,0.1,0.1), xpd = NA) -plot(umapAndMeta[plotOrderTissues,"UMAP1"], umapAndMeta[plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[plotOrderTissues], cex = 0.4, pch = 16) - -par(mar = c(0,0,0,0), xpd = NA) -plot.new() -plot.window(xlim = 0:1, ylim = 0:1) -legend("center", fill = tissueCol[,1], legend = row.names(tissueCol), bty = "n", ncol = 2,cex = 0.7) - - -dev.off() - - - - - diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R new file mode 100644 index 000000000..f56829394 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R @@ -0,0 +1,167 @@ +smartSeq <- read.delim("smartseqSamples.txt")[,1] +str(smartSeq) + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colSmartseq <- defaultCol +pcsAndMeta$colSmartseq[pcsAndMeta$Row.names %in% smartSeq] <- "darkslategray" +pcsAndMeta$colSmartseq[pcsAndMeta$Row.names %in% qseq] <- "orangered" +pcsAndMeta$colSmartseq[outliersPc1 == "TRUE" ] <- "darkblue" +plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1) + +rpng() +plot(pcsAndMeta[plotOrderSmartseq,"PC_1"], pcsAndMeta[plotOrderSmartseq,"PC_2"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Smartseq") +dev.off() + +rpng() +plot(pcsAndMeta[plotOrderSmartseq,"PC_1"], pcsAndMeta[plotOrderSmartseq,"PC_6"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Quantseq and Smartseq", pch =16) + +library(pROC) + +smartSeqClass <- as.factor(pcsAndMeta$Row.names %in% smartSeq) +table(smartSeqClass) +dim(pcsAndMeta) +smartSeqAuc <- apply(pcsAndMeta[,2:100],2,function(x){ + tryCatch( + { + #wilcox.test(x ~ smartSeqClass)$p.value + as.numeric(auc(response = smartSeqClass, predictor = x)) + }, + error=function(cond){return(1)} + ) +}) +sort(smartSeqAuc) +str(pcsAndMeta[,2]) +boxplot(pcsAndMeta[,2]~smartSeqClass) + +boxplot(log10(pcsAndMeta[,"sra.sample_spots"]) ~ smartSeqClass ) + +sum(pcsAndMeta[,"sra.sample_spots"] < 10000000, na.rm = T) + + +library(vioplot) +vioplot(log10(pcsAndMeta[,"sra.sample_spots"]) ~ smartSeqClass) + + +plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], log10(pcsAndMeta[plotOrderSmartseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.6) + + + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +sum(!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 150000) + +pcsAndMeta$colSmartseq <- defaultCol +pcsAndMeta$colSmartseq[!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 150000] <- "aquamarine2" +table(pcsAndMeta$colSmartseq) +plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1) + +plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], pcsAndMeta[plotOrderSmartseq,"PC_1"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "recount_qc.star.number_of_splices:_total < 150000") + + + +defaultCol <- adjustcolor("grey", alpha.f = 0.5) +pcsAndMeta$colSmartseq <- defaultCol +pcsAndMeta$colSmartseq[!is.na(pcsAndMeta[,"recount_seq_qc.%a"]) & pcsAndMeta[,"recount_seq_qc.%a"] < 20] <- "darkslateblue" +table(pcsAndMeta$colSmartseq) +plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1) + +plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], pcsAndMeta[plotOrderSmartseq,"PC_1"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.6, main = "recount_seq_qc.%c < 20") + + + + + + +pcNoCenter <- read.delim("Components.txt", sep = ",", row.names = 1) +pcNoCenter <- merge(pcNoCenter, combinedMeta, all.x = T, by = 0) +rownames(pcNoCenter) <- pcNoCenter$Row.names + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcNoCenter$colSmartseq <- defaultCol +pcNoCenter$colSmartseq[rownames(pcNoCenter) %in% smartSeq] <- "darkslategray" +table(pcNoCenter$colSmartseq) +plotOrderSmartseq <- order((pcNoCenter$colSmartseq != defaultCol) + 1) + +plot(pcNoCenter[plotOrderSmartseq,"X0"], pcNoCenter[plotOrderSmartseq,"X1"], col = pcNoCenter$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Smartseq") + + +plot(pcNoCenter[plotOrderSmartseq,"X0"], pcNoCenter[plotOrderSmartseq,"X1"], col = adjustcolor("grey", alpha.f = 0.2), cex = 0.3) + + + + + + + + + +defaultCol <- adjustcolor("grey", alpha.f = 0.6) +pcNoCenter$col <- defaultCol + +tissueAndCol <- tolower(pcNoCenter[,"Tissue"]) %in% tolower(tissueCol$PlotClass) + +pcNoCenter$col[tissueAndCol] <- tissueCol$col[match(tolower(pcNoCenter[tissueAndCol,"Tissue"]), tolower(tissueCol$PlotClass))] + + +tissue2AndCol <- tolower(pcNoCenter[,"Tissue2"]) %in% tolower(tissueCol$PlotClass) +sum(tissue2AndCol) +pcNoCenter$col[tissue2AndCol] <- tissueCol$col[match(tolower(pcNoCenter[tissue2AndCol,"Tissue2"]), tolower(tissueCol$PlotClass))] + + + +sum(is.na(tolower(pcNoCenter[,"Tissue"]) %in% tolower(tissueCol$PlotClass))) + +#pcNoCenter$col <- tissueCol$col[match(tolower(pcNoCenter[,"Tissue"]), tolower(tissueCol$PlotClass), nomatch = nrow(tissueCol))] + +plotOrder <- order((pcNoCenter$col != defaultCol) + 1) + +plot(pcNoCenter[plotOrder,"X0"], pcNoCenter[plotOrder,"X1"], col = pcNoCenter$col[plotOrder], cex = 0.4) + + +pcNoCenter$gtexCol <- defaultCol +pcNoCenter$gtexCol[pcNoCenter$Cohort == "GTEx" ] <- "goldenrod3" +pcNoCenter$gtexCol[pcNoCenter$Cohort == "TCGA" ] <- "cyan1" + +plotOrder <- order((pcNoCenter$gtexCol != defaultCol) + 1) +plot(pcNoCenter[plotOrder,"X0"], pcNoCenter[plotOrder,"X1"], col = pcNoCenter$gtexCol[plotOrder], cex = 0.4) + + + + +toExclude <- + (!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 15000) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%a"]) & pcsAndMeta[,"recount_seq_qc.%a"] < 20) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%t"]) & pcsAndMeta[,"recount_seq_qc.%t"] < 20) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%c"]) & pcsAndMeta[,"recount_seq_qc.%c"] < 20) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%g"]) & pcsAndMeta[,"recount_seq_qc.%g"] < 20) +sum(toExclude) + + +samplesToKeep <- pcsAndMeta$Row.names[!toExclude] +length(samplesToKeep) + sum(toExclude) == nrow(pcs) + +write.table(samplesToKeep, file = "samplesToKeep.txt", row.names = F, quote = F) + + +boxplot(pcsAndMeta[,"PC_1"]) + +outliersPc1 <- as.factor(pcsAndMeta[,"PC_2"] >= 120) +table(outliersPc1) +library(pROC) +outlierAuc <- sapply(colnames(pcsAndMeta),function(x){ + tryCatch( + { + #wilcox.test(x ~ smartSeqClass)$p.value + as.numeric(auc(response = outliersPc1, predictor = pcsAndMeta[,x])) + }, + error=function(cond){return(NA)} + ) + }) +sort(outlierAuc) + + + + + +auc(response = outliersPc1, predictor = pcsAndMeta[,"PC_2"]) diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/downstreamer_umap.rmd b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/downstreamer_umap.rmd similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/downstreamer_umap.rmd rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/downstreamer_umap.rmd diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/pathwayUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/pathwayUmap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/pathwayUmap.R rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/pathwayUmap.R diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/sampleUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/sampleUmap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/sampleUmap.R rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/sampleUmap.R diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/tryUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/tryUmap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/tryUmap.R rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/tryUmap.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md new file mode 100644 index 000000000..5edb92e21 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md @@ -0,0 +1,20 @@ +# Processing of recount3 data + +## Downloading and extracting the expression values and meta data + + + +## Harmonize and expand the annotations + +In `harmonizeAndExtentSampleAnnotations.R` we expand and map the sample annotations. + +For a large part the annotations where added manually. + +## QQ normalization of the data + + +## PCA on the co-expression matrix + + +## Excluding the cell line and cancer samples + diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R new file mode 100644 index 000000000..cf4b191d4 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R @@ -0,0 +1,631 @@ +#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T) + + +remoter::client("localhost", port = 55501, password = "laberkak") + + + +library(uwot) + +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") + + +load(file = "DataForPredictions.RData") +rownames(pcsAndMeta) <- pcsAndMeta$Row.names +load(file = "tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) +tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "") + + +colnamesToUpdate <- colnames(pcsAndMeta)[colnames(pcsAndMeta) %in% colnames(combinedMeta)] +all(rownames(pcsAndMeta) %in% rownames(combinedMeta)) +pcsAndMeta[,colnamesToUpdate] <- combinedMeta[rownames(pcsAndMeta),colnamesToUpdate] + + +table(pcsAndMeta$selectedSamples, useNA = "a") + + +clusterAnnotations <- read.delim("umap/annotationsBasedOnOldUmap.txt", row.names = 1) +samplesWithClusterAnnotation <- rownames(pcsAndMeta)[rownames(pcsAndMeta) %in% rownames(clusterAnnotations)] + +pcsAndMeta$ClusterAnnotation <- NA +pcsAndMeta[samplesWithClusterAnnotation, "ClusterAnnotation"] <- clusterAnnotations[samplesWithClusterAnnotation,"ClusterAnnotation"] +table(pcsAndMeta$ClusterAnnotation, useNA = "a") + +tissueSamples <- pcsAndMeta[pcsAndMeta$selectedSamples,] + +tissueSamples$class <- tissueSamples$Tissue + + +hasT2 <- tissueSamples$Tissue2 != "" +tissueSamples$class[hasT2] <- paste0(tissueSamples$Tissue[hasT2], "-", tissueSamples$Tissue2[hasT2]) +table(tissueSamples$class) +isFetal <- !is.na(tissueSamples$Fetal) & tissueSamples$Fetal +tissueSamples$class[isFetal] <- paste0(tissueSamples$class[isFetal], "-Fetal") + +noTbutCluster <- tissueSamples$class == "" & !is.na(tissueSamples$ClusterAnnotation) +table(noTbutCluster, useNA = "a") +tissueSamples$class[noTbutCluster] <- tissueSamples$ClusterAnnotation[noTbutCluster] + +table(tissueSamples$class) +write.table(table(tissueSamples$class, useNA = "always"), file = "umap/tissues.txt", sep = "\t", quote = F, row.names = F) + +str(tissueSamples) + + + +mapping <- read.delim("umap/tissuesMapping.txt") +str(mapping) + +all(tissueSamples$class %in% mapping$Class) +tissueSamples$class[!tissueSamples$class %in% mapping$Class] + +tissueSamples$umapFactor <- as.factor(mapping$ClassificationClass[match(tissueSamples$class, mapping$Class)]) + +table(tissueSamples$umapFactor, useNA = "always") + + +defaultCol <- adjustcolor("grey", alpha.f = 0.6) +tissueCol <- read.delim("umap/col.txt", row.names = 1) + + +tissueSamples$TissueCol <- defaultCol +sum(unique(tissueSamples$umapFactor) %in% rownames(tissueCol)) +sum(tissueSamples$umapFactor %in% rownames(tissueCol)) +tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1], alpha.f = 0.5) +#tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1] +table(tissueSamples$TissueCol, useNA = "a") + +tissueSamples$plotOrderTissues <- order(tissueSamples$TissueCol != defaultCol) + + +#, n_threads = 22 + +compsToUseForUmap <- compsToUse +init <- as.matrix(tissueSamples[,paste0("PC_",1:2)]) +umapInput <- as.matrix(tissueSamples[,paste0("PC_",1:compsToUseForUmap)]) + +sampleUmap <- umap( + umapInput, + n_epochs = 1000, + init = init, + n_neighbors = 500, + min_dist = 1, init_sdev = 1e-4, learning_rate = 2, + spread = 20, + bandwidth = 10, + scale = "scale", + local_connectivity = 10, + repulsion_strength = 0.5, + metric = "correlation") + + +rownames(sampleUmap) <- rownames(tissueSamples) +colnames(sampleUmap) <- c("UMAP1", "UMAP2") +save(sampleUmap, file = "umap/sampleUmap6.RData") + +#load(file = "umap/sampleUmap3.RData") + + + + +umapAndMeta <- merge(sampleUmap, tissueSamples, by = 0) +rownames(umapAndMeta) <- umapAndMeta$Row.names +dim(umapAndMeta) + + + + +rpng() + +par(mar = c(3,5,0.1,0.1), xpd = NA) +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16) + +dev.off() + +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16, xlim = c(-100,100), ylim = c(-100,100)) + + + +locator(n =2, type = "l") +cluster1 <- locator(n =2, type = "l") +cluster2 <- locator(n =2, type = "l") + + +write.table(umapAndMeta[,!grepl("PC_",colnames(umapAndMeta))],file = "umaptest.txt", sep = "\t", quote = F, col.names = NA) +#save(umapAndMeta, file = "umaptest.RData") +#load("umaptest.RData") + +#save.image( file="umap_tmp.RData") +#load("umap_tmp.RData") + +rpng() + +par(mar = c(3,5,0.1,0.1), xpd = NA) +plot(umapAndMeta[plotOrder,"UMAP1"], umapAndMeta[plotOrder,"UMAP2"], col = umapAndMeta$TissueCol[plotOrder], cex = 0.8, pch = 16, xlim = c(-25,25), ylim = c(-25,25)) + +dev.off() + + + +#png(file = "umaptest.png", width = 1600, height = 800) + +pdf(file = "umaptest.pdf", width = 16, height = 8) +#rpng() + +layout(matrix(1:2,ncol = 2)) + +par(mar = c(3,5,0.1,0.1), xpd = NA) +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16) + +par(mar = c(0,0,0,0), xpd = NA) +plot.new() +plot.window(xlim = 0:1, ylim = 0:1) +legend("center", fill = tissueCol[,1], legend = row.names(tissueCol), bty = "n", ncol = 2,cex = 0.7) + + +dev.off() + + + + + + + +#smartseq plots + +someSmartSeqStudies <- read.delim("selectionSmartseqStudies.txt", header = F)[,1] +str(someSmartSeqStudies) + +someSmartSeqSamples <- read.delim("smartseqSamples.txt", header = T)[,1] +str(someSmartSeqSamples) + +umapAndMeta$smartseqcol <- defaultCol +umapAndMeta$smartseqcol[umapAndMeta$study %in% someSmartSeqStudies] <- "pink" +umapAndMeta$smartseqcol[umapAndMeta$Row.names %in% someSmartSeqSamples] <- "pink" + +umapAndMeta$plotOrdersq <- order(umapAndMeta$smartseqcol != defaultCol) + + +par(mar = c(3,5,0.1,0.1), xpd = NA) +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$smartseqcol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16) + + +#instestine clusters + +umapAndMeta$intestineCluster <- "" +umapAndMeta$intestineCluster[umapAndMeta$UMAP1 >= cluster1$x[1] & umapAndMeta$UMAP1 <= cluster1$x[2] & umapAndMeta$UMAP2 >= cluster1$y[1] & umapAndMeta$UMAP2 <= cluster1$y[2]] <- "c1" +umapAndMeta$intestineCluster[umapAndMeta$UMAP1 >= cluster2$x[1] & umapAndMeta$UMAP1 <= cluster2$x[2] & umapAndMeta$UMAP2 >= cluster2$y[1] & umapAndMeta$UMAP2 <= cluster2$y[2]] <- "c2" +table(umapAndMeta$intestineCluster) + +table(factor(umapAndMeta$umapFactor[umapAndMeta$intestineCluster=="c1"])) +table(factor(umapAndMeta$umapFactor[umapAndMeta$intestineCluster=="c2"])) + +table(factor(umapAndMeta$class[umapAndMeta$intestineCluster=="c1"])) +table(factor(umapAndMeta$class[umapAndMeta$intestineCluster=="c2"])) + +a <- as.data.frame(table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster=="c1"])) +b <- as.data.frame(table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster=="c2"])) + + +table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster!=""], umapAndMeta$intestineCluster[umapAndMeta$intestineCluster!=""]) + +str(a) +c <- merge(a,b,by = 0, all = T) +c + +load("metadata_gtex.Rda", verbose = T) +View(metadata_gtex) + + +gtexTansverse <- umapAndMeta[umapAndMeta$study == "GTEx" & umapAndMeta$Tissue2 == "Transverse" & umapAndMeta$intestineCluster != "",] + +rownames(gtexTansverse) <- gtexTansverse$Row.names + +rownames(metadata_gtex) <- metadata_gtex$external_id + +dim(gtexTansverse) +gtexTansverse <- merge(gtexTansverse, metadata_gtex[,!colnames(metadata_gtex) %in% colnames(gtexTansverse)], by = 0) +dim(gtexTansverse) + +table(gtexTansverse$gtex.smatsscr, gtexTansverse$intestineCluster) + +fisher.test(table(gtexTansverse$gtex.smatsscr, gtexTansverse$intestineCluster)) +grep("MHBCTINF", colnames(gtexTansverse), ignore.case = T) + + + + +numCols <- colnames(gtexTansverse)[unlist(lapply(gtexTansverse, is.numeric)) ] + +colName <- "sra.paired_nominal_length" +clusterCompare <- sapply(numCols, function(colName){ + #print(colName) + if(!all(is.na(gtexTansverse[,colName])) & sd(gtexTansverse[,colName], na.rm =T) > 0 ){ + t.test(gtexTansverse[,colName] ~ gtexTansverse$intestineCluster)$p.value + } + +}) +clusterCompare <- unlist(clusterCompare) +clusterCompare2 <- clusterCompare[grep("PC_", names(clusterCompare), invert = T)] +sort(clusterCompare2, decreasing = T) +boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrx` ~ gtexTansverse$intestineCluster) +boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrx` ~ paste0(gtexTansverse$intestineCluster, "_",gtexTansverse$gtex.sex)) +boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrm` ~ gtexTansverse$intestineCluster) +boxplot(gtexTansverse$`` ~ gtexTansverse$intestineCluster) + +boxplot(gtexTansverse$`recount_qc.star.number_of_reads_unmapped:_other_both` ~ gtexTansverse$intestineCluster) + +boxplot(gtexTansverse$`gtex.smtsisch` ~ gtexTansverse$intestineCluster) +boxplot(gtexTansverse$`CnvAutoCor` ~ gtexTansverse$intestineCluster) + +#save(gtexTansverse, file = "gtexTansverse.RData") +load("gtexTansverse.RData") + + +str(row.names(gtexTansverse)) +str(gtexTansverse$Row.names) +str(exp) +expgT <- exp[,gtexTansverse$Row.names] +save(expgT, file = "expgT.RData") +load( "expgT.RData") + + +colnames(expgT) +expgT <- t(expgT) +all(rownames(expgT) == gtexTansverse$Row.names) + +x <- expgT[,1] + +diffExp <- apply(expgT, 2, function(x){ + t.test(x ~gtexTansverse$intestineCluster)$statistic +}) +hist(-log10(diffExp)) +names(diffExp)[order(diffExp)[1:100]] +cat(sub("\\..+","",names(diffExp)[order(diffExp, decreasing = T)[1:200]]), sep = "\n") + +load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Fibroblasts.rda", verbose = T) +str(fibroblasts) + +load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/BloodVessels.rda", verbose = T) + + + + +minSamplesTraining <- 50 +maxFractionOfStudy <- 0.8 + +#Take only samples that have an annotation +umapAndMetaClassified <- umapAndMeta[!is.na(umapAndMeta$umapFactor),] +#First put all in test, algorithm will put some +umapAndMetaClassified$training <- FALSE + +tissueClass <- levels(umapAndMetaClassified$umapFactor)[2] +study <- "GTEx" + +set.seed(42) +#for each tissue slecect samples for training +for(tissueClass in levels(umapAndMetaClassified$umapFactor)){ + thisTissueSamples <- umapAndMetaClassified$umapFactor==tissueClass + studiesForThisTissue <- unique(umapAndMetaClassified$study[thisTissueSamples]) + numberOfStudies <- length(studiesForThisTissue) + numberOfSamplesPerStudy <- ceiling(minSamplesTraining / numberOfStudies) + print(paste(tissueClass, length(studiesForThisTissue), numberOfSamplesPerStudy, sep = " - ")) + #for each studies put samples to training or test + for(study in studiesForThisTissue){ + + thisTissueAndStudySamples <- thisTissueSamples & umapAndMetaClassified$study == study + thisTissueAndStudySamplesCount <- sum(thisTissueAndStudySamples) + + #Don't select more samples from study then the study has and also no more then set fraction. Do floor to put studies with single sample to testset + potentialMax <- floor(thisTissueAndStudySamplesCount * maxFractionOfStudy) + numberTrainingSamplesThisStudy <- if(potentialMax > numberOfSamplesPerStudy) numberOfSamplesPerStudy else potentialMax + if(numberTrainingSamplesThisStudy > 0){ + #The which will get all indices for the samples of this study-tissue combination. These are then samples for the samples used for training + trainingSamplesThisStudy <- sample(which(thisTissueAndStudySamples), numberTrainingSamplesThisStudy) + #Set selected to TRUE + umapAndMetaClassified$training[trainingSamplesThisStudy] <- TRUE + } + + + #print(paste0(thisTissueAndStudySamplesCount, " - ", numberTrainingSamplesThisStudy)) + } + +} + +sum(umapAndMetaClassified$training) + +umapAndMetaClassifiedTraining <- umapAndMetaClassified[umapAndMetaClassified$training,] +table(umapAndMetaClassifiedTraining$umapFactor) +umapAndMetaClassifiedTest <- umapAndMetaClassified[!umapAndMetaClassified$training,] +dim(umapAndMetaClassifiedTest) + + +library(glmnet) +cfit <- cv.glmnet(x = as.matrix(umapAndMetaClassifiedTraining[,paste0("PC_",1:compsToUse)]), y = umapAndMetaClassifiedTraining$umapFactor, family = "multinomial", type.measure = "class") +cfit + +rpng() +plot(cfit) +dev.off() + + + +assess.glmnet(cfit, newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), newy = umapAndMetaClassifiedTest$umapFactor, family = "multinomial", type.measure = "class", keep = TRUE, alpha=1, lambda = "1se") + + + +predictionsTest <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), type = "class") + +predictionsTestScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), type = "response") +predictionsTestScores <- predictionsTestScores[,,1] +umapAndMetaClassifiedTest$predictedTissueScore <- apply(predictionsTestScores, 1, max) + +prop = 0.5 + +predictionsInTest <- sapply(seq(0,1,0.05), function(prop){ + + umapAndMetaClassifiedTest$predictedTissue <- predictionsTest[,1] + + + umapAndMetaClassifiedTest$predictedTissue[umapAndMetaClassifiedTest$predictedTissueScore <= prop] <- NA + + umapAndMetaClassifiedTest$misclasified <- FALSE + umapAndMetaClassifiedTest$misclasified[!is.na(umapAndMetaClassifiedTest$umapFactor) & !is.na(umapAndMetaClassifiedTest$predictedTissue) & umapAndMetaClassifiedTest$umapFactor != umapAndMetaClassifiedTest$predictedTissue] <- TRUE + errors <- sum(umapAndMetaClassifiedTest$misclasified ) + + umapAndMetaClassifiedTest$notPredictedBack <- FALSE + umapAndMetaClassifiedTest$notPredictedBack[!is.na(umapAndMetaClassifiedTest$umapFactor) & is.na(umapAndMetaClassifiedTest$predictedTissue) ] <- TRUE + missed <- sum(umapAndMetaClassifiedTest$notPredictedBack) + + total <- nrow(umapAndMetaClassifiedTest) + + missedPercentage <- missed / total + errorPercentage <- errors / total + + return(c("Threshold" = prop, "MissedPerc" = missedPercentage , "ErrorPerc" = errorPercentage )) + +}) +predictionsInTest + +tissueClass <- levels(umapAndMetaClassified$umapFactor)[1] + +predictionsInTestPerTissue <- lapply(levels(umapAndMetaClassified$umapFactor), function(tissueClass){ + predictionsInTestThisTissue <- sapply(seq(0,1,0.05), function(prop){ + + umapAndMetaClassifiedTestTissue <- umapAndMetaClassifiedTest[umapAndMetaClassifiedTest$umapFactor == tissueClass,] + umapAndMetaClassifiedTestTissue$predictedTissue <- predictionsTest[umapAndMetaClassifiedTest$umapFactor == tissueClass,1] + + + umapAndMetaClassifiedTestTissue$predictedTissue[umapAndMetaClassifiedTestTissue$predictedTissueScore <= prop] <- NA + + umapAndMetaClassifiedTestTissue$misclasified <- FALSE + umapAndMetaClassifiedTestTissue$misclasified[!is.na(umapAndMetaClassifiedTestTissue$umapFactor) & !is.na(umapAndMetaClassifiedTestTissue$predictedTissue) & umapAndMetaClassifiedTestTissue$umapFactor != umapAndMetaClassifiedTestTissue$predictedTissue] <- TRUE + errors <- sum(umapAndMetaClassifiedTestTissue$misclasified ) + + umapAndMetaClassifiedTestTissue$notPredictedBack <- FALSE + umapAndMetaClassifiedTestTissue$notPredictedBack[!is.na(umapAndMetaClassifiedTestTissue$umapFactor) & is.na(umapAndMetaClassifiedTestTissue$predictedTissue) ] <- TRUE + missed <- sum(umapAndMetaClassifiedTestTissue$notPredictedBack) + + total <- nrow(umapAndMetaClassifiedTestTissue) + + missedPercentage <- missed / total + errorPercentage <- errors / total + + return(c("Threshold" = prop, "MissedPerc" = missedPercentage , "ErrorPerc" = errorPercentage )) + + }) + return(predictionsInTestThisTissue) +}) +names(predictionsInTestPerTissue) <- levels(umapAndMetaClassified$umapFactor) +str(predictionsInTestPerTissue) + +x <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ + return(predictionsInTestThisTissue[3,11]) +}) +sort(x) + +predictionsInTest[2,15] + +predictionsInTestPerTissue[["Whole Blood Fetal"]] + +layout(matrix(1:2, nrow = 1)) +plot(t(predictionsInTest[1:2,]), main = "Percentage classification missed in test dataset") +for(tissueClass in levels(umapAndMetaClassified$umapFactor)){ + predictionsInTestThisTissue <- predictionsInTestPerTissue[[tissueClass]] + points(t(predictionsInTestThisTissue[1:2,]), type = "l", col=adjustcolor("grey", alpha.f = 0.5)) +} +plot(t(predictionsInTest[c(1,3),]), main = "Percentage wrong classification in test dataset") +sink <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ + points(t(predictionsInTestThisTissue[c(1,3),]), type = "l", col=adjustcolor("grey", alpha.f = 0.5)) +}) + + + + +confusion <- confusion.glmnet(cfit, newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), newy = umapAndMetaClassifiedTest$umapFactor, family = "multinomial", type.measure = "class", keep = TRUE, alpha=1, lambda = "1se") +diag(confusion) <- 0 + +library(heatmap3) + +rpng() +pdf("confusion.pdf", width = 12, height = 12) +heatmap3(confusion, Rowv = NA, Colv = NA, balanceColor =T, scale = "none") +dev.off() + + +predictions <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "class") +umapAndMeta$predictedTissue <- predictions[,1] + +predictionsScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "response") +predictionsScores <- predictionsScores[,,1] +rownames(predictionsScores) <- umapAndMeta$Row.names +umapAndMeta$predictedTissueScore <- apply(predictionsScores, 1, max) + +sum(umapAndMeta$predictedTissueScore <= 0.5) +umapAndMeta$predictedTissue[umapAndMeta$predictedTissueScore <= 0.5] <- NA + + +rpng() +hist(umapAndMeta$predictedTissueScore) +dev.off() + +umapAndMeta$misclasified <- FALSE +umapAndMeta$misclasified[!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & umapAndMeta$umapFactor != umapAndMeta$predictedTissue] <- TRUE +sum(umapAndMeta$misclasified ) + +umapAndMeta$notPredictedBack <- FALSE +umapAndMeta$notPredictedBack[!is.na(umapAndMeta$umapFactor) & is.na(umapAndMeta$predictedTissue) ] <- TRUE +sum(umapAndMeta$notPredictedBack) + +sum(!is.na(umapAndMeta$predictedTissue)) + +length(unique((umapAndMeta$predictedTissue))) + +sum(table((umapAndMeta$predictedTissue)) >= 1000) +hist(table((umapAndMeta$predictedTissue)), breaks =25) +barplot(table((umapAndMeta$predictedTissue))) + +sort(table(umapAndMeta[umapAndMeta$misclasified, "umapFactor"])) +sort(table(umapAndMeta[umapAndMeta$notPredictedBack, "umapFactor"])) + +tissueClass <- levels(umapAndMeta$umapFactor)[1] + +pdf("tissuePrediction.pdf") +for(tissueClass in levels(umapAndMeta$umapFactor)){ + + umapAndMeta$ThisTissueCol <- defaultCol + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & !umapAndMeta$misclasified] <- adjustcolor("forestgreen", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$notPredictedBack] <- adjustcolor("hotpink", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$misclasified] <- adjustcolor("violetred3", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass != umapAndMeta$umapFactor & tissueClass == umapAndMeta$predictedTissue] <- adjustcolor("orange1", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass == umapAndMeta$predictedTissue] <- adjustcolor("dodgerblue1", alpha.f = 0.5) + + predictedBack <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & !umapAndMeta$misclasified) + notPredictedBack <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$notPredictedBack) + predictedAsOther <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$misclasified) + otherPredicted <- sum(!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass != umapAndMeta$umapFactor & tissueClass == umapAndMeta$predictedTissue) + newPredicted <- sum(is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass == umapAndMeta$predictedTissue) + + table(umapAndMeta$ThisTissueCol, useNA = "a") + + umapAndMeta$plotOrderThisTissues <- order(umapAndMeta$ThisTissueCol != defaultCol) + + #rpng() + layout(matrix(c(1,2,3), ncol = 1, byrow = T), heights = c(0.05,0.85,0.1)) + par(mar = c(0,0,0,0), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5,tissueClass, cex = 2 , font = 2) + + par(mar = c(5,5,0,0.1), xpd = NA) + plot(umapAndMeta[umapAndMeta$plotOrderThisTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderThisTissues,"UMAP2"], col = umapAndMeta$ThisTissueCol[umapAndMeta$plotOrderThisTissues], cex = 0.2, pch = 16, bty="n", xlab = "UMAP-1", ylab = "UMAP-2") + + par(mar = c(0,0,0,0), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("center", fill = c( + "forestgreen", + "hotpink", + "violetred3", + "orange1", + "dodgerblue1" + ), + legend = c( + paste0(tissueClass, " correctly predicted back (", predictedBack,")"), + paste0(tissueClass, " not predicted back (", notPredictedBack,")"), + paste0(tissueClass, " predicted as other (", predictedAsOther,")"), + paste0("Other tissue predicted as ", tissueClass," (", otherPredicted,")"), + paste0("Unkown predicted as ", tissueClass, " (", newPredicted,")") + ), + bty = "n") + + + + #dev.off() + +} +dev.off() + +#save(umapAndMeta, file = "tissuePredictions/tissuePredictions_16_09_22.RData") +load("tissuePredictions/tissuePredictions_16_09_22.RData", verbose = T) + +unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)] + + + +clusterToExclude <- c("U2-OS", "Leukemia_blood-cell-line", "HAP1", "LNCaP") + + + + +samplesWithPrediction <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umapAndMeta$predictedTissue %in% clusterToExclude, c( + "predictedTissue", + "predictedTissueScore", + "umapFactor", + "misclasified", + "study", + "sra.library_layout" +)] +colnames(samplesWithPrediction)[3] <- "annotatedTissue" +str(samplesWithPrediction) +#save(samplesWithPrediction, file = "tissuePredictions/samplesWithPrediction_16_09_22.RData") + +write.table(samplesWithPrediction, file = "samplesWithPrediction.txt") +load("tissuePredictions/samplesWithPrediction_16_09_22.RData") +str(samplesWithPrediction) +table(samplesWithPrediction$predictedTissue) + +load(file = "umap/sampleUmap6.RData", verbose = T) + + +umapAndPredictions <- merge(samplesWithPrediction, sampleUmap, by = 0 ) +rownames(umapAndPredictions) <- umapAndPredictions$Row.names + + +umapAndPredictions$TissuePredictedCol <- defaultCol +umapAndPredictions$TissuePredictedCol[umapAndPredictions$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndPredictions$predictedTissue[umapAndPredictions$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5) +umapAndPredictions$plotOrderTissuePredicted <- order(umapAndPredictions$TissuePredictedCol != defaultCol) + +#rpng() + +par(mar = c(3,3,0.1,0.1), xpd = NA) +plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16) + +plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16, xlim = c(-100,70), ylim = c(-50,50)) + + + + +#dev.off() + +locator(n =2, type = "l") + + +pdf(file = "umapPredicted.pdf", width = 16, height = 8) +#rpng() + +layout(matrix(1:2,ncol = 2)) + +par(mar = c(5,5,0.1,0.1), xpd = NA) +plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16, bty = "n", xlab = "UMAP-1", ylab = "UMAP-2") + +par(mar = c(0,0,0,0), xpd = NA) +plot.new() +plot.window(xlim = 0:1, ylim = 0:1) +legend("center", fill = tissueCol[rownames(tissueCol) %in% umapAndPredictions$predictedTissue,1], legend = row.names(tissueCol)[rownames(tissueCol) %in% umapAndPredictions$predictedTissue], bty = "n", ncol = 2,cex = 0.7) + + +dev.off() + + + + +countTable <- table(umapAndPredictions$predictedTissue) +sum(countTable) +sum(countTable >= 500) +pdf("baplotTissues.pdf", width = 15, height = 10) +par(mar = c(25,5,2,0.1), xpd = NA) +b <- barplot(countTable, las =2, col = tissueCol[names(countTable),]) +text(b, countTable + 280, countTable, font=1, srt = 90) +dev.off() diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R new file mode 100644 index 000000000..a01f9d656 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R @@ -0,0 +1,64 @@ +#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, sync = T) + + +remoter::client("localhost", port = 55501) + + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") + + + +sraFiles <- list.files(path="rse-sra/SRA_Files/", pattern="sra*", full.names=TRUE, recursive=FALSE) +gtexFiles <- list.files(path="rse-gtex/rse_gtex", pattern="rse*", full.names=TRUE, recursive=FALSE) +allFiles <- c(sraFiles, gtexFiles, "rse-tcga/rseTCGA.rda", "rse-tcga/rse_ESCA_TCGA.rda") + +load("tissuePredictions/samplesWithPrediction_16_09_22.RData") +selectedSamples <- rownames(samplesWithPrediction) +str(selectedSamples) + + +#file = allFiles[10] + +perChunkExp <- sapply(allFiles, function(file){ + + loadedObject <- load(file) + + sreObjects <- get(loadedObject[1]) + + #sometimes single RSE is not in list. Put in list of one to make code uniform + if(!is.list(sreObjects)){ + sreObjects <- list(sreObjects) + } + + #sreObject <- sreObjects[[1]] + + perStudyExp <- lapply(sreObjects, function(sreObject){ + studyExp <- sreObject@assays@data@listData$raw_counts + return(studyExp[,colnames(studyExp) %in% selectedSamples, drop = F]) + }) + + return(do.call(cbind, perStudyExp)) + +}) + +str(sreObject) + +selectedSamplesExp <- do.call(cbind, perChunkExp) +str(selectedSamplesExp) +all(selectedSamples %in% colnames(selectedSamplesExp )) +table(selectedSamples %in% colnames(selectedSamplesExp )) + + + +#Some samples are duplicated in the chunks, now make sure only one is in the matrix +uniqueSamplesIndex <- match(selectedSamples, colnames(selectedSamplesExp)) +selectedSamplesExp <- selectedSamplesExp[,uniqueSamplesIndex] + + + +#save(selectedSamplesExp, file = "perTissueNormalization/selectedSamplesRawExpression.RData") + + + diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R new file mode 100644 index 000000000..fdb5022b5 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R @@ -0,0 +1,1881 @@ +#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T) + + +remoter::client("localhost", port = 55501, password = "laberkak") + + +#save.image("tmp2.RData") + + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") +#load("tmp2.RData") + + +library(readr) + + +table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_sra1.txt", delim = "\t", quote = "", guess_max = 20000) +sraMeta1 <- as.data.frame(table_tmp[,-1]) +rm(table_tmp) + +table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_sra2.txt", delim = "\t", quote = "", guess_max = 20000) +sraMeta2 <- as.data.frame(table_tmp[,-1]) +rm(table_tmp) + +sraSharedCol <- intersect(colnames(sraMeta2), colnames(sraMeta1)) +length(sraSharedCol) + +sraMeta <- rbind(sraMeta1[,sraSharedCol], sraMeta2[,sraSharedCol]) + +#For some reason some runs are duplicated in the meta data file. +#Quick inspection showed that they have the same values +#Solution exclude duplicate row +sraUniqueIds <- unique(sraMeta$external_id) +str(sraUniqueIds) +sraMeta <- sraMeta[ match(sraUniqueIds, sraMeta$external_id), ] +rownames(sraMeta) <- sraMeta$external_id + + + +#extra columns in part 2 +sraPart2Col <- colnames(sraMeta2)[!colnames(sraMeta2) %in% colnames(sraMeta1)] + +sraMetaExtended <- sraMeta2[match(sraUniqueIds, sraMeta2$external_id),sraPart2Col] + +str(sraMetaExtended) + +sum(length(unique(sraMeta2$external_id))) +sum(length(unique(sraMeta1$external_id))) + +sum(unique(length(sraMeta2$external_id))) +sum(unique(length(sraMeta1$external_id))) + +#metadata_gtex +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_gtex.Rda", verbose = T) +rownames(metadata_gtex) <- metadata_gtex$external_id +metadata_gtex2 <- metadata_gtex[,c("gtex.smts", "gtex.smtsd")] +str(metadata_gtex2) + +#metadata_tcga +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_tcga.Rda", verbose = T) +rownames(metadata_tcga) <- metadata_tcga$external_id +metadata_tcga2 <- metadata_tcga[,c("tcga.gdc_cases.project.primary_site", "tcga.cgc_sample_sample_type")] + + + +#ARCH4 data +table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadataSRA.txt", delim = "\t", quote = "") +metadata_archs4 <- as.data.frame(table_tmp[,-1]) +rownames(metadata_archs4) <- table_tmp[,3][[1]] +rm(table_tmp) +metadata_archs4_2 <- metadata_archs4[,c("Tissue", "CellType", "CellLine")] +colnames(metadata_archs4_2) <- c("archs4.Tissue", "archs4.CellType", "archs4.CellLine") + +#GADO data +metadata_Gado <- read.delim("celllinesAndCancer/oldAnnotations/sampleAnnotations.txt") +rownames(metadata_Gado) <- metadata_Gado$Sample +metadata_Gado2 <- metadata_Gado[,c("CellLine", "TissueType", "CellType", "PlotClass")] +colnames(metadata_Gado2) <- paste0("gado.",colnames(metadata_Gado2)) +gadoTissueCol <- read.delim("celllinesAndCancer/oldAnnotations/tissueCol5.txt") + + +#Kidney Network Annotaions +metadata_Kn <- read.delim("Metadata/KidneyNetwork.txt") +rownames(metadata_Kn) <- metadata_Kn$Sample +metadata_Kn2 <- metadata_Kn[,c("Origin", "Cell_type", "Cell_type_simplified", "Cell_type_manual")] +colnames(metadata_Kn2) <- paste0("KidneyNetwork.",colnames(metadata_Kn2)) + + +#Mahmoud annotations +load("Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Annotations.rda", verbose = T) +str(Annotations) +rownames(Annotations) <- Annotations$SampleID +mahmoudAnnotations <- Annotations[,-(1:2)] +#write.table(Annotations, sep = "\t", quote = F, col.names = NA, file = "tmp.txt") + +allSamples <- c(rownames(metadata_gtex2), rownames(metadata_tcga2), rownames(sraMeta)) + +length(unique(allSamples)) == length(allSamples) + +numberSamples = length(allSamples) +finalAnnotations <- data.frame( + Tissue = rep("",numberSamples), + Tissue2 = rep("",numberSamples), + Cellline = vector(mode = "logical", length = numberSamples), + CelllineName = rep("",numberSamples), + Cancer = vector(mode = "logical", length = numberSamples), + Cohort = rep("SRA",numberSamples), + row.names = allSamples, stringsAsFactors = F) +finalAnnotations$Cellline = NA +finalAnnotations$Cancer = NA +finalAnnotations$Fetal <- NA + +finalAnnotations$Cohort[rownames(finalAnnotations) %in% rownames(metadata_gtex2)] <- "GTEx" +finalAnnotations$Cohort[rownames(finalAnnotations) %in% rownames(metadata_tcga2)] <- "TCGA" +table(finalAnnotations$Cohort, useNA = "always") +str(finalAnnotations) + + +dim(finalAnnotations) +dim(metadata_gtex2) + + +sum(rownames(finalAnnotations) %in% rownames(metadata_gtex2)) + +a <- merge(finalAnnotations, metadata_gtex2, all.x = T, by = 0) +row.names(a) <- a$Row.names + +b <- merge(a, metadata_tcga2, all.x = T, by = 0) +row.names(b) <- b$Row.names + +c <- merge(b, metadata_archs4_2, all.x = T, by = 0) +row.names(c) <- c$Row.names + +d <- merge(c, metadata_Gado2, all.x = T, by = 0) +row.names(d) <- d$Row.names + +e <- merge(d, sraMetaExtended, all.x = T, by = 0) +row.names(e) <- e$Row.names + +f <- merge(e, sraMeta, all.x = T, by = 0) +row.names(f) <- f$Row.names + +g <- merge(f, metadata_Kn2, all.x = T, by = 0) +row.names(g) <- g$Row.names + +str(g) + +combinedMeta <- g[,-c(1:7)] +str(combinedMeta) + +#now fillin the gtex and gcta recount meta data. + +tmp <- metadata_gtex[,colnames(metadata_gtex) %in% sraSharedCol] +combinedMeta[rownames(tmp),colnames(tmp)] <- tmp + +tmp <- metadata_tcga[,colnames(metadata_tcga) %in% sraSharedCol] +combinedMeta[rownames(tmp),colnames(tmp)] <- tmp + +rm(tmp) + +#set study make column uniform +combinedMeta$study[combinedMeta$Cohort == "GTEx"] <- "GTEx" +combinedMeta$study[combinedMeta$Cohort == "TCGA"] <- "TCGA" + +combinedMeta$exclude <- FALSE + +#save(combinedMeta, file = "combinedMeta.RData") +#load(file = "combinedMeta.RData") + +combinedMeta$Tissue[combinedMeta$Cohort == "GTEx"] <- combinedMeta$gtex.smts[combinedMeta$Cohort == "GTEx"] +combinedMeta$Tissue2[combinedMeta$Cohort == "GTEx"] <- combinedMeta$gtex.smtsd[combinedMeta$Cohort == "GTEx"] +combinedMeta$Cellline[combinedMeta$Cohort == "GTEx"] <- FALSE +combinedMeta$Cancer[combinedMeta$Cohort == "GTEx"] <- FALSE + +gtexLcl <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - EBV-transformed lymphocytes") +combinedMeta$Cellline[gtexLcl] <- TRUE +combinedMeta$CelllineName[gtexLcl] <- "lcl" +combinedMeta$Tissue[gtexLcl] <- "" +combinedMeta$Tissue2[gtexLcl] <- "" + +gtexCml <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - Leukemia cell line (CML)") +combinedMeta$Cellline[gtexCml] <- TRUE +combinedMeta$CelllineName[gtexCml] <- "cml" +combinedMeta$Tissue[gtexCml] <- "" +combinedMeta$Tissue2[gtexCml] <- "" + +gtexFibroblasts <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - Cultured fibroblasts") +combinedMeta$Cellline[gtexFibroblasts] <- TRUE +combinedMeta$CelllineName[gtexFibroblasts] <- "Fibroblasts" +combinedMeta$Tissue[gtexFibroblasts] <- "" +combinedMeta$Tissue2[gtexFibroblasts] <- "" + + +table(combinedMeta$gtex.smtsd[combinedMeta$Cohort == "GTEx"]) + +combinedMeta$Tissue[combinedMeta$Cohort == "TCGA"] <- combinedMeta$tcga.gdc_cases.project.primary_site[combinedMeta$Cohort == "TCGA"] +combinedMeta$Cellline[combinedMeta$Cohort == "TCGA"] <- FALSE +combinedMeta$Cancer[combinedMeta$Cohort == "TCGA"] <- TRUE #default for TCGA exception below +combinedMeta$Cancer[combinedMeta$Cohort == "TCGA" & combinedMeta$tcga.cgc_sample_sample_type == "Solid Tissue Normal"] <- FALSE + + +#Map GADO names to gtex names +combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""] <- gsub("(^[[:alpha:]])", "\\U\\1", combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""], perl=TRUE)#https://stackoverflow.com/questions/18509527/first-letter-to-upper-case +gadoTissueCol$PlotClass <- gsub("(^[[:alpha:]])", "\\U\\1", gadoTissueCol$PlotClass, perl=TRUE)#https://stackoverflow.com/questions/18509527/first-letter-to-upper-case +combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType == "Adipose"] <- "Adipose Tissue" +gadoTissueCol$PlotClass[gadoTissueCol$PlotClass == "Adipose"] <- "Adipose Tissue" + +#Fix +combinedMeta$gado.CellType[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "acute myeloid leukemia"] <- "AML" + +#Only annotations with a color are checked and highly realiable +gadoAnnotatedTissues <- !is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != "" & combinedMeta$gado.TissueType %in% gadoTissueCol$PlotClass +combinedMeta$Tissue[gadoAnnotatedTissues] <- combinedMeta$gado.TissueType[gadoAnnotatedTissues] +combinedMeta$Cancer[gadoAnnotatedTissues] <- FALSE +combinedMeta$Cellline[gadoAnnotatedTissues] <- FALSE + +gadoAnnotatedCelltypes <- !is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType != "" & combinedMeta$gado.CellType %in% gadoTissueCol$PlotClass +combinedMeta$Tissue2[gadoAnnotatedCelltypes] <- combinedMeta$gado.CellType[gadoAnnotatedCelltypes] +combinedMeta$Cancer[gadoAnnotatedCelltypes] <- FALSE +combinedMeta$Cellline[gadoAnnotatedCelltypes] <- FALSE + +combinedMeta$Cancer[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "AML"] <- TRUE +combinedMeta$Cancer[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "DLBCL"] <- TRUE + + +gadoAnnotatedCelllines <- !is.na(combinedMeta$gado.CellLine) & combinedMeta$gado.CellLine != "" & tolower(combinedMeta$gado.CellLine) %in% tolower(gadoTissueCol$PlotClass) +combinedMeta$CelllineName[gadoAnnotatedCelllines] <- combinedMeta$gado.CellLine[gadoAnnotatedCelllines] +combinedMeta$Cancer[gadoAnnotatedCelllines] <- FALSE +combinedMeta$Cellline[gadoAnnotatedCelllines] <- TRUE + +#Some manual stuff for big studies + +combinedMeta$CelllineName[!is.na(combinedMeta$study) & combinedMeta$study == "SRP166108"] <- "HepaRG" +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP166108"] <- TRUE + + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- "Blood" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- "Whole Blood" +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- FALSE +combinedMeta$Cancer[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- FALSE + +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP186687"] <- TRUE + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- "Blood" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- "Whole Blood" +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- FALSE +combinedMeta$Cancer[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- FALSE + + +combinedMeta$Tissue[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- "Blood" +combinedMeta$Tissue2[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- "T-cells" +combinedMeta$Cellline[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- FALSE + +combinedMeta$Tissue[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- "Blood" +combinedMeta$Tissue2[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- "Monocytes" +combinedMeta$Cellline[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- FALSE + +combinedMeta$Tissue[combinedMeta$study == "SRP061932"] <- "" +combinedMeta$Tissue2[combinedMeta$study == "SRP061932"] <- "" +combinedMeta$Cellline[combinedMeta$study == "SRP061932"] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP061932"] <- FALSE + + + +combinedMeta$Tissue[combinedMeta$study == "SRP047323"] <- "" +combinedMeta$Tissue2[combinedMeta$study == "SRP047323"] <- "" +combinedMeta$Cellline[combinedMeta$study == "SRP047323"] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP047323"] <- FALSE + + + +combinedMeta$CelllineName[combinedMeta$study == "ERP001942"] <- "lcl" +combinedMeta$Cellline[combinedMeta$study == "ERP001942"] <- TRUE + + +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "ERP007111"] <- "iPSC" + +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "ERP012914"] <- TRUE +combinedMeta$CelllineName[!is.na(combinedMeta$study) & combinedMeta$study == "ERP012914"] <- "HAP1" + + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP151763"] <- "Eye" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP151763"] <- "Retina" +combinedMeta$Cellline[combinedMeta$study == "SRP151763"] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP151763"] <- FALSE + + + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP162411"] <- "Blood" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP162411"] <- "Whole Blood" + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP102542" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP150311" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP162873" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP163524" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study %in% c("SRP006676", "SRP071758", "SRP081599", "SRP086078", "SRP119923") +combinedMeta$Tissue[studySamples] <- "Airway Epithelial" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +samples <- combinedMeta$study == "SRP188219" & grepl("left atrial appendage", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Heart" +combinedMeta$Tissue2[samples] <- "Left atrial appendage" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- FALSE + + +samples <- combinedMeta$study == "SRP188219" & grepl("right atrial appendage", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Heart" +combinedMeta$Tissue2[samples] <- "Right atrial appendage" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- FALSE + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRA755613" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- TRUE +combinedMeta$CelllineName[studySamples] <- "iPSC" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRA755626" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- TRUE +combinedMeta$CelllineName[studySamples] <- "iPSC" +combinedMeta$Cancer[studySamples] <- FALSE + + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP148659" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- TRUE +combinedMeta$CelllineName[studySamples] <- "iPSC" +combinedMeta$Cancer[studySamples] <- FALSE + +#Put to missing annotations unclear +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP009316" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP021509" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + +#Unsure how to classify airway smooth muscle +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP043162" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP052896" +combinedMeta$Cancer[studySamples] <- TRUE + + + + +#Some organoids and cancers +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP058722" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + + +#DPN and Tamoxifen treatments of parathyroid adenoma cells have cancer CNV profile +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP012167" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP019936" +combinedMeta$Cancer[studySamples] <- TRUE + + + +combinedMeta["SRR5341594", "sra.sample_title"] <- "Human differentiating macrophage" + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "ERP011411" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP010166" +combinedMeta$Cancer[studySamples] <- TRUE + + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP012656" +combinedMeta$Cancer[studySamples] <- TRUE + + +samples <- combinedMeta$study == "ERP006077" & grepl("Primary Prostate Tumour", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Prostate" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +samples <- combinedMeta$study == "ERP006077" & grepl("Matched Adjacent", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Prostate" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- FALSE + +samples <- combinedMeta$study == "ERP006077" +combinedMeta$Tissue[samples] <- "Pancreas" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +samples <- combinedMeta$study == "SRP058587" +combinedMeta$Tissue[samples] <- "Breast" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +samples <- combinedMeta$study == "SRP062332" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + + +samples <- combinedMeta$study == "SRP030401" +combinedMeta$Tissue[samples] <- "Breast" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + + +samples <- combinedMeta$study == "SRP028344" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + + + +samples <- combinedMeta$study == "SRP073061" & grepl("Tumor", combinedMeta$sra.experiment_title) +combinedMeta$Tissue[samples] <- "Breast" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + + + +samples <- combinedMeta$study == "SRP028346" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + +samples <- combinedMeta$study == "SRP058571" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + +studySamples <- combinedMeta$study %in% c("SRP014027", "SRP006575", "SRP071932", "ERP004617", "SRP034592","SRP049695") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + +samples <- combinedMeta$study == "SRP066596" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- NA +combinedMeta$Cancer[samples] <- TRUE + + + +samples <- combinedMeta$study == "SRP049648" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + + + +studySamples <- combinedMeta$study == "SRP039694" +combinedMeta$Cancer[studySamples] <- TRUE + + +studySamples <- combinedMeta$study == "SRP066260" +combinedMeta$Cancer[studySamples] <- TRUE + + + +#mahmoudAnnotations + +colnames(mahmoudAnnotations)[colnames(mahmoudAnnotations) == "Cell_Line"] <- "Cellline" +colnames(mahmoudAnnotations)[colnames(mahmoudAnnotations) == "Cell_Line_Name"] <- "CelllineName" + +all(colnames(mahmoudAnnotations) %in% colnames(combinedMeta)) +all(rownames(mahmoudAnnotations) %in% rownames(combinedMeta)) + + +combinedMeta[rownames(mahmoudAnnotations),colnames(mahmoudAnnotations)] <- mahmoudAnnotations + + +#All cellline to false for all tissues +combinedMeta$Cancer[combinedMeta$Cellline] <- FALSE + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Cervix Uteri" +combinedMeta$Tissue[tmp] <- "Uterus" +combinedMeta$Tissue2[tmp] <- "Cervix" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Cervix" +combinedMeta$Tissue[tmp] <- "Uterus" +combinedMeta$Tissue2[tmp] <- "Cervix" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Lymph node" +combinedMeta$Tissue[tmp] <- "Lymph Nodes" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Bone marrow" +combinedMeta$Tissue[tmp] <- "Bone Marrow" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Colorectal" +combinedMeta$Tissue[tmp] <- "Colon" + + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Whole blood" +combinedMeta$Tissue2[tmp] <- "Whole Blood" +combinedMeta$Tissue[tmp] <- "Blood" + + + +#Below are tissues2 fixes by Mahmoud +# annotations already present in Tissue are removed from Tissue2 +#duplicated are harmonized +#set rare annotations to NA +### All parts of the basal ganglia (including substantia nigra) were annotated as basal ganglia +###brain fragements was set to NA +###Retina needs to have Eye as Tissue +###Sample annotated as both brain & stomach was annotated as NA + +#Adipose Tissue +# Tissue2 includes "Adipose - Subcutaneous" & "Adipose - Visceral (Omentum)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Adipose - Subcutaneous"),2]= "Subcutaneous" +combinedMeta[!is.na(combinedMeta$Tissue2) & combinedMeta$Tissue2== "Adipose - Visceral (Omentum)",2]= "Visceral" +combinedMeta[!is.na(combinedMeta$Tissue) & combinedMeta$Tissue== "Adipose Tissue",1]= "Adipose" + +# Adrena Gland +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Adrenal Gland"),2]= NA + +#AML +# keep as is + +# Arteries +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Aorta"),2]= "Aorta" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Coronary"),2]= "Coronary" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Tibial"),2]= "Tibial" + +#B-cells +# keep as is + +#basal ganglion +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "basal ganglion"),2]= "Basal Ganglia" + +#Bladder +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Bladder"),2]= NA + +#Brain (keep as GTEX)**** +#Check for brain cortex vs cortex vs cerebral cortex +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Amygdala"),2]= "Amygdala" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Anterior cingulate cortex (BA24)"),2]= "Anterior cingulate cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Caudate (basal ganglia)"),2]= "Caudate (basal ganglia)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cerebellar Hemisphere"),2]= "Cerebellar Hemisphere" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cerebellum"),2]= "Cerebellum" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cortex"),2]= "Cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Frontal Cortex (BA9)"),2]= "Frontal Cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Hippocampus"),2]= "Hippocampus" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Hypothalamus"),2]= "Hypothalamus" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Nucleus accumbens (basal ganglia)"),2]= "Nucleus accumbens (basal ganglia)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Putamen (basal ganglia)"),2]= "Putamen (basal ganglia)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Spinal cord (cervical c-1)"),2]= "Spinal Cord (cervical c-1)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Substantia nigra"),2]= "Substantia nigra" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "brain fragment"),2]= NA + +#Breast +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Breast - Mammary Tissue"),2]= "Mammary Tissue" + +#CD34+ +# Keep as is + +#Cultured fibroblasts +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Cells - Cultured fibroblasts"),2]= "Cultured Fibroblasts" + +#cerebellum +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "cerebellum"),2]= "Cerebellum" + +# cerebral cortex +#recheck**** +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "cerebral cortex"),2]= "Cortex" + +#Cervix +#keep as is + +#choroid plexus +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "choroid plexus"),2]= NA + +#Colon +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Colon - Sigmoid"),2]= "Sigmoid" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Colon - Transverse"),2]= "Transverse" + +#diencephalon & diencephalon and midbrain +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "diencephalon"),2]= "Diencephalon" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "diencephalon and midbrain"),2]= NA + +#DLBCL +#keep as is + +#Esophagus +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Gastroesophageal Junction"),2]= "Gastroesophageal Junction" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Mucosa"),2]= "Mucosa" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Muscularis"),2]= "Muscularis" + +#Fallopian Tube +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Fallopian Tube"),2]= NA + +#forebrain +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain and midbrain"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain fragment"),2]= NA + +#Heart +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Heart - Atrial Appendage"),2]= "Atrial Appendage" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Heart - Left Ventricle"),2]= "Left Ventricle" + +#hindbrain +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain fragment"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain without cerebellum"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hippocampus"),2]= "Hippocampus" + +#Kidney +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Kidney - Cortex"),2]= "Cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Kidney - Medulla"),2]= "Medulla" + +#Liver +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Liver"),2]= NA + +#Lung +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Lung"),2]= NA + +#medulla oblongata +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "medulla oblongata"),2]= "Medulla Oblongata" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "midbrain"),2]= "Midbrain" + +#Minor salivary gland +#keep as is + +#Monocytes +#keep as is + +#Muscle-skeletal +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Muscle - Skeletal"),2]= "Skeletal" + +#Nerve +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Nerve - Tibial"),2]= "Tibial" + +#NK-cells +#keep as is + +#Ovary +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Ovary"),2]= NA + +#Pancreas +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Pancreas"),2]= NA + +#PBMC +#keep as is + +#Pituitary +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Pituitary"),2]= NA + +#pituitary and diencephalon & pons +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "pituitary and diencephalon"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "pons"),2]= NA + +#prostate +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Prostate"),2]= NA + +#skin +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Skin - Not Sun Exposed (Suprapubic)"),2]= "Suprapubic" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Skin - Sun Exposed (Lower leg)"),2]= "Lower Leg" + +# Small Intesine +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Small Intestine - Terminal Ileum"),2]= "Terminal Ileum" + +#spinal cord +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "spinal cord"),2]= "" + +#Spleen +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Spleen"),2]= NA + +#Stomach +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Stomach"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "stomach"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "stomach"),1]= NA + +#T-cells +#Keep as is + +#telencephalon +#too general +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "telencephalon"),2]= NA + +#temporal lobe +#The temporal lobe is one of the four major lobes of the cerebral cortex +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "temporal lobe"),2]= "Cortex" + +#Testis +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Testis"),2]=NA + +#Thyroid +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Thyroid"),2]=NA + +#Uterus +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Uterus"),2]=NA + +#Vagina +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Vagina"),2]=NA + +#whole blood +#keep as is + +#remove iPSCs from +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "iPSC") ,4]="iPSC" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "iPSC") ,2]="" + +#remove NA problem +combinedMeta$Tissue2[is.na(combinedMeta$Tissue2)]<- "" +combinedMeta$Tissue[is.na(combinedMeta$Tissue)]<- "" + + + + + + + +#Harmonizing Cell Line Names for samples in recount3 + +# A549 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "a549"),4]= "A549" + +#H-STS NET +#Keep as is + +#H1 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "h1"),4]= "H1" + +#H9 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "h9"),4]= "H9" + +#HAP1 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hap1"),4]= "HAP1" + +#HCT116 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hct116"),4]= "HCT116" + +#Hek293 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hek293"),4]= "HEK293" + +#HeLa +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hela"),4]= "HeLa" + +#HepaRG +#keep as is + +#hepg2 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hepg2"),4]= "HepG2" + +#ipsc +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "ipsc"),4]= "iPSC" + +#K562 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "k562"),4]= "K562" + +#LCLs +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "lcl"),4]= "LCL" + +#lcl_s4u_capturing +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "lcl_s4u_capturing"),4]= "LCL_S4U_Capturing" + +#MCF10A +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mcf10a"),4]= "MCF10A" + +#MCF7 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mcf7"),4]= "MCF7" + +#MDA231 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mda231"),4]= "MDA231" + +#T47D +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "t47d"),4]= "T47D" + + +#Fix SRP045234 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),4]= "iPSC" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),5]= FALSE +#Fix SRP007525 Annotaions +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),4]= "OCI-LY1" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),5]= FALSE +#Fix SRP027358 & SRP032926 +combinedMeta$Fetal[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027358")]= TRUE +combinedMeta$Fetal[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP032926")]= TRUE + +#Fix SRP026537 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),4]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),5]= FALSE + +#Fix SRP049063 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),4]= "HT-29" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),5]= FALSE +#Fix SRP053034 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),4]= "RPE-1" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),5]= FALSE +#Fix SRP056197 Annoations +samples <- combinedMeta$study == "SRP056197" & grepl("Bone marrow", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Bone Marrow" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE +samples <- combinedMeta$study == "SRP056197" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Blood" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + +#Fix SRP013565 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Cancer"]= FALSE + +#Fix ERP008682 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"CelllineName"]= "H9" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Cancer"]= FALSE +#Fix SRP033646 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"CelllineName"]= "Caco2" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Cancer"]= FALSE +#Fix SRP027383 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Cancer"]= TRUE +#Fix SRP050003 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003") & (grepl("non-tumoral", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP073253 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Tissue"]= "Kidney" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Cancer"]= TRUE +#Fix SRP069235 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Cancer"]= TRUE +#Fix SRP074425 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Cancer"]= TRUE +#Fix SRP044668 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668") & (grepl("non-neoplastic", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668") & (grepl("glioma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP009123 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123") & (grepl("non-tumor", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP041094 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Tissue"]= "Prostate" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Cancer"]= TRUE +#Fix SRP040998 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Cancer"]= NA +#Fix SRP052056 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Tissue"]= "Thyroid" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056") & (grepl("healthy", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP029880 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Tissue"]= "Colon" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Cancer"]= TRUE +#Fix SRP056696 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696") & (grepl("Normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696") & (grepl("Tumor", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP066794 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Tissue"]= "Lung" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Cancer"]= TRUE +#Fix SRP149374 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Tissue"]= "Bone Marrow" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Tissue2"]= "CD34+" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374") & (grepl("Healthy", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374") & (grepl("Myelodysplastic", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP019250 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250") & (grepl("HEK", combinedMeta$sra.sample_attributes, ignore.case=T)),"CelllineName"]= "HEK" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250") & (grepl("LCL", combinedMeta$sra.sample_attributes, ignore.case=T)),"CelllineName"]= "LCL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Cancer"]= FALSE +#Fix SRP074349 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Tissue"]= "Lung" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349") & (grepl("NSCLC", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349") & (grepl("NSCLC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE + + +#Fix SRP009067 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"CelllineName"]= "LCL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Cancer"]= FALSE +#Fix SRP007885 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"CelllineName"]= "LCL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Cancer"]= FALSE +#Fix SRP018218 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue"]= "Pancreas" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue2"]= "Stellate Cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218"),"Cancer"]= TRUE +#Fix SRP019275 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Cancer"]= FALSE +#Fix SRP042186 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Cancer"]= FALSE +#Fix SRP042620 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue"]= "Breast" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("ER+", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("Triple Negative", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("Uninvolved", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("No Known", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE + + +#Fix ERP010142 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Tissue"]= "Breast" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Cancer"]= TRUE +#Fix SRP026600 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Cancer"]= NA +#Fix SRP028336 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("muscle", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Muscle" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("kidney", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Kidney" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("prefrontal cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("cerebellar Cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("primary visual cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("muscle", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("kidney", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("prefrontal cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Prefrontal Cortex" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("cerebellar Cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Cerebellar Cortex" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("primary visual cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Primary Visual Cortex" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"Cancer"]= FALSE +#Fix SRP009029 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Cancer"]= NA +#Fix SRP006912 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"CelllineName"]= "HK-2" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Cancer"]= FALSE +#Fix SRP055444 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Tissue2"]= "CLL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Cancer"]= TRUE +#Fix SRP022942 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Cancer"]= FALSE +#Fix ERP012180 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Cancer"]= NA +#Fix SRP058717 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"CelllineName"]= "HT-29" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Cancer"]= FALSE +#Fix SRP012568 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Cancer"]= NA +#Fix SRP065146 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Cancer"]= FALSE +#Fix ERP012188 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Cancer"]= NA +#Fix SRP055390 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Tissue2"]= "CLL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue2"]="CLL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" + + +#Fix SRP036145 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Cancer"]= NA +#Fix SRP050533 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Cancer"]= NA + + + + + + +#Fix ERP016243 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Fetal"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "" + +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2, " - ", combinedMeta$Fetal)[combinedMeta$study == "ERP109002"]) + +#Fix SRP078234 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("hindbrain", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("hindbrain", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Hindbrain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("spinal cord", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("spinal cord", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Spinal Cord (cervical c-1)" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Fetal"]= TRUE + + +#Fix SRP041044 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Cancer"]= NA +#Fix SRP050260 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Cancer"]= NA +#Fix SRP076099 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Cancer"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Fetal"]= TRUE +# Fix ERP115010 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Cancer"]= FALSE +# Fix SRP221482 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Cancer"]= FALSE +# Fix SRP059039 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Cancer"]= FALSE +# Fix SRP059172 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Cancer"]= FALSE +# Fix SRP062966 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Cancer"]= FALSE +# Fix SRP081605 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Cancer"]= FALSE +# Fix SRP103772 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Cancer"]= FALSE +# Fix SRP132939 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Cancer"]= FALSE +# Fix SRP136938 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Cancer"]= FALSE +# Fix SRP150552 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cancer"]= FALSE +# Fix SRP174223 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Cancer"]= FALSE +# Fix SRP174638 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Cancer"]= FALSE +# Fix SRP150552 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cancer"]= FALSE + + + + +samples <- combinedMeta$study == "SRP033266" & grepl("Bone marrow", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Bone Marrow" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE +samples <- combinedMeta$study == "SRP033266" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Blood" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + +samples <- combinedMeta$study == "SRP048759" & grepl("Bone marrow", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Bone Marrow" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE +samples <- combinedMeta$study == "SRP048759" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Blood" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +#Fix SRP045500 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("wholw blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("CD4", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("CD8", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("Neutrophils", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Neutrophils" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("Monocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("B-cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("NK", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "NK-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Cancer"]= FALSE + +#Fix SRP076719 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719") & (grepl("pbmc", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719") & (grepl("ln", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood"#in other cases we also put all t-cell to blood regardless of source +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Cancer"]= FALSE + +#Fix SRP051688 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("T cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("monocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("NK cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "NK-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("PBMC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "PBMC" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("myeloid DC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Dendritic cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("neutrophils", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Neutrophils" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Cancer"]= FALSE + +#Fix SRP078912 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912") & (grepl("T cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912") & (grepl("Monocyte", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Cancer"]= FALSE + +#Fix SRP110609 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609") & (grepl("lymphocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609") & (grepl("Monocyte", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Cancer"]= FALSE + +#Fix SRP158943 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("cll", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "CLL" #It doesn't state further clasification +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("cll", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE + +#Fix ERP104864 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("synovium", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Synovium" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("synovium", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"Cancer"]= FALSE + +#Fix intestine samples +#ERP000546 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP000546") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#ERP003613 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("smallintestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("duodenum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("duodenum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Duodenum" +#ERP006650 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP006650") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP039090 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP039090") & (combinedMeta$Tissue=="Intestine") & (grepl("Small Intestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +#SRP043391 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP043391") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon" +#SRP048801 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP048801") & (combinedMeta$Tissue=="Intestine") & (grepl("ileum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP048801") & (combinedMeta$Tissue=="Intestine") & (grepl("ileum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Ileum" +#SRP055438 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055438") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP056520 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056520") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP006900 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006900") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP063496 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP063496") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP000941 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP000941") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP000941") & (combinedMeta$Tissue=="Intestine") & (grepl("small intestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +#SRP021221 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP021221") & (TissucombinedMetaes$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP009386 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009386") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#exclude SRP048804 (Cell line) +combinedMeta=combinedMeta[!combinedMeta$study=="SRP048804",] +#exclude the remaining sample of Intestine +combinedMeta=combinedMeta[!combinedMeta$Tissue=="Intestine",] + +#Fix ERP109002 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Heart", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Heart" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Heart", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Kidney", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Kidney" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Kidney", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Testis", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Testis" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Testis", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Liver", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Liver", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Brain", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Brain", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Forebrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Forebrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue2"]= "Forebrain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Hindbrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Hindbrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue2"]= "Hindbrain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Ovary", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Ovary" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Ovary", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Fetal"] <- FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("embryo", combinedMeta$sra.sample_attributes, ignore.case=T)),"Fetal"]= TRUE + + +studySamples <- combinedMeta$study %in% c("SRP105369", "ERP006121", "SRP062144") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "AML" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + + + +studySamples <- combinedMeta$study %in% c("SRP221351", "SRP110313", "SRP115151", "SRP133278", "SRP156583", "SRP201603") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "B-cells" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("ERP107715", "ERP111116", "SRP092158", "SRP133442", "SRP065795", "SRP119636") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + +studySamples <- combinedMeta$study %in% c("ERP109703", "SRP100686", "SRP161505") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "CLL" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + + + +studySamples <- combinedMeta$study == "SRP123604" +combinedMeta$Tissue[studySamples] <- "Colon" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + + + +studySamples <- combinedMeta$study %in% c("ERP113862", "ERP002323", "ERP114921", "SRP051368", "SRP097893", "SRP101856", "SRP151577") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Dendritic cells" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- combinedMeta$study %in% c("SRP056733", "SRP062278", "SRP064515", "SRP074274", "SRP076097", "SRP095287", "SRP103821", "SRP109107", "SRP110187", "SRP118741", "SRP118760", "SRP145599", "SRP190161", "SRP218274", "SRP155941") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Macrophages" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("ERP020977", "ERP022909") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Macrophages-iPSC" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- combinedMeta$study %in% c("ERP014531", "SRP041826", "SRP058953", "SRP096201", "SRP113586", "SRP192825", "SRP173842", "SRP045352", "SRP055514", "SRP069333", "SRP101726") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Monocytes" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("SRP150456") +combinedMeta$Tissue[studySamples] <- "Nasal Lavage" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- combinedMeta$study %in% c("SRP102104", "SRP162654", "SRP042596", "SRP049605", "SRP074736", "SRP090282", "SRP125882", "SRP140711", "SRP162023", "SRP168421", "SRP201023", "SRP212077", "SRP140558") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "PBMC" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("SRP072980", "SRP169062", "SRP086613", "SRP092010", "ERP105662", "SRP093990", "SRP215282", "SRP032926", "SRP053186", "SRP059057", "SRP098715", "SRP101784", "SRP117629", "SRP140710", "SRP155217", "SRP158900", "SRP192607") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "T-cells" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + +#SRP081020 is mis-anotated in SRA as PBMC, paper and clustering both state wholeblood +studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077", "SRP056443") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + + + + +combinedMeta$Cellline[!is.na(combinedMeta$CelllineName)&combinedMeta$CelllineName=="iPSC"] <- TRUE +combinedMeta$Cancer[!is.na(combinedMeta$Cellline) & combinedMeta$Cellline] <- NA + + +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2, " - ", combinedMeta$Fetal)[combinedMeta$study == "ERP109002"]) + +#Exclude spike in +combinedMeta$exclude[combinedMeta$study == "SRP041955"] <- TRUE + +(x <- table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2),combinedMeta$Cancer)) +write.table(x, file = "test.txt", row.names = T, col.names = NA, quote = F, sep = "\t") + + +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2),combinedMeta$Cancer) + +table(combinedMeta$Tissue) +table(combinedMeta$Tissue2) + +table(combinedMeta$Tissue, combinedMeta$Cellline) + +table(combinedMeta$Cancer) + +table(combinedMeta$Tissue, combinedMeta$Cancer) + +table(combinedMeta$Tissue[combinedMeta$Cohort == "TCGA"], combinedMeta$Cancer[combinedMeta$Cohort == "TCGA"]) + + +sum(combinedMeta$gado.TissueType %in% combinedMeta$gado.PlotClass) + +sum((!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != "")) + +sum( ) +unique(combinedMeta$gado.TissueType) + +table(combinedMeta$gado.PlotClass, useNA = "a") + +combinedMeta$Tissue[combinedMeta$Cohort == "GSA"] + + + +#save(combinedMeta, file = "combinedMeta_2022_09_15.RData") + +load(file = "combinedMeta_2022_08_19.RData") + +pcsAndMeta <- merge(expPcs[,1:100], combinedMeta, by = 0, all.x = T) +dim(pcsAndMeta) +str(combinedMeta) + +tissueCol <- read.delim("Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Annotations_color2.txt", row.names = 1) + +sum(unique(pcsAndMeta[,"Tissue"]) %in% tissueCol$PlotClass) +sum(unique(pcsAndMeta[,"Tissue2"]) %in% tissueCol$PlotClass) + +x <- unique(pcsAndMeta[,"Tissue2"]) +x[!x %in% tissueCol$PlotClass] + + +defaultCol <- adjustcolor("grey", alpha.f = 0.6) +pcsAndMeta$col <- defaultCol + +tissueAndCol <- pcsAndMeta[,"Tissue"] %in% tissueCol$PlotClass + +pcsAndMeta$col[tissueAndCol] <- adjustcolor(tissueCol$col[match(pcsAndMeta[tissueAndCol,"Tissue"], tissueCol$PlotClass)], alpha.f = 0.6) + + +tissue2AndCol <- pcsAndMeta[,"Tissue2"] %in% tissueCol$PlotClass +sum(tissue2AndCol) +pcsAndMeta$col[tissue2AndCol] <- adjustcolor(tissueCol$col[match(pcsAndMeta[tissue2AndCol,"Tissue2"], tissueCol$PlotClass)], alpha.f = 0.6) + +table(pcsAndMeta[pcsAndMeta[,"PC_2"] >= 0,"Tissue2"]) + + +sum(is.na(tolower(pcsAndMeta[,"Tissue"]) %in% tolower(tisueCol$PlotClass))) + +#pcsAndMeta$col <- tissueCol$col[match(tolower(pcsAndMeta[,"Tissue"]), tolower(tissueCol$PlotClass), nomatch = nrow(tissueCol))] + +plotOrder <- order((pcsAndMeta$col != defaultCol) + 1) + +rpng(width = 800, height = 800) +#pdf(file = "test.pdf") +plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$col[plotOrder], cex = 0.3, pch = 16) +dev.off() + + +rpng(width = 800, height = 800) +#pdf(file = "test.pdf") +plot(pcsAndMeta[plotOrder,"PC_3"], pcsAndMeta[plotOrder,"PC_7"], col = pcsAndMeta$col[plotOrder], cex = 0.3, pch = 16) +dev.off() + + +#rpng(width = 800, height = 800) +png("tissues.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + +png("tissues2.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",6:10)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + +png("tissues3.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",11:15)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colCelline <- defaultCol +pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cellline"]) & pcsAndMeta[,"Cellline"]] <- adjustcolor("magenta", alpha.f = 0.3) +pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cellline"]) & !pcsAndMeta[,"Cellline"]] <- adjustcolor("royalblue1", alpha.f = 0.3) +pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- adjustcolor("forestgreen", alpha.f = 0.3) +plotOrder <- order((pcsAndMeta$colCelline != defaultCol) + 1) + + +pcsAndMeta$cellineTissueCancer <- "Unkown" +pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cellline"]) & pcsAndMeta[,"Cellline"]] <- "Cellline" +pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cellline"]) & !pcsAndMeta[,"Cellline"]] <- "Tissue" +pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- "Cancer" + +pcsAndMeta$cellineTissueCancer <- factor(pcsAndMeta$cellineTissueCancer, levels = c("Tissue", "Cancer", "Cellline", "Unkown")) + +table(pcsAndMeta$cellineTissueCancer, useNA = "always") + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, pch = 16) +dev.off() + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_3"], pcsAndMeta[plotOrder,"PC_75"], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, pch = 16) +dev.off() + +for(i in c(1,3:100)){ + png(paste0("cellinePlots/pc",i,".png"),width = 1000, height = 1000) + #rpng() + plot(pcsAndMeta[plotOrder,"PC_2"], pcsAndMeta[plotOrder,paste0("PC_",i)], col = pcsAndMeta$colCelline[plotOrder], cex = 1, pch = 16, xlab = "PC2", ylab = paste0("PC", i)) + dev.off() +} + +library(vioplot) + +for(i in 1:100){ +png(paste0("cellinePlots2/pc",i,".png"),width = 500, height = 500) +vioplot( pcsAndMeta[,paste0("PC_",i)] ~ pcsAndMeta$cellineTissueCancer, col = c(adjustcolor("royalblue1", alpha.f = 0.3), adjustcolor("forestgreen", alpha.f = 0.3), adjustcolor("magenta", alpha.f = 0.3), defaultCol)) +dev.off() +} +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2)) + +png("celllines_c.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5, "_c")], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + + +png("celllines2.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",6:10, "")], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colCancer <- defaultCol +pcsAndMeta$colCancer[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- adjustcolor("chartreuse1", alpha.f = 0.6) +plotOrder <- order((pcsAndMeta$colCancer != defaultCol) + 1) + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4) +dev.off() + +png("cancers.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5)], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + +library(pROC) +cancerAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cancer, predictor = x))}) +sort(cancerAuc) + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_33"], pcsAndMeta[plotOrder,"PC_9"], col = pcsAndMeta$col[plotOrder], cex = 0.4) +dev.off() + + +library(pROC) +cancerAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cancer, predictor = x))}) +sort(cancerAuc) + +celllineAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cellline, predictor = x))}) +sort(celllineAuc) + + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_33"], pcsAndMeta[plotOrder,"PC_9"], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4) +dev.off() + +rpng() +pairs(pcsAndMeta[plotOrder,paste0("PC_",c(33,32,9,10,21))], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + + +rpng() +pairs(pcsAndMeta[plotOrder,paste0("PC_",c(3,2,27,6,20))], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + +combinedMeta$sra.sample_title <- gsub("\"", "", combinedMeta$sra.sample_title) + +tmp <- merge(combinedMeta,pcs[,1:100], by = 0, all.y = T) +dim(tmp) +write.table(tmp, file = "tmpAnnotations.txt", sep = "\t", quote = FALSE, col.names = NA) + +qseq <- read.delim("quantseqSamples.txt")[,1] +str(qseq) + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colQseq <- defaultCol +pcsAndMeta$colQseq[pcsAndMeta$Row.names %in% qseq] <- "orangered" +plotOrderQseq <- order((pcsAndMeta$colQseq != defaultCol) + 1) + +plot(pcsAndMeta[plotOrderQseq,"PC_1"], pcsAndMeta[plotOrderQseq,"PC_2"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4, main = "Quantseq") + +plot(pcsAndMeta[plotOrderQseq,"PC_6"], pcsAndMeta[plotOrderQseq,"PC_1"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4, main = "Quantseq") + + +table(pcsAndMeta$sra.library_layout) + + +numColumns <- unlist(lapply(combinedMeta, is.numeric)) + + +combinedMetaMatrix <- as.matrix(combinedMeta[,numColumns]) + +library(pROC) + +qseqClass <- as.factor(rownames(combinedMeta) %in% qseq) +table(qseqClass) +dim(combinedMetaMatrix) +qseqPValues <- apply(combinedMetaMatrix,2,function(x){ + tryCatch( + { + #wilcox.test(x ~ qseqClass)$p.value + as.numeric(auc(response = qseqClass, predictor = x)) + }, + error=function(cond){return(1)} + ) +}) +sort(qseqPValues) + + +boxplot(combinedMetaMatrix[,"recount_qc.bc_frag.kallisto_mean_length"] ~ qseqClass ) + + + + + + + + +plot(pcsAndMeta[plotOrderQseq,"recount_seq_qc.%c"], log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.6) +10^5.2 +abline(h=log10(150000)) +abline(v=60) +log10(10^5) + +plot(log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), pcsAndMeta[plotOrderQseq,"recount_qc.bc_frag.kallisto_mean_length"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4) + +plot(pcsAndMeta[plotOrderQseq,"PC_6"], log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4) + + +pc1Cor <- cor(pcsAndMeta[,"PC_1"], pcsAndMeta[,numColumns], use = "pairwise.complete.obs") +sort(pc1Cor[1,]) + +pc6Cor <- apply(combinedMetaMatrix[pcsAndMeta$Row.names,],2,function(x){ + tryCatch( + { + #wilcox.test(x ~ qseqClass)$p.value + cor(pcsAndMeta[,"PC_6"], x, use = "pairwise.complete.obs") + }, + error=function(cond){return(0)}, + warning=function(cond){return(0)} + ) +}) +sort(pc6Cor^2) + +load("testPcPAtrickFrist100.RData", verbose = T) +colnames(expPcs) +str(expPcs) +colnames(expPcs) <- paste0("PC_", 1:ncol(expPcs)) +pcsAndMeta <- merge(expPcs, combinedMeta, by = 0, all.x = T) +dim(pcsAndMeta) + + + +load("gadoPca.RData", verbose = T) +colnames(expGadoPcsSub) +str(expGadoPcsSub) +colnames(expGadoPcsSub) <- paste0("PC_", 1:ncol(expGadoPcsSub)) +pcsAndMeta <- merge(expGadoPcsSub, combinedMeta, by = 0, all.x = T) +dim(pcsAndMeta) + + + +table(pcsAndMeta$CelllineName) +pcsAndMeta$Cellline[grepl("s4u", pcsAndMeta$CelllineName)] + + + + + + diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R new file mode 100644 index 000000000..ee8f5db99 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R @@ -0,0 +1,165 @@ +#srun --cpus-per-task=20 --mem=200gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, sync = T) + + + + +remoter::client("localhost", port = 55504) + +library(DESeq2) +library(parallel) +library(viridisLite, lib.loc = .libPaths()[2]) + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") + +load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T) +load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) + +sort(table(samplesWithPrediction$predictedTissue)) +tissueClasses <- unique(samplesWithPrediction$predictedTissue) + + +#tissueClasses <- tissueClasses[1:29] +#tissueClasses <- tissueClasses[30:57] + +#tissueClasses <- tissueClasses[c(1,2,6,14,55)] + +#limit expression to max int +selectedSamplesExp[selectedSamplesExp > .Machine$integer.max] <- .Machine$integer.max + + +mclapply(tissueClasses, mc.cores = 10, function(tissue){ + + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- selectedSamplesExp[,tissueSamples] + numberOfSamples <- length(tissueSamples) + + includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5}) + + tissueExp <- tissueExp[includedGenes,] + + mode(tissueExp) <- "integer" + + save(tissueExp, file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData")) + +}) + + +tissueClasses <- unique(samplesWithPrediction$predictedTissue) + +#Run 1 +#tissueClasses <- tissueClasses[1:5] +#Run 2 +run2Tisses <- c("Whole Blood", "T-Cells", "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells", "PBMC") +tissueClasses <- run2Tisses +#Run3 +run3Tisses <- c("derived-neural-progenitor_derived-neurons", "Macrophages", "Liver", "Macrophages-iPSC") +tissueClasses <- run3Tisses + +#Run4 colorectal en prostate + +perTissueExp <- mclapply(tissueClasses, mc.cores = 4, function(tissue){ + + load(file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData")) + rlogExp <- rlog(tissueExp) + save(rlogExp, file = paste0("perTissueNormalization/rlogExp/",make.names(tissue),".RData")) + return(NULL) + +}) + +#names(perTissueExp) <- tissueClasses +#save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData") +tissue = "Kidney" + +load(file = paste0("perTissueNormalization/rlogExp/",make.names(tissue),".RData")) + +perTissuePca <- lapply(perTissueExp, function(exp){ + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = rlogExp - rowMeans(rlogExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expEigen <- eigen(expCov) + #eigenVectors <- expEigen$vectors + #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + #rownames(eigenVectors) <- rownames(expScale) + + #eigenValues <- expEigen$values + #names(eigenValues) <- paste0("PC_",1:length(eigenValues)) + + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + #expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + + expSvd <- svd(expScale, nu = 1000, nv = 1000) + + eigenValues <- expSvd$d^2 + eigenVectors <- expSvd$u + colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + rownames(eigenVectors) <- rownames(expScale) + + expPcs <- expSvd$v[,1:25] %*% diag(expSvd$d[1:25]) + colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + rownames(expPcs) <- colnames(expScale) + + return(list(eigenVectors, eigenValues, expPcs)) + +}) + +save(perTissueExp, perTissuePca, file = "perTissueNormalization/tmpTestSession.RData") +#load(file = "perTissueNormalization/tmpTestRlog.RData") + +save(expPcs, samplesWithPrediction, file = "perTissueNormalization/tmpTest2.RData") +load("perTissueNormalization/tmpTest2.RData") + + +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired" +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired" + + +tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] +str(tissueSamplesInfo) + +#Put TCGA and GTEx to paired end + +studies <- length(unique(tissueSamplesInfo$study)) + + + +palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + +pchMap <- rep(c(15,16,17), length.out = studies) + +rpng(width = 1000, height = 1000) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) +pairs(expPcs[,1:5], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, upper.panel = NULL) +dev.off() +View(tissueSamplesInfo) + +palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$sra.library_layout), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) + + + + +breakPoints <- seq(0.5,1,by = 0.05) +breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) + + +plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) +legend("bottomright",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) + + +plot(expPcs[,1],expPcs[,5], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) + +legend("topleft",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) + +pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 1, upper.panel = NULL, pch = 16) + + + + sum(expPcs[,2]>10) +x <- cbind(expPcs, tissueSamplesInfo) +View(x) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R new file mode 100644 index 000000000..e4c35f6c2 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -0,0 +1,520 @@ +#srun --cpus-per-task=20 --mem=100gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, sync = T) + + + + +remoter::client("localhost", port = 55506) + +library(DESeq2) +library(parallel) +library(viridisLite, lib.loc = .libPaths()[2]) +library(preprocessCore) + + + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") + +load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T) +load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) + +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired" +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired" + +table(samplesWithPrediction$predictedTissue) + + +sort(table(samplesWithPrediction$predictedTissue)) +tissueClasses <- unique(samplesWithPrediction$predictedTissue) + +#not used currently, we now use the expression data used for the primary QC and sample predictions. +mclapply(tissueClasses, mc.cores = 10, function(tissue){ + + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- selectedSamplesExp[,tissueSamples] + numberOfSamples <- length(tissueSamples) + + includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5}) + + tissueExp <- tissueExp[includedGenes,] + + tissueExp <- log2(tissueExp + 1) + + normalize.quantiles(tissueExp,copy=FALSE) + + save(tissueExp, file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + +}) + +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData", verbose = T) + + +mclapply(tissueClasses, mc.cores = 10, function(tissue, exp){ + + #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- exp[,tissueSamples] + + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = tissueExp - rowMeans(tissueExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expEigen <- eigen(expCov) + #eigenVectors <- expEigen$vectors + #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + #rownames(eigenVectors) <- rownames(expScale) + + #eigenValues <- expEigen$values + #names(eigenValues) <- paste0("PC_",1:length(eigenValues)) + + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + #expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + + expSvd <- svd(expScale, nu = 50, nv = 50) + + + + + eigenValues <- expSvd$d^2 + eigenVectors <- expSvd$u + colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + rownames(eigenVectors) <- rownames(expScale) + + expPcs <- expSvd$v[,1:50] %*% diag(expSvd$d[1:50]) + colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + rownames(expPcs) <- colnames(expScale) + + + explainedVariance <- eigenValues * 100 / nrow(expScale) + + + pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance) + + save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) + + return(NULL) + +}, exp = exp) + +tissue = "Kidney" +tissue = "Brain-Nucleus accumbens (basal ganglia)" + +ERP009290 +ERP009290 +samplesWithPrediction[samplesWithPrediction$study=="ERP009290",] + +nonOutlierSampleList <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ + + load(file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) + + expPcs <- pcaRes$expPcs[,1:10] + explainedVariance <- pcaRes$explainedVariance + tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] + studies <- length(unique(tissueSamplesInfo$study)) + + #are in the same order + write.table(cbind(tissueSamplesInfo, expPcs), col.names = T, row.names = F, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt")) + + shortTissue <- ifelse(nchar(tissue) > 20, paste0(substr(tissue,0,17),"..."), tissue) + + breakPoints <- seq(0.5,1,by = 0.05) + breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) + + expPcsMeans <- apply(expPcs, 2, mean) + expPcsSds <- apply(expPcs, 2, sd) + + #Larger threshold is needed for wholeblood + sdThreshold <- ifelse(tissue == "Whole Blood", 4,3) + + threshold <- expPcsMeans + sdThreshold * expPcsSds + + outlierPerComp <- sapply(1:10, function(i){ + + abs(expPcs[,i]) > threshold[i] + }) + tissueSamplesInfo$outlier <- apply(outlierPerComp, 1, any) + sum(tissueSamplesInfo$outlier) + + ### Do some manual corrections + if(tissue == "Airway basal cells"){ + tissueSamplesInfo$outlier[expPcs[,1] < 2] <- TRUE #Checked annotation, these are wrongly predicted + } + tissueSamplesInfo$outlier[tissueSamplesInfo$study == "ERP009290"] <- TRUE #Mixed tissue samples + + if(tissue == "Brain-Hindbrain-Fetal"){ + tissueSamplesInfo$outlier[expPcs[,1] < -10] <- TRUE #Checked annotation, these are wrongly predicted + tissueSamplesInfo$outlier[expPcs[,3] > 3] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Brain-Nucleus accumbens (basal ganglia)"){ + tissueSamplesInfo$outlier[expPcs[,1] < -5] <- TRUE #Checked annotation, these are wrongly predicted + tissueSamplesInfo$outlier[expPcs[,3] < -10] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Kidney"){ + tissueSamplesInfo$outlier[expPcs[,3] < -6] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Macrophages-iPSC"){ + tissueSamplesInfo$outlier[tissueSamplesInfo$study == "ERP020977"] <- FALSE #not real outlier, strange distribution due to stimulations. + } + if(tissue == "Monocytes"){ + tissueSamplesInfo$outlier[expPcs[,2] > 3] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Nasal Lavage"){ + tissueSamplesInfo$outlier[expPcs[,1] < -5] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Vagina"){ + tissueSamplesInfo$outlier[expPcs[,1] > 10] <- TRUE #Checked annotation, these are wrongly predicted + tissueSamplesInfo$outlier[expPcs[,2] < -8] <- TRUE #Checked annotation, these are wrongly predicted + } + + + + colnames(expPcs) <- paste0("Comp ",1:10, " (", round(explainedVariance[1:10],2) ,"%)") + write.table(cbind(tissueSamplesInfo, expPcs), col.names = NA, row.names = T, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt")) + + + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"1.png"), width = 1200, height = 900) + #rpng(width = 1000, height = 1000) + layout(matrix(c(1,1,1,1,2,3,4,8,5,6,7,8),ncol = 4, byrow = T), heights = c(0.1,1,1), widths = c(1,1,1,0.1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + pchMap <- rep(c(15,16,17), length.out = studies) + plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Studies (", studies,")"), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + + palette(adjustcolor(c("lemonchiffon3", "darkorange1", "springgreen2"), alpha.f = 0.5)) + annotated <- factor(rep("Unkown", nrow(tissueSamplesInfo)), levels = c("Unkown", "Other", "Current")) + annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue != tissue] <- "Other" + annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue == tissue] <- "Current" + plot(expPcs[,1],expPcs[,2], col = annotated, pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Annotated as " , shortTissue), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + + palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) + plot(expPcs[,1],expPcs[,2], col = factor(tissueSamplesInfo$sra.library_layout, levels = c("paired", "single")), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = "Sequencing layout", xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1, main = "Prediction posterior probability", xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + plot(cumsum(explainedVariance)[1:10], bty = "n", pch = 16, xlab = "Components", ylab = "Cumulative explained variance (%)", main = "Explained variance", ylim = c(0,100), xlim = c(0,10)) + + + + + + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("topleft",title="Annotation",legend=c("Unkown", "Other", shortTissue), col = c("lemonchiffon3", "darkorange1", "springgreen2") , pch = 16, bty = "n") + legend("top",title="Layout",legend=c("Single", "Paired"), col = c("maroon2", "dodgerblue1") , pch = 16, bty = "n") + legend("topright",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n") + + + dev.off() + + + + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"2.png"), width = 2000, height = 2000) + #pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 2, upper.panel = NULL, pch = 16) + #dev.off() + + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000) + #palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + #pchMap <- rep(c(15,16,17), length.out = studies) + #pairs(expPcs[,1:10], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2, upper.panel = NULL) + #dev.off() + + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000) + #palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) + #pairs(expPcs[,1:10], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 2, upper.panel = NULL) + #dev.off() + + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"4.png"), width = 1500, height = 700) + #rpng(width = 1000, height = 1000) + palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) + layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + for(i in 2:10){ + plot(expPcs[,1],expPcs[,i], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") + abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) + abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) + } + + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("top",title="Outliers",legend=c("Included", "Excluded"), col = c("grey", "firebrick3") , pch = 16, bty = "n") + + + + dev.off() + + + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"5.png"), width = 1500, height = 700) + #rpng() + layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + for(i in 2:10){ + plot(expPcs[,1],expPcs[,i], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") + abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) + abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) + } + + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("top",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n") + + + dev.off() + + + + return(tissueSamplesInfo[!tissueSamplesInfo$outlier,1:(ncol(tissueSamplesInfo)-1)]) + +}, samplesWithPrediction = samplesWithPrediction) + +samplesWithPredictionNoOutliers <- do.call(rbind, nonOutlierSampleList) +#save(samplesWithPredictionNoOutliers, file = "tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) + +load("tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) + +tissue = "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells" +tissue = "HUVEC" + +sink <- lapply(tissueClasses, function(tissue, exp){ + + #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] + tissueExp <- exp[,tissueSamples] + + save(tissueExp, file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData")) + +}, exp = exp) + +#Create co-expression matrices +sink <- lapply(tissueClasses, function(tissue){ + + load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"), verbose = T) + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = tissueExp - rowMeans(tissueExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + + write.table(expCov, file = paste0("perTissueNormalization/qqCoExp/",make.names(tissue),".txt"), sep = "\t", quote = F, col.names = NA) + +}) + +sink <- lapply(tissueClasses, function(tissue){ + + #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + #tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] + #tissueExp <- exp[,tissueSamples] + + load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"), verbose = T) + + write.table(tissueExp, file = gzfile("huvec.txt.gz"), sep = "\t", quote = F, col.names = NA) + + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = tissueExp - rowMeans(tissueExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expEigen <- eigen(expCov) + #eigenVectors <- expEigen$vectors + #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + #rownames(eigenVectors) <- rownames(expScale) + + #eigenValues <- expEigen$values + #names(eigenValues) <- paste0("PC_",1:length(eigenValues)) + + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + #expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + + expSvd <- svd(expScale, nu = 50, nv = 50) + + + + + eigenValues <- expSvd$d^2 + eigenVectors <- expSvd$u + colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + rownames(eigenVectors) <- rownames(expScale) + + expPcs <- expSvd$v[,1:50] %*% diag(expSvd$d[1:50]) + colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + rownames(expPcs) <- colnames(expScale) + + + explainedVariance <- eigenValues * 100 / nrow(expScale) + + + pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance) + + save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) + + return(expPcs) + +}) + +write.table(pcaRes$eigenVectors, file = "huvecEigenVectors.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$eigenValues, file = "huvecEigenValues.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$expPcs, file = "huvecPcs.txt", sep = "\t", quote = F, col.names = NA) + +pcsPerTissue <- lapply(tissueClasses, function(tissue){ + load(file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) + eigenvectors <- pcaRes$eigenVectors + colnames(eigenvectors) <- paste0(tissue,"_",colnames(eigenvectors)) + + return(eigenvectors) +}) +#str(pcsPerTissue) +pcsPerTissue2 <- do.call(cbind, pcsPerTissue) + +str(pcsPerTissue2) + +rownames(pcsPerTissue2) <- (gsub("\\..+", "", rownames(pcsPerTissue2))) +write.table(pcsPerTissue2, file = "perTissueNormalization/perTissueQqPcaNoOutliers/combinedComponents.txt", sep = "\t", quote = FALSE, col.names = NA) + +pcsPerTissue2t <- t(pcsPerTissue2) +pcsPerTissue2Scale = pcsPerTissue2t - rowMeans(pcsPerTissue2t) +# Standardize each variable +pcsPerTissue2Scale = pcsPerTissue2Scale / sqrt(rowSums(pcsPerTissue2Scale^2)) + +pcCorMatrix <- pcsPerTissue2Scale %*% t(pcsPerTissue2Scale) + +range(pcCorMatrix) +range(diag(pcCorMatrix)) + +sum(pcCorMatrix[lower.tri(pcCorMatrix)] >= 0.8) + +identicalPerPc <- apply(pcCorMatrix, 2, function(x){sum(x>=0.7)}) +tail(sort(identicalPerPc)) + +hist(pcCorMatrix[,"Brain-Cortex_PC_3"]) +dev.off() + +pcCorMatrix[,"Whole Blood Fetal_PC_1"][pcCorMatrix[,"Whole Blood Fetal_PC_1"] >= 0.7] + + +compEigen <- eigen(pcCorMatrix) +str(compEigen) +sum(compEigen$values) + +(numberOfCompsEigenvalue1 <- sum(as.numeric(compEigen$values) >= 1)) + +str(compEigen) +pcsOfComps <- t(pcsPerTissue2t) %*% compEigen$vectors[,1:numberOfCompsEigenvalue1] +colnames(pcsOfComps) <- paste0("PC_",1:ncol(pcsOfComps)) +rownames(pcsOfComps) <- (gsub("\\..+", "", rownames(pcsOfComps))) +write.table(pcsOfComps, col.names = NA, sep = "\t", quote = F, file = gzfile("perTissueNormalization/perTissueQqPcaNoOutliers/pcaCombinedComponents.txt.gz")) +str(pcsOfComps) + +rpng() +plot(cumsum(as.numeric(compEigen$values) * 100 / sum(as.numeric(compEigen$values)))) +dev.off() + +rpng() +plot(as.numeric(compEigen$values)) +dev.off() + + + + + + + + + + + + + + + + +head(as.numeric(compEigen$values)) + +sum(eigenValues >= 1) + +compSvd <- svd(t(pcsPerTissue2Scale)) +str(compSvd) + +(numberOfCompsEigenvalue1 <- sum(compSvd$d^2>=1)) + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") + +load("problem.RData") + + +combinedCompsPcs <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs2 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs3 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs4 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs5 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) + +cor.test(combinedCompsPcs[,1], combinedCompsPcs2[,1]) +cor.test(combinedCompsPcs[,1], combinedCompsPcs3[,1]) +cor.test(combinedCompsPcs[,1], combinedCompsPcs4[,1]) +cor.test(combinedCompsPcs[,1], combinedCompsPcs5[,1]) + +range(abs(combinedCompsPcs[,1]) - abs(combinedCompsPcs2[,1])) + +plot(combinedCompsPcs[,1], combinedCompsPcs2[,1]) +plot(combinedCompsPcs[,1], combinedCompsPcs3[,1]) +plot(combinedCompsPcs[,1], combinedCompsPcs4[,1]) +plot(combinedCompsPcs[,1], combinedCompsPcs5[,1]) +dev.off() + +save(compSvd, numberOfCompsEigenvalue1, file = "problem.RData") + +str(combinedCompsPcs) +range(combinedCompsPcs) + +plot(as.numeric(teest[,1]),combinedCompsPcs[,1]) +dev.off() + + +head(compSvd$d^2) +head(compSvd$d^2) + +eigenValues <- compSvd$d^2 + +rpng() +plot(cumsum(eigenValues * 100 / sum(eigenValues))) +dev.off() + + + + +rpng() +plot(eigenValues) +dev.off() + diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R similarity index 91% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R index 25d056ae6..e312a3287 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R @@ -18,15 +18,16 @@ rm(table_tmp) str(exp) #save(exp, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") +#exp contains expression rows genes cols samples +#First center and scale each row #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 expScale = exp - rowMeans(exp); # Standardize each variable expScale = expScale / sqrt(rowSums(expScale^2)); -expCov = tcrossprod(expScale);#equevelent to correlation due to center scale -range(expCov) -str(expCov) +expCov = tcrossprod(expScale);#equivalent to correlation due to center scale expEigen <- eigen(expCov) @@ -41,8 +42,7 @@ str(eigenValues) save(eigenVectors, eigenValues, expFile, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/PCA_Patrick/eigen.RData") - - +#Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) expPcs <- t(expScale) %*% expEigen$vectors[,1:1000] colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R b/Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R similarity index 99% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R index e6020bd90..02791d9a6 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R @@ -2,7 +2,7 @@ library(parallel) setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") -load(file = "combinedMeta_2022_09_02.RData", verbose = T) +load(file = "combinedMeta_2022_09_15.RData", verbose = T) load(file = "Recount3_QC_2ndRun/PCA_Patrick/pcs.RData", verbose = T) load(file = "Recount3_QC_2ndRun/PCA_Patrick/eigen.RData", verbose = T) @@ -20,7 +20,7 @@ pcsAndMeta <- merge(expPcs[,1:compsToUse], combinedMeta, by = 0) rownames(pcsAndMeta) <- pcsAndMeta$Row.names -save(pcsAndMeta, compsToUse, file = "DataForPredictions.RData") +#save(pcsAndMeta, compsToUse, file = "DataForPredictions.RData") dim(pcsAndMeta) pcsAndMeta <- pcsAndMeta[!pcsAndMeta$exclude,] diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/qqNorm.R b/Downstreamer/src/main/r/downstreamer_main/recount3/qqNormOfFullRecount3.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/qqNorm.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/qqNormOfFullRecount3.R diff --git a/Genotype-Harmonizer/pom.xml b/Genotype-Harmonizer/pom.xml index 4414958d1..44e9bf74a 100644 --- a/Genotype-Harmonizer/pom.xml +++ b/Genotype-Harmonizer/pom.xml @@ -7,7 +7,7 @@ 4.0.0 Genotype-Harmonizer - 1.4.23-SNAPSHOT + 1.4.24-SNAPSHOT Genotype Harmonizer jar