diff --git a/.gitignore b/.gitignore
index 1e72b30bf..7668984af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,4 @@ deconvolutionTestResults/
.Rproj.user
.Rhistory
/DEPICT2/src/main/r/downstreamer_main/downstreamer_main.Rproj
+Downstreamer/src/main/r/downstreamer_main/.remoterserverlog
diff --git a/Downstreamer/pom.xml b/Downstreamer/pom.xml
index ac6432d85..64e8ca675 100644
--- a/Downstreamer/pom.xml
+++ b/Downstreamer/pom.xml
@@ -7,7 +7,7 @@
1.0.4-SNAPSHOT
Downstreamer
- 1.29-SNAPSHOT
+ 1.30-SNAPSHOT
jar
diff --git a/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj b/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj
new file mode 100644
index 000000000..8e3c2ebc9
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R
index 5b3d371fc..892d655c4 100644
--- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R
+++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R
@@ -1564,7 +1564,7 @@ combinedMeta$CelllineName[studySamples] <- ""
combinedMeta$Cancer[studySamples] <- FALSE
#SRP081020 is mis-anotated in SRA as PBMC, paper and clustering both state wholeblood
-studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077")
+studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077", "SRP056443")
combinedMeta$Tissue[studySamples] <- ""
combinedMeta$Tissue2[studySamples] <- ""
combinedMeta$Cellline[studySamples] <- NA
@@ -1573,6 +1573,9 @@ combinedMeta$Cancer[studySamples] <- NA
+
+
+
combinedMeta$Cellline[!is.na(combinedMeta$CelllineName)&combinedMeta$CelllineName=="iPSC"] <- TRUE
combinedMeta$Cancer[!is.na(combinedMeta$Cellline) & combinedMeta$Cellline] <- NA
@@ -1613,7 +1616,7 @@ combinedMeta$Tissue[combinedMeta$Cohort == "GSA"]
-#save(combinedMeta, file = "combinedMeta_2022_09_02.RData")
+#save(combinedMeta, file = "combinedMeta_2022_09_15.RData")
load(file = "combinedMeta_2022_08_19.RData")
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R
new file mode 100644
index 000000000..8d261ed03
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R
@@ -0,0 +1,31 @@
+setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs")
+
+source(paste0("C:\\Users\\patri\\Documents\\GitHub\\systemsgenetics\\Downstreamer\\src\\main\\r\\downstreamer_main/downstreamer_functions.r"))
+
+traits <- read.delim("MetaBrain/traits.txt")
+
+
+i <- 1
+
+pdf("MetaBrain/withAndWithoutEqtls.pdf", height = 20, width = 10)
+#png("MetaBrain/withAndWithoutEqtls.png", height = 2000, width = 1000)
+layout(matrix(1:8, ncol =2))
+par(pty="s")
+for(i in 1:nrow(traits)){
+
+
+
+trait <- traits[i, "trait"]
+name <- traits[i, "name"]
+
+enrichments <- read.depict2(paste0("MetaBrain/normal/",trait,"_enrichtments.xlsx"))$GenePrioritization_MetaBrain
+enrichmentsIncEqtl <- read.depict2(paste0("MetaBrain/inceqt/",trait,"_enrichtments.xlsx"))$GenePrioritization_MetaBrain
+
+enrichmentsBoth <- merge(enrichments, enrichmentsIncEqtl, "Gene.ID" , suffixes= c("Normal", "incEqtl"))
+
+maxZ <- max(range(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl))
+r <- cor(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl)
+plot(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl, bg = adjustcolor("dodgerblue2", alpha.f = 0.3), pch = 21, col=adjustcolor("dodgerblue2", alpha.f = 0.5), asp = 1, xlab = "Key gene score without eqtl information", ylab = "Key gene score without eqtl information", xlim = c(-maxZ,maxZ), ylim = c(-maxZ,maxZ), main = name)
+mtext(paste0("Pearson r: ", signif(r,2)))
+}
+dev.off()
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R
index f12897f41..ebff773a5 100644
--- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R
+++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R
@@ -270,12 +270,6 @@ plot(log(expSvd$d))
abline(v=60)
-library(rpca)
-
-
-expRpca <- rpca(t(expSub2))
-
-
library(corpcor)
expSvdFast <- fast.svd(expSubScale)
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R
deleted file mode 100644
index 850281aed..000000000
--- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R
+++ /dev/null
@@ -1,166 +0,0 @@
-#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i
-#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T)
-
-
-remoter::client("localhost", port = 55501, password = "laberkak")
-
-
-
-library(uwot)
-
-setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\")
-setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
-
-tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "")
-
-load(file = "DataForPredictions.RData")
-
-#load(file = "combinedMeta_2022_08_30.RData", verbose = T)
-#str(combinedMeta)
-#updatedAnnotations <- combinedMeta[,c("Tissue", "Tissue2", "Cellline", "CelllineName", "Cancer", "Cohort", "Fetal")]
-
-#all(rownames(pcsAndMeta) %in% rownames(updatedAnnotations))
-#updatedAnnotations <- updatedAnnotations[rownames(pcsAndMeta),]
-#all(rownames(pcsAndMeta) == rownames(updatedAnnotations))
-
-#pcsAndMeta[,colnames(updatedAnnotations)] <- updatedAnnotations
-
-#pcsAndMeta$selectedSamples <- !pcsAndMeta$excludeBasedOnPredictionCellline2 & !pcsAndMeta$excludeBasedOnPredictionCancer & !(!is.na(pcsAndMeta$Cancer) & pcsAndMeta$Cancer) & !(!is.na(pcsAndMeta$Cellline) & pcsAndMeta$Cellline)
-
-table(pcsAndMeta$selectedSamples, useNA = "a")
-
-
-clusterAnnotations <- read.delim("umap/annotationsBasedOnOldUmap.txt", row.names = 1)
-pcsAndMeta <- merge(pcsAndMeta, clusterAnnotations, by = 0, all.x = T)
-rownames(pcsAndMeta) <- pcsAndMeta$Row.names
-table(pcsAndMeta$ClusterAnnotation)
-
-
-
-
-#pcsAndMeta[!is.na(pcsAndMeta$study) & (pcsAndMeta$study== "ERP104864") & (grepl("synovium", pcsAndMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= ""
-
-
-tissueSamples <- pcsAndMeta[pcsAndMeta$selectedSamples,]
-
-tissueSamples$class <- tissueSamples$Tissue
-
-hasT2 <- tissueSamples$Tissue2 != ""
-tissueSamples$class[hasT2] <- paste0(tissueSamples$class[hasT2], "-", tissueSamples$Tissue2[hasT2])
-
-isFetal <- !is.na(tissueSamples$Fetal) & tissueSamples$Fetal
-tissueSamples$class[isFetal] <- paste0(tissueSamples$class[isFetal], "-Fetal")
-
-noTbutCluster <- tissueSamples$class == "" & !is.na(tissueSamples$ClusterAnnotation)
-table(noTbutCluster, useNA = "a")
-tissueSamples$class[noTbutCluster] <- tissueSamples$ClusterAnnotation[noTbutCluster]
-
-table(tissueSamples$class)
-write.table(table(tissueSamples$class, useNA = "always"), file = "umap/tissues.txt", sep = "\t", quote = F, row.names = F)
-
-str(tissueSamples)
-
-
-
-mapping <- read.delim("umap/tissuesMapping.txt")
-str(mapping)
-
-all(tissueSamples$class %in% mapping$Class)
-
-
-tissueSamples$umapFactor <- as.factor(mapping$ClassificationClass[match(tissueSamples$class, mapping$Class)])
-
-table(tissueSamples$umapFactor, useNA = "always")
-
-
-defaultCol <- adjustcolor("grey", alpha.f = 0.6)
-tissueCol <- read.delim("umap/col.txt", row.names = 1)
-
-
-tissueSamples$TissueCol <- defaultCol
-sum(unique(tissueSamples$umapFactor) %in% rownames(tissueCol))
-sum(tissueSamples$umapFactor %in% rownames(tissueCol))
-tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1], alpha.f = 0.5)
-#tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1]
-table(tissueSamples$TissueCol, useNA = "a")
-
-tissueSamples$plotOrderTissues <- order(tissueSamples$TissueCol != defaultCol)
-
-
-#, n_threads = 22
-
-compsToUseForUmap <- compsToUse
-init <- as.matrix(tissueSamples[,paste0("PC_",1:2)])
-umapInput <- as.matrix(tissueSamples[,paste0("PC_",1:compsToUseForUmap)])
-
-sampleUmap <- umap(
- umapInput,
- n_epochs = 1000,
- init = init,
- n_neighbors = 500,
- min_dist = 2, init_sdev = 1e-4, learning_rate = 1,
- spread = 15,
- bandwidth = 10,
- scale = "scale",
- local_connectivity = 1,
- metric = "correlation")
-
-
-rownames(sampleUmap) <- rownames(tissueSamples)
-colnames(sampleUmap) <- c("UMAP1", "UMAP2")
-umapAndMeta <- merge(sampleUmap, tissueSamples, by = 0)
-dim(umapAndMeta)
-
-
-
-
-
-rpng()
-
-par(mar = c(3,5,0.1,0.1), xpd = NA)
-plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16)
-
-dev.off()
-
-
-locator(n =2, type = "l")
-
-
-
-
-write.table(umapAndMeta,file = "umaptest.txt", sep = "\t", quote = F, col.names = NA)
-
-#save.image( file="umap_tmp.RData")
-#load("umap_tmp.RData")
-
-rpng()
-
-par(mar = c(3,5,0.1,0.1), xpd = NA)
-plot(umapAndMeta[plotOrder,"UMAP1"], umapAndMeta[plotOrder,"UMAP2"], col = umapAndMeta$TissueCol[plotOrder], cex = 0.8, pch = 16, xlim = c(-25,25), ylim = c(-25,25))
-
-dev.off()
-
-
-
-#png(file = "umaptest.png", width = 1600, height = 800)
-
-pdf(file = "umaptest.pdf", width = 16, height = 8)
-#rpng()
-
-layout(matrix(1:2,ncol = 2))
-
-par(mar = c(3,5,0.1,0.1), xpd = NA)
-plot(umapAndMeta[plotOrderTissues,"UMAP1"], umapAndMeta[plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[plotOrderTissues], cex = 0.4, pch = 16)
-
-par(mar = c(0,0,0,0), xpd = NA)
-plot.new()
-plot.window(xlim = 0:1, ylim = 0:1)
-legend("center", fill = tissueCol[,1], legend = row.names(tissueCol), bty = "n", ncol = 2,cex = 0.7)
-
-
-dev.off()
-
-
-
-
-
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R
new file mode 100644
index 000000000..f56829394
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R
@@ -0,0 +1,167 @@
+smartSeq <- read.delim("smartseqSamples.txt")[,1]
+str(smartSeq)
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.3)
+pcsAndMeta$colSmartseq <- defaultCol
+pcsAndMeta$colSmartseq[pcsAndMeta$Row.names %in% smartSeq] <- "darkslategray"
+pcsAndMeta$colSmartseq[pcsAndMeta$Row.names %in% qseq] <- "orangered"
+pcsAndMeta$colSmartseq[outliersPc1 == "TRUE" ] <- "darkblue"
+plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1)
+
+rpng()
+plot(pcsAndMeta[plotOrderSmartseq,"PC_1"], pcsAndMeta[plotOrderSmartseq,"PC_2"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Smartseq")
+dev.off()
+
+rpng()
+plot(pcsAndMeta[plotOrderSmartseq,"PC_1"], pcsAndMeta[plotOrderSmartseq,"PC_6"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Quantseq and Smartseq", pch =16)
+
+library(pROC)
+
+smartSeqClass <- as.factor(pcsAndMeta$Row.names %in% smartSeq)
+table(smartSeqClass)
+dim(pcsAndMeta)
+smartSeqAuc <- apply(pcsAndMeta[,2:100],2,function(x){
+ tryCatch(
+ {
+ #wilcox.test(x ~ smartSeqClass)$p.value
+ as.numeric(auc(response = smartSeqClass, predictor = x))
+ },
+ error=function(cond){return(1)}
+ )
+})
+sort(smartSeqAuc)
+str(pcsAndMeta[,2])
+boxplot(pcsAndMeta[,2]~smartSeqClass)
+
+boxplot(log10(pcsAndMeta[,"sra.sample_spots"]) ~ smartSeqClass )
+
+sum(pcsAndMeta[,"sra.sample_spots"] < 10000000, na.rm = T)
+
+
+library(vioplot)
+vioplot(log10(pcsAndMeta[,"sra.sample_spots"]) ~ smartSeqClass)
+
+
+plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], log10(pcsAndMeta[plotOrderSmartseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.6)
+
+
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.3)
+sum(!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 150000)
+
+pcsAndMeta$colSmartseq <- defaultCol
+pcsAndMeta$colSmartseq[!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 150000] <- "aquamarine2"
+table(pcsAndMeta$colSmartseq)
+plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1)
+
+plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], pcsAndMeta[plotOrderSmartseq,"PC_1"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "recount_qc.star.number_of_splices:_total < 150000")
+
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.5)
+pcsAndMeta$colSmartseq <- defaultCol
+pcsAndMeta$colSmartseq[!is.na(pcsAndMeta[,"recount_seq_qc.%a"]) & pcsAndMeta[,"recount_seq_qc.%a"] < 20] <- "darkslateblue"
+table(pcsAndMeta$colSmartseq)
+plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1)
+
+plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], pcsAndMeta[plotOrderSmartseq,"PC_1"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.6, main = "recount_seq_qc.%c < 20")
+
+
+
+
+
+
+pcNoCenter <- read.delim("Components.txt", sep = ",", row.names = 1)
+pcNoCenter <- merge(pcNoCenter, combinedMeta, all.x = T, by = 0)
+rownames(pcNoCenter) <- pcNoCenter$Row.names
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.3)
+pcNoCenter$colSmartseq <- defaultCol
+pcNoCenter$colSmartseq[rownames(pcNoCenter) %in% smartSeq] <- "darkslategray"
+table(pcNoCenter$colSmartseq)
+plotOrderSmartseq <- order((pcNoCenter$colSmartseq != defaultCol) + 1)
+
+plot(pcNoCenter[plotOrderSmartseq,"X0"], pcNoCenter[plotOrderSmartseq,"X1"], col = pcNoCenter$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Smartseq")
+
+
+plot(pcNoCenter[plotOrderSmartseq,"X0"], pcNoCenter[plotOrderSmartseq,"X1"], col = adjustcolor("grey", alpha.f = 0.2), cex = 0.3)
+
+
+
+
+
+
+
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.6)
+pcNoCenter$col <- defaultCol
+
+tissueAndCol <- tolower(pcNoCenter[,"Tissue"]) %in% tolower(tissueCol$PlotClass)
+
+pcNoCenter$col[tissueAndCol] <- tissueCol$col[match(tolower(pcNoCenter[tissueAndCol,"Tissue"]), tolower(tissueCol$PlotClass))]
+
+
+tissue2AndCol <- tolower(pcNoCenter[,"Tissue2"]) %in% tolower(tissueCol$PlotClass)
+sum(tissue2AndCol)
+pcNoCenter$col[tissue2AndCol] <- tissueCol$col[match(tolower(pcNoCenter[tissue2AndCol,"Tissue2"]), tolower(tissueCol$PlotClass))]
+
+
+
+sum(is.na(tolower(pcNoCenter[,"Tissue"]) %in% tolower(tissueCol$PlotClass)))
+
+#pcNoCenter$col <- tissueCol$col[match(tolower(pcNoCenter[,"Tissue"]), tolower(tissueCol$PlotClass), nomatch = nrow(tissueCol))]
+
+plotOrder <- order((pcNoCenter$col != defaultCol) + 1)
+
+plot(pcNoCenter[plotOrder,"X0"], pcNoCenter[plotOrder,"X1"], col = pcNoCenter$col[plotOrder], cex = 0.4)
+
+
+pcNoCenter$gtexCol <- defaultCol
+pcNoCenter$gtexCol[pcNoCenter$Cohort == "GTEx" ] <- "goldenrod3"
+pcNoCenter$gtexCol[pcNoCenter$Cohort == "TCGA" ] <- "cyan1"
+
+plotOrder <- order((pcNoCenter$gtexCol != defaultCol) + 1)
+plot(pcNoCenter[plotOrder,"X0"], pcNoCenter[plotOrder,"X1"], col = pcNoCenter$gtexCol[plotOrder], cex = 0.4)
+
+
+
+
+toExclude <-
+ (!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 15000) |
+ (!is.na(pcsAndMeta[,"recount_seq_qc.%a"]) & pcsAndMeta[,"recount_seq_qc.%a"] < 20) |
+ (!is.na(pcsAndMeta[,"recount_seq_qc.%t"]) & pcsAndMeta[,"recount_seq_qc.%t"] < 20) |
+ (!is.na(pcsAndMeta[,"recount_seq_qc.%c"]) & pcsAndMeta[,"recount_seq_qc.%c"] < 20) |
+ (!is.na(pcsAndMeta[,"recount_seq_qc.%g"]) & pcsAndMeta[,"recount_seq_qc.%g"] < 20)
+sum(toExclude)
+
+
+samplesToKeep <- pcsAndMeta$Row.names[!toExclude]
+length(samplesToKeep) + sum(toExclude) == nrow(pcs)
+
+write.table(samplesToKeep, file = "samplesToKeep.txt", row.names = F, quote = F)
+
+
+boxplot(pcsAndMeta[,"PC_1"])
+
+outliersPc1 <- as.factor(pcsAndMeta[,"PC_2"] >= 120)
+table(outliersPc1)
+library(pROC)
+outlierAuc <- sapply(colnames(pcsAndMeta),function(x){
+ tryCatch(
+ {
+ #wilcox.test(x ~ smartSeqClass)$p.value
+ as.numeric(auc(response = outliersPc1, predictor = pcsAndMeta[,x]))
+ },
+ error=function(cond){return(NA)}
+ )
+ })
+sort(outlierAuc)
+
+
+
+
+
+auc(response = outliersPc1, predictor = pcsAndMeta[,"PC_2"])
diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/downstreamer_umap.rmd b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/downstreamer_umap.rmd
similarity index 100%
rename from Downstreamer/src/main/r/downstreamer_main/umap/downstreamer_umap.rmd
rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/downstreamer_umap.rmd
diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/pathwayUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/pathwayUmap.R
similarity index 100%
rename from Downstreamer/src/main/r/downstreamer_main/umap/pathwayUmap.R
rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/pathwayUmap.R
diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/sampleUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/sampleUmap.R
similarity index 100%
rename from Downstreamer/src/main/r/downstreamer_main/umap/sampleUmap.R
rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/sampleUmap.R
diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/tryUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/tryUmap.R
similarity index 100%
rename from Downstreamer/src/main/r/downstreamer_main/umap/tryUmap.R
rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/tryUmap.R
diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md
new file mode 100644
index 000000000..5edb92e21
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md
@@ -0,0 +1,20 @@
+# Processing of recount3 data
+
+## Downloading and extracting the expression values and meta data
+
+
+
+## Harmonize and expand the annotations
+
+In `harmonizeAndExtentSampleAnnotations.R` we expand and map the sample annotations.
+
+For a large part the annotations where added manually.
+
+## QQ normalization of the data
+
+
+## PCA on the co-expression matrix
+
+
+## Excluding the cell line and cancer samples
+
diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R
new file mode 100644
index 000000000..cf4b191d4
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R
@@ -0,0 +1,631 @@
+#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i
+#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T)
+
+
+remoter::client("localhost", port = 55501, password = "laberkak")
+
+
+
+library(uwot)
+
+setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\")
+setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
+
+
+load(file = "DataForPredictions.RData")
+rownames(pcsAndMeta) <- pcsAndMeta$Row.names
+load(file = "tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T)
+tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "")
+
+
+colnamesToUpdate <- colnames(pcsAndMeta)[colnames(pcsAndMeta) %in% colnames(combinedMeta)]
+all(rownames(pcsAndMeta) %in% rownames(combinedMeta))
+pcsAndMeta[,colnamesToUpdate] <- combinedMeta[rownames(pcsAndMeta),colnamesToUpdate]
+
+
+table(pcsAndMeta$selectedSamples, useNA = "a")
+
+
+clusterAnnotations <- read.delim("umap/annotationsBasedOnOldUmap.txt", row.names = 1)
+samplesWithClusterAnnotation <- rownames(pcsAndMeta)[rownames(pcsAndMeta) %in% rownames(clusterAnnotations)]
+
+pcsAndMeta$ClusterAnnotation <- NA
+pcsAndMeta[samplesWithClusterAnnotation, "ClusterAnnotation"] <- clusterAnnotations[samplesWithClusterAnnotation,"ClusterAnnotation"]
+table(pcsAndMeta$ClusterAnnotation, useNA = "a")
+
+tissueSamples <- pcsAndMeta[pcsAndMeta$selectedSamples,]
+
+tissueSamples$class <- tissueSamples$Tissue
+
+
+hasT2 <- tissueSamples$Tissue2 != ""
+tissueSamples$class[hasT2] <- paste0(tissueSamples$Tissue[hasT2], "-", tissueSamples$Tissue2[hasT2])
+table(tissueSamples$class)
+isFetal <- !is.na(tissueSamples$Fetal) & tissueSamples$Fetal
+tissueSamples$class[isFetal] <- paste0(tissueSamples$class[isFetal], "-Fetal")
+
+noTbutCluster <- tissueSamples$class == "" & !is.na(tissueSamples$ClusterAnnotation)
+table(noTbutCluster, useNA = "a")
+tissueSamples$class[noTbutCluster] <- tissueSamples$ClusterAnnotation[noTbutCluster]
+
+table(tissueSamples$class)
+write.table(table(tissueSamples$class, useNA = "always"), file = "umap/tissues.txt", sep = "\t", quote = F, row.names = F)
+
+str(tissueSamples)
+
+
+
+mapping <- read.delim("umap/tissuesMapping.txt")
+str(mapping)
+
+all(tissueSamples$class %in% mapping$Class)
+tissueSamples$class[!tissueSamples$class %in% mapping$Class]
+
+tissueSamples$umapFactor <- as.factor(mapping$ClassificationClass[match(tissueSamples$class, mapping$Class)])
+
+table(tissueSamples$umapFactor, useNA = "always")
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.6)
+tissueCol <- read.delim("umap/col.txt", row.names = 1)
+
+
+tissueSamples$TissueCol <- defaultCol
+sum(unique(tissueSamples$umapFactor) %in% rownames(tissueCol))
+sum(tissueSamples$umapFactor %in% rownames(tissueCol))
+tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1], alpha.f = 0.5)
+#tissueSamples$TissueCol[tissueSamples$umapFactor %in% rownames(tissueCol)] <- tissueCol[as.character(tissueSamples$umapFactor[tissueSamples$umapFactor %in% rownames(tissueCol)]),1]
+table(tissueSamples$TissueCol, useNA = "a")
+
+tissueSamples$plotOrderTissues <- order(tissueSamples$TissueCol != defaultCol)
+
+
+#, n_threads = 22
+
+compsToUseForUmap <- compsToUse
+init <- as.matrix(tissueSamples[,paste0("PC_",1:2)])
+umapInput <- as.matrix(tissueSamples[,paste0("PC_",1:compsToUseForUmap)])
+
+sampleUmap <- umap(
+ umapInput,
+ n_epochs = 1000,
+ init = init,
+ n_neighbors = 500,
+ min_dist = 1, init_sdev = 1e-4, learning_rate = 2,
+ spread = 20,
+ bandwidth = 10,
+ scale = "scale",
+ local_connectivity = 10,
+ repulsion_strength = 0.5,
+ metric = "correlation")
+
+
+rownames(sampleUmap) <- rownames(tissueSamples)
+colnames(sampleUmap) <- c("UMAP1", "UMAP2")
+save(sampleUmap, file = "umap/sampleUmap6.RData")
+
+#load(file = "umap/sampleUmap3.RData")
+
+
+
+
+umapAndMeta <- merge(sampleUmap, tissueSamples, by = 0)
+rownames(umapAndMeta) <- umapAndMeta$Row.names
+dim(umapAndMeta)
+
+
+
+
+rpng()
+
+par(mar = c(3,5,0.1,0.1), xpd = NA)
+plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16)
+
+dev.off()
+
+plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16, xlim = c(-100,100), ylim = c(-100,100))
+
+
+
+locator(n =2, type = "l")
+cluster1 <- locator(n =2, type = "l")
+cluster2 <- locator(n =2, type = "l")
+
+
+write.table(umapAndMeta[,!grepl("PC_",colnames(umapAndMeta))],file = "umaptest.txt", sep = "\t", quote = F, col.names = NA)
+#save(umapAndMeta, file = "umaptest.RData")
+#load("umaptest.RData")
+
+#save.image( file="umap_tmp.RData")
+#load("umap_tmp.RData")
+
+rpng()
+
+par(mar = c(3,5,0.1,0.1), xpd = NA)
+plot(umapAndMeta[plotOrder,"UMAP1"], umapAndMeta[plotOrder,"UMAP2"], col = umapAndMeta$TissueCol[plotOrder], cex = 0.8, pch = 16, xlim = c(-25,25), ylim = c(-25,25))
+
+dev.off()
+
+
+
+#png(file = "umaptest.png", width = 1600, height = 800)
+
+pdf(file = "umaptest.pdf", width = 16, height = 8)
+#rpng()
+
+layout(matrix(1:2,ncol = 2))
+
+par(mar = c(3,5,0.1,0.1), xpd = NA)
+plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16)
+
+par(mar = c(0,0,0,0), xpd = NA)
+plot.new()
+plot.window(xlim = 0:1, ylim = 0:1)
+legend("center", fill = tissueCol[,1], legend = row.names(tissueCol), bty = "n", ncol = 2,cex = 0.7)
+
+
+dev.off()
+
+
+
+
+
+
+
+#smartseq plots
+
+someSmartSeqStudies <- read.delim("selectionSmartseqStudies.txt", header = F)[,1]
+str(someSmartSeqStudies)
+
+someSmartSeqSamples <- read.delim("smartseqSamples.txt", header = T)[,1]
+str(someSmartSeqSamples)
+
+umapAndMeta$smartseqcol <- defaultCol
+umapAndMeta$smartseqcol[umapAndMeta$study %in% someSmartSeqStudies] <- "pink"
+umapAndMeta$smartseqcol[umapAndMeta$Row.names %in% someSmartSeqSamples] <- "pink"
+
+umapAndMeta$plotOrdersq <- order(umapAndMeta$smartseqcol != defaultCol)
+
+
+par(mar = c(3,5,0.1,0.1), xpd = NA)
+plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$smartseqcol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16)
+
+
+#instestine clusters
+
+umapAndMeta$intestineCluster <- ""
+umapAndMeta$intestineCluster[umapAndMeta$UMAP1 >= cluster1$x[1] & umapAndMeta$UMAP1 <= cluster1$x[2] & umapAndMeta$UMAP2 >= cluster1$y[1] & umapAndMeta$UMAP2 <= cluster1$y[2]] <- "c1"
+umapAndMeta$intestineCluster[umapAndMeta$UMAP1 >= cluster2$x[1] & umapAndMeta$UMAP1 <= cluster2$x[2] & umapAndMeta$UMAP2 >= cluster2$y[1] & umapAndMeta$UMAP2 <= cluster2$y[2]] <- "c2"
+table(umapAndMeta$intestineCluster)
+
+table(factor(umapAndMeta$umapFactor[umapAndMeta$intestineCluster=="c1"]))
+table(factor(umapAndMeta$umapFactor[umapAndMeta$intestineCluster=="c2"]))
+
+table(factor(umapAndMeta$class[umapAndMeta$intestineCluster=="c1"]))
+table(factor(umapAndMeta$class[umapAndMeta$intestineCluster=="c2"]))
+
+a <- as.data.frame(table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster=="c1"]))
+b <- as.data.frame(table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster=="c2"]))
+
+
+table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster!=""], umapAndMeta$intestineCluster[umapAndMeta$intestineCluster!=""])
+
+str(a)
+c <- merge(a,b,by = 0, all = T)
+c
+
+load("metadata_gtex.Rda", verbose = T)
+View(metadata_gtex)
+
+
+gtexTansverse <- umapAndMeta[umapAndMeta$study == "GTEx" & umapAndMeta$Tissue2 == "Transverse" & umapAndMeta$intestineCluster != "",]
+
+rownames(gtexTansverse) <- gtexTansverse$Row.names
+
+rownames(metadata_gtex) <- metadata_gtex$external_id
+
+dim(gtexTansverse)
+gtexTansverse <- merge(gtexTansverse, metadata_gtex[,!colnames(metadata_gtex) %in% colnames(gtexTansverse)], by = 0)
+dim(gtexTansverse)
+
+table(gtexTansverse$gtex.smatsscr, gtexTansverse$intestineCluster)
+
+fisher.test(table(gtexTansverse$gtex.smatsscr, gtexTansverse$intestineCluster))
+grep("MHBCTINF", colnames(gtexTansverse), ignore.case = T)
+
+
+
+
+numCols <- colnames(gtexTansverse)[unlist(lapply(gtexTansverse, is.numeric)) ]
+
+colName <- "sra.paired_nominal_length"
+clusterCompare <- sapply(numCols, function(colName){
+ #print(colName)
+ if(!all(is.na(gtexTansverse[,colName])) & sd(gtexTansverse[,colName], na.rm =T) > 0 ){
+ t.test(gtexTansverse[,colName] ~ gtexTansverse$intestineCluster)$p.value
+ }
+
+})
+clusterCompare <- unlist(clusterCompare)
+clusterCompare2 <- clusterCompare[grep("PC_", names(clusterCompare), invert = T)]
+sort(clusterCompare2, decreasing = T)
+boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrx` ~ gtexTansverse$intestineCluster)
+boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrx` ~ paste0(gtexTansverse$intestineCluster, "_",gtexTansverse$gtex.sex))
+boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrm` ~ gtexTansverse$intestineCluster)
+boxplot(gtexTansverse$`` ~ gtexTansverse$intestineCluster)
+
+boxplot(gtexTansverse$`recount_qc.star.number_of_reads_unmapped:_other_both` ~ gtexTansverse$intestineCluster)
+
+boxplot(gtexTansverse$`gtex.smtsisch` ~ gtexTansverse$intestineCluster)
+boxplot(gtexTansverse$`CnvAutoCor` ~ gtexTansverse$intestineCluster)
+
+#save(gtexTansverse, file = "gtexTansverse.RData")
+load("gtexTansverse.RData")
+
+
+str(row.names(gtexTansverse))
+str(gtexTansverse$Row.names)
+str(exp)
+expgT <- exp[,gtexTansverse$Row.names]
+save(expgT, file = "expgT.RData")
+load( "expgT.RData")
+
+
+colnames(expgT)
+expgT <- t(expgT)
+all(rownames(expgT) == gtexTansverse$Row.names)
+
+x <- expgT[,1]
+
+diffExp <- apply(expgT, 2, function(x){
+ t.test(x ~gtexTansverse$intestineCluster)$statistic
+})
+hist(-log10(diffExp))
+names(diffExp)[order(diffExp)[1:100]]
+cat(sub("\\..+","",names(diffExp)[order(diffExp, decreasing = T)[1:200]]), sep = "\n")
+
+load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Fibroblasts.rda", verbose = T)
+str(fibroblasts)
+
+load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/BloodVessels.rda", verbose = T)
+
+
+
+
+minSamplesTraining <- 50
+maxFractionOfStudy <- 0.8
+
+#Take only samples that have an annotation
+umapAndMetaClassified <- umapAndMeta[!is.na(umapAndMeta$umapFactor),]
+#First put all in test, algorithm will put some
+umapAndMetaClassified$training <- FALSE
+
+tissueClass <- levels(umapAndMetaClassified$umapFactor)[2]
+study <- "GTEx"
+
+set.seed(42)
+#for each tissue slecect samples for training
+for(tissueClass in levels(umapAndMetaClassified$umapFactor)){
+ thisTissueSamples <- umapAndMetaClassified$umapFactor==tissueClass
+ studiesForThisTissue <- unique(umapAndMetaClassified$study[thisTissueSamples])
+ numberOfStudies <- length(studiesForThisTissue)
+ numberOfSamplesPerStudy <- ceiling(minSamplesTraining / numberOfStudies)
+ print(paste(tissueClass, length(studiesForThisTissue), numberOfSamplesPerStudy, sep = " - "))
+ #for each studies put samples to training or test
+ for(study in studiesForThisTissue){
+
+ thisTissueAndStudySamples <- thisTissueSamples & umapAndMetaClassified$study == study
+ thisTissueAndStudySamplesCount <- sum(thisTissueAndStudySamples)
+
+ #Don't select more samples from study then the study has and also no more then set fraction. Do floor to put studies with single sample to testset
+ potentialMax <- floor(thisTissueAndStudySamplesCount * maxFractionOfStudy)
+ numberTrainingSamplesThisStudy <- if(potentialMax > numberOfSamplesPerStudy) numberOfSamplesPerStudy else potentialMax
+ if(numberTrainingSamplesThisStudy > 0){
+ #The which will get all indices for the samples of this study-tissue combination. These are then samples for the samples used for training
+ trainingSamplesThisStudy <- sample(which(thisTissueAndStudySamples), numberTrainingSamplesThisStudy)
+ #Set selected to TRUE
+ umapAndMetaClassified$training[trainingSamplesThisStudy] <- TRUE
+ }
+
+
+ #print(paste0(thisTissueAndStudySamplesCount, " - ", numberTrainingSamplesThisStudy))
+ }
+
+}
+
+sum(umapAndMetaClassified$training)
+
+umapAndMetaClassifiedTraining <- umapAndMetaClassified[umapAndMetaClassified$training,]
+table(umapAndMetaClassifiedTraining$umapFactor)
+umapAndMetaClassifiedTest <- umapAndMetaClassified[!umapAndMetaClassified$training,]
+dim(umapAndMetaClassifiedTest)
+
+
+library(glmnet)
+cfit <- cv.glmnet(x = as.matrix(umapAndMetaClassifiedTraining[,paste0("PC_",1:compsToUse)]), y = umapAndMetaClassifiedTraining$umapFactor, family = "multinomial", type.measure = "class")
+cfit
+
+rpng()
+plot(cfit)
+dev.off()
+
+
+
+assess.glmnet(cfit, newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), newy = umapAndMetaClassifiedTest$umapFactor, family = "multinomial", type.measure = "class", keep = TRUE, alpha=1, lambda = "1se")
+
+
+
+predictionsTest <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), type = "class")
+
+predictionsTestScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), type = "response")
+predictionsTestScores <- predictionsTestScores[,,1]
+umapAndMetaClassifiedTest$predictedTissueScore <- apply(predictionsTestScores, 1, max)
+
+prop = 0.5
+
+predictionsInTest <- sapply(seq(0,1,0.05), function(prop){
+
+ umapAndMetaClassifiedTest$predictedTissue <- predictionsTest[,1]
+
+
+ umapAndMetaClassifiedTest$predictedTissue[umapAndMetaClassifiedTest$predictedTissueScore <= prop] <- NA
+
+ umapAndMetaClassifiedTest$misclasified <- FALSE
+ umapAndMetaClassifiedTest$misclasified[!is.na(umapAndMetaClassifiedTest$umapFactor) & !is.na(umapAndMetaClassifiedTest$predictedTissue) & umapAndMetaClassifiedTest$umapFactor != umapAndMetaClassifiedTest$predictedTissue] <- TRUE
+ errors <- sum(umapAndMetaClassifiedTest$misclasified )
+
+ umapAndMetaClassifiedTest$notPredictedBack <- FALSE
+ umapAndMetaClassifiedTest$notPredictedBack[!is.na(umapAndMetaClassifiedTest$umapFactor) & is.na(umapAndMetaClassifiedTest$predictedTissue) ] <- TRUE
+ missed <- sum(umapAndMetaClassifiedTest$notPredictedBack)
+
+ total <- nrow(umapAndMetaClassifiedTest)
+
+ missedPercentage <- missed / total
+ errorPercentage <- errors / total
+
+ return(c("Threshold" = prop, "MissedPerc" = missedPercentage , "ErrorPerc" = errorPercentage ))
+
+})
+predictionsInTest
+
+tissueClass <- levels(umapAndMetaClassified$umapFactor)[1]
+
+predictionsInTestPerTissue <- lapply(levels(umapAndMetaClassified$umapFactor), function(tissueClass){
+ predictionsInTestThisTissue <- sapply(seq(0,1,0.05), function(prop){
+
+ umapAndMetaClassifiedTestTissue <- umapAndMetaClassifiedTest[umapAndMetaClassifiedTest$umapFactor == tissueClass,]
+ umapAndMetaClassifiedTestTissue$predictedTissue <- predictionsTest[umapAndMetaClassifiedTest$umapFactor == tissueClass,1]
+
+
+ umapAndMetaClassifiedTestTissue$predictedTissue[umapAndMetaClassifiedTestTissue$predictedTissueScore <= prop] <- NA
+
+ umapAndMetaClassifiedTestTissue$misclasified <- FALSE
+ umapAndMetaClassifiedTestTissue$misclasified[!is.na(umapAndMetaClassifiedTestTissue$umapFactor) & !is.na(umapAndMetaClassifiedTestTissue$predictedTissue) & umapAndMetaClassifiedTestTissue$umapFactor != umapAndMetaClassifiedTestTissue$predictedTissue] <- TRUE
+ errors <- sum(umapAndMetaClassifiedTestTissue$misclasified )
+
+ umapAndMetaClassifiedTestTissue$notPredictedBack <- FALSE
+ umapAndMetaClassifiedTestTissue$notPredictedBack[!is.na(umapAndMetaClassifiedTestTissue$umapFactor) & is.na(umapAndMetaClassifiedTestTissue$predictedTissue) ] <- TRUE
+ missed <- sum(umapAndMetaClassifiedTestTissue$notPredictedBack)
+
+ total <- nrow(umapAndMetaClassifiedTestTissue)
+
+ missedPercentage <- missed / total
+ errorPercentage <- errors / total
+
+ return(c("Threshold" = prop, "MissedPerc" = missedPercentage , "ErrorPerc" = errorPercentage ))
+
+ })
+ return(predictionsInTestThisTissue)
+})
+names(predictionsInTestPerTissue) <- levels(umapAndMetaClassified$umapFactor)
+str(predictionsInTestPerTissue)
+
+x <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){
+ return(predictionsInTestThisTissue[3,11])
+})
+sort(x)
+
+predictionsInTest[2,15]
+
+predictionsInTestPerTissue[["Whole Blood Fetal"]]
+
+layout(matrix(1:2, nrow = 1))
+plot(t(predictionsInTest[1:2,]), main = "Percentage classification missed in test dataset")
+for(tissueClass in levels(umapAndMetaClassified$umapFactor)){
+ predictionsInTestThisTissue <- predictionsInTestPerTissue[[tissueClass]]
+ points(t(predictionsInTestThisTissue[1:2,]), type = "l", col=adjustcolor("grey", alpha.f = 0.5))
+}
+plot(t(predictionsInTest[c(1,3),]), main = "Percentage wrong classification in test dataset")
+sink <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){
+ points(t(predictionsInTestThisTissue[c(1,3),]), type = "l", col=adjustcolor("grey", alpha.f = 0.5))
+})
+
+
+
+
+confusion <- confusion.glmnet(cfit, newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), newy = umapAndMetaClassifiedTest$umapFactor, family = "multinomial", type.measure = "class", keep = TRUE, alpha=1, lambda = "1se")
+diag(confusion) <- 0
+
+library(heatmap3)
+
+rpng()
+pdf("confusion.pdf", width = 12, height = 12)
+heatmap3(confusion, Rowv = NA, Colv = NA, balanceColor =T, scale = "none")
+dev.off()
+
+
+predictions <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "class")
+umapAndMeta$predictedTissue <- predictions[,1]
+
+predictionsScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "response")
+predictionsScores <- predictionsScores[,,1]
+rownames(predictionsScores) <- umapAndMeta$Row.names
+umapAndMeta$predictedTissueScore <- apply(predictionsScores, 1, max)
+
+sum(umapAndMeta$predictedTissueScore <= 0.5)
+umapAndMeta$predictedTissue[umapAndMeta$predictedTissueScore <= 0.5] <- NA
+
+
+rpng()
+hist(umapAndMeta$predictedTissueScore)
+dev.off()
+
+umapAndMeta$misclasified <- FALSE
+umapAndMeta$misclasified[!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & umapAndMeta$umapFactor != umapAndMeta$predictedTissue] <- TRUE
+sum(umapAndMeta$misclasified )
+
+umapAndMeta$notPredictedBack <- FALSE
+umapAndMeta$notPredictedBack[!is.na(umapAndMeta$umapFactor) & is.na(umapAndMeta$predictedTissue) ] <- TRUE
+sum(umapAndMeta$notPredictedBack)
+
+sum(!is.na(umapAndMeta$predictedTissue))
+
+length(unique((umapAndMeta$predictedTissue)))
+
+sum(table((umapAndMeta$predictedTissue)) >= 1000)
+hist(table((umapAndMeta$predictedTissue)), breaks =25)
+barplot(table((umapAndMeta$predictedTissue)))
+
+sort(table(umapAndMeta[umapAndMeta$misclasified, "umapFactor"]))
+sort(table(umapAndMeta[umapAndMeta$notPredictedBack, "umapFactor"]))
+
+tissueClass <- levels(umapAndMeta$umapFactor)[1]
+
+pdf("tissuePrediction.pdf")
+for(tissueClass in levels(umapAndMeta$umapFactor)){
+
+ umapAndMeta$ThisTissueCol <- defaultCol
+ umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & !umapAndMeta$misclasified] <- adjustcolor("forestgreen", alpha.f = 0.5)
+ umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$notPredictedBack] <- adjustcolor("hotpink", alpha.f = 0.5)
+ umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$misclasified] <- adjustcolor("violetred3", alpha.f = 0.5)
+ umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass != umapAndMeta$umapFactor & tissueClass == umapAndMeta$predictedTissue] <- adjustcolor("orange1", alpha.f = 0.5)
+ umapAndMeta$ThisTissueCol[is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass == umapAndMeta$predictedTissue] <- adjustcolor("dodgerblue1", alpha.f = 0.5)
+
+ predictedBack <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & !umapAndMeta$misclasified)
+ notPredictedBack <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$notPredictedBack)
+ predictedAsOther <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$misclasified)
+ otherPredicted <- sum(!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass != umapAndMeta$umapFactor & tissueClass == umapAndMeta$predictedTissue)
+ newPredicted <- sum(is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass == umapAndMeta$predictedTissue)
+
+ table(umapAndMeta$ThisTissueCol, useNA = "a")
+
+ umapAndMeta$plotOrderThisTissues <- order(umapAndMeta$ThisTissueCol != defaultCol)
+
+ #rpng()
+ layout(matrix(c(1,2,3), ncol = 1, byrow = T), heights = c(0.05,0.85,0.1))
+ par(mar = c(0,0,0,0), xpd = NA)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ text(0.5,0.5,tissueClass, cex = 2 , font = 2)
+
+ par(mar = c(5,5,0,0.1), xpd = NA)
+ plot(umapAndMeta[umapAndMeta$plotOrderThisTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderThisTissues,"UMAP2"], col = umapAndMeta$ThisTissueCol[umapAndMeta$plotOrderThisTissues], cex = 0.2, pch = 16, bty="n", xlab = "UMAP-1", ylab = "UMAP-2")
+
+ par(mar = c(0,0,0,0), xpd = NA)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ legend("center", fill = c(
+ "forestgreen",
+ "hotpink",
+ "violetred3",
+ "orange1",
+ "dodgerblue1"
+ ),
+ legend = c(
+ paste0(tissueClass, " correctly predicted back (", predictedBack,")"),
+ paste0(tissueClass, " not predicted back (", notPredictedBack,")"),
+ paste0(tissueClass, " predicted as other (", predictedAsOther,")"),
+ paste0("Other tissue predicted as ", tissueClass," (", otherPredicted,")"),
+ paste0("Unkown predicted as ", tissueClass, " (", newPredicted,")")
+ ),
+ bty = "n")
+
+
+
+ #dev.off()
+
+}
+dev.off()
+
+#save(umapAndMeta, file = "tissuePredictions/tissuePredictions_16_09_22.RData")
+load("tissuePredictions/tissuePredictions_16_09_22.RData", verbose = T)
+
+unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)]
+
+
+
+clusterToExclude <- c("U2-OS", "Leukemia_blood-cell-line", "HAP1", "LNCaP")
+
+
+
+
+samplesWithPrediction <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umapAndMeta$predictedTissue %in% clusterToExclude, c(
+ "predictedTissue",
+ "predictedTissueScore",
+ "umapFactor",
+ "misclasified",
+ "study",
+ "sra.library_layout"
+)]
+colnames(samplesWithPrediction)[3] <- "annotatedTissue"
+str(samplesWithPrediction)
+#save(samplesWithPrediction, file = "tissuePredictions/samplesWithPrediction_16_09_22.RData")
+
+write.table(samplesWithPrediction, file = "samplesWithPrediction.txt")
+load("tissuePredictions/samplesWithPrediction_16_09_22.RData")
+str(samplesWithPrediction)
+table(samplesWithPrediction$predictedTissue)
+
+load(file = "umap/sampleUmap6.RData", verbose = T)
+
+
+umapAndPredictions <- merge(samplesWithPrediction, sampleUmap, by = 0 )
+rownames(umapAndPredictions) <- umapAndPredictions$Row.names
+
+
+umapAndPredictions$TissuePredictedCol <- defaultCol
+umapAndPredictions$TissuePredictedCol[umapAndPredictions$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndPredictions$predictedTissue[umapAndPredictions$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5)
+umapAndPredictions$plotOrderTissuePredicted <- order(umapAndPredictions$TissuePredictedCol != defaultCol)
+
+#rpng()
+
+par(mar = c(3,3,0.1,0.1), xpd = NA)
+plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16)
+
+plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16, xlim = c(-100,70), ylim = c(-50,50))
+
+
+
+
+#dev.off()
+
+locator(n =2, type = "l")
+
+
+pdf(file = "umapPredicted.pdf", width = 16, height = 8)
+#rpng()
+
+layout(matrix(1:2,ncol = 2))
+
+par(mar = c(5,5,0.1,0.1), xpd = NA)
+plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16, bty = "n", xlab = "UMAP-1", ylab = "UMAP-2")
+
+par(mar = c(0,0,0,0), xpd = NA)
+plot.new()
+plot.window(xlim = 0:1, ylim = 0:1)
+legend("center", fill = tissueCol[rownames(tissueCol) %in% umapAndPredictions$predictedTissue,1], legend = row.names(tissueCol)[rownames(tissueCol) %in% umapAndPredictions$predictedTissue], bty = "n", ncol = 2,cex = 0.7)
+
+
+dev.off()
+
+
+
+
+countTable <- table(umapAndPredictions$predictedTissue)
+sum(countTable)
+sum(countTable >= 500)
+pdf("baplotTissues.pdf", width = 15, height = 10)
+par(mar = c(25,5,2,0.1), xpd = NA)
+b <- barplot(countTable, las =2, col = tissueCol[names(countTable),])
+text(b, countTable + 280, countTable, font=1, srt = 90)
+dev.off()
diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R
new file mode 100644
index 000000000..a01f9d656
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R
@@ -0,0 +1,64 @@
+#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i
+#remoter::server(verbose = T, port = 55556, sync = T)
+
+
+remoter::client("localhost", port = 55501)
+
+
+
+setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
+
+
+
+sraFiles <- list.files(path="rse-sra/SRA_Files/", pattern="sra*", full.names=TRUE, recursive=FALSE)
+gtexFiles <- list.files(path="rse-gtex/rse_gtex", pattern="rse*", full.names=TRUE, recursive=FALSE)
+allFiles <- c(sraFiles, gtexFiles, "rse-tcga/rseTCGA.rda", "rse-tcga/rse_ESCA_TCGA.rda")
+
+load("tissuePredictions/samplesWithPrediction_16_09_22.RData")
+selectedSamples <- rownames(samplesWithPrediction)
+str(selectedSamples)
+
+
+#file = allFiles[10]
+
+perChunkExp <- sapply(allFiles, function(file){
+
+ loadedObject <- load(file)
+
+ sreObjects <- get(loadedObject[1])
+
+ #sometimes single RSE is not in list. Put in list of one to make code uniform
+ if(!is.list(sreObjects)){
+ sreObjects <- list(sreObjects)
+ }
+
+ #sreObject <- sreObjects[[1]]
+
+ perStudyExp <- lapply(sreObjects, function(sreObject){
+ studyExp <- sreObject@assays@data@listData$raw_counts
+ return(studyExp[,colnames(studyExp) %in% selectedSamples, drop = F])
+ })
+
+ return(do.call(cbind, perStudyExp))
+
+})
+
+str(sreObject)
+
+selectedSamplesExp <- do.call(cbind, perChunkExp)
+str(selectedSamplesExp)
+all(selectedSamples %in% colnames(selectedSamplesExp ))
+table(selectedSamples %in% colnames(selectedSamplesExp ))
+
+
+
+#Some samples are duplicated in the chunks, now make sure only one is in the matrix
+uniqueSamplesIndex <- match(selectedSamples, colnames(selectedSamplesExp))
+selectedSamplesExp <- selectedSamplesExp[,uniqueSamplesIndex]
+
+
+
+#save(selectedSamplesExp, file = "perTissueNormalization/selectedSamplesRawExpression.RData")
+
+
+
diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R
new file mode 100644
index 000000000..fdb5022b5
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R
@@ -0,0 +1,1881 @@
+#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i
+#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T)
+
+
+remoter::client("localhost", port = 55501, password = "laberkak")
+
+
+#save.image("tmp2.RData")
+
+
+
+setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
+setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\")
+#load("tmp2.RData")
+
+
+library(readr)
+
+
+table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_sra1.txt", delim = "\t", quote = "", guess_max = 20000)
+sraMeta1 <- as.data.frame(table_tmp[,-1])
+rm(table_tmp)
+
+table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_sra2.txt", delim = "\t", quote = "", guess_max = 20000)
+sraMeta2 <- as.data.frame(table_tmp[,-1])
+rm(table_tmp)
+
+sraSharedCol <- intersect(colnames(sraMeta2), colnames(sraMeta1))
+length(sraSharedCol)
+
+sraMeta <- rbind(sraMeta1[,sraSharedCol], sraMeta2[,sraSharedCol])
+
+#For some reason some runs are duplicated in the meta data file.
+#Quick inspection showed that they have the same values
+#Solution exclude duplicate row
+sraUniqueIds <- unique(sraMeta$external_id)
+str(sraUniqueIds)
+sraMeta <- sraMeta[ match(sraUniqueIds, sraMeta$external_id), ]
+rownames(sraMeta) <- sraMeta$external_id
+
+
+
+#extra columns in part 2
+sraPart2Col <- colnames(sraMeta2)[!colnames(sraMeta2) %in% colnames(sraMeta1)]
+
+sraMetaExtended <- sraMeta2[match(sraUniqueIds, sraMeta2$external_id),sraPart2Col]
+
+str(sraMetaExtended)
+
+sum(length(unique(sraMeta2$external_id)))
+sum(length(unique(sraMeta1$external_id)))
+
+sum(unique(length(sraMeta2$external_id)))
+sum(unique(length(sraMeta1$external_id)))
+
+#metadata_gtex
+load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_gtex.Rda", verbose = T)
+rownames(metadata_gtex) <- metadata_gtex$external_id
+metadata_gtex2 <- metadata_gtex[,c("gtex.smts", "gtex.smtsd")]
+str(metadata_gtex2)
+
+#metadata_tcga
+load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_tcga.Rda", verbose = T)
+rownames(metadata_tcga) <- metadata_tcga$external_id
+metadata_tcga2 <- metadata_tcga[,c("tcga.gdc_cases.project.primary_site", "tcga.cgc_sample_sample_type")]
+
+
+
+#ARCH4 data
+table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadataSRA.txt", delim = "\t", quote = "")
+metadata_archs4 <- as.data.frame(table_tmp[,-1])
+rownames(metadata_archs4) <- table_tmp[,3][[1]]
+rm(table_tmp)
+metadata_archs4_2 <- metadata_archs4[,c("Tissue", "CellType", "CellLine")]
+colnames(metadata_archs4_2) <- c("archs4.Tissue", "archs4.CellType", "archs4.CellLine")
+
+#GADO data
+metadata_Gado <- read.delim("celllinesAndCancer/oldAnnotations/sampleAnnotations.txt")
+rownames(metadata_Gado) <- metadata_Gado$Sample
+metadata_Gado2 <- metadata_Gado[,c("CellLine", "TissueType", "CellType", "PlotClass")]
+colnames(metadata_Gado2) <- paste0("gado.",colnames(metadata_Gado2))
+gadoTissueCol <- read.delim("celllinesAndCancer/oldAnnotations/tissueCol5.txt")
+
+
+#Kidney Network Annotaions
+metadata_Kn <- read.delim("Metadata/KidneyNetwork.txt")
+rownames(metadata_Kn) <- metadata_Kn$Sample
+metadata_Kn2 <- metadata_Kn[,c("Origin", "Cell_type", "Cell_type_simplified", "Cell_type_manual")]
+colnames(metadata_Kn2) <- paste0("KidneyNetwork.",colnames(metadata_Kn2))
+
+
+#Mahmoud annotations
+load("Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Annotations.rda", verbose = T)
+str(Annotations)
+rownames(Annotations) <- Annotations$SampleID
+mahmoudAnnotations <- Annotations[,-(1:2)]
+#write.table(Annotations, sep = "\t", quote = F, col.names = NA, file = "tmp.txt")
+
+allSamples <- c(rownames(metadata_gtex2), rownames(metadata_tcga2), rownames(sraMeta))
+
+length(unique(allSamples)) == length(allSamples)
+
+numberSamples = length(allSamples)
+finalAnnotations <- data.frame(
+ Tissue = rep("",numberSamples),
+ Tissue2 = rep("",numberSamples),
+ Cellline = vector(mode = "logical", length = numberSamples),
+ CelllineName = rep("",numberSamples),
+ Cancer = vector(mode = "logical", length = numberSamples),
+ Cohort = rep("SRA",numberSamples),
+ row.names = allSamples, stringsAsFactors = F)
+finalAnnotations$Cellline = NA
+finalAnnotations$Cancer = NA
+finalAnnotations$Fetal <- NA
+
+finalAnnotations$Cohort[rownames(finalAnnotations) %in% rownames(metadata_gtex2)] <- "GTEx"
+finalAnnotations$Cohort[rownames(finalAnnotations) %in% rownames(metadata_tcga2)] <- "TCGA"
+table(finalAnnotations$Cohort, useNA = "always")
+str(finalAnnotations)
+
+
+dim(finalAnnotations)
+dim(metadata_gtex2)
+
+
+sum(rownames(finalAnnotations) %in% rownames(metadata_gtex2))
+
+a <- merge(finalAnnotations, metadata_gtex2, all.x = T, by = 0)
+row.names(a) <- a$Row.names
+
+b <- merge(a, metadata_tcga2, all.x = T, by = 0)
+row.names(b) <- b$Row.names
+
+c <- merge(b, metadata_archs4_2, all.x = T, by = 0)
+row.names(c) <- c$Row.names
+
+d <- merge(c, metadata_Gado2, all.x = T, by = 0)
+row.names(d) <- d$Row.names
+
+e <- merge(d, sraMetaExtended, all.x = T, by = 0)
+row.names(e) <- e$Row.names
+
+f <- merge(e, sraMeta, all.x = T, by = 0)
+row.names(f) <- f$Row.names
+
+g <- merge(f, metadata_Kn2, all.x = T, by = 0)
+row.names(g) <- g$Row.names
+
+str(g)
+
+combinedMeta <- g[,-c(1:7)]
+str(combinedMeta)
+
+#now fillin the gtex and gcta recount meta data.
+
+tmp <- metadata_gtex[,colnames(metadata_gtex) %in% sraSharedCol]
+combinedMeta[rownames(tmp),colnames(tmp)] <- tmp
+
+tmp <- metadata_tcga[,colnames(metadata_tcga) %in% sraSharedCol]
+combinedMeta[rownames(tmp),colnames(tmp)] <- tmp
+
+rm(tmp)
+
+#set study make column uniform
+combinedMeta$study[combinedMeta$Cohort == "GTEx"] <- "GTEx"
+combinedMeta$study[combinedMeta$Cohort == "TCGA"] <- "TCGA"
+
+combinedMeta$exclude <- FALSE
+
+#save(combinedMeta, file = "combinedMeta.RData")
+#load(file = "combinedMeta.RData")
+
+combinedMeta$Tissue[combinedMeta$Cohort == "GTEx"] <- combinedMeta$gtex.smts[combinedMeta$Cohort == "GTEx"]
+combinedMeta$Tissue2[combinedMeta$Cohort == "GTEx"] <- combinedMeta$gtex.smtsd[combinedMeta$Cohort == "GTEx"]
+combinedMeta$Cellline[combinedMeta$Cohort == "GTEx"] <- FALSE
+combinedMeta$Cancer[combinedMeta$Cohort == "GTEx"] <- FALSE
+
+gtexLcl <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - EBV-transformed lymphocytes")
+combinedMeta$Cellline[gtexLcl] <- TRUE
+combinedMeta$CelllineName[gtexLcl] <- "lcl"
+combinedMeta$Tissue[gtexLcl] <- ""
+combinedMeta$Tissue2[gtexLcl] <- ""
+
+gtexCml <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - Leukemia cell line (CML)")
+combinedMeta$Cellline[gtexCml] <- TRUE
+combinedMeta$CelllineName[gtexCml] <- "cml"
+combinedMeta$Tissue[gtexCml] <- ""
+combinedMeta$Tissue2[gtexCml] <- ""
+
+gtexFibroblasts <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - Cultured fibroblasts")
+combinedMeta$Cellline[gtexFibroblasts] <- TRUE
+combinedMeta$CelllineName[gtexFibroblasts] <- "Fibroblasts"
+combinedMeta$Tissue[gtexFibroblasts] <- ""
+combinedMeta$Tissue2[gtexFibroblasts] <- ""
+
+
+table(combinedMeta$gtex.smtsd[combinedMeta$Cohort == "GTEx"])
+
+combinedMeta$Tissue[combinedMeta$Cohort == "TCGA"] <- combinedMeta$tcga.gdc_cases.project.primary_site[combinedMeta$Cohort == "TCGA"]
+combinedMeta$Cellline[combinedMeta$Cohort == "TCGA"] <- FALSE
+combinedMeta$Cancer[combinedMeta$Cohort == "TCGA"] <- TRUE #default for TCGA exception below
+combinedMeta$Cancer[combinedMeta$Cohort == "TCGA" & combinedMeta$tcga.cgc_sample_sample_type == "Solid Tissue Normal"] <- FALSE
+
+
+#Map GADO names to gtex names
+combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""] <- gsub("(^[[:alpha:]])", "\\U\\1", combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""], perl=TRUE)#https://stackoverflow.com/questions/18509527/first-letter-to-upper-case
+gadoTissueCol$PlotClass <- gsub("(^[[:alpha:]])", "\\U\\1", gadoTissueCol$PlotClass, perl=TRUE)#https://stackoverflow.com/questions/18509527/first-letter-to-upper-case
+combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType == "Adipose"] <- "Adipose Tissue"
+gadoTissueCol$PlotClass[gadoTissueCol$PlotClass == "Adipose"] <- "Adipose Tissue"
+
+#Fix
+combinedMeta$gado.CellType[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "acute myeloid leukemia"] <- "AML"
+
+#Only annotations with a color are checked and highly realiable
+gadoAnnotatedTissues <- !is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != "" & combinedMeta$gado.TissueType %in% gadoTissueCol$PlotClass
+combinedMeta$Tissue[gadoAnnotatedTissues] <- combinedMeta$gado.TissueType[gadoAnnotatedTissues]
+combinedMeta$Cancer[gadoAnnotatedTissues] <- FALSE
+combinedMeta$Cellline[gadoAnnotatedTissues] <- FALSE
+
+gadoAnnotatedCelltypes <- !is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType != "" & combinedMeta$gado.CellType %in% gadoTissueCol$PlotClass
+combinedMeta$Tissue2[gadoAnnotatedCelltypes] <- combinedMeta$gado.CellType[gadoAnnotatedCelltypes]
+combinedMeta$Cancer[gadoAnnotatedCelltypes] <- FALSE
+combinedMeta$Cellline[gadoAnnotatedCelltypes] <- FALSE
+
+combinedMeta$Cancer[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "AML"] <- TRUE
+combinedMeta$Cancer[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "DLBCL"] <- TRUE
+
+
+gadoAnnotatedCelllines <- !is.na(combinedMeta$gado.CellLine) & combinedMeta$gado.CellLine != "" & tolower(combinedMeta$gado.CellLine) %in% tolower(gadoTissueCol$PlotClass)
+combinedMeta$CelllineName[gadoAnnotatedCelllines] <- combinedMeta$gado.CellLine[gadoAnnotatedCelllines]
+combinedMeta$Cancer[gadoAnnotatedCelllines] <- FALSE
+combinedMeta$Cellline[gadoAnnotatedCelllines] <- TRUE
+
+#Some manual stuff for big studies
+
+combinedMeta$CelllineName[!is.na(combinedMeta$study) & combinedMeta$study == "SRP166108"] <- "HepaRG"
+combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP166108"] <- TRUE
+
+
+combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- "Blood"
+combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- "Whole Blood"
+combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- FALSE
+combinedMeta$Cancer[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- FALSE
+
+combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP186687"] <- TRUE
+
+combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- "Blood"
+combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- "Whole Blood"
+combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- FALSE
+combinedMeta$Cancer[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- FALSE
+
+
+combinedMeta$Tissue[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- "Blood"
+combinedMeta$Tissue2[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- "T-cells"
+combinedMeta$Cellline[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- FALSE
+combinedMeta$Cancer[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- FALSE
+
+combinedMeta$Tissue[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- "Blood"
+combinedMeta$Tissue2[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- "Monocytes"
+combinedMeta$Cellline[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- FALSE
+combinedMeta$Cancer[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- FALSE
+
+combinedMeta$Tissue[combinedMeta$study == "SRP061932"] <- ""
+combinedMeta$Tissue2[combinedMeta$study == "SRP061932"] <- ""
+combinedMeta$Cellline[combinedMeta$study == "SRP061932"] <- FALSE
+combinedMeta$Cancer[combinedMeta$study == "SRP061932"] <- FALSE
+
+
+
+combinedMeta$Tissue[combinedMeta$study == "SRP047323"] <- ""
+combinedMeta$Tissue2[combinedMeta$study == "SRP047323"] <- ""
+combinedMeta$Cellline[combinedMeta$study == "SRP047323"] <- FALSE
+combinedMeta$Cancer[combinedMeta$study == "SRP047323"] <- FALSE
+
+
+
+combinedMeta$CelllineName[combinedMeta$study == "ERP001942"] <- "lcl"
+combinedMeta$Cellline[combinedMeta$study == "ERP001942"] <- TRUE
+
+
+combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "ERP007111"] <- "iPSC"
+
+combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "ERP012914"] <- TRUE
+combinedMeta$CelllineName[!is.na(combinedMeta$study) & combinedMeta$study == "ERP012914"] <- "HAP1"
+
+
+combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP151763"] <- "Eye"
+combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP151763"] <- "Retina"
+combinedMeta$Cellline[combinedMeta$study == "SRP151763"] <- FALSE
+combinedMeta$Cancer[combinedMeta$study == "SRP151763"] <- FALSE
+
+
+
+combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP162411"] <- "Blood"
+combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP162411"] <- "Whole Blood"
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP102542"
+combinedMeta$Tissue[studySamples] <- "Muscle"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP150311"
+combinedMeta$Tissue[studySamples] <- "Muscle"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP162873"
+combinedMeta$Tissue[studySamples] <- "Muscle"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP163524"
+combinedMeta$Tissue[studySamples] <- "Muscle"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study %in% c("SRP006676", "SRP071758", "SRP081599", "SRP086078", "SRP119923")
+combinedMeta$Tissue[studySamples] <- "Airway Epithelial"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+samples <- combinedMeta$study == "SRP188219" & grepl("left atrial appendage", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Heart"
+combinedMeta$Tissue2[samples] <- "Left atrial appendage"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- FALSE
+
+
+samples <- combinedMeta$study == "SRP188219" & grepl("right atrial appendage", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Heart"
+combinedMeta$Tissue2[samples] <- "Right atrial appendage"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- FALSE
+
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRA755613"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- TRUE
+combinedMeta$CelllineName[studySamples] <- "iPSC"
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRA755626"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- TRUE
+combinedMeta$CelllineName[studySamples] <- "iPSC"
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP148659"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- TRUE
+combinedMeta$CelllineName[studySamples] <- "iPSC"
+combinedMeta$Cancer[studySamples] <- FALSE
+
+#Put to missing annotations unclear
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP009316"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP021509"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+#Unsure how to classify airway smooth muscle
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP043162"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP052896"
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+
+#Some organoids and cancers
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP058722"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+
+
+#DPN and Tamoxifen treatments of parathyroid adenoma cells have cancer CNV profile
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP012167"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP019936"
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+combinedMeta["SRR5341594", "sra.sample_title"] <- "Human differentiating macrophage"
+
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "ERP011411"
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP010166"
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+
+studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP012656"
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+samples <- combinedMeta$study == "ERP006077" & grepl("Primary Prostate Tumour", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Prostate"
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+samples <- combinedMeta$study == "ERP006077" & grepl("Matched Adjacent", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Prostate"
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- FALSE
+
+samples <- combinedMeta$study == "ERP006077"
+combinedMeta$Tissue[samples] <- "Pancreas"
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+samples <- combinedMeta$study == "SRP058587"
+combinedMeta$Tissue[samples] <- "Breast"
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+samples <- combinedMeta$study == "SRP062332"
+combinedMeta$Tissue[samples] <- ""
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- TRUE
+combinedMeta$Cancer[samples] <- NA
+
+
+
+samples <- combinedMeta$study == "SRP030401"
+combinedMeta$Tissue[samples] <- "Breast"
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+
+
+samples <- combinedMeta$study == "SRP028344"
+combinedMeta$Tissue[samples] <- ""
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- TRUE
+combinedMeta$Cancer[samples] <- NA
+
+
+
+
+samples <- combinedMeta$study == "SRP073061" & grepl("Tumor", combinedMeta$sra.experiment_title)
+combinedMeta$Tissue[samples] <- "Breast"
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+
+
+
+samples <- combinedMeta$study == "SRP028346"
+combinedMeta$Tissue[samples] <- ""
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- TRUE
+combinedMeta$Cancer[samples] <- NA
+
+
+samples <- combinedMeta$study == "SRP058571"
+combinedMeta$Tissue[samples] <- ""
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- TRUE
+combinedMeta$Cancer[samples] <- NA
+
+studySamples <- combinedMeta$study %in% c("SRP014027", "SRP006575", "SRP071932", "ERP004617", "SRP034592","SRP049695")
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+
+samples <- combinedMeta$study == "SRP066596"
+combinedMeta$Tissue[samples] <- ""
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- NA
+combinedMeta$Cancer[samples] <- TRUE
+
+
+
+samples <- combinedMeta$study == "SRP049648"
+combinedMeta$Tissue[samples] <- ""
+combinedMeta$Tissue2[samples] <- ""
+combinedMeta$Cellline[samples] <- TRUE
+combinedMeta$Cancer[samples] <- NA
+
+
+
+
+studySamples <- combinedMeta$study == "SRP039694"
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+studySamples <- combinedMeta$study == "SRP066260"
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+#mahmoudAnnotations
+
+colnames(mahmoudAnnotations)[colnames(mahmoudAnnotations) == "Cell_Line"] <- "Cellline"
+colnames(mahmoudAnnotations)[colnames(mahmoudAnnotations) == "Cell_Line_Name"] <- "CelllineName"
+
+all(colnames(mahmoudAnnotations) %in% colnames(combinedMeta))
+all(rownames(mahmoudAnnotations) %in% rownames(combinedMeta))
+
+
+combinedMeta[rownames(mahmoudAnnotations),colnames(mahmoudAnnotations)] <- mahmoudAnnotations
+
+
+#All cellline to false for all tissues
+combinedMeta$Cancer[combinedMeta$Cellline] <- FALSE
+
+tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Cervix Uteri"
+combinedMeta$Tissue[tmp] <- "Uterus"
+combinedMeta$Tissue2[tmp] <- "Cervix"
+
+tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Cervix"
+combinedMeta$Tissue[tmp] <- "Uterus"
+combinedMeta$Tissue2[tmp] <- "Cervix"
+
+tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Lymph node"
+combinedMeta$Tissue[tmp] <- "Lymph Nodes"
+
+tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Bone marrow"
+combinedMeta$Tissue[tmp] <- "Bone Marrow"
+
+tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Colorectal"
+combinedMeta$Tissue[tmp] <- "Colon"
+
+
+tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Whole blood"
+combinedMeta$Tissue2[tmp] <- "Whole Blood"
+combinedMeta$Tissue[tmp] <- "Blood"
+
+
+
+#Below are tissues2 fixes by Mahmoud
+# annotations already present in Tissue are removed from Tissue2
+#duplicated are harmonized
+#set rare annotations to NA
+### All parts of the basal ganglia (including substantia nigra) were annotated as basal ganglia
+###brain fragements was set to NA
+###Retina needs to have Eye as Tissue
+###Sample annotated as both brain & stomach was annotated as NA
+
+#Adipose Tissue
+# Tissue2 includes "Adipose - Subcutaneous" & "Adipose - Visceral (Omentum)"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Adipose - Subcutaneous"),2]= "Subcutaneous"
+combinedMeta[!is.na(combinedMeta$Tissue2) & combinedMeta$Tissue2== "Adipose - Visceral (Omentum)",2]= "Visceral"
+combinedMeta[!is.na(combinedMeta$Tissue) & combinedMeta$Tissue== "Adipose Tissue",1]= "Adipose"
+
+# Adrena Gland
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Adrenal Gland"),2]= NA
+
+#AML
+# keep as is
+
+# Arteries
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Aorta"),2]= "Aorta"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Coronary"),2]= "Coronary"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Tibial"),2]= "Tibial"
+
+#B-cells
+# keep as is
+
+#basal ganglion
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "basal ganglion"),2]= "Basal Ganglia"
+
+#Bladder
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Bladder"),2]= NA
+
+#Brain (keep as GTEX)****
+#Check for brain cortex vs cortex vs cerebral cortex
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Amygdala"),2]= "Amygdala"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Anterior cingulate cortex (BA24)"),2]= "Anterior cingulate cortex"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Caudate (basal ganglia)"),2]= "Caudate (basal ganglia)"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cerebellar Hemisphere"),2]= "Cerebellar Hemisphere"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cerebellum"),2]= "Cerebellum"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cortex"),2]= "Cortex"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Frontal Cortex (BA9)"),2]= "Frontal Cortex"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Hippocampus"),2]= "Hippocampus"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Hypothalamus"),2]= "Hypothalamus"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Nucleus accumbens (basal ganglia)"),2]= "Nucleus accumbens (basal ganglia)"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Putamen (basal ganglia)"),2]= "Putamen (basal ganglia)"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Spinal cord (cervical c-1)"),2]= "Spinal Cord (cervical c-1)"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Substantia nigra"),2]= "Substantia nigra"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "brain fragment"),2]= NA
+
+#Breast
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Breast - Mammary Tissue"),2]= "Mammary Tissue"
+
+#CD34+
+# Keep as is
+
+#Cultured fibroblasts
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Cells - Cultured fibroblasts"),2]= "Cultured Fibroblasts"
+
+#cerebellum
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "cerebellum"),2]= "Cerebellum"
+
+# cerebral cortex
+#recheck****
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "cerebral cortex"),2]= "Cortex"
+
+#Cervix
+#keep as is
+
+#choroid plexus
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "choroid plexus"),2]= NA
+
+#Colon
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Colon - Sigmoid"),2]= "Sigmoid"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Colon - Transverse"),2]= "Transverse"
+
+#diencephalon & diencephalon and midbrain
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "diencephalon"),2]= "Diencephalon"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "diencephalon and midbrain"),2]= NA
+
+#DLBCL
+#keep as is
+
+#Esophagus
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Gastroesophageal Junction"),2]= "Gastroesophageal Junction"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Mucosa"),2]= "Mucosa"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Muscularis"),2]= "Muscularis"
+
+#Fallopian Tube
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Fallopian Tube"),2]= NA
+
+#forebrain
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain and midbrain"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain fragment"),2]= NA
+
+#Heart
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Heart - Atrial Appendage"),2]= "Atrial Appendage"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Heart - Left Ventricle"),2]= "Left Ventricle"
+
+#hindbrain
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain fragment"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain without cerebellum"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hippocampus"),2]= "Hippocampus"
+
+#Kidney
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Kidney - Cortex"),2]= "Cortex"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Kidney - Medulla"),2]= "Medulla"
+
+#Liver
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Liver"),2]= NA
+
+#Lung
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Lung"),2]= NA
+
+#medulla oblongata
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "medulla oblongata"),2]= "Medulla Oblongata"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "midbrain"),2]= "Midbrain"
+
+#Minor salivary gland
+#keep as is
+
+#Monocytes
+#keep as is
+
+#Muscle-skeletal
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Muscle - Skeletal"),2]= "Skeletal"
+
+#Nerve
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Nerve - Tibial"),2]= "Tibial"
+
+#NK-cells
+#keep as is
+
+#Ovary
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Ovary"),2]= NA
+
+#Pancreas
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Pancreas"),2]= NA
+
+#PBMC
+#keep as is
+
+#Pituitary
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Pituitary"),2]= NA
+
+#pituitary and diencephalon & pons
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "pituitary and diencephalon"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "pons"),2]= NA
+
+#prostate
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Prostate"),2]= NA
+
+#skin
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Skin - Not Sun Exposed (Suprapubic)"),2]= "Suprapubic"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Skin - Sun Exposed (Lower leg)"),2]= "Lower Leg"
+
+# Small Intesine
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Small Intestine - Terminal Ileum"),2]= "Terminal Ileum"
+
+#spinal cord
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "spinal cord"),2]= ""
+
+#Spleen
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Spleen"),2]= NA
+
+#Stomach
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Stomach"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "stomach"),2]= NA
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "stomach"),1]= NA
+
+#T-cells
+#Keep as is
+
+#telencephalon
+#too general
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "telencephalon"),2]= NA
+
+#temporal lobe
+#The temporal lobe is one of the four major lobes of the cerebral cortex
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "temporal lobe"),2]= "Cortex"
+
+#Testis
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Testis"),2]=NA
+
+#Thyroid
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Thyroid"),2]=NA
+
+#Uterus
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Uterus"),2]=NA
+
+#Vagina
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Vagina"),2]=NA
+
+#whole blood
+#keep as is
+
+#remove iPSCs from
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "iPSC") ,4]="iPSC"
+combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "iPSC") ,2]=""
+
+#remove NA problem
+combinedMeta$Tissue2[is.na(combinedMeta$Tissue2)]<- ""
+combinedMeta$Tissue[is.na(combinedMeta$Tissue)]<- ""
+
+
+
+
+
+
+
+#Harmonizing Cell Line Names for samples in recount3
+
+# A549
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "a549"),4]= "A549"
+
+#H-STS NET
+#Keep as is
+
+#H1
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "h1"),4]= "H1"
+
+#H9
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "h9"),4]= "H9"
+
+#HAP1
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hap1"),4]= "HAP1"
+
+#HCT116
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hct116"),4]= "HCT116"
+
+#Hek293
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hek293"),4]= "HEK293"
+
+#HeLa
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hela"),4]= "HeLa"
+
+#HepaRG
+#keep as is
+
+#hepg2
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hepg2"),4]= "HepG2"
+
+#ipsc
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "ipsc"),4]= "iPSC"
+
+#K562
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "k562"),4]= "K562"
+
+#LCLs
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "lcl"),4]= "LCL"
+
+#lcl_s4u_capturing
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "lcl_s4u_capturing"),4]= "LCL_S4U_Capturing"
+
+#MCF10A
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mcf10a"),4]= "MCF10A"
+
+#MCF7
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mcf7"),4]= "MCF7"
+
+#MDA231
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mda231"),4]= "MDA231"
+
+#T47D
+combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "t47d"),4]= "T47D"
+
+
+#Fix SRP045234 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),1]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),2]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),3]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),4]= "iPSC"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),5]= FALSE
+#Fix SRP007525 Annotaions
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),1]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),2]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),3]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),4]= "OCI-LY1"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),5]= FALSE
+#Fix SRP027358 & SRP032926
+combinedMeta$Fetal[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027358")]= TRUE
+combinedMeta$Fetal[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP032926")]= TRUE
+
+#Fix SRP026537 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),1]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),2]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),3]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),4]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),5]= FALSE
+
+#Fix SRP049063 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),1]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),2]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),3]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),4]= "HT-29"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),5]= FALSE
+#Fix SRP053034 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),1]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),2]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),3]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),4]= "RPE-1"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),5]= FALSE
+#Fix SRP056197 Annoations
+samples <- combinedMeta$study == "SRP056197" & grepl("Bone marrow", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Bone Marrow"
+combinedMeta$Tissue2[samples] <- "AML"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+samples <- combinedMeta$study == "SRP056197" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Blood"
+combinedMeta$Tissue2[samples] <- "AML"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+
+#Fix SRP013565 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Cancer"]= FALSE
+
+#Fix ERP008682 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"CelllineName"]= "H9"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Cancer"]= FALSE
+#Fix SRP033646 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"CelllineName"]= "Caco2"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Cancer"]= FALSE
+#Fix SRP027383 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Cancer"]= TRUE
+#Fix SRP050003 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Tissue"]= "Liver"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003") & (grepl("non-tumoral", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+#Fix SRP073253 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Tissue"]= "Kidney"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Cancer"]= TRUE
+#Fix SRP069235 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Cancer"]= TRUE
+#Fix SRP074425 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Cancer"]= TRUE
+#Fix SRP044668 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668") & (grepl("non-neoplastic", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668") & (grepl("glioma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+#Fix SRP009123 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Tissue"]= "Liver"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123") & (grepl("non-tumor", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+#Fix SRP041094 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Tissue"]= "Prostate"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Cancer"]= TRUE
+#Fix SRP040998 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Tissue"]= "Liver"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Cancer"]= NA
+#Fix SRP052056 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Tissue"]= "Thyroid"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056") & (grepl("healthy", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+#Fix SRP029880 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Tissue"]= "Colon"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Cancer"]= TRUE
+#Fix SRP056696 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Tissue"]= "Liver"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696") & (grepl("Normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696") & (grepl("Tumor", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+#Fix SRP066794 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Tissue"]= "Lung"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Cancer"]= TRUE
+#Fix SRP149374 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Tissue"]= "Bone Marrow"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Tissue2"]= "CD34+"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374") & (grepl("Healthy", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374") & (grepl("Myelodysplastic", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+#Fix SRP019250 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250") & (grepl("HEK", combinedMeta$sra.sample_attributes, ignore.case=T)),"CelllineName"]= "HEK"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250") & (grepl("LCL", combinedMeta$sra.sample_attributes, ignore.case=T)),"CelllineName"]= "LCL"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Cancer"]= FALSE
+#Fix SRP074349 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Tissue"]= "Lung"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349") & (grepl("NSCLC", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349") & (grepl("NSCLC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+
+
+#Fix SRP009067 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"CelllineName"]= "LCL"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Cancer"]= FALSE
+#Fix SRP007885 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"CelllineName"]= "LCL"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Cancer"]= FALSE
+#Fix SRP018218 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue"]= "Pancreas"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue2"]= "Stellate Cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218"),"Cancer"]= TRUE
+#Fix SRP019275 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Cancer"]= FALSE
+#Fix SRP042186 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Cancer"]= FALSE
+#Fix SRP042620 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue"]= "Breast"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("ER+", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("Triple Negative", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("Uninvolved", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("No Known", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+
+
+#Fix ERP010142 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Tissue"]= "Breast"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Cancer"]= TRUE
+#Fix SRP026600 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Cancer"]= NA
+#Fix SRP028336 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("muscle", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Muscle"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("kidney", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Kidney"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("prefrontal cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("cerebellar Cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("primary visual cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("muscle", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("kidney", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("prefrontal cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Prefrontal Cortex"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("cerebellar Cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Cerebellar Cortex"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("primary visual cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Primary Visual Cortex"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"Cancer"]= FALSE
+#Fix SRP009029 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Cancer"]= NA
+#Fix SRP006912 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"CelllineName"]= "HK-2"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Cancer"]= FALSE
+#Fix SRP055444 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Tissue2"]= "CLL"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Cancer"]= TRUE
+#Fix SRP022942 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Cancer"]= FALSE
+#Fix ERP012180 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Cancer"]= NA
+#Fix SRP058717 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Cellline"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"CelllineName"]= "HT-29"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Cancer"]= FALSE
+#Fix SRP012568 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Cancer"]= NA
+#Fix SRP065146 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Cancer"]= FALSE
+#Fix ERP012188 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Cancer"]= NA
+#Fix SRP055390 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Tissue2"]= "CLL"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cancer"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue2"]="CLL"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells"
+
+
+#Fix SRP036145 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Cancer"]= NA
+#Fix SRP050533 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Cancer"]= NA
+
+
+
+
+
+
+#Fix ERP016243
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Fetal"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= ""
+
+table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2, " - ", combinedMeta$Fetal)[combinedMeta$study == "ERP109002"])
+
+#Fix SRP078234 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("hindbrain", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("hindbrain", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Hindbrain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("spinal cord", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("spinal cord", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Spinal Cord (cervical c-1)"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Fetal"]= TRUE
+
+
+#Fix SRP041044 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Cancer"]= NA
+#Fix SRP050260 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Tissue"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Cellline"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Cancer"]= NA
+#Fix SRP076099 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Cancer"]= NA
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Fetal"]= TRUE
+# Fix ERP115010 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Cancer"]= FALSE
+# Fix SRP221482 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Cancer"]= FALSE
+# Fix SRP059039 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Cancer"]= FALSE
+# Fix SRP059172 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Cancer"]= FALSE
+# Fix SRP062966 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Cancer"]= FALSE
+# Fix SRP081605 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Cancer"]= FALSE
+# Fix SRP103772 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Cancer"]= FALSE
+# Fix SRP132939 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Cancer"]= FALSE
+# Fix SRP136938 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Cancer"]= FALSE
+# Fix SRP150552 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cancer"]= FALSE
+# Fix SRP174223 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Cancer"]= FALSE
+# Fix SRP174638 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Cancer"]= FALSE
+# Fix SRP150552 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cancer"]= FALSE
+
+
+
+
+samples <- combinedMeta$study == "SRP033266" & grepl("Bone marrow", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Bone Marrow"
+combinedMeta$Tissue2[samples] <- "AML"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+samples <- combinedMeta$study == "SRP033266" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Blood"
+combinedMeta$Tissue2[samples] <- "AML"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+
+samples <- combinedMeta$study == "SRP048759" & grepl("Bone marrow", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Bone Marrow"
+combinedMeta$Tissue2[samples] <- "AML"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+samples <- combinedMeta$study == "SRP048759" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes)
+combinedMeta$Tissue[samples] <- "Blood"
+combinedMeta$Tissue2[samples] <- "AML"
+combinedMeta$Cellline[samples] <- FALSE
+combinedMeta$Cancer[samples] <- TRUE
+
+#Fix SRP045500 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("wholw blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("CD4", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("CD8", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("Neutrophils", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Neutrophils"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("Monocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("B-cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("NK", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "NK-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Cancer"]= FALSE
+
+#Fix SRP076719 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719") & (grepl("pbmc", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719") & (grepl("ln", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood"#in other cases we also put all t-cell to blood regardless of source
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Tissue2"]= "T-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Cancer"]= FALSE
+
+#Fix SRP051688 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("T cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("monocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("NK cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "NK-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("PBMC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "PBMC"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("myeloid DC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Dendritic cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("neutrophils", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Neutrophils"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Cancer"]= FALSE
+
+#Fix SRP078912 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912") & (grepl("T cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912") & (grepl("Monocyte", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Cancer"]= FALSE
+
+#Fix SRP110609 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609") & (grepl("lymphocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609") & (grepl("Monocyte", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Cancer"]= FALSE
+
+#Fix SRP158943 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("cll", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "CLL" #It doesn't state further clasification
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("cll", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE
+
+#Fix ERP104864 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("synovium", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Synovium"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Whole Blood"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("synovium", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"Cancer"]= FALSE
+
+#Fix intestine samples
+#ERP000546
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP000546") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#ERP003613
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("smallintestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("duodenum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("duodenum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Duodenum"
+#ERP006650
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP006650") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#SRP039090
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP039090") & (combinedMeta$Tissue=="Intestine") & (grepl("Small Intestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine"
+#SRP043391
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP043391") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon"
+#SRP048801
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP048801") & (combinedMeta$Tissue=="Intestine") & (grepl("ileum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP048801") & (combinedMeta$Tissue=="Intestine") & (grepl("ileum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Ileum"
+#SRP055438
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055438") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#SRP056520
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056520") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#SRP006900
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006900") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#SRP063496
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP063496") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#SRP000941
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP000941") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP000941") & (combinedMeta$Tissue=="Intestine") & (grepl("small intestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine"
+#SRP021221
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP021221") & (TissucombinedMetaes$Tissue=="Intestine"),"Tissue"]= "Colon"
+#SRP009386
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009386") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon"
+#exclude SRP048804 (Cell line)
+combinedMeta=combinedMeta[!combinedMeta$study=="SRP048804",]
+#exclude the remaining sample of Intestine
+combinedMeta=combinedMeta[!combinedMeta$Tissue=="Intestine",]
+
+#Fix ERP109002 Annotations
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Heart", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Heart"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Heart", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Kidney", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Kidney"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Kidney", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Testis", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Testis"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Testis", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Liver", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Liver"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Liver", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Brain", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Brain", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Forebrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Forebrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue2"]= "Forebrain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Hindbrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue"]= "Brain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Hindbrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue2"]= "Hindbrain"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Ovary", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Ovary"
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Ovary", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Cellline"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"CelllineName"]= ""
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Cancer"]= FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Fetal"] <- FALSE
+combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("embryo", combinedMeta$sra.sample_attributes, ignore.case=T)),"Fetal"]= TRUE
+
+
+studySamples <- combinedMeta$study %in% c("SRP105369", "ERP006121", "SRP062144")
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- "AML"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+studySamples <- combinedMeta$study %in% c("SRP221351", "SRP110313", "SRP115151", "SRP133278", "SRP156583", "SRP201603")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "B-cells"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- combinedMeta$study %in% c("ERP107715", "ERP111116", "SRP092158", "SRP133442", "SRP065795", "SRP119636")
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- TRUE
+
+studySamples <- combinedMeta$study %in% c("ERP109703", "SRP100686", "SRP161505")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "CLL"
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+studySamples <- combinedMeta$study == "SRP123604"
+combinedMeta$Tissue[studySamples] <- "Colon"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- TRUE
+
+
+
+studySamples <- combinedMeta$study %in% c("ERP113862", "ERP002323", "ERP114921", "SRP051368", "SRP097893", "SRP101856", "SRP151577")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "Dendritic cells"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+
+studySamples <- combinedMeta$study %in% c("SRP056733", "SRP062278", "SRP064515", "SRP074274", "SRP076097", "SRP095287", "SRP103821", "SRP109107", "SRP110187", "SRP118741", "SRP118760", "SRP145599", "SRP190161", "SRP218274", "SRP155941")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "Macrophages"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- combinedMeta$study %in% c("ERP020977", "ERP022909")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "Macrophages-iPSC"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+
+studySamples <- combinedMeta$study %in% c("ERP014531", "SRP041826", "SRP058953", "SRP096201", "SRP113586", "SRP192825", "SRP173842", "SRP045352", "SRP055514", "SRP069333", "SRP101726")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "Monocytes"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- combinedMeta$study %in% c("SRP150456")
+combinedMeta$Tissue[studySamples] <- "Nasal Lavage"
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+
+studySamples <- combinedMeta$study %in% c("SRP102104", "SRP162654", "SRP042596", "SRP049605", "SRP074736", "SRP090282", "SRP125882", "SRP140711", "SRP162023", "SRP168421", "SRP201023", "SRP212077", "SRP140558")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "PBMC"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+
+studySamples <- combinedMeta$study %in% c("SRP072980", "SRP169062", "SRP086613", "SRP092010", "ERP105662", "SRP093990", "SRP215282", "SRP032926", "SRP053186", "SRP059057", "SRP098715", "SRP101784", "SRP117629", "SRP140710", "SRP155217", "SRP158900", "SRP192607")
+combinedMeta$Tissue[studySamples] <- "Blood"
+combinedMeta$Tissue2[studySamples] <- "T-cells"
+combinedMeta$Cellline[studySamples] <- FALSE
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- FALSE
+
+#SRP081020 is mis-anotated in SRA as PBMC, paper and clustering both state wholeblood
+studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077", "SRP056443")
+combinedMeta$Tissue[studySamples] <- ""
+combinedMeta$Tissue2[studySamples] <- ""
+combinedMeta$Cellline[studySamples] <- NA
+combinedMeta$CelllineName[studySamples] <- ""
+combinedMeta$Cancer[studySamples] <- NA
+
+
+
+
+
+
+combinedMeta$Cellline[!is.na(combinedMeta$CelllineName)&combinedMeta$CelllineName=="iPSC"] <- TRUE
+combinedMeta$Cancer[!is.na(combinedMeta$Cellline) & combinedMeta$Cellline] <- NA
+
+
+table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2, " - ", combinedMeta$Fetal)[combinedMeta$study == "ERP109002"])
+
+#Exclude spike in
+combinedMeta$exclude[combinedMeta$study == "SRP041955"] <- TRUE
+
+(x <- table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2),combinedMeta$Cancer))
+write.table(x, file = "test.txt", row.names = T, col.names = NA, quote = F, sep = "\t")
+
+
+table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2),combinedMeta$Cancer)
+
+table(combinedMeta$Tissue)
+table(combinedMeta$Tissue2)
+
+table(combinedMeta$Tissue, combinedMeta$Cellline)
+
+table(combinedMeta$Cancer)
+
+table(combinedMeta$Tissue, combinedMeta$Cancer)
+
+table(combinedMeta$Tissue[combinedMeta$Cohort == "TCGA"], combinedMeta$Cancer[combinedMeta$Cohort == "TCGA"])
+
+
+sum(combinedMeta$gado.TissueType %in% combinedMeta$gado.PlotClass)
+
+sum((!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""))
+
+sum( )
+unique(combinedMeta$gado.TissueType)
+
+table(combinedMeta$gado.PlotClass, useNA = "a")
+
+combinedMeta$Tissue[combinedMeta$Cohort == "GSA"]
+
+
+
+#save(combinedMeta, file = "combinedMeta_2022_09_15.RData")
+
+load(file = "combinedMeta_2022_08_19.RData")
+
+pcsAndMeta <- merge(expPcs[,1:100], combinedMeta, by = 0, all.x = T)
+dim(pcsAndMeta)
+str(combinedMeta)
+
+tissueCol <- read.delim("Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Annotations_color2.txt", row.names = 1)
+
+sum(unique(pcsAndMeta[,"Tissue"]) %in% tissueCol$PlotClass)
+sum(unique(pcsAndMeta[,"Tissue2"]) %in% tissueCol$PlotClass)
+
+x <- unique(pcsAndMeta[,"Tissue2"])
+x[!x %in% tissueCol$PlotClass]
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.6)
+pcsAndMeta$col <- defaultCol
+
+tissueAndCol <- pcsAndMeta[,"Tissue"] %in% tissueCol$PlotClass
+
+pcsAndMeta$col[tissueAndCol] <- adjustcolor(tissueCol$col[match(pcsAndMeta[tissueAndCol,"Tissue"], tissueCol$PlotClass)], alpha.f = 0.6)
+
+
+tissue2AndCol <- pcsAndMeta[,"Tissue2"] %in% tissueCol$PlotClass
+sum(tissue2AndCol)
+pcsAndMeta$col[tissue2AndCol] <- adjustcolor(tissueCol$col[match(pcsAndMeta[tissue2AndCol,"Tissue2"], tissueCol$PlotClass)], alpha.f = 0.6)
+
+table(pcsAndMeta[pcsAndMeta[,"PC_2"] >= 0,"Tissue2"])
+
+
+sum(is.na(tolower(pcsAndMeta[,"Tissue"]) %in% tolower(tisueCol$PlotClass)))
+
+#pcsAndMeta$col <- tissueCol$col[match(tolower(pcsAndMeta[,"Tissue"]), tolower(tissueCol$PlotClass), nomatch = nrow(tissueCol))]
+
+plotOrder <- order((pcsAndMeta$col != defaultCol) + 1)
+
+rpng(width = 800, height = 800)
+#pdf(file = "test.pdf")
+plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$col[plotOrder], cex = 0.3, pch = 16)
+dev.off()
+
+
+rpng(width = 800, height = 800)
+#pdf(file = "test.pdf")
+plot(pcsAndMeta[plotOrder,"PC_3"], pcsAndMeta[plotOrder,"PC_7"], col = pcsAndMeta$col[plotOrder], cex = 0.3, pch = 16)
+dev.off()
+
+
+#rpng(width = 800, height = 800)
+png("tissues.png",width = 2000, height = 2000)
+pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16)
+dev.off()
+
+png("tissues2.png",width = 2000, height = 2000)
+pairs(pcsAndMeta[plotOrder,paste0("PC_",6:10)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL)
+dev.off()
+
+png("tissues3.png",width = 2000, height = 2000)
+pairs(pcsAndMeta[plotOrder,paste0("PC_",11:15)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL)
+dev.off()
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.3)
+pcsAndMeta$colCelline <- defaultCol
+pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cellline"]) & pcsAndMeta[,"Cellline"]] <- adjustcolor("magenta", alpha.f = 0.3)
+pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cellline"]) & !pcsAndMeta[,"Cellline"]] <- adjustcolor("royalblue1", alpha.f = 0.3)
+pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- adjustcolor("forestgreen", alpha.f = 0.3)
+plotOrder <- order((pcsAndMeta$colCelline != defaultCol) + 1)
+
+
+pcsAndMeta$cellineTissueCancer <- "Unkown"
+pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cellline"]) & pcsAndMeta[,"Cellline"]] <- "Cellline"
+pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cellline"]) & !pcsAndMeta[,"Cellline"]] <- "Tissue"
+pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- "Cancer"
+
+pcsAndMeta$cellineTissueCancer <- factor(pcsAndMeta$cellineTissueCancer, levels = c("Tissue", "Cancer", "Cellline", "Unkown"))
+
+table(pcsAndMeta$cellineTissueCancer, useNA = "always")
+
+rpng(width = 800, height = 800)
+plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, pch = 16)
+dev.off()
+
+rpng(width = 800, height = 800)
+plot(pcsAndMeta[plotOrder,"PC_3"], pcsAndMeta[plotOrder,"PC_75"], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, pch = 16)
+dev.off()
+
+for(i in c(1,3:100)){
+ png(paste0("cellinePlots/pc",i,".png"),width = 1000, height = 1000)
+ #rpng()
+ plot(pcsAndMeta[plotOrder,"PC_2"], pcsAndMeta[plotOrder,paste0("PC_",i)], col = pcsAndMeta$colCelline[plotOrder], cex = 1, pch = 16, xlab = "PC2", ylab = paste0("PC", i))
+ dev.off()
+}
+
+library(vioplot)
+
+for(i in 1:100){
+png(paste0("cellinePlots2/pc",i,".png"),width = 500, height = 500)
+vioplot( pcsAndMeta[,paste0("PC_",i)] ~ pcsAndMeta$cellineTissueCancer, col = c(adjustcolor("royalblue1", alpha.f = 0.3), adjustcolor("forestgreen", alpha.f = 0.3), adjustcolor("magenta", alpha.f = 0.3), defaultCol))
+dev.off()
+}
+table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2))
+
+png("celllines_c.png",width = 2000, height = 2000)
+pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5, "_c")], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16)
+dev.off()
+
+
+png("celllines2.png",width = 2000, height = 2000)
+pairs(pcsAndMeta[plotOrder,paste0("PC_",6:10, "")], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16)
+dev.off()
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.3)
+pcsAndMeta$colCancer <- defaultCol
+pcsAndMeta$colCancer[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- adjustcolor("chartreuse1", alpha.f = 0.6)
+plotOrder <- order((pcsAndMeta$colCancer != defaultCol) + 1)
+
+rpng(width = 800, height = 800)
+plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4)
+dev.off()
+
+png("cancers.png",width = 2000, height = 2000)
+pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5)], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16)
+dev.off()
+
+library(pROC)
+cancerAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cancer, predictor = x))})
+sort(cancerAuc)
+
+rpng(width = 800, height = 800)
+plot(pcsAndMeta[plotOrder,"PC_33"], pcsAndMeta[plotOrder,"PC_9"], col = pcsAndMeta$col[plotOrder], cex = 0.4)
+dev.off()
+
+
+library(pROC)
+cancerAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cancer, predictor = x))})
+sort(cancerAuc)
+
+celllineAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cellline, predictor = x))})
+sort(celllineAuc)
+
+
+rpng(width = 800, height = 800)
+plot(pcsAndMeta[plotOrder,"PC_33"], pcsAndMeta[plotOrder,"PC_9"], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4)
+dev.off()
+
+rpng()
+pairs(pcsAndMeta[plotOrder,paste0("PC_",c(33,32,9,10,21))], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4, upper.panel = NULL)
+dev.off()
+
+
+rpng()
+pairs(pcsAndMeta[plotOrder,paste0("PC_",c(3,2,27,6,20))], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL)
+dev.off()
+
+combinedMeta$sra.sample_title <- gsub("\"", "", combinedMeta$sra.sample_title)
+
+tmp <- merge(combinedMeta,pcs[,1:100], by = 0, all.y = T)
+dim(tmp)
+write.table(tmp, file = "tmpAnnotations.txt", sep = "\t", quote = FALSE, col.names = NA)
+
+qseq <- read.delim("quantseqSamples.txt")[,1]
+str(qseq)
+
+
+defaultCol <- adjustcolor("grey", alpha.f = 0.3)
+pcsAndMeta$colQseq <- defaultCol
+pcsAndMeta$colQseq[pcsAndMeta$Row.names %in% qseq] <- "orangered"
+plotOrderQseq <- order((pcsAndMeta$colQseq != defaultCol) + 1)
+
+plot(pcsAndMeta[plotOrderQseq,"PC_1"], pcsAndMeta[plotOrderQseq,"PC_2"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4, main = "Quantseq")
+
+plot(pcsAndMeta[plotOrderQseq,"PC_6"], pcsAndMeta[plotOrderQseq,"PC_1"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4, main = "Quantseq")
+
+
+table(pcsAndMeta$sra.library_layout)
+
+
+numColumns <- unlist(lapply(combinedMeta, is.numeric))
+
+
+combinedMetaMatrix <- as.matrix(combinedMeta[,numColumns])
+
+library(pROC)
+
+qseqClass <- as.factor(rownames(combinedMeta) %in% qseq)
+table(qseqClass)
+dim(combinedMetaMatrix)
+qseqPValues <- apply(combinedMetaMatrix,2,function(x){
+ tryCatch(
+ {
+ #wilcox.test(x ~ qseqClass)$p.value
+ as.numeric(auc(response = qseqClass, predictor = x))
+ },
+ error=function(cond){return(1)}
+ )
+})
+sort(qseqPValues)
+
+
+boxplot(combinedMetaMatrix[,"recount_qc.bc_frag.kallisto_mean_length"] ~ qseqClass )
+
+
+
+
+
+
+
+
+plot(pcsAndMeta[plotOrderQseq,"recount_seq_qc.%c"], log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.6)
+10^5.2
+abline(h=log10(150000))
+abline(v=60)
+log10(10^5)
+
+plot(log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), pcsAndMeta[plotOrderQseq,"recount_qc.bc_frag.kallisto_mean_length"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4)
+
+plot(pcsAndMeta[plotOrderQseq,"PC_6"], log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4)
+
+
+pc1Cor <- cor(pcsAndMeta[,"PC_1"], pcsAndMeta[,numColumns], use = "pairwise.complete.obs")
+sort(pc1Cor[1,])
+
+pc6Cor <- apply(combinedMetaMatrix[pcsAndMeta$Row.names,],2,function(x){
+ tryCatch(
+ {
+ #wilcox.test(x ~ qseqClass)$p.value
+ cor(pcsAndMeta[,"PC_6"], x, use = "pairwise.complete.obs")
+ },
+ error=function(cond){return(0)},
+ warning=function(cond){return(0)}
+ )
+})
+sort(pc6Cor^2)
+
+load("testPcPAtrickFrist100.RData", verbose = T)
+colnames(expPcs)
+str(expPcs)
+colnames(expPcs) <- paste0("PC_", 1:ncol(expPcs))
+pcsAndMeta <- merge(expPcs, combinedMeta, by = 0, all.x = T)
+dim(pcsAndMeta)
+
+
+
+load("gadoPca.RData", verbose = T)
+colnames(expGadoPcsSub)
+str(expGadoPcsSub)
+colnames(expGadoPcsSub) <- paste0("PC_", 1:ncol(expGadoPcsSub))
+pcsAndMeta <- merge(expGadoPcsSub, combinedMeta, by = 0, all.x = T)
+dim(pcsAndMeta)
+
+
+
+table(pcsAndMeta$CelllineName)
+pcsAndMeta$Cellline[grepl("s4u", pcsAndMeta$CelllineName)]
+
+
+
+
+
+
diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R
new file mode 100644
index 000000000..ee8f5db99
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R
@@ -0,0 +1,165 @@
+#srun --cpus-per-task=20 --mem=200gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i
+#remoter::server(verbose = T, port = 55556, sync = T)
+
+
+
+
+remoter::client("localhost", port = 55504)
+
+library(DESeq2)
+library(parallel)
+library(viridisLite, lib.loc = .libPaths()[2])
+
+setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
+setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\")
+
+load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T)
+load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T)
+
+sort(table(samplesWithPrediction$predictedTissue))
+tissueClasses <- unique(samplesWithPrediction$predictedTissue)
+
+
+#tissueClasses <- tissueClasses[1:29]
+#tissueClasses <- tissueClasses[30:57]
+
+#tissueClasses <- tissueClasses[c(1,2,6,14,55)]
+
+#limit expression to max int
+selectedSamplesExp[selectedSamplesExp > .Machine$integer.max] <- .Machine$integer.max
+
+
+mclapply(tissueClasses, mc.cores = 10, function(tissue){
+
+ tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue]
+ tissueExp <- selectedSamplesExp[,tissueSamples]
+ numberOfSamples <- length(tissueSamples)
+
+ includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5})
+
+ tissueExp <- tissueExp[includedGenes,]
+
+ mode(tissueExp) <- "integer"
+
+ save(tissueExp, file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData"))
+
+})
+
+
+tissueClasses <- unique(samplesWithPrediction$predictedTissue)
+
+#Run 1
+#tissueClasses <- tissueClasses[1:5]
+#Run 2
+run2Tisses <- c("Whole Blood", "T-Cells", "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells", "PBMC")
+tissueClasses <- run2Tisses
+#Run3
+run3Tisses <- c("derived-neural-progenitor_derived-neurons", "Macrophages", "Liver", "Macrophages-iPSC")
+tissueClasses <- run3Tisses
+
+#Run4 colorectal en prostate
+
+perTissueExp <- mclapply(tissueClasses, mc.cores = 4, function(tissue){
+
+ load(file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData"))
+ rlogExp <- rlog(tissueExp)
+ save(rlogExp, file = paste0("perTissueNormalization/rlogExp/",make.names(tissue),".RData"))
+ return(NULL)
+
+})
+
+#names(perTissueExp) <- tissueClasses
+#save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData")
+tissue = "Kidney"
+
+load(file = paste0("perTissueNormalization/rlogExp/",make.names(tissue),".RData"))
+
+perTissuePca <- lapply(perTissueExp, function(exp){
+
+ #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892
+ expScale = rlogExp - rowMeans(rlogExp);
+ # Standardize each variable
+ expScale = expScale / sqrt(rowSums(expScale^2));
+ #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale
+ #expEigen <- eigen(expCov)
+ #eigenVectors <- expEigen$vectors
+ #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors))
+ #rownames(eigenVectors) <- rownames(expScale)
+
+ #eigenValues <- expEigen$values
+ #names(eigenValues) <- paste0("PC_",1:length(eigenValues))
+
+ #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors)
+ #expPcs <- t(expScale) %*% expEigen$vectors[,1:10]
+ #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
+
+ expSvd <- svd(expScale, nu = 1000, nv = 1000)
+
+ eigenValues <- expSvd$d^2
+ eigenVectors <- expSvd$u
+ colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors))
+ rownames(eigenVectors) <- rownames(expScale)
+
+ expPcs <- expSvd$v[,1:25] %*% diag(expSvd$d[1:25])
+ colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
+ rownames(expPcs) <- colnames(expScale)
+
+ return(list(eigenVectors, eigenValues, expPcs))
+
+})
+
+save(perTissueExp, perTissuePca, file = "perTissueNormalization/tmpTestSession.RData")
+#load(file = "perTissueNormalization/tmpTestRlog.RData")
+
+save(expPcs, samplesWithPrediction, file = "perTissueNormalization/tmpTest2.RData")
+load("perTissueNormalization/tmpTest2.RData")
+
+
+samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired"
+samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired"
+
+
+tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),]
+str(tissueSamplesInfo)
+
+#Put TCGA and GTEx to paired end
+
+studies <- length(unique(tissueSamplesInfo$study))
+
+
+
+palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5))
+
+pchMap <- rep(c(15,16,17), length.out = studies)
+
+rpng(width = 1000, height = 1000)
+plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1)
+pairs(expPcs[,1:5], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, upper.panel = NULL)
+dev.off()
+View(tissueSamplesInfo)
+
+palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5))
+plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$sra.library_layout), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1)
+
+
+
+
+breakPoints <- seq(0.5,1,by = 0.05)
+breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5))
+
+
+plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1)
+legend("bottomright",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16)
+
+
+plot(expPcs[,1],expPcs[,5], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1)
+
+legend("topleft",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16)
+
+pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 1, upper.panel = NULL, pch = 16)
+
+
+
+ sum(expPcs[,2]>10)
+x <- cbind(expPcs, tissueSamplesInfo)
+View(x)
diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R
new file mode 100644
index 000000000..e4c35f6c2
--- /dev/null
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R
@@ -0,0 +1,520 @@
+#srun --cpus-per-task=20 --mem=100gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i
+#remoter::server(verbose = T, port = 55556, sync = T)
+
+
+
+
+remoter::client("localhost", port = 55506)
+
+library(DESeq2)
+library(parallel)
+library(viridisLite, lib.loc = .libPaths()[2])
+library(preprocessCore)
+
+
+
+
+setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
+setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\")
+
+load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T)
+load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T)
+
+samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired"
+samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired"
+
+table(samplesWithPrediction$predictedTissue)
+
+
+sort(table(samplesWithPrediction$predictedTissue))
+tissueClasses <- unique(samplesWithPrediction$predictedTissue)
+
+#not used currently, we now use the expression data used for the primary QC and sample predictions.
+mclapply(tissueClasses, mc.cores = 10, function(tissue){
+
+ tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue]
+ tissueExp <- selectedSamplesExp[,tissueSamples]
+ numberOfSamples <- length(tissueSamples)
+
+ includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5})
+
+ tissueExp <- tissueExp[includedGenes,]
+
+ tissueExp <- log2(tissueExp + 1)
+
+ normalize.quantiles(tissueExp,copy=FALSE)
+
+ save(tissueExp, file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData"))
+
+})
+
+load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData", verbose = T)
+
+
+mclapply(tissueClasses, mc.cores = 10, function(tissue, exp){
+
+ #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData"))
+
+ tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue]
+ tissueExp <- exp[,tissueSamples]
+
+
+ #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892
+ expScale = tissueExp - rowMeans(tissueExp);
+ # Standardize each variable
+ expScale = expScale / sqrt(rowSums(expScale^2));
+ #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale
+ #expEigen <- eigen(expCov)
+ #eigenVectors <- expEigen$vectors
+ #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors))
+ #rownames(eigenVectors) <- rownames(expScale)
+
+ #eigenValues <- expEigen$values
+ #names(eigenValues) <- paste0("PC_",1:length(eigenValues))
+
+ #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors)
+ #expPcs <- t(expScale) %*% expEigen$vectors[,1:10]
+ #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
+
+ expSvd <- svd(expScale, nu = 50, nv = 50)
+
+
+
+
+ eigenValues <- expSvd$d^2
+ eigenVectors <- expSvd$u
+ colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors))
+ rownames(eigenVectors) <- rownames(expScale)
+
+ expPcs <- expSvd$v[,1:50] %*% diag(expSvd$d[1:50])
+ colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
+ rownames(expPcs) <- colnames(expScale)
+
+
+ explainedVariance <- eigenValues * 100 / nrow(expScale)
+
+
+ pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance)
+
+ save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData"))
+
+ return(NULL)
+
+}, exp = exp)
+
+tissue = "Kidney"
+tissue = "Brain-Nucleus accumbens (basal ganglia)"
+
+ERP009290
+ERP009290
+samplesWithPrediction[samplesWithPrediction$study=="ERP009290",]
+
+nonOutlierSampleList <- lapply(tissueClasses, function(tissue, samplesWithPrediction){
+
+ load(file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData"))
+
+ expPcs <- pcaRes$expPcs[,1:10]
+ explainedVariance <- pcaRes$explainedVariance
+ tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),]
+ studies <- length(unique(tissueSamplesInfo$study))
+
+ #are in the same order
+ write.table(cbind(tissueSamplesInfo, expPcs), col.names = T, row.names = F, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt"))
+
+ shortTissue <- ifelse(nchar(tissue) > 20, paste0(substr(tissue,0,17),"..."), tissue)
+
+ breakPoints <- seq(0.5,1,by = 0.05)
+ breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5))
+
+ expPcsMeans <- apply(expPcs, 2, mean)
+ expPcsSds <- apply(expPcs, 2, sd)
+
+ #Larger threshold is needed for wholeblood
+ sdThreshold <- ifelse(tissue == "Whole Blood", 4,3)
+
+ threshold <- expPcsMeans + sdThreshold * expPcsSds
+
+ outlierPerComp <- sapply(1:10, function(i){
+
+ abs(expPcs[,i]) > threshold[i]
+ })
+ tissueSamplesInfo$outlier <- apply(outlierPerComp, 1, any)
+ sum(tissueSamplesInfo$outlier)
+
+ ### Do some manual corrections
+ if(tissue == "Airway basal cells"){
+ tissueSamplesInfo$outlier[expPcs[,1] < 2] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+ tissueSamplesInfo$outlier[tissueSamplesInfo$study == "ERP009290"] <- TRUE #Mixed tissue samples
+
+ if(tissue == "Brain-Hindbrain-Fetal"){
+ tissueSamplesInfo$outlier[expPcs[,1] < -10] <- TRUE #Checked annotation, these are wrongly predicted
+ tissueSamplesInfo$outlier[expPcs[,3] > 3] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+ if(tissue == "Brain-Nucleus accumbens (basal ganglia)"){
+ tissueSamplesInfo$outlier[expPcs[,1] < -5] <- TRUE #Checked annotation, these are wrongly predicted
+ tissueSamplesInfo$outlier[expPcs[,3] < -10] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+ if(tissue == "Kidney"){
+ tissueSamplesInfo$outlier[expPcs[,3] < -6] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+ if(tissue == "Macrophages-iPSC"){
+ tissueSamplesInfo$outlier[tissueSamplesInfo$study == "ERP020977"] <- FALSE #not real outlier, strange distribution due to stimulations.
+ }
+ if(tissue == "Monocytes"){
+ tissueSamplesInfo$outlier[expPcs[,2] > 3] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+ if(tissue == "Nasal Lavage"){
+ tissueSamplesInfo$outlier[expPcs[,1] < -5] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+ if(tissue == "Vagina"){
+ tissueSamplesInfo$outlier[expPcs[,1] > 10] <- TRUE #Checked annotation, these are wrongly predicted
+ tissueSamplesInfo$outlier[expPcs[,2] < -8] <- TRUE #Checked annotation, these are wrongly predicted
+ }
+
+
+
+ colnames(expPcs) <- paste0("Comp ",1:10, " (", round(explainedVariance[1:10],2) ,"%)")
+ write.table(cbind(tissueSamplesInfo, expPcs), col.names = NA, row.names = T, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt"))
+
+
+ png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"1.png"), width = 1200, height = 900)
+ #rpng(width = 1000, height = 1000)
+ layout(matrix(c(1,1,1,1,2,3,4,8,5,6,7,8),ncol = 4, byrow = T), heights = c(0.1,1,1), widths = c(1,1,1,0.1))
+ par(mar = c(0,0,0,0), xpd = NA, cex = 1.2)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2)
+
+ par(mar = c(4,4,3,0.5), xpd = NA)
+
+ palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5))
+ pchMap <- rep(c(15,16,17), length.out = studies)
+ plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Studies (", studies,")"), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n")
+
+
+ palette(adjustcolor(c("lemonchiffon3", "darkorange1", "springgreen2"), alpha.f = 0.5))
+ annotated <- factor(rep("Unkown", nrow(tissueSamplesInfo)), levels = c("Unkown", "Other", "Current"))
+ annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue != tissue] <- "Other"
+ annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue == tissue] <- "Current"
+ plot(expPcs[,1],expPcs[,2], col = annotated, pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Annotated as " , shortTissue), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n")
+
+
+ palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5))
+ plot(expPcs[,1],expPcs[,2], col = factor(tissueSamplesInfo$sra.library_layout, levels = c("paired", "single")), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = "Sequencing layout", xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n")
+
+ plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1, main = "Prediction posterior probability", xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n")
+
+ plot(cumsum(explainedVariance)[1:10], bty = "n", pch = 16, xlab = "Components", ylab = "Cumulative explained variance (%)", main = "Explained variance", ylim = c(0,100), xlim = c(0,10))
+
+
+
+
+
+ par(mar = c(0,2,3,1), xpd = NA)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ legend("topleft",title="Annotation",legend=c("Unkown", "Other", shortTissue), col = c("lemonchiffon3", "darkorange1", "springgreen2") , pch = 16, bty = "n")
+ legend("top",title="Layout",legend=c("Single", "Paired"), col = c("maroon2", "dodgerblue1") , pch = 16, bty = "n")
+ legend("topright",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n")
+
+
+ dev.off()
+
+
+
+ #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"2.png"), width = 2000, height = 2000)
+ #pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 2, upper.panel = NULL, pch = 16)
+ #dev.off()
+
+ #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000)
+ #palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5))
+ #pchMap <- rep(c(15,16,17), length.out = studies)
+ #pairs(expPcs[,1:10], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2, upper.panel = NULL)
+ #dev.off()
+
+ #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000)
+ #palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5))
+ #pairs(expPcs[,1:10], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 2, upper.panel = NULL)
+ #dev.off()
+
+ png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"4.png"), width = 1500, height = 700)
+ #rpng(width = 1000, height = 1000)
+ palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5))
+ layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1))
+ par(mar = c(0,0,0,0), xpd = NA, cex = 1.2)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2)
+
+ par(mar = c(4,4,3,0.5), xpd = NA)
+
+ for(i in 2:10){
+ plot(expPcs[,1],expPcs[,i], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n")
+ abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE)
+ abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE)
+ }
+
+ par(mar = c(0,2,3,1), xpd = NA)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ legend("top",title="Outliers",legend=c("Included", "Excluded"), col = c("grey", "firebrick3") , pch = 16, bty = "n")
+
+
+
+ dev.off()
+
+
+ png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"5.png"), width = 1500, height = 700)
+ #rpng()
+ layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1))
+ par(mar = c(0,0,0,0), xpd = NA, cex = 1.2)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2)
+
+ par(mar = c(4,4,3,0.5), xpd = NA)
+
+ for(i in 2:10){
+ plot(expPcs[,1],expPcs[,i], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n")
+ abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE)
+ abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE)
+ }
+
+ par(mar = c(0,2,3,1), xpd = NA)
+ plot.new()
+ plot.window(xlim = 0:1, ylim = 0:1)
+ legend("top",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n")
+
+
+ dev.off()
+
+
+
+ return(tissueSamplesInfo[!tissueSamplesInfo$outlier,1:(ncol(tissueSamplesInfo)-1)])
+
+}, samplesWithPrediction = samplesWithPrediction)
+
+samplesWithPredictionNoOutliers <- do.call(rbind, nonOutlierSampleList)
+#save(samplesWithPredictionNoOutliers, file = "tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T)
+
+load("tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T)
+
+tissue = "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells"
+tissue = "HUVEC"
+
+sink <- lapply(tissueClasses, function(tissue, exp){
+
+ #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData"))
+
+ tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue]
+ tissueExp <- exp[,tissueSamples]
+
+ save(tissueExp, file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"))
+
+}, exp = exp)
+
+#Create co-expression matrices
+sink <- lapply(tissueClasses, function(tissue){
+
+ load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"), verbose = T)
+
+ #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892
+ expScale = tissueExp - rowMeans(tissueExp);
+ # Standardize each variable
+ expScale = expScale / sqrt(rowSums(expScale^2));
+ expCov = tcrossprod(expScale);#equevelent to correlation due to center scale
+
+ write.table(expCov, file = paste0("perTissueNormalization/qqCoExp/",make.names(tissue),".txt"), sep = "\t", quote = F, col.names = NA)
+
+})
+
+sink <- lapply(tissueClasses, function(tissue){
+
+ #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData"))
+
+ #tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue]
+ #tissueExp <- exp[,tissueSamples]
+
+ load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"), verbose = T)
+
+ write.table(tissueExp, file = gzfile("huvec.txt.gz"), sep = "\t", quote = F, col.names = NA)
+
+
+ #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892
+ expScale = tissueExp - rowMeans(tissueExp);
+ # Standardize each variable
+ expScale = expScale / sqrt(rowSums(expScale^2));
+ #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale
+ #expEigen <- eigen(expCov)
+ #eigenVectors <- expEigen$vectors
+ #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors))
+ #rownames(eigenVectors) <- rownames(expScale)
+
+ #eigenValues <- expEigen$values
+ #names(eigenValues) <- paste0("PC_",1:length(eigenValues))
+
+ #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors)
+ #expPcs <- t(expScale) %*% expEigen$vectors[,1:10]
+ #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
+
+ expSvd <- svd(expScale, nu = 50, nv = 50)
+
+
+
+
+ eigenValues <- expSvd$d^2
+ eigenVectors <- expSvd$u
+ colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors))
+ rownames(eigenVectors) <- rownames(expScale)
+
+ expPcs <- expSvd$v[,1:50] %*% diag(expSvd$d[1:50])
+ colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
+ rownames(expPcs) <- colnames(expScale)
+
+
+ explainedVariance <- eigenValues * 100 / nrow(expScale)
+
+
+ pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance)
+
+ save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData"))
+
+ return(expPcs)
+
+})
+
+write.table(pcaRes$eigenVectors, file = "huvecEigenVectors.txt", sep = "\t", quote = F, col.names = NA)
+write.table(pcaRes$eigenValues, file = "huvecEigenValues.txt", sep = "\t", quote = F, col.names = NA)
+write.table(pcaRes$expPcs, file = "huvecPcs.txt", sep = "\t", quote = F, col.names = NA)
+
+pcsPerTissue <- lapply(tissueClasses, function(tissue){
+ load(file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData"))
+ eigenvectors <- pcaRes$eigenVectors
+ colnames(eigenvectors) <- paste0(tissue,"_",colnames(eigenvectors))
+
+ return(eigenvectors)
+})
+#str(pcsPerTissue)
+pcsPerTissue2 <- do.call(cbind, pcsPerTissue)
+
+str(pcsPerTissue2)
+
+rownames(pcsPerTissue2) <- (gsub("\\..+", "", rownames(pcsPerTissue2)))
+write.table(pcsPerTissue2, file = "perTissueNormalization/perTissueQqPcaNoOutliers/combinedComponents.txt", sep = "\t", quote = FALSE, col.names = NA)
+
+pcsPerTissue2t <- t(pcsPerTissue2)
+pcsPerTissue2Scale = pcsPerTissue2t - rowMeans(pcsPerTissue2t)
+# Standardize each variable
+pcsPerTissue2Scale = pcsPerTissue2Scale / sqrt(rowSums(pcsPerTissue2Scale^2))
+
+pcCorMatrix <- pcsPerTissue2Scale %*% t(pcsPerTissue2Scale)
+
+range(pcCorMatrix)
+range(diag(pcCorMatrix))
+
+sum(pcCorMatrix[lower.tri(pcCorMatrix)] >= 0.8)
+
+identicalPerPc <- apply(pcCorMatrix, 2, function(x){sum(x>=0.7)})
+tail(sort(identicalPerPc))
+
+hist(pcCorMatrix[,"Brain-Cortex_PC_3"])
+dev.off()
+
+pcCorMatrix[,"Whole Blood Fetal_PC_1"][pcCorMatrix[,"Whole Blood Fetal_PC_1"] >= 0.7]
+
+
+compEigen <- eigen(pcCorMatrix)
+str(compEigen)
+sum(compEigen$values)
+
+(numberOfCompsEigenvalue1 <- sum(as.numeric(compEigen$values) >= 1))
+
+str(compEigen)
+pcsOfComps <- t(pcsPerTissue2t) %*% compEigen$vectors[,1:numberOfCompsEigenvalue1]
+colnames(pcsOfComps) <- paste0("PC_",1:ncol(pcsOfComps))
+rownames(pcsOfComps) <- (gsub("\\..+", "", rownames(pcsOfComps)))
+write.table(pcsOfComps, col.names = NA, sep = "\t", quote = F, file = gzfile("perTissueNormalization/perTissueQqPcaNoOutliers/pcaCombinedComponents.txt.gz"))
+str(pcsOfComps)
+
+rpng()
+plot(cumsum(as.numeric(compEigen$values) * 100 / sum(as.numeric(compEigen$values))))
+dev.off()
+
+rpng()
+plot(as.numeric(compEigen$values))
+dev.off()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+head(as.numeric(compEigen$values))
+
+sum(eigenValues >= 1)
+
+compSvd <- svd(t(pcsPerTissue2Scale))
+str(compSvd)
+
+(numberOfCompsEigenvalue1 <- sum(compSvd$d^2>=1))
+
+setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
+
+load("problem.RData")
+
+
+combinedCompsPcs <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1])
+combinedCompsPcs2 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1])
+combinedCompsPcs3 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1])
+combinedCompsPcs4 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1])
+combinedCompsPcs5 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1])
+
+cor.test(combinedCompsPcs[,1], combinedCompsPcs2[,1])
+cor.test(combinedCompsPcs[,1], combinedCompsPcs3[,1])
+cor.test(combinedCompsPcs[,1], combinedCompsPcs4[,1])
+cor.test(combinedCompsPcs[,1], combinedCompsPcs5[,1])
+
+range(abs(combinedCompsPcs[,1]) - abs(combinedCompsPcs2[,1]))
+
+plot(combinedCompsPcs[,1], combinedCompsPcs2[,1])
+plot(combinedCompsPcs[,1], combinedCompsPcs3[,1])
+plot(combinedCompsPcs[,1], combinedCompsPcs4[,1])
+plot(combinedCompsPcs[,1], combinedCompsPcs5[,1])
+dev.off()
+
+save(compSvd, numberOfCompsEigenvalue1, file = "problem.RData")
+
+str(combinedCompsPcs)
+range(combinedCompsPcs)
+
+plot(as.numeric(teest[,1]),combinedCompsPcs[,1])
+dev.off()
+
+
+head(compSvd$d^2)
+head(compSvd$d^2)
+
+eigenValues <- compSvd$d^2
+
+rpng()
+plot(cumsum(eigenValues * 100 / sum(eigenValues)))
+dev.off()
+
+
+
+
+rpng()
+plot(eigenValues)
+dev.off()
+
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R
similarity index 91%
rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R
rename to Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R
index 25d056ae6..e312a3287 100644
--- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R
@@ -18,15 +18,16 @@ rm(table_tmp)
str(exp)
#save(exp, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData")
+load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData")
+#exp contains expression rows genes cols samples
+#First center and scale each row
#https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892
expScale = exp - rowMeans(exp);
# Standardize each variable
expScale = expScale / sqrt(rowSums(expScale^2));
-expCov = tcrossprod(expScale);#equevelent to correlation due to center scale
-range(expCov)
-str(expCov)
+expCov = tcrossprod(expScale);#equivalent to correlation due to center scale
expEigen <- eigen(expCov)
@@ -41,8 +42,7 @@ str(eigenValues)
save(eigenVectors, eigenValues, expFile, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/PCA_Patrick/eigen.RData")
-
-
+#Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors)
expPcs <- t(expScale) %*% expEigen$vectors[,1:1000]
colnames(expPcs) <- paste0("PC_",1:ncol(expPcs))
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R b/Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R
similarity index 99%
rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R
rename to Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R
index e6020bd90..02791d9a6 100644
--- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R
+++ b/Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R
@@ -2,7 +2,7 @@ library(parallel)
setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/")
-load(file = "combinedMeta_2022_09_02.RData", verbose = T)
+load(file = "combinedMeta_2022_09_15.RData", verbose = T)
load(file = "Recount3_QC_2ndRun/PCA_Patrick/pcs.RData", verbose = T)
load(file = "Recount3_QC_2ndRun/PCA_Patrick/eigen.RData", verbose = T)
@@ -20,7 +20,7 @@ pcsAndMeta <- merge(expPcs[,1:compsToUse], combinedMeta, by = 0)
rownames(pcsAndMeta) <- pcsAndMeta$Row.names
-save(pcsAndMeta, compsToUse, file = "DataForPredictions.RData")
+#save(pcsAndMeta, compsToUse, file = "DataForPredictions.RData")
dim(pcsAndMeta)
pcsAndMeta <- pcsAndMeta[!pcsAndMeta$exclude,]
diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/qqNorm.R b/Downstreamer/src/main/r/downstreamer_main/recount3/qqNormOfFullRecount3.R
similarity index 100%
rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/qqNorm.R
rename to Downstreamer/src/main/r/downstreamer_main/recount3/qqNormOfFullRecount3.R
diff --git a/Genotype-Harmonizer/pom.xml b/Genotype-Harmonizer/pom.xml
index 4414958d1..44e9bf74a 100644
--- a/Genotype-Harmonizer/pom.xml
+++ b/Genotype-Harmonizer/pom.xml
@@ -7,7 +7,7 @@
4.0.0
Genotype-Harmonizer
- 1.4.23-SNAPSHOT
+ 1.4.24-SNAPSHOT
Genotype Harmonizer
jar