From b8cfb73702a964ecb6762b7c52685bfd87449b5f Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Tue, 13 Sep 2022 22:41:34 +0200 Subject: [PATCH 01/22] Update umap.R --- .../legacy_scripts/recount3/umap.R | 185 +++++++++++++++++- 1 file changed, 183 insertions(+), 2 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R index 850281aed..a3cf01281 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R @@ -124,8 +124,8 @@ dev.off() locator(n =2, type = "l") - - +cluster1 <- locator(n =2, type = "l") +cluster2 <- locator(n =2, type = "l") write.table(umapAndMeta,file = "umaptest.txt", sep = "\t", quote = F, col.names = NA) @@ -164,3 +164,184 @@ dev.off() + + +#smartseq plots + +someSmartSeqStudies <- read.delim("selectionSmartseqStudies.txt", header = F)[,1] +str(someSmartSeqStudies) + +someSmartSeqSamples <- read.delim("smartseqSamples.txt", header = T)[,1] +str(someSmartSeqSamples) + +umapAndMeta$smartseqcol <- defaultCol +umapAndMeta$smartseqcol[umapAndMeta$study %in% someSmartSeqStudies] <- "pink" +umapAndMeta$smartseqcol[umapAndMeta$Row.names %in% someSmartSeqSamples] <- "pink" + +umapAndMeta$plotOrdersq <- order(umapAndMeta$smartseqcol != defaultCol) + + +par(mar = c(3,5,0.1,0.1), xpd = NA) +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$smartseqcol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16) + + +#instestine clusters + +umapAndMeta$intestineCluster <- "" +umapAndMeta$intestineCluster[umapAndMeta$UMAP1 >= cluster1$x[1] & umapAndMeta$UMAP1 <= cluster1$x[2] & umapAndMeta$UMAP2 >= cluster1$y[1] & umapAndMeta$UMAP2 <= cluster1$y[2]] <- "c1" +umapAndMeta$intestineCluster[umapAndMeta$UMAP1 >= cluster2$x[1] & umapAndMeta$UMAP1 <= cluster2$x[2] & umapAndMeta$UMAP2 >= cluster2$y[1] & umapAndMeta$UMAP2 <= cluster2$y[2]] <- "c2" +table(umapAndMeta$intestineCluster) + +table(factor(umapAndMeta$umapFactor[umapAndMeta$intestineCluster=="c1"])) +table(factor(umapAndMeta$umapFactor[umapAndMeta$intestineCluster=="c2"])) + +table(factor(umapAndMeta$class[umapAndMeta$intestineCluster=="c1"])) +table(factor(umapAndMeta$class[umapAndMeta$intestineCluster=="c2"])) + +a <- as.data.frame(table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster=="c1"])) +b <- as.data.frame(table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster=="c2"])) + + +table(paste(umapAndMeta$Cohort, umapAndMeta$class)[umapAndMeta$intestineCluster!=""], umapAndMeta$intestineCluster[umapAndMeta$intestineCluster!=""]) + +str(a) +c <- merge(a,b,by = 0, all = T) +c + +load("metadata_gtex.Rda", verbose = T) +View(metadata_gtex) + + +gtexTansverse <- umapAndMeta[umapAndMeta$study == "GTEx" & umapAndMeta$Tissue2 == "Transverse" & umapAndMeta$intestineCluster != "",] + +rownames(gtexTansverse) <- gtexTansverse$Row.names + +rownames(metadata_gtex) <- metadata_gtex$external_id + +dim(gtexTansverse) +gtexTansverse <- merge(gtexTansverse, metadata_gtex[,!colnames(metadata_gtex) %in% colnames(gtexTansverse)], by = 0) +dim(gtexTansverse) + +table(gtexTansverse$gtex.smatsscr, gtexTansverse$intestineCluster) + +fisher.test(table(gtexTansverse$gtex.smatsscr, gtexTansverse$intestineCluster)) +grep("MHBCTINF", colnames(gtexTansverse), ignore.case = T) + + + + +numCols <- colnames(gtexTansverse)[unlist(lapply(gtexTansverse, is.numeric)) ] + +colName <- "sra.paired_nominal_length" +clusterCompare <- sapply(numCols, function(colName){ + #print(colName) + if(!all(is.na(gtexTansverse[,colName])) & sd(gtexTansverse[,colName], na.rm =T) > 0 ){ + t.test(gtexTansverse[,colName] ~ gtexTansverse$intestineCluster)$p.value + } + +}) +clusterCompare <- unlist(clusterCompare) +clusterCompare2 <- clusterCompare[grep("PC_", names(clusterCompare), invert = T)] +sort(clusterCompare2, decreasing = T) +boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrx` ~ gtexTansverse$intestineCluster) +boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrx` ~ paste0(gtexTansverse$intestineCluster, "_",gtexTansverse$gtex.sex)) +boxplot(gtexTansverse$`recount_qc.aligned_reads%.chrm` ~ gtexTansverse$intestineCluster) +boxplot(gtexTansverse$`` ~ gtexTansverse$intestineCluster) + +boxplot(gtexTansverse$`recount_qc.star.number_of_reads_unmapped:_other_both` ~ gtexTansverse$intestineCluster) + +boxplot(gtexTansverse$`gtex.smtsisch` ~ gtexTansverse$intestineCluster) +boxplot(gtexTansverse$`CnvAutoCor` ~ gtexTansverse$intestineCluster) + +#save(gtexTansverse, file = "gtexTansverse.RData") +load("gtexTansverse.RData") + + +str(row.names(gtexTansverse)) +str(gtexTansverse$Row.names) +str(exp) +expgT <- exp[,gtexTansverse$Row.names] +save(expgT, file = "expgT.RData") +load( "expgT.RData") + + +colnames(expgT) +expgT <- t(expgT) +all(rownames(expgT) == gtexTansverse$Row.names) + +x <- expgT[,1] + +diffExp <- apply(expgT, 2, function(x){ + t.test(x ~gtexTansverse$intestineCluster)$statistic +}) +hist(-log10(diffExp)) +names(diffExp)[order(diffExp)[1:100]] +cat(sub("\\..+","",names(diffExp)[order(diffExp, decreasing = T)[1:200]]), sep = "\n") + +load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Fibroblasts.rda", verbose = T) +str(fibroblasts) + +load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/BloodVessels.rda", verbose = T) + + + + +minSamplesTraining <- 50 +maxFractionOfStudy <- 0.8 + + +umapAndMetaClassified <- umapAndMeta[!is.na(umapAndMeta$umapFactor),] +umapAndMetaClassified$training <- FALSE + +tissueClass <- levels(umapAndMetaClassified$umapFactor)[2] +study <- "GTEx" + +set.seed(42) +#for each tissue slecect samples for training +for(tissueClass in levels(umapAndMetaClassified$umapFactor)){ + thisTissueSamples <- umapAndMetaClassified$umapFactor==tissueClass + studiesForThisTissue <- unique(umapAndMetaClassified$study[thisTissueSamples]) + numberOfStudies <- length(studiesForThisTissue) + numberOfSamplesPerStudy <- ceiling(minSamplesTraining / numberOfStudies) + print(paste(tissueClass, length(studiesForThisTissue), numberOfSamplesPerStudy, sep = " - ")) + #for each studies put samples to training or test + for(study in studiesForThisTissue){ + + thisTissueAndStudySamples <- thisTissueSamples & umapAndMetaClassified$study == study + thisTissueAndStudySamplesCount <- sum(thisTissueAndStudySamples) + + #Don't select more samples from study then the study has and also no more then set fraction. Do floor to put studies with single sample to testset + potentialMax <- floor(thisTissueAndStudySamplesCount * maxFractionOfStudy) + numberTrainingSamplesThisStudy <- if(potentialMax > numberOfSamplesPerStudy) numberOfSamplesPerStudy else potentialMax + if(numberTrainingSamplesThisStudy > 0){ + #The which will get all indices for the samples of this study-tissue combination. These are then samples for the samples used for training + trainingSamplesThisStudy <- sample(which(thisTissueAndStudySamples), numberTrainingSamplesThisStudy) + #Set selected to TRUE + umapAndMetaClassified$training[trainingSamplesThisStudy] <- TRUE + } + + + #print(paste0(thisTissueAndStudySamplesCount, " - ", numberTrainingSamplesThisStudy)) + } + +} + +sum(umapAndMetaClassified$training) + +umapAndMetaClassifiedTraining <- umapAndMetaClassified[umapAndMetaClassified$training,] +table(umapAndMetaClassifiedTraining$umapFactor) +umapAndMetaClassifiedTest <- umapAndMetaClassified[!umapAndMetaClassified$training,] +dim(umapAndMetaClassifiedTest) + + + +cfit <- cv.glmnet(x = as.matrix(umapAndMetaClassifiedTraining[,paste0("PC_",1:compsToUse)]), y = umapAndMetaClassifiedTraining$umapFactor, family = "multinomial", type.measure = "class", alpha=1, nlambda=100) +best_lambda <- cfit$lambda.min +cfit + + + +fibTraining <- fibroblasts$Row.names +bvTraining <- bloodVessels$Row.names +bvTraining +unique(bvTraining) From 940366ed9d7c7679726d03f84ec880bb568547a5 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Fri, 16 Sep 2022 12:16:14 +0200 Subject: [PATCH 02/22] recount3 --- .../legacy_scripts/combineAnnotations.R | 7 +- .../recount3/recountCancerCellline2.R | 4 +- .../legacy_scripts/recount3/umap.R | 260 +++++++++++++++--- 3 files changed, 234 insertions(+), 37 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R index 5b3d371fc..892d655c4 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/combineAnnotations.R @@ -1564,7 +1564,7 @@ combinedMeta$CelllineName[studySamples] <- "" combinedMeta$Cancer[studySamples] <- FALSE #SRP081020 is mis-anotated in SRA as PBMC, paper and clustering both state wholeblood -studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077") +studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077", "SRP056443") combinedMeta$Tissue[studySamples] <- "" combinedMeta$Tissue2[studySamples] <- "" combinedMeta$Cellline[studySamples] <- NA @@ -1573,6 +1573,9 @@ combinedMeta$Cancer[studySamples] <- NA + + + combinedMeta$Cellline[!is.na(combinedMeta$CelllineName)&combinedMeta$CelllineName=="iPSC"] <- TRUE combinedMeta$Cancer[!is.na(combinedMeta$Cellline) & combinedMeta$Cellline] <- NA @@ -1613,7 +1616,7 @@ combinedMeta$Tissue[combinedMeta$Cohort == "GSA"] -#save(combinedMeta, file = "combinedMeta_2022_09_02.RData") +#save(combinedMeta, file = "combinedMeta_2022_09_15.RData") load(file = "combinedMeta_2022_08_19.RData") diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R index e6020bd90..02791d9a6 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R @@ -2,7 +2,7 @@ library(parallel) setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") -load(file = "combinedMeta_2022_09_02.RData", verbose = T) +load(file = "combinedMeta_2022_09_15.RData", verbose = T) load(file = "Recount3_QC_2ndRun/PCA_Patrick/pcs.RData", verbose = T) load(file = "Recount3_QC_2ndRun/PCA_Patrick/eigen.RData", verbose = T) @@ -20,7 +20,7 @@ pcsAndMeta <- merge(expPcs[,1:compsToUse], combinedMeta, by = 0) rownames(pcsAndMeta) <- pcsAndMeta$Row.names -save(pcsAndMeta, compsToUse, file = "DataForPredictions.RData") +#save(pcsAndMeta, compsToUse, file = "DataForPredictions.RData") dim(pcsAndMeta) pcsAndMeta <- pcsAndMeta[!pcsAndMeta$exclude,] diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R index a3cf01281..4a6d1148c 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R @@ -14,40 +14,33 @@ setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "") load(file = "DataForPredictions.RData") +rownames(pcsAndMeta) <- pcsAndMeta$Row.names +load(file = "combinedMeta_2022_09_15.RData", verbose = T) -#load(file = "combinedMeta_2022_08_30.RData", verbose = T) -#str(combinedMeta) -#updatedAnnotations <- combinedMeta[,c("Tissue", "Tissue2", "Cellline", "CelllineName", "Cancer", "Cohort", "Fetal")] - -#all(rownames(pcsAndMeta) %in% rownames(updatedAnnotations)) -#updatedAnnotations <- updatedAnnotations[rownames(pcsAndMeta),] -#all(rownames(pcsAndMeta) == rownames(updatedAnnotations)) -#pcsAndMeta[,colnames(updatedAnnotations)] <- updatedAnnotations +colnamesToUpdate <- colnames(pcsAndMeta)[colnames(pcsAndMeta) %in% colnames(combinedMeta)] +all(rownames(pcsAndMeta) %in% rownames(combinedMeta)) +pcsAndMeta[,colnamesToUpdate] <- combinedMeta[rownames(pcsAndMeta),colnamesToUpdate] -#pcsAndMeta$selectedSamples <- !pcsAndMeta$excludeBasedOnPredictionCellline2 & !pcsAndMeta$excludeBasedOnPredictionCancer & !(!is.na(pcsAndMeta$Cancer) & pcsAndMeta$Cancer) & !(!is.na(pcsAndMeta$Cellline) & pcsAndMeta$Cellline) table(pcsAndMeta$selectedSamples, useNA = "a") clusterAnnotations <- read.delim("umap/annotationsBasedOnOldUmap.txt", row.names = 1) -pcsAndMeta <- merge(pcsAndMeta, clusterAnnotations, by = 0, all.x = T) -rownames(pcsAndMeta) <- pcsAndMeta$Row.names -table(pcsAndMeta$ClusterAnnotation) - - - - -#pcsAndMeta[!is.na(pcsAndMeta$study) & (pcsAndMeta$study== "ERP104864") & (grepl("synovium", pcsAndMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +samplesWithClusterAnnotation <- rownames(pcsAndMeta)[rownames(pcsAndMeta) %in% rownames(clusterAnnotations)] +pcsAndMeta$ClusterAnnotation <- NA +pcsAndMeta[samplesWithClusterAnnotation, "ClusterAnnotation"] <- clusterAnnotations[samplesWithClusterAnnotation,"ClusterAnnotation"] +table(pcsAndMeta$ClusterAnnotation, useNA = "a") tissueSamples <- pcsAndMeta[pcsAndMeta$selectedSamples,] tissueSamples$class <- tissueSamples$Tissue -hasT2 <- tissueSamples$Tissue2 != "" -tissueSamples$class[hasT2] <- paste0(tissueSamples$class[hasT2], "-", tissueSamples$Tissue2[hasT2]) +hasT2 <- tissueSamples$Tissue2 != "" +tissueSamples$class[hasT2] <- paste0(tissueSamples$Tissue[hasT2], "-", tissueSamples$Tissue2[hasT2]) +table(tissueSamples$class) isFetal <- !is.na(tissueSamples$Fetal) & tissueSamples$Fetal tissueSamples$class[isFetal] <- paste0(tissueSamples$class[isFetal], "-Fetal") @@ -66,7 +59,7 @@ mapping <- read.delim("umap/tissuesMapping.txt") str(mapping) all(tissueSamples$class %in% mapping$Class) - +tissueSamples$class[!tissueSamples$class %in% mapping$Class] tissueSamples$umapFactor <- as.factor(mapping$ClassificationClass[match(tissueSamples$class, mapping$Class)]) @@ -95,11 +88,11 @@ umapInput <- as.matrix(tissueSamples[,paste0("PC_",1:compsToUseForUmap)]) sampleUmap <- umap( umapInput, - n_epochs = 1000, + n_epochs = 300, init = init, n_neighbors = 500, min_dist = 2, init_sdev = 1e-4, learning_rate = 1, - spread = 15, + spread = 20, bandwidth = 10, scale = "scale", local_connectivity = 1, @@ -109,10 +102,12 @@ sampleUmap <- umap( rownames(sampleUmap) <- rownames(tissueSamples) colnames(sampleUmap) <- c("UMAP1", "UMAP2") umapAndMeta <- merge(sampleUmap, tissueSamples, by = 0) +rownames(umapAndMeta) <- umapAndMeta$Row.names dim(umapAndMeta) - +#save(sampleUmap, file = "umap/sampleUmap.RData") +#load(file = "umap/sampleUmap.RData") rpng() @@ -123,12 +118,16 @@ plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$ dev.off() + + locator(n =2, type = "l") cluster1 <- locator(n =2, type = "l") cluster2 <- locator(n =2, type = "l") -write.table(umapAndMeta,file = "umaptest.txt", sep = "\t", quote = F, col.names = NA) +write.table(umapAndMeta[,!grepl("PC_",colnames(umapAndMeta))],file = "umaptest.txt", sep = "\t", quote = F, col.names = NA) +#save(umapAndMeta, file = "umaptest.RData") +#load("umaptest.RData") #save.image( file="umap_tmp.RData") #load("umap_tmp.RData") @@ -150,7 +149,7 @@ pdf(file = "umaptest.pdf", width = 16, height = 8) layout(matrix(1:2,ncol = 2)) par(mar = c(3,5,0.1,0.1), xpd = NA) -plot(umapAndMeta[plotOrderTissues,"UMAP1"], umapAndMeta[plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[plotOrderTissues], cex = 0.4, pch = 16) +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$umapAndMeta$TissueCol[plotOrderTissues], cex = 0.4, pch = 16) par(mar = c(0,0,0,0), xpd = NA) plot.new() @@ -289,8 +288,9 @@ load("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/SRA minSamplesTraining <- 50 maxFractionOfStudy <- 0.8 - +#Take only samples that have an annotation umapAndMetaClassified <- umapAndMeta[!is.na(umapAndMeta$umapFactor),] +#First put all in test, algorithm will put some umapAndMetaClassified$training <- FALSE tissueClass <- levels(umapAndMetaClassified$umapFactor)[2] @@ -334,14 +334,208 @@ umapAndMetaClassifiedTest <- umapAndMetaClassified[!umapAndMetaClassified$traini dim(umapAndMetaClassifiedTest) - -cfit <- cv.glmnet(x = as.matrix(umapAndMetaClassifiedTraining[,paste0("PC_",1:compsToUse)]), y = umapAndMetaClassifiedTraining$umapFactor, family = "multinomial", type.measure = "class", alpha=1, nlambda=100) -best_lambda <- cfit$lambda.min +library(glmnet) +cfit <- cv.glmnet(x = as.matrix(umapAndMetaClassifiedTraining[,paste0("PC_",1:compsToUse)]), y = umapAndMetaClassifiedTraining$umapFactor, family = "multinomial", type.measure = "class") cfit +rpng() +plot(cfit) +dev.off() + + + +assess.glmnet(cfit, newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), newy = umapAndMetaClassifiedTest$umapFactor, family = "multinomial", type.measure = "class", keep = TRUE, alpha=1, lambda = "1se") + + + +predictionsTest <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), type = "class") + +predictionsTestScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), type = "response") +predictionsTestScores <- predictionsTestScores[,,1] +umapAndMetaClassifiedTest$predictedTissueScore <- apply(predictionsTestScores, 1, max) + +prop = 0.5 + +predictionsInTest <- sapply(seq(0,1,0.05), function(prop){ + + umapAndMetaClassifiedTest$predictedTissue <- predictionsTest[,1] + + + umapAndMetaClassifiedTest$predictedTissue[umapAndMetaClassifiedTest$predictedTissueScore <= prop] <- NA + + umapAndMetaClassifiedTest$misclasified <- FALSE + umapAndMetaClassifiedTest$misclasified[!is.na(umapAndMetaClassifiedTest$umapFactor) & !is.na(umapAndMetaClassifiedTest$predictedTissue) & umapAndMetaClassifiedTest$umapFactor != umapAndMetaClassifiedTest$predictedTissue] <- TRUE + errors <- sum(umapAndMetaClassifiedTest$misclasified ) + + umapAndMetaClassifiedTest$notPredictedBack <- FALSE + umapAndMetaClassifiedTest$notPredictedBack[!is.na(umapAndMetaClassifiedTest$umapFactor) & is.na(umapAndMetaClassifiedTest$predictedTissue) ] <- TRUE + missed <- sum(umapAndMetaClassifiedTest$notPredictedBack) + + total <- nrow(umapAndMetaClassifiedTest) + + missedPercentage <- missed / total + errorPercentage <- errors / total + + return(c("Threshold" = prop, "MissedPerc" = missedPercentage , "ErrorPerc" = errorPercentage )) + +}) +predictionsInTest + +tissueClass <- levels(umapAndMetaClassified$umapFactor)[1] + +predictionsInTestPerTissue <- lapply(levels(umapAndMetaClassified$umapFactor), function(tissueClass){ + predictionsInTestThisTissue <- sapply(seq(0,1,0.05), function(prop){ + + umapAndMetaClassifiedTestTissue <- umapAndMetaClassifiedTest[umapAndMetaClassifiedTest$umapFactor == tissueClass,] + umapAndMetaClassifiedTestTissue$predictedTissue <- predictionsTest[umapAndMetaClassifiedTest$umapFactor == tissueClass,1] + + + umapAndMetaClassifiedTestTissue$predictedTissue[umapAndMetaClassifiedTestTissue$predictedTissueScore <= prop] <- NA + + umapAndMetaClassifiedTestTissue$misclasified <- FALSE + umapAndMetaClassifiedTestTissue$misclasified[!is.na(umapAndMetaClassifiedTestTissue$umapFactor) & !is.na(umapAndMetaClassifiedTestTissue$predictedTissue) & umapAndMetaClassifiedTestTissue$umapFactor != umapAndMetaClassifiedTestTissue$predictedTissue] <- TRUE + errors <- sum(umapAndMetaClassifiedTestTissue$misclasified ) + + umapAndMetaClassifiedTestTissue$notPredictedBack <- FALSE + umapAndMetaClassifiedTestTissue$notPredictedBack[!is.na(umapAndMetaClassifiedTestTissue$umapFactor) & is.na(umapAndMetaClassifiedTestTissue$predictedTissue) ] <- TRUE + missed <- sum(umapAndMetaClassifiedTestTissue$notPredictedBack) + + total <- nrow(umapAndMetaClassifiedTestTissue) + + missedPercentage <- missed / total + errorPercentage <- errors / total + + return(c("Threshold" = prop, "MissedPerc" = missedPercentage , "ErrorPerc" = errorPercentage )) + + }) + return(predictionsInTestThisTissue) +}) + +str(predictionsInTestPerTissue) + +layout(matrix(1:2, nrow = 1)) +plot(t(predictionsInTest[1:2,]), main = "Percentage classification missed in test dataset") +sink <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ + points(t(predictionsInTestThisTissue[1:2,]), type = "l", col=adjustcolor("grey", alpha.f = 0.5)) +}) +plot(t(predictionsInTest[c(1,3),]), main = "Percentage wrong classification in test dataset") +sink <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ + points(t(predictionsInTestThisTissue[c(1,3),]), type = "l", col=adjustcolor("grey", alpha.f = 0.5)) +}) + + + + +confusion <- confusion.glmnet(cfit, newx = as.matrix(umapAndMetaClassifiedTest[,paste0("PC_",1:compsToUse)]), newy = umapAndMetaClassifiedTest$umapFactor, family = "multinomial", type.measure = "class", keep = TRUE, alpha=1, lambda = "1se") +diag(confusion) <- 0 + +library(heatmap3) + +rpng() +pdf("confusion.pdf", width = 12, height = 12) +heatmap3(confusion, Rowv = NA, Colv = NA, balanceColor =T, scale = "none") +dev.off() + + +predictions <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "class") +umapAndMeta$predictedTissue <- predictions[,1] + +predictionsScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "response") +predictionsScores <- predictionsScores[,,1] +rownames(predictionsScores) <- umapAndMeta$Row.names +str(predictionsScores) +sort(predictionsScores["SRR5499181",]) +umapAndMeta$predictedTissueScore <- apply(predictionsScores, 1, max) + +sum(umapAndMeta$predictedTissueScore <= 0.5) +umapAndMeta$predictedTissue[umapAndMeta$predictedTissueScore <= 0.5] <- NA +sum(umapAndMeta$predictedTissueScore <= 0.5) + +rpng() +hist(umapAndMeta$predictedTissueScore) +dev.off() + +umapAndMeta$misclasified <- FALSE +umapAndMeta$misclasified[!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & umapAndMeta$umapFactor != umapAndMeta$predictedTissue] <- TRUE +sum(umapAndMeta$misclasified ) + +umapAndMeta$notPredictedBack <- FALSE +umapAndMeta$notPredictedBack[!is.na(umapAndMeta$umapFactor) & is.na(umapAndMeta$predictedTissue) ] <- TRUE +sum(umapAndMeta$notPredictedBack) + +sort(table(umapAndMeta[umapAndMeta$misclasified, "umapFactor"])) +sort(table(umapAndMeta[umapAndMeta$notPredictedBack, "umapFactor"])) +tissueClass <- levels(umapAndMeta$umapFactor)[1] -fibTraining <- fibroblasts$Row.names -bvTraining <- bloodVessels$Row.names -bvTraining -unique(bvTraining) +pdf("tissuePrediction.pdf") +for(tissueClass in levels(umapAndMeta$umapFactor)){ + + umapAndMeta$ThisTissueCol <- defaultCol + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & !umapAndMeta$misclasified] <- adjustcolor("forestgreen", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$notPredictedBack] <- adjustcolor("hotpink", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$misclasified] <- adjustcolor("violetred3", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass != umapAndMeta$umapFactor & tissueClass == umapAndMeta$predictedTissue] <- adjustcolor("orange1", alpha.f = 0.5) + umapAndMeta$ThisTissueCol[is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass == umapAndMeta$predictedTissue] <- adjustcolor("dodgerblue1", alpha.f = 0.5) + + predictedBack <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & !umapAndMeta$misclasified) + notPredictedBack <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$notPredictedBack) + predictedAsOther <- sum(!is.na(umapAndMeta$umapFactor) & tissueClass == umapAndMeta$umapFactor & umapAndMeta$misclasified) + otherPredicted <- sum(!is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass != umapAndMeta$umapFactor & tissueClass == umapAndMeta$predictedTissue) + newPredicted <- sum(is.na(umapAndMeta$umapFactor) & !is.na(umapAndMeta$predictedTissue) & tissueClass == umapAndMeta$predictedTissue) + + table(umapAndMeta$ThisTissueCol, useNA = "a") + + umapAndMeta$plotOrderThisTissues <- order(umapAndMeta$ThisTissueCol != defaultCol) + + #rpng() + layout(matrix(c(1,2,3), ncol = 1, byrow = T), heights = c(0.05,0.85,0.1)) + par(mar = c(0,0,0,0), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5,tissueClass, cex = 2 , font = 2) + + par(mar = c(5,5,0,0.1), xpd = NA) + plot(umapAndMeta[umapAndMeta$plotOrderThisTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderThisTissues,"UMAP2"], col = umapAndMeta$ThisTissueCol[umapAndMeta$plotOrderThisTissues], cex = 0.2, pch = 16, bty="n", xlab = "UMAP-1", ylab = "UMAP-2") + + par(mar = c(0,0,0,0), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("center", fill = c( + "forestgreen", + "hotpink", + "violetred3", + "orange1", + "dodgerblue1" + ), + legend = c( + paste0(tissueClass, " correctly predicted back (", predictedBack,")"), + paste0(tissueClass, " not predicted back (", notPredictedBack,")"), + paste0(tissueClass, " predicted as other (", predictedAsOther,")"), + paste0("Other tissue predicted as ", tissueClass," (", otherPredicted,")"), + paste0("Unkown predicted as ", tissueClass, " (", newPredicted,")") + ), + bty = "n") + + + + #dev.off() + +} +dev.off() + + +unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)] + +umapAndMeta$TissuePredictedCol <- defaultCol +umapAndMeta$TissuePredictedCol[umapAndMeta$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndMeta$predictedTissue[umapAndMeta$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5) +umapAndMeta$plotOrderTissuePredicted <- order(umapAndMeta$TissuePredictedCol != defaultCol) + +rpng() + +par(mar = c(3,3,0.1,0.1), xpd = NA) +plot(umapAndMeta[umapAndMeta$plotOrderTissuePredicted,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissuePredicted,"UMAP2"], col = umapAndMeta$TissuePredictedCol[umapAndMeta$plotOrderTissuePredicted], cex = 0.2, pch = 16) + +dev.off() + +locator(n =2, type = "l") From 39cd5b330f9c348f8b4db18819252208afe4db45 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Wed, 21 Sep 2022 14:35:28 +0200 Subject: [PATCH 03/22] Update umap.R --- .../legacy_scripts/recount3/umap.R | 84 +++++++++++++++---- 1 file changed, 70 insertions(+), 14 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R index 4a6d1148c..04d19aea2 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R @@ -88,11 +88,11 @@ umapInput <- as.matrix(tissueSamples[,paste0("PC_",1:compsToUseForUmap)]) sampleUmap <- umap( umapInput, - n_epochs = 300, + n_epochs = 1000, init = init, n_neighbors = 500, min_dist = 2, init_sdev = 1e-4, learning_rate = 1, - spread = 20, + spread = 15, bandwidth = 10, scale = "scale", local_connectivity = 1, @@ -101,13 +101,18 @@ sampleUmap <- umap( rownames(sampleUmap) <- rownames(tissueSamples) colnames(sampleUmap) <- c("UMAP1", "UMAP2") +save(sampleUmap, file = "umap/sampleUmap2.RData") + +#load(file = "umap/sampleUmap.RData") + + + + umapAndMeta <- merge(sampleUmap, tissueSamples, by = 0) rownames(umapAndMeta) <- umapAndMeta$Row.names dim(umapAndMeta) -#save(sampleUmap, file = "umap/sampleUmap.RData") -#load(file = "umap/sampleUmap.RData") rpng() @@ -410,14 +415,24 @@ predictionsInTestPerTissue <- lapply(levels(umapAndMetaClassified$umapFactor), f }) return(predictionsInTestThisTissue) }) - +names(predictionsInTestPerTissue) <- levels(umapAndMetaClassified$umapFactor) str(predictionsInTestPerTissue) +x <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ + return(predictionsInTestThisTissue[3,11]) +}) +sort(x) + +predictionsInTest[2,15] + +predictionsInTestPerTissue[["Whole Blood Fetal"]] + layout(matrix(1:2, nrow = 1)) plot(t(predictionsInTest[1:2,]), main = "Percentage classification missed in test dataset") -sink <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ +for(tissueClass in levels(umapAndMetaClassified$umapFactor)){ + predictionsInTestThisTissue <- predictionsInTestPerTissue[[tissueClass]] points(t(predictionsInTestThisTissue[1:2,]), type = "l", col=adjustcolor("grey", alpha.f = 0.5)) -}) +} plot(t(predictionsInTest[c(1,3),]), main = "Percentage wrong classification in test dataset") sink <- sapply(predictionsInTestPerTissue, function(predictionsInTestThisTissue){ points(t(predictionsInTestThisTissue[c(1,3),]), type = "l", col=adjustcolor("grey", alpha.f = 0.5)) @@ -443,13 +458,11 @@ umapAndMeta$predictedTissue <- predictions[,1] predictionsScores <- predict(cfit, s = "lambda.1se", newx = as.matrix(umapAndMeta[,paste0("PC_",1:compsToUse)]), type = "response") predictionsScores <- predictionsScores[,,1] rownames(predictionsScores) <- umapAndMeta$Row.names -str(predictionsScores) -sort(predictionsScores["SRR5499181",]) umapAndMeta$predictedTissueScore <- apply(predictionsScores, 1, max) sum(umapAndMeta$predictedTissueScore <= 0.5) umapAndMeta$predictedTissue[umapAndMeta$predictedTissueScore <= 0.5] <- NA -sum(umapAndMeta$predictedTissueScore <= 0.5) + rpng() hist(umapAndMeta$predictedTissueScore) @@ -463,6 +476,14 @@ umapAndMeta$notPredictedBack <- FALSE umapAndMeta$notPredictedBack[!is.na(umapAndMeta$umapFactor) & is.na(umapAndMeta$predictedTissue) ] <- TRUE sum(umapAndMeta$notPredictedBack) +sum(!is.na(umapAndMeta$predictedTissue)) + +length(unique((umapAndMeta$predictedTissue))) + +sum(table((umapAndMeta$predictedTissue)) >= 1000) +hist(table((umapAndMeta$predictedTissue)), breaks =25) +barplot(table((umapAndMeta$predictedTissue))) + sort(table(umapAndMeta[umapAndMeta$misclasified, "umapFactor"])) sort(table(umapAndMeta[umapAndMeta$notPredictedBack, "umapFactor"])) @@ -524,18 +545,53 @@ for(tissueClass in levels(umapAndMeta$umapFactor)){ } dev.off() +save(umapAndMeta, file = "tissuePredictions_16_09_22.RData") unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)] -umapAndMeta$TissuePredictedCol <- defaultCol -umapAndMeta$TissuePredictedCol[umapAndMeta$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndMeta$predictedTissue[umapAndMeta$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5) -umapAndMeta$plotOrderTissuePredicted <- order(umapAndMeta$TissuePredictedCol != defaultCol) +table(umapAndMeta$predictedTissue) + +clusterToExclude <- c("U2-OS", "Leukemia_blood-cell-line", "HAP1", "LNCaP") + +umapAndMetaSelected <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umapAndMeta$predictedTissue %in% clusterToExclude, ] + +umapAndMetaSelected$TissuePredictedCol <- defaultCol +umapAndMetaSelected$TissuePredictedCol[umapAndMetaSelected$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndMetaSelected$predictedTissue[umapAndMetaSelected$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5) +umapAndMetaSelected$plotOrderTissuePredicted <- order(umapAndMetaSelected$TissuePredictedCol != defaultCol) rpng() par(mar = c(3,3,0.1,0.1), xpd = NA) -plot(umapAndMeta[umapAndMeta$plotOrderTissuePredicted,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissuePredicted,"UMAP2"], col = umapAndMeta$TissuePredictedCol[umapAndMeta$plotOrderTissuePredicted], cex = 0.2, pch = 16) +plot(umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP1"], umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP2"], col = umapAndMetaSelected$TissuePredictedCol[umapAndMetaSelected$plotOrderTissuePredicted], cex = 0.2, pch = 16) dev.off() locator(n =2, type = "l") + + +pdf(file = "umapPredicted.pdf", width = 16, height = 8) +#rpng() + +layout(matrix(1:2,ncol = 2)) + +par(mar = c(5,5,0.1,0.1), xpd = NA) +plot(umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP1"], umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP2"], col = umapAndMetaSelected$TissuePredictedCol[umapAndMetaSelected$plotOrderTissuePredicted], cex = 0.4, pch = 16, bty = "n", xlab = "UMAP-1", ylab = "UMAP-2") + +par(mar = c(0,0,0,0), xpd = NA) +plot.new() +plot.window(xlim = 0:1, ylim = 0:1) +legend("center", fill = tissueCol[rownames(tissueCol) %in% umapAndMetaSelected$predictedTissue,1], legend = row.names(tissueCol)[rownames(tissueCol) %in% umapAndMetaSelected$predictedTissue], bty = "n", ncol = 2,cex = 0.7) + + +dev.off() + + + + +countTable <- table(umapAndMetaSelected$predictedTissue) +sum(countTable) +pdf("baplotTissues.pdf", width = 12, height = 10) +par(mar = c(20,5,2,0.1), xpd = NA) +b <- barplot(countTable, las =2, col = tissueCol[names(countTable),]) +text(b, countTable + 250, countTable, font=1, srt = 90) +dev.off() \ No newline at end of file From 8f4638909d560aba2ad45e6d14d9ef23b776c598 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 22 Sep 2022 15:55:26 +0200 Subject: [PATCH 04/22] Update umap.R --- .../legacy_scripts/recount3/umap.R | 64 +++++++++++++------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R index 04d19aea2..73b276d03 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R @@ -91,19 +91,20 @@ sampleUmap <- umap( n_epochs = 1000, init = init, n_neighbors = 500, - min_dist = 2, init_sdev = 1e-4, learning_rate = 1, - spread = 15, - bandwidth = 10, + min_dist = 1, init_sdev = 1e-4, learning_rate = 2, + spread = 20, + bandwidth = 5, scale = "scale", - local_connectivity = 1, + local_connectivity = 10, + repulsion_strength = 0.5, metric = "correlation") rownames(sampleUmap) <- rownames(tissueSamples) colnames(sampleUmap) <- c("UMAP1", "UMAP2") -save(sampleUmap, file = "umap/sampleUmap2.RData") +save(sampleUmap, file = "umap/sampleUmap6.RData") -#load(file = "umap/sampleUmap.RData") +#load(file = "umap/sampleUmap3.RData") @@ -122,6 +123,7 @@ plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$ dev.off() +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16, xlim = c(-100,100), ylim = c(-100,100)) @@ -154,7 +156,7 @@ pdf(file = "umaptest.pdf", width = 16, height = 8) layout(matrix(1:2,ncol = 2)) par(mar = c(3,5,0.1,0.1), xpd = NA) -plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$umapAndMeta$TissueCol[plotOrderTissues], cex = 0.4, pch = 16) +plot(umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP1"], umapAndMeta[umapAndMeta$plotOrderTissues,"UMAP2"], col = umapAndMeta$TissueCol[umapAndMeta$plotOrderTissues], cex = 0.2, pch = 16) par(mar = c(0,0,0,0), xpd = NA) plot.new() @@ -549,22 +551,46 @@ save(umapAndMeta, file = "tissuePredictions_16_09_22.RData") unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)] -table(umapAndMeta$predictedTissue) + clusterToExclude <- c("U2-OS", "Leukemia_blood-cell-line", "HAP1", "LNCaP") -umapAndMetaSelected <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umapAndMeta$predictedTissue %in% clusterToExclude, ] -umapAndMetaSelected$TissuePredictedCol <- defaultCol -umapAndMetaSelected$TissuePredictedCol[umapAndMetaSelected$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndMetaSelected$predictedTissue[umapAndMetaSelected$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5) -umapAndMetaSelected$plotOrderTissuePredicted <- order(umapAndMetaSelected$TissuePredictedCol != defaultCol) -rpng() + +samplesWithPrediction <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umapAndMeta$predictedTissue %in% clusterToExclude, c( + "predictedTissue", + "predictedTissueScore", + "umapFactor", + "misclasified" +)] +colnames(samplesWithPrediction)[3] <- "annotatedTissue" +str(samplesWithPrediction) +save(samplesWithPrediction, file = "samplesWithPrediction_16_09_22.RData") + + +load(file = "umap/sampleUmap6.RData", verbose = T) + + +umapAndPredictions <- merge(samplesWithPrediction, sampleUmap, by = 0 ) +rownames(umapAndPredictions) <- umapAndPredictions$Row.names + + +umapAndPredictions$TissuePredictedCol <- defaultCol +umapAndPredictions$TissuePredictedCol[umapAndPredictions$predictedTissue %in% rownames(tissueCol)] <- adjustcolor(tissueCol[as.character(umapAndPredictions$predictedTissue[umapAndPredictions$predictedTissue %in% rownames(tissueCol)]),1], alpha.f = 0.5) +umapAndPredictions$plotOrderTissuePredicted <- order(umapAndPredictions$TissuePredictedCol != defaultCol) + +#rpng() par(mar = c(3,3,0.1,0.1), xpd = NA) -plot(umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP1"], umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP2"], col = umapAndMetaSelected$TissuePredictedCol[umapAndMetaSelected$plotOrderTissuePredicted], cex = 0.2, pch = 16) +plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16) + +plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16, xlim = c(-100,70), ylim = c(-50,50)) -dev.off() + + + +#dev.off() locator(n =2, type = "l") @@ -575,12 +601,12 @@ pdf(file = "umapPredicted.pdf", width = 16, height = 8) layout(matrix(1:2,ncol = 2)) par(mar = c(5,5,0.1,0.1), xpd = NA) -plot(umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP1"], umapAndMetaSelected[umapAndMetaSelected$plotOrderTissuePredicted,"UMAP2"], col = umapAndMetaSelected$TissuePredictedCol[umapAndMetaSelected$plotOrderTissuePredicted], cex = 0.4, pch = 16, bty = "n", xlab = "UMAP-1", ylab = "UMAP-2") +plot(umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP1"], umapAndPredictions[umapAndPredictions$plotOrderTissuePredicted,"UMAP2"], col = umapAndPredictions$TissuePredictedCol[umapAndPredictions$plotOrderTissuePredicted], cex = 0.2, pch = 16, bty = "n", xlab = "UMAP-1", ylab = "UMAP-2") par(mar = c(0,0,0,0), xpd = NA) plot.new() plot.window(xlim = 0:1, ylim = 0:1) -legend("center", fill = tissueCol[rownames(tissueCol) %in% umapAndMetaSelected$predictedTissue,1], legend = row.names(tissueCol)[rownames(tissueCol) %in% umapAndMetaSelected$predictedTissue], bty = "n", ncol = 2,cex = 0.7) +legend("center", fill = tissueCol[rownames(tissueCol) %in% umapAndPredictions$predictedTissue,1], legend = row.names(tissueCol)[rownames(tissueCol) %in% umapAndPredictions$predictedTissue], bty = "n", ncol = 2,cex = 0.7) dev.off() @@ -588,10 +614,10 @@ dev.off() -countTable <- table(umapAndMetaSelected$predictedTissue) +countTable <- table(umapAndPredictions$predictedTissue) sum(countTable) pdf("baplotTissues.pdf", width = 12, height = 10) par(mar = c(20,5,2,0.1), xpd = NA) b <- barplot(countTable, las =2, col = tissueCol[names(countTable),]) text(b, countTable + 250, countTable, font=1, srt = 90) -dev.off() \ No newline at end of file +dev.off() From 76b39cb2d844d64c5897ac2828eeda13f193d872 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 22 Sep 2022 16:06:43 +0200 Subject: [PATCH 05/22] recount3 --- .../legacy_scripts/metaBrain2.R | 31 ++++ .../legacy_scripts/smartSeqTest.R | 167 ++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R create mode 100644 Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R new file mode 100644 index 000000000..8d261ed03 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/metaBrain2.R @@ -0,0 +1,31 @@ +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs") + +source(paste0("C:\\Users\\patri\\Documents\\GitHub\\systemsgenetics\\Downstreamer\\src\\main\\r\\downstreamer_main/downstreamer_functions.r")) + +traits <- read.delim("MetaBrain/traits.txt") + + +i <- 1 + +pdf("MetaBrain/withAndWithoutEqtls.pdf", height = 20, width = 10) +#png("MetaBrain/withAndWithoutEqtls.png", height = 2000, width = 1000) +layout(matrix(1:8, ncol =2)) +par(pty="s") +for(i in 1:nrow(traits)){ + + + +trait <- traits[i, "trait"] +name <- traits[i, "name"] + +enrichments <- read.depict2(paste0("MetaBrain/normal/",trait,"_enrichtments.xlsx"))$GenePrioritization_MetaBrain +enrichmentsIncEqtl <- read.depict2(paste0("MetaBrain/inceqt/",trait,"_enrichtments.xlsx"))$GenePrioritization_MetaBrain + +enrichmentsBoth <- merge(enrichments, enrichmentsIncEqtl, "Gene.ID" , suffixes= c("Normal", "incEqtl")) + +maxZ <- max(range(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl)) +r <- cor(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl) +plot(enrichmentsBoth$Enrichment.Z.scoreNormal, enrichmentsBoth$Enrichment.Z.scoreincEqtl, bg = adjustcolor("dodgerblue2", alpha.f = 0.3), pch = 21, col=adjustcolor("dodgerblue2", alpha.f = 0.5), asp = 1, xlab = "Key gene score without eqtl information", ylab = "Key gene score without eqtl information", xlim = c(-maxZ,maxZ), ylim = c(-maxZ,maxZ), main = name) +mtext(paste0("Pearson r: ", signif(r,2))) +} +dev.off() diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R new file mode 100644 index 000000000..f56829394 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/smartSeqTest.R @@ -0,0 +1,167 @@ +smartSeq <- read.delim("smartseqSamples.txt")[,1] +str(smartSeq) + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colSmartseq <- defaultCol +pcsAndMeta$colSmartseq[pcsAndMeta$Row.names %in% smartSeq] <- "darkslategray" +pcsAndMeta$colSmartseq[pcsAndMeta$Row.names %in% qseq] <- "orangered" +pcsAndMeta$colSmartseq[outliersPc1 == "TRUE" ] <- "darkblue" +plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1) + +rpng() +plot(pcsAndMeta[plotOrderSmartseq,"PC_1"], pcsAndMeta[plotOrderSmartseq,"PC_2"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Smartseq") +dev.off() + +rpng() +plot(pcsAndMeta[plotOrderSmartseq,"PC_1"], pcsAndMeta[plotOrderSmartseq,"PC_6"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Quantseq and Smartseq", pch =16) + +library(pROC) + +smartSeqClass <- as.factor(pcsAndMeta$Row.names %in% smartSeq) +table(smartSeqClass) +dim(pcsAndMeta) +smartSeqAuc <- apply(pcsAndMeta[,2:100],2,function(x){ + tryCatch( + { + #wilcox.test(x ~ smartSeqClass)$p.value + as.numeric(auc(response = smartSeqClass, predictor = x)) + }, + error=function(cond){return(1)} + ) +}) +sort(smartSeqAuc) +str(pcsAndMeta[,2]) +boxplot(pcsAndMeta[,2]~smartSeqClass) + +boxplot(log10(pcsAndMeta[,"sra.sample_spots"]) ~ smartSeqClass ) + +sum(pcsAndMeta[,"sra.sample_spots"] < 10000000, na.rm = T) + + +library(vioplot) +vioplot(log10(pcsAndMeta[,"sra.sample_spots"]) ~ smartSeqClass) + + +plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], log10(pcsAndMeta[plotOrderSmartseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.6) + + + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +sum(!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 150000) + +pcsAndMeta$colSmartseq <- defaultCol +pcsAndMeta$colSmartseq[!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 150000] <- "aquamarine2" +table(pcsAndMeta$colSmartseq) +plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1) + +plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], pcsAndMeta[plotOrderSmartseq,"PC_1"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "recount_qc.star.number_of_splices:_total < 150000") + + + +defaultCol <- adjustcolor("grey", alpha.f = 0.5) +pcsAndMeta$colSmartseq <- defaultCol +pcsAndMeta$colSmartseq[!is.na(pcsAndMeta[,"recount_seq_qc.%a"]) & pcsAndMeta[,"recount_seq_qc.%a"] < 20] <- "darkslateblue" +table(pcsAndMeta$colSmartseq) +plotOrderSmartseq <- order((pcsAndMeta$colSmartseq != defaultCol) + 1) + +plot(pcsAndMeta[plotOrderSmartseq,"PC_6"], pcsAndMeta[plotOrderSmartseq,"PC_1"], col = pcsAndMeta$colSmartseq[plotOrderSmartseq], cex = 0.6, main = "recount_seq_qc.%c < 20") + + + + + + +pcNoCenter <- read.delim("Components.txt", sep = ",", row.names = 1) +pcNoCenter <- merge(pcNoCenter, combinedMeta, all.x = T, by = 0) +rownames(pcNoCenter) <- pcNoCenter$Row.names + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcNoCenter$colSmartseq <- defaultCol +pcNoCenter$colSmartseq[rownames(pcNoCenter) %in% smartSeq] <- "darkslategray" +table(pcNoCenter$colSmartseq) +plotOrderSmartseq <- order((pcNoCenter$colSmartseq != defaultCol) + 1) + +plot(pcNoCenter[plotOrderSmartseq,"X0"], pcNoCenter[plotOrderSmartseq,"X1"], col = pcNoCenter$colSmartseq[plotOrderSmartseq], cex = 0.4, main = "Smartseq") + + +plot(pcNoCenter[plotOrderSmartseq,"X0"], pcNoCenter[plotOrderSmartseq,"X1"], col = adjustcolor("grey", alpha.f = 0.2), cex = 0.3) + + + + + + + + + +defaultCol <- adjustcolor("grey", alpha.f = 0.6) +pcNoCenter$col <- defaultCol + +tissueAndCol <- tolower(pcNoCenter[,"Tissue"]) %in% tolower(tissueCol$PlotClass) + +pcNoCenter$col[tissueAndCol] <- tissueCol$col[match(tolower(pcNoCenter[tissueAndCol,"Tissue"]), tolower(tissueCol$PlotClass))] + + +tissue2AndCol <- tolower(pcNoCenter[,"Tissue2"]) %in% tolower(tissueCol$PlotClass) +sum(tissue2AndCol) +pcNoCenter$col[tissue2AndCol] <- tissueCol$col[match(tolower(pcNoCenter[tissue2AndCol,"Tissue2"]), tolower(tissueCol$PlotClass))] + + + +sum(is.na(tolower(pcNoCenter[,"Tissue"]) %in% tolower(tissueCol$PlotClass))) + +#pcNoCenter$col <- tissueCol$col[match(tolower(pcNoCenter[,"Tissue"]), tolower(tissueCol$PlotClass), nomatch = nrow(tissueCol))] + +plotOrder <- order((pcNoCenter$col != defaultCol) + 1) + +plot(pcNoCenter[plotOrder,"X0"], pcNoCenter[plotOrder,"X1"], col = pcNoCenter$col[plotOrder], cex = 0.4) + + +pcNoCenter$gtexCol <- defaultCol +pcNoCenter$gtexCol[pcNoCenter$Cohort == "GTEx" ] <- "goldenrod3" +pcNoCenter$gtexCol[pcNoCenter$Cohort == "TCGA" ] <- "cyan1" + +plotOrder <- order((pcNoCenter$gtexCol != defaultCol) + 1) +plot(pcNoCenter[plotOrder,"X0"], pcNoCenter[plotOrder,"X1"], col = pcNoCenter$gtexCol[plotOrder], cex = 0.4) + + + + +toExclude <- + (!is.na(pcsAndMeta[,"recount_qc.star.number_of_splices:_total"]) & pcsAndMeta[,"recount_qc.star.number_of_splices:_total"] < 15000) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%a"]) & pcsAndMeta[,"recount_seq_qc.%a"] < 20) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%t"]) & pcsAndMeta[,"recount_seq_qc.%t"] < 20) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%c"]) & pcsAndMeta[,"recount_seq_qc.%c"] < 20) | + (!is.na(pcsAndMeta[,"recount_seq_qc.%g"]) & pcsAndMeta[,"recount_seq_qc.%g"] < 20) +sum(toExclude) + + +samplesToKeep <- pcsAndMeta$Row.names[!toExclude] +length(samplesToKeep) + sum(toExclude) == nrow(pcs) + +write.table(samplesToKeep, file = "samplesToKeep.txt", row.names = F, quote = F) + + +boxplot(pcsAndMeta[,"PC_1"]) + +outliersPc1 <- as.factor(pcsAndMeta[,"PC_2"] >= 120) +table(outliersPc1) +library(pROC) +outlierAuc <- sapply(colnames(pcsAndMeta),function(x){ + tryCatch( + { + #wilcox.test(x ~ smartSeqClass)$p.value + as.numeric(auc(response = outliersPc1, predictor = pcsAndMeta[,x])) + }, + error=function(cond){return(NA)} + ) + }) +sort(outlierAuc) + + + + + +auc(response = outliersPc1, predictor = pcsAndMeta[,"PC_2"]) From d1471516a3eee2cffea34afa0e78aa9c3286067c Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 22 Sep 2022 16:27:01 +0200 Subject: [PATCH 06/22] refactor --- .../downstreamer_main/downstreamer_main.Rproj | 13 + .../umap/downstreamer_umap.rmd | 0 .../{ => legacy_scripts}/umap/pathwayUmap.R | 0 .../{ => legacy_scripts}/umap/sampleUmap.R | 0 .../{ => legacy_scripts}/umap/tryUmap.R | 0 .../r/downstreamer_main/recount3/README.md | 0 .../recount3/combineAnnotations.R | 1881 +++++++++++++++++ .../recount3/doPcaUsingCorMatrix.R | 0 .../{legacy_scripts => }/recount3/qqNorm.R | 0 .../recount3/recountCancerCellline2.R | 0 .../{legacy_scripts => }/recount3/umap.R | 0 11 files changed, 1894 insertions(+) create mode 100644 Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj rename Downstreamer/src/main/r/downstreamer_main/{ => legacy_scripts}/umap/downstreamer_umap.rmd (100%) rename Downstreamer/src/main/r/downstreamer_main/{ => legacy_scripts}/umap/pathwayUmap.R (100%) rename Downstreamer/src/main/r/downstreamer_main/{ => legacy_scripts}/umap/sampleUmap.R (100%) rename Downstreamer/src/main/r/downstreamer_main/{ => legacy_scripts}/umap/tryUmap.R (100%) create mode 100644 Downstreamer/src/main/r/downstreamer_main/recount3/README.md create mode 100644 Downstreamer/src/main/r/downstreamer_main/recount3/combineAnnotations.R rename Downstreamer/src/main/r/downstreamer_main/{legacy_scripts => }/recount3/doPcaUsingCorMatrix.R (100%) rename Downstreamer/src/main/r/downstreamer_main/{legacy_scripts => }/recount3/qqNorm.R (100%) rename Downstreamer/src/main/r/downstreamer_main/{legacy_scripts => }/recount3/recountCancerCellline2.R (100%) rename Downstreamer/src/main/r/downstreamer_main/{legacy_scripts => }/recount3/umap.R (100%) diff --git a/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj b/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj new file mode 100644 index 000000000..8e3c2ebc9 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/downstreamer_main.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/downstreamer_umap.rmd b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/downstreamer_umap.rmd similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/downstreamer_umap.rmd rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/downstreamer_umap.rmd diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/pathwayUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/pathwayUmap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/pathwayUmap.R rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/pathwayUmap.R diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/sampleUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/sampleUmap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/sampleUmap.R rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/sampleUmap.R diff --git a/Downstreamer/src/main/r/downstreamer_main/umap/tryUmap.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/tryUmap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/umap/tryUmap.R rename to Downstreamer/src/main/r/downstreamer_main/legacy_scripts/umap/tryUmap.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/combineAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/recount3/combineAnnotations.R new file mode 100644 index 000000000..892d655c4 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/combineAnnotations.R @@ -0,0 +1,1881 @@ +#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T) + + +remoter::client("localhost", port = 55501, password = "laberkak") + + +#save.image("tmp2.RData") + + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") +#load("tmp2.RData") + + +library(readr) + + +table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_sra1.txt", delim = "\t", quote = "", guess_max = 20000) +sraMeta1 <- as.data.frame(table_tmp[,-1]) +rm(table_tmp) + +table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_sra2.txt", delim = "\t", quote = "", guess_max = 20000) +sraMeta2 <- as.data.frame(table_tmp[,-1]) +rm(table_tmp) + +sraSharedCol <- intersect(colnames(sraMeta2), colnames(sraMeta1)) +length(sraSharedCol) + +sraMeta <- rbind(sraMeta1[,sraSharedCol], sraMeta2[,sraSharedCol]) + +#For some reason some runs are duplicated in the meta data file. +#Quick inspection showed that they have the same values +#Solution exclude duplicate row +sraUniqueIds <- unique(sraMeta$external_id) +str(sraUniqueIds) +sraMeta <- sraMeta[ match(sraUniqueIds, sraMeta$external_id), ] +rownames(sraMeta) <- sraMeta$external_id + + + +#extra columns in part 2 +sraPart2Col <- colnames(sraMeta2)[!colnames(sraMeta2) %in% colnames(sraMeta1)] + +sraMetaExtended <- sraMeta2[match(sraUniqueIds, sraMeta2$external_id),sraPart2Col] + +str(sraMetaExtended) + +sum(length(unique(sraMeta2$external_id))) +sum(length(unique(sraMeta1$external_id))) + +sum(unique(length(sraMeta2$external_id))) +sum(unique(length(sraMeta1$external_id))) + +#metadata_gtex +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_gtex.Rda", verbose = T) +rownames(metadata_gtex) <- metadata_gtex$external_id +metadata_gtex2 <- metadata_gtex[,c("gtex.smts", "gtex.smtsd")] +str(metadata_gtex2) + +#metadata_tcga +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadata_tcga.Rda", verbose = T) +rownames(metadata_tcga) <- metadata_tcga$external_id +metadata_tcga2 <- metadata_tcga[,c("tcga.gdc_cases.project.primary_site", "tcga.cgc_sample_sample_type")] + + + +#ARCH4 data +table_tmp <- read_delim("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Metadata/metadataSRA.txt", delim = "\t", quote = "") +metadata_archs4 <- as.data.frame(table_tmp[,-1]) +rownames(metadata_archs4) <- table_tmp[,3][[1]] +rm(table_tmp) +metadata_archs4_2 <- metadata_archs4[,c("Tissue", "CellType", "CellLine")] +colnames(metadata_archs4_2) <- c("archs4.Tissue", "archs4.CellType", "archs4.CellLine") + +#GADO data +metadata_Gado <- read.delim("celllinesAndCancer/oldAnnotations/sampleAnnotations.txt") +rownames(metadata_Gado) <- metadata_Gado$Sample +metadata_Gado2 <- metadata_Gado[,c("CellLine", "TissueType", "CellType", "PlotClass")] +colnames(metadata_Gado2) <- paste0("gado.",colnames(metadata_Gado2)) +gadoTissueCol <- read.delim("celllinesAndCancer/oldAnnotations/tissueCol5.txt") + + +#Kidney Network Annotaions +metadata_Kn <- read.delim("Metadata/KidneyNetwork.txt") +rownames(metadata_Kn) <- metadata_Kn$Sample +metadata_Kn2 <- metadata_Kn[,c("Origin", "Cell_type", "Cell_type_simplified", "Cell_type_manual")] +colnames(metadata_Kn2) <- paste0("KidneyNetwork.",colnames(metadata_Kn2)) + + +#Mahmoud annotations +load("Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Annotations.rda", verbose = T) +str(Annotations) +rownames(Annotations) <- Annotations$SampleID +mahmoudAnnotations <- Annotations[,-(1:2)] +#write.table(Annotations, sep = "\t", quote = F, col.names = NA, file = "tmp.txt") + +allSamples <- c(rownames(metadata_gtex2), rownames(metadata_tcga2), rownames(sraMeta)) + +length(unique(allSamples)) == length(allSamples) + +numberSamples = length(allSamples) +finalAnnotations <- data.frame( + Tissue = rep("",numberSamples), + Tissue2 = rep("",numberSamples), + Cellline = vector(mode = "logical", length = numberSamples), + CelllineName = rep("",numberSamples), + Cancer = vector(mode = "logical", length = numberSamples), + Cohort = rep("SRA",numberSamples), + row.names = allSamples, stringsAsFactors = F) +finalAnnotations$Cellline = NA +finalAnnotations$Cancer = NA +finalAnnotations$Fetal <- NA + +finalAnnotations$Cohort[rownames(finalAnnotations) %in% rownames(metadata_gtex2)] <- "GTEx" +finalAnnotations$Cohort[rownames(finalAnnotations) %in% rownames(metadata_tcga2)] <- "TCGA" +table(finalAnnotations$Cohort, useNA = "always") +str(finalAnnotations) + + +dim(finalAnnotations) +dim(metadata_gtex2) + + +sum(rownames(finalAnnotations) %in% rownames(metadata_gtex2)) + +a <- merge(finalAnnotations, metadata_gtex2, all.x = T, by = 0) +row.names(a) <- a$Row.names + +b <- merge(a, metadata_tcga2, all.x = T, by = 0) +row.names(b) <- b$Row.names + +c <- merge(b, metadata_archs4_2, all.x = T, by = 0) +row.names(c) <- c$Row.names + +d <- merge(c, metadata_Gado2, all.x = T, by = 0) +row.names(d) <- d$Row.names + +e <- merge(d, sraMetaExtended, all.x = T, by = 0) +row.names(e) <- e$Row.names + +f <- merge(e, sraMeta, all.x = T, by = 0) +row.names(f) <- f$Row.names + +g <- merge(f, metadata_Kn2, all.x = T, by = 0) +row.names(g) <- g$Row.names + +str(g) + +combinedMeta <- g[,-c(1:7)] +str(combinedMeta) + +#now fillin the gtex and gcta recount meta data. + +tmp <- metadata_gtex[,colnames(metadata_gtex) %in% sraSharedCol] +combinedMeta[rownames(tmp),colnames(tmp)] <- tmp + +tmp <- metadata_tcga[,colnames(metadata_tcga) %in% sraSharedCol] +combinedMeta[rownames(tmp),colnames(tmp)] <- tmp + +rm(tmp) + +#set study make column uniform +combinedMeta$study[combinedMeta$Cohort == "GTEx"] <- "GTEx" +combinedMeta$study[combinedMeta$Cohort == "TCGA"] <- "TCGA" + +combinedMeta$exclude <- FALSE + +#save(combinedMeta, file = "combinedMeta.RData") +#load(file = "combinedMeta.RData") + +combinedMeta$Tissue[combinedMeta$Cohort == "GTEx"] <- combinedMeta$gtex.smts[combinedMeta$Cohort == "GTEx"] +combinedMeta$Tissue2[combinedMeta$Cohort == "GTEx"] <- combinedMeta$gtex.smtsd[combinedMeta$Cohort == "GTEx"] +combinedMeta$Cellline[combinedMeta$Cohort == "GTEx"] <- FALSE +combinedMeta$Cancer[combinedMeta$Cohort == "GTEx"] <- FALSE + +gtexLcl <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - EBV-transformed lymphocytes") +combinedMeta$Cellline[gtexLcl] <- TRUE +combinedMeta$CelllineName[gtexLcl] <- "lcl" +combinedMeta$Tissue[gtexLcl] <- "" +combinedMeta$Tissue2[gtexLcl] <- "" + +gtexCml <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - Leukemia cell line (CML)") +combinedMeta$Cellline[gtexCml] <- TRUE +combinedMeta$CelllineName[gtexCml] <- "cml" +combinedMeta$Tissue[gtexCml] <- "" +combinedMeta$Tissue2[gtexCml] <- "" + +gtexFibroblasts <- combinedMeta$Cohort == "GTEx" & (!is.na(combinedMeta$gtex.smtsd) & combinedMeta$gtex.smtsd == "Cells - Cultured fibroblasts") +combinedMeta$Cellline[gtexFibroblasts] <- TRUE +combinedMeta$CelllineName[gtexFibroblasts] <- "Fibroblasts" +combinedMeta$Tissue[gtexFibroblasts] <- "" +combinedMeta$Tissue2[gtexFibroblasts] <- "" + + +table(combinedMeta$gtex.smtsd[combinedMeta$Cohort == "GTEx"]) + +combinedMeta$Tissue[combinedMeta$Cohort == "TCGA"] <- combinedMeta$tcga.gdc_cases.project.primary_site[combinedMeta$Cohort == "TCGA"] +combinedMeta$Cellline[combinedMeta$Cohort == "TCGA"] <- FALSE +combinedMeta$Cancer[combinedMeta$Cohort == "TCGA"] <- TRUE #default for TCGA exception below +combinedMeta$Cancer[combinedMeta$Cohort == "TCGA" & combinedMeta$tcga.cgc_sample_sample_type == "Solid Tissue Normal"] <- FALSE + + +#Map GADO names to gtex names +combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""] <- gsub("(^[[:alpha:]])", "\\U\\1", combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != ""], perl=TRUE)#https://stackoverflow.com/questions/18509527/first-letter-to-upper-case +gadoTissueCol$PlotClass <- gsub("(^[[:alpha:]])", "\\U\\1", gadoTissueCol$PlotClass, perl=TRUE)#https://stackoverflow.com/questions/18509527/first-letter-to-upper-case +combinedMeta$gado.TissueType[!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType == "Adipose"] <- "Adipose Tissue" +gadoTissueCol$PlotClass[gadoTissueCol$PlotClass == "Adipose"] <- "Adipose Tissue" + +#Fix +combinedMeta$gado.CellType[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "acute myeloid leukemia"] <- "AML" + +#Only annotations with a color are checked and highly realiable +gadoAnnotatedTissues <- !is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != "" & combinedMeta$gado.TissueType %in% gadoTissueCol$PlotClass +combinedMeta$Tissue[gadoAnnotatedTissues] <- combinedMeta$gado.TissueType[gadoAnnotatedTissues] +combinedMeta$Cancer[gadoAnnotatedTissues] <- FALSE +combinedMeta$Cellline[gadoAnnotatedTissues] <- FALSE + +gadoAnnotatedCelltypes <- !is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType != "" & combinedMeta$gado.CellType %in% gadoTissueCol$PlotClass +combinedMeta$Tissue2[gadoAnnotatedCelltypes] <- combinedMeta$gado.CellType[gadoAnnotatedCelltypes] +combinedMeta$Cancer[gadoAnnotatedCelltypes] <- FALSE +combinedMeta$Cellline[gadoAnnotatedCelltypes] <- FALSE + +combinedMeta$Cancer[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "AML"] <- TRUE +combinedMeta$Cancer[!is.na(combinedMeta$gado.CellType) & combinedMeta$gado.CellType == "DLBCL"] <- TRUE + + +gadoAnnotatedCelllines <- !is.na(combinedMeta$gado.CellLine) & combinedMeta$gado.CellLine != "" & tolower(combinedMeta$gado.CellLine) %in% tolower(gadoTissueCol$PlotClass) +combinedMeta$CelllineName[gadoAnnotatedCelllines] <- combinedMeta$gado.CellLine[gadoAnnotatedCelllines] +combinedMeta$Cancer[gadoAnnotatedCelllines] <- FALSE +combinedMeta$Cellline[gadoAnnotatedCelllines] <- TRUE + +#Some manual stuff for big studies + +combinedMeta$CelllineName[!is.na(combinedMeta$study) & combinedMeta$study == "SRP166108"] <- "HepaRG" +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP166108"] <- TRUE + + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- "Blood" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- "Whole Blood" +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- FALSE +combinedMeta$Cancer[!is.na(combinedMeta$study) & combinedMeta$study == "SRP192714"] <- FALSE + +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP186687"] <- TRUE + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- "Blood" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- "Whole Blood" +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- FALSE +combinedMeta$Cancer[!is.na(combinedMeta$study) & combinedMeta$study == "SRP092402"] <- FALSE + + +combinedMeta$Tissue[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- "Blood" +combinedMeta$Tissue2[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- "T-cells" +combinedMeta$Cellline[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP116272" & grepl("source_name;;T cells", combinedMeta$sra.sample_attributes)] <- FALSE + +combinedMeta$Tissue[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- "Blood" +combinedMeta$Tissue2[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- "Monocytes" +combinedMeta$Cellline[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP116272" & grepl("source_name;;Monocytes", combinedMeta$sra.sample_attributes)] <- FALSE + +combinedMeta$Tissue[combinedMeta$study == "SRP061932"] <- "" +combinedMeta$Tissue2[combinedMeta$study == "SRP061932"] <- "" +combinedMeta$Cellline[combinedMeta$study == "SRP061932"] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP061932"] <- FALSE + + + +combinedMeta$Tissue[combinedMeta$study == "SRP047323"] <- "" +combinedMeta$Tissue2[combinedMeta$study == "SRP047323"] <- "" +combinedMeta$Cellline[combinedMeta$study == "SRP047323"] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP047323"] <- FALSE + + + +combinedMeta$CelllineName[combinedMeta$study == "ERP001942"] <- "lcl" +combinedMeta$Cellline[combinedMeta$study == "ERP001942"] <- TRUE + + +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "ERP007111"] <- "iPSC" + +combinedMeta$Cellline[!is.na(combinedMeta$study) & combinedMeta$study == "ERP012914"] <- TRUE +combinedMeta$CelllineName[!is.na(combinedMeta$study) & combinedMeta$study == "ERP012914"] <- "HAP1" + + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP151763"] <- "Eye" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP151763"] <- "Retina" +combinedMeta$Cellline[combinedMeta$study == "SRP151763"] <- FALSE +combinedMeta$Cancer[combinedMeta$study == "SRP151763"] <- FALSE + + + +combinedMeta$Tissue[!is.na(combinedMeta$study) & combinedMeta$study == "SRP162411"] <- "Blood" +combinedMeta$Tissue2[!is.na(combinedMeta$study) & combinedMeta$study == "SRP162411"] <- "Whole Blood" + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP102542" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP150311" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP162873" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP163524" +combinedMeta$Tissue[studySamples] <- "Muscle" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study %in% c("SRP006676", "SRP071758", "SRP081599", "SRP086078", "SRP119923") +combinedMeta$Tissue[studySamples] <- "Airway Epithelial" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$Cancer[studySamples] <- FALSE + + +samples <- combinedMeta$study == "SRP188219" & grepl("left atrial appendage", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Heart" +combinedMeta$Tissue2[samples] <- "Left atrial appendage" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- FALSE + + +samples <- combinedMeta$study == "SRP188219" & grepl("right atrial appendage", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Heart" +combinedMeta$Tissue2[samples] <- "Right atrial appendage" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- FALSE + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRA755613" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- TRUE +combinedMeta$CelllineName[studySamples] <- "iPSC" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRA755626" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- TRUE +combinedMeta$CelllineName[studySamples] <- "iPSC" +combinedMeta$Cancer[studySamples] <- FALSE + + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP148659" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- TRUE +combinedMeta$CelllineName[studySamples] <- "iPSC" +combinedMeta$Cancer[studySamples] <- FALSE + +#Put to missing annotations unclear +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP009316" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP021509" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + +#Unsure how to classify airway smooth muscle +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP043162" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP052896" +combinedMeta$Cancer[studySamples] <- TRUE + + + + +#Some organoids and cancers +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP058722" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + + +#DPN and Tamoxifen treatments of parathyroid adenoma cells have cancer CNV profile +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP012167" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP019936" +combinedMeta$Cancer[studySamples] <- TRUE + + + +combinedMeta["SRR5341594", "sra.sample_title"] <- "Human differentiating macrophage" + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "ERP011411" +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP010166" +combinedMeta$Cancer[studySamples] <- TRUE + + + + +studySamples <- !is.na(combinedMeta$study) & combinedMeta$study == "SRP012656" +combinedMeta$Cancer[studySamples] <- TRUE + + +samples <- combinedMeta$study == "ERP006077" & grepl("Primary Prostate Tumour", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Prostate" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +samples <- combinedMeta$study == "ERP006077" & grepl("Matched Adjacent", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Prostate" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- FALSE + +samples <- combinedMeta$study == "ERP006077" +combinedMeta$Tissue[samples] <- "Pancreas" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +samples <- combinedMeta$study == "SRP058587" +combinedMeta$Tissue[samples] <- "Breast" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +samples <- combinedMeta$study == "SRP062332" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + + +samples <- combinedMeta$study == "SRP030401" +combinedMeta$Tissue[samples] <- "Breast" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + + +samples <- combinedMeta$study == "SRP028344" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + + + +samples <- combinedMeta$study == "SRP073061" & grepl("Tumor", combinedMeta$sra.experiment_title) +combinedMeta$Tissue[samples] <- "Breast" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + + + +samples <- combinedMeta$study == "SRP028346" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + +samples <- combinedMeta$study == "SRP058571" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + +studySamples <- combinedMeta$study %in% c("SRP014027", "SRP006575", "SRP071932", "ERP004617", "SRP034592","SRP049695") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + +samples <- combinedMeta$study == "SRP066596" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- NA +combinedMeta$Cancer[samples] <- TRUE + + + +samples <- combinedMeta$study == "SRP049648" +combinedMeta$Tissue[samples] <- "" +combinedMeta$Tissue2[samples] <- "" +combinedMeta$Cellline[samples] <- TRUE +combinedMeta$Cancer[samples] <- NA + + + + +studySamples <- combinedMeta$study == "SRP039694" +combinedMeta$Cancer[studySamples] <- TRUE + + +studySamples <- combinedMeta$study == "SRP066260" +combinedMeta$Cancer[studySamples] <- TRUE + + + +#mahmoudAnnotations + +colnames(mahmoudAnnotations)[colnames(mahmoudAnnotations) == "Cell_Line"] <- "Cellline" +colnames(mahmoudAnnotations)[colnames(mahmoudAnnotations) == "Cell_Line_Name"] <- "CelllineName" + +all(colnames(mahmoudAnnotations) %in% colnames(combinedMeta)) +all(rownames(mahmoudAnnotations) %in% rownames(combinedMeta)) + + +combinedMeta[rownames(mahmoudAnnotations),colnames(mahmoudAnnotations)] <- mahmoudAnnotations + + +#All cellline to false for all tissues +combinedMeta$Cancer[combinedMeta$Cellline] <- FALSE + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Cervix Uteri" +combinedMeta$Tissue[tmp] <- "Uterus" +combinedMeta$Tissue2[tmp] <- "Cervix" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Cervix" +combinedMeta$Tissue[tmp] <- "Uterus" +combinedMeta$Tissue2[tmp] <- "Cervix" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Lymph node" +combinedMeta$Tissue[tmp] <- "Lymph Nodes" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Bone marrow" +combinedMeta$Tissue[tmp] <- "Bone Marrow" + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Colorectal" +combinedMeta$Tissue[tmp] <- "Colon" + + +tmp <- !is.na(combinedMeta$Tissue) & combinedMeta$Tissue == "Whole blood" +combinedMeta$Tissue2[tmp] <- "Whole Blood" +combinedMeta$Tissue[tmp] <- "Blood" + + + +#Below are tissues2 fixes by Mahmoud +# annotations already present in Tissue are removed from Tissue2 +#duplicated are harmonized +#set rare annotations to NA +### All parts of the basal ganglia (including substantia nigra) were annotated as basal ganglia +###brain fragements was set to NA +###Retina needs to have Eye as Tissue +###Sample annotated as both brain & stomach was annotated as NA + +#Adipose Tissue +# Tissue2 includes "Adipose - Subcutaneous" & "Adipose - Visceral (Omentum)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Adipose - Subcutaneous"),2]= "Subcutaneous" +combinedMeta[!is.na(combinedMeta$Tissue2) & combinedMeta$Tissue2== "Adipose - Visceral (Omentum)",2]= "Visceral" +combinedMeta[!is.na(combinedMeta$Tissue) & combinedMeta$Tissue== "Adipose Tissue",1]= "Adipose" + +# Adrena Gland +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Adrenal Gland"),2]= NA + +#AML +# keep as is + +# Arteries +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Aorta"),2]= "Aorta" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Coronary"),2]= "Coronary" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Artery - Tibial"),2]= "Tibial" + +#B-cells +# keep as is + +#basal ganglion +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "basal ganglion"),2]= "Basal Ganglia" + +#Bladder +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Bladder"),2]= NA + +#Brain (keep as GTEX)**** +#Check for brain cortex vs cortex vs cerebral cortex +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Amygdala"),2]= "Amygdala" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Anterior cingulate cortex (BA24)"),2]= "Anterior cingulate cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Caudate (basal ganglia)"),2]= "Caudate (basal ganglia)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cerebellar Hemisphere"),2]= "Cerebellar Hemisphere" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cerebellum"),2]= "Cerebellum" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Cortex"),2]= "Cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Frontal Cortex (BA9)"),2]= "Frontal Cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Hippocampus"),2]= "Hippocampus" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Hypothalamus"),2]= "Hypothalamus" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Nucleus accumbens (basal ganglia)"),2]= "Nucleus accumbens (basal ganglia)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Putamen (basal ganglia)"),2]= "Putamen (basal ganglia)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Spinal cord (cervical c-1)"),2]= "Spinal Cord (cervical c-1)" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Brain - Substantia nigra"),2]= "Substantia nigra" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "brain fragment"),2]= NA + +#Breast +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Breast - Mammary Tissue"),2]= "Mammary Tissue" + +#CD34+ +# Keep as is + +#Cultured fibroblasts +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Cells - Cultured fibroblasts"),2]= "Cultured Fibroblasts" + +#cerebellum +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "cerebellum"),2]= "Cerebellum" + +# cerebral cortex +#recheck**** +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "cerebral cortex"),2]= "Cortex" + +#Cervix +#keep as is + +#choroid plexus +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "choroid plexus"),2]= NA + +#Colon +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Colon - Sigmoid"),2]= "Sigmoid" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Colon - Transverse"),2]= "Transverse" + +#diencephalon & diencephalon and midbrain +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "diencephalon"),2]= "Diencephalon" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "diencephalon and midbrain"),2]= NA + +#DLBCL +#keep as is + +#Esophagus +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Gastroesophageal Junction"),2]= "Gastroesophageal Junction" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Mucosa"),2]= "Mucosa" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Esophagus - Muscularis"),2]= "Muscularis" + +#Fallopian Tube +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Fallopian Tube"),2]= NA + +#forebrain +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain and midbrain"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "forebrain fragment"),2]= NA + +#Heart +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Heart - Atrial Appendage"),2]= "Atrial Appendage" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Heart - Left Ventricle"),2]= "Left Ventricle" + +#hindbrain +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain fragment"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hindbrain without cerebellum"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "hippocampus"),2]= "Hippocampus" + +#Kidney +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Kidney - Cortex"),2]= "Cortex" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Kidney - Medulla"),2]= "Medulla" + +#Liver +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Liver"),2]= NA + +#Lung +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Lung"),2]= NA + +#medulla oblongata +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "medulla oblongata"),2]= "Medulla Oblongata" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "midbrain"),2]= "Midbrain" + +#Minor salivary gland +#keep as is + +#Monocytes +#keep as is + +#Muscle-skeletal +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Muscle - Skeletal"),2]= "Skeletal" + +#Nerve +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Nerve - Tibial"),2]= "Tibial" + +#NK-cells +#keep as is + +#Ovary +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Ovary"),2]= NA + +#Pancreas +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Pancreas"),2]= NA + +#PBMC +#keep as is + +#Pituitary +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Pituitary"),2]= NA + +#pituitary and diencephalon & pons +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "pituitary and diencephalon"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "pons"),2]= NA + +#prostate +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Prostate"),2]= NA + +#skin +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Skin - Not Sun Exposed (Suprapubic)"),2]= "Suprapubic" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Skin - Sun Exposed (Lower leg)"),2]= "Lower Leg" + +# Small Intesine +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Small Intestine - Terminal Ileum"),2]= "Terminal Ileum" + +#spinal cord +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "spinal cord"),2]= "" + +#Spleen +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Spleen"),2]= NA + +#Stomach +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Stomach"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "stomach"),2]= NA +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "stomach"),1]= NA + +#T-cells +#Keep as is + +#telencephalon +#too general +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "telencephalon"),2]= NA + +#temporal lobe +#The temporal lobe is one of the four major lobes of the cerebral cortex +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "temporal lobe"),2]= "Cortex" + +#Testis +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Testis"),2]=NA + +#Thyroid +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Thyroid"),2]=NA + +#Uterus +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Uterus"),2]=NA + +#Vagina +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "Vagina"),2]=NA + +#whole blood +#keep as is + +#remove iPSCs from +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "iPSC") ,4]="iPSC" +combinedMeta[!is.na(combinedMeta$Tissue2) & (combinedMeta$Tissue2== "iPSC") ,2]="" + +#remove NA problem +combinedMeta$Tissue2[is.na(combinedMeta$Tissue2)]<- "" +combinedMeta$Tissue[is.na(combinedMeta$Tissue)]<- "" + + + + + + + +#Harmonizing Cell Line Names for samples in recount3 + +# A549 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "a549"),4]= "A549" + +#H-STS NET +#Keep as is + +#H1 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "h1"),4]= "H1" + +#H9 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "h9"),4]= "H9" + +#HAP1 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hap1"),4]= "HAP1" + +#HCT116 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hct116"),4]= "HCT116" + +#Hek293 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hek293"),4]= "HEK293" + +#HeLa +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hela"),4]= "HeLa" + +#HepaRG +#keep as is + +#hepg2 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "hepg2"),4]= "HepG2" + +#ipsc +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "ipsc"),4]= "iPSC" + +#K562 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "k562"),4]= "K562" + +#LCLs +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "lcl"),4]= "LCL" + +#lcl_s4u_capturing +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "lcl_s4u_capturing"),4]= "LCL_S4U_Capturing" + +#MCF10A +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mcf10a"),4]= "MCF10A" + +#MCF7 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mcf7"),4]= "MCF7" + +#MDA231 +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "mda231"),4]= "MDA231" + +#T47D +combinedMeta[!is.na(combinedMeta$CelllineName) & (combinedMeta$CelllineName== "t47d"),4]= "T47D" + + +#Fix SRP045234 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),4]= "iPSC" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045234"),5]= FALSE +#Fix SRP007525 Annotaions +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),4]= "OCI-LY1" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007525"),5]= FALSE +#Fix SRP027358 & SRP032926 +combinedMeta$Fetal[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027358")]= TRUE +combinedMeta$Fetal[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP032926")]= TRUE + +#Fix SRP026537 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),4]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026537"),5]= FALSE + +#Fix SRP049063 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),4]= "HT-29" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP049063"),5]= FALSE +#Fix SRP053034 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),1]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),2]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),3]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),4]= "RPE-1" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP053034"),5]= FALSE +#Fix SRP056197 Annoations +samples <- combinedMeta$study == "SRP056197" & grepl("Bone marrow", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Bone Marrow" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE +samples <- combinedMeta$study == "SRP056197" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Blood" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + +#Fix SRP013565 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP013565"),"Cancer"]= FALSE + +#Fix ERP008682 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"CelllineName"]= "H9" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP008682"),"Cancer"]= FALSE +#Fix SRP033646 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"CelllineName"]= "Caco2" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP033646"),"Cancer"]= FALSE +#Fix SRP027383 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP027383"),"Cancer"]= TRUE +#Fix SRP050003 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003") & (grepl("non-tumoral", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050003") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP073253 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Tissue"]= "Kidney" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP073253"),"Cancer"]= TRUE +#Fix SRP069235 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP069235"),"Cancer"]= TRUE +#Fix SRP074425 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074425"),"Cancer"]= TRUE +#Fix SRP044668 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668") & (grepl("non-neoplastic", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP044668") & (grepl("glioma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP009123 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123") & (grepl("non-tumor", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009123") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP041094 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Tissue"]= "Prostate" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041094"),"Cancer"]= TRUE +#Fix SRP040998 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP040998"),"Cancer"]= NA +#Fix SRP052056 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Tissue"]= "Thyroid" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056") & (grepl("healthy", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP052056") & (grepl("carcinoma", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP029880 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Tissue"]= "Colon" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP029880"),"Cancer"]= TRUE +#Fix SRP056696 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696") & (grepl("Normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056696") & (grepl("Tumor", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP066794 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Tissue"]= "Lung" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP066794"),"Cancer"]= TRUE +#Fix SRP149374 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Tissue"]= "Bone Marrow" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Tissue2"]= "CD34+" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374") & (grepl("Healthy", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP149374") & (grepl("Myelodysplastic", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +#Fix SRP019250 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250") & (grepl("HEK", combinedMeta$sra.sample_attributes, ignore.case=T)),"CelllineName"]= "HEK" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250") & (grepl("LCL", combinedMeta$sra.sample_attributes, ignore.case=T)),"CelllineName"]= "LCL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019250"),"Cancer"]= FALSE +#Fix SRP074349 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Tissue"]= "Lung" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349") & (grepl("NSCLC", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP074349") & (grepl("NSCLC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE + + +#Fix SRP009067 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"CelllineName"]= "LCL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009067"),"Cancer"]= FALSE +#Fix SRP007885 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"CelllineName"]= "LCL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP007885"),"Cancer"]= FALSE +#Fix SRP018218 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue"]= "Pancreas" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue2"]= "Stellate Cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP018218"),"Cancer"]= TRUE +#Fix SRP019275 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP019275"),"Cancer"]= FALSE +#Fix SRP042186 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042186"),"Cancer"]= FALSE +#Fix SRP042620 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue"]= "Breast" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("cell line", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("ER+", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("Triple Negative", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("Uninvolved", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP042620") & (grepl("No Known", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE + + +#Fix ERP010142 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Tissue"]= "Breast" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP010142"),"Cancer"]= TRUE +#Fix SRP026600 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP026600"),"Cancer"]= NA +#Fix SRP028336 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("muscle", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Muscle" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("kidney", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Kidney" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("prefrontal cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("cerebellar Cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("primary visual cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("muscle", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("kidney", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("prefrontal cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Prefrontal Cortex" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("cerebellar Cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Cerebellar Cortex" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336") & (grepl("primary visual cortex", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Primary Visual Cortex" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP028336"),"Cancer"]= FALSE +#Fix SRP009029 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009029"),"Cancer"]= NA +#Fix SRP006912 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"CelllineName"]= "HK-2" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006912"),"Cancer"]= FALSE +#Fix SRP055444 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Tissue2"]= "CLL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055444"),"Cancer"]= TRUE +#Fix SRP022942 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP022942"),"Cancer"]= FALSE +#Fix ERP012180 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012180"),"Cancer"]= NA +#Fix SRP058717 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Cellline"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"CelllineName"]= "HT-29" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP058717"),"Cancer"]= FALSE +#Fix SRP012568 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP012568"),"Cancer"]= NA +#Fix SRP065146 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP065146"),"Cancer"]= FALSE +#Fix ERP012188 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP012188"),"Cancer"]= NA +#Fix SRP055390 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Tissue2"]= "CLL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T))==F,"Tissue2"]="CLL" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055390") & (grepl("normal", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" + + +#Fix SRP036145 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP036145"),"Cancer"]= NA +#Fix SRP050533 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050533"),"Cancer"]= NA + + + + + + +#Fix ERP016243 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Fetal"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "" + +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2, " - ", combinedMeta$Fetal)[combinedMeta$study == "ERP109002"]) + +#Fix SRP078234 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("hindbrain", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("hindbrain", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Hindbrain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("spinal cord", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234") & (grepl("spinal cord", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Spinal Cord (cervical c-1)" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078234"),"Fetal"]= TRUE + + +#Fix SRP041044 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP041044"),"Cancer"]= NA +#Fix SRP050260 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Tissue"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Cellline"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP050260"),"Cancer"]= NA +#Fix SRP076099 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Cancer"]= NA +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076099"),"Fetal"]= TRUE +# Fix ERP115010 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP115010"),"Cancer"]= FALSE +# Fix SRP221482 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP221482"),"Cancer"]= FALSE +# Fix SRP059039 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059039"),"Cancer"]= FALSE +# Fix SRP059172 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP059172"),"Cancer"]= FALSE +# Fix SRP062966 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP062966"),"Cancer"]= FALSE +# Fix SRP081605 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP081605"),"Cancer"]= FALSE +# Fix SRP103772 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP103772"),"Cancer"]= FALSE +# Fix SRP132939 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP132939"),"Cancer"]= FALSE +# Fix SRP136938 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP136938"),"Cancer"]= FALSE +# Fix SRP150552 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cancer"]= FALSE +# Fix SRP174223 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174223"),"Cancer"]= FALSE +# Fix SRP174638 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP174638"),"Cancer"]= FALSE +# Fix SRP150552 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP150552"),"Cancer"]= FALSE + + + + +samples <- combinedMeta$study == "SRP033266" & grepl("Bone marrow", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Bone Marrow" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE +samples <- combinedMeta$study == "SRP033266" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Blood" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + + +samples <- combinedMeta$study == "SRP048759" & grepl("Bone marrow", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Bone Marrow" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE +samples <- combinedMeta$study == "SRP048759" & grepl("Heparinised blood", combinedMeta$sra.sample_attributes) +combinedMeta$Tissue[samples] <- "Blood" +combinedMeta$Tissue2[samples] <- "AML" +combinedMeta$Cellline[samples] <- FALSE +combinedMeta$Cancer[samples] <- TRUE + +#Fix SRP045500 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("wholw blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("CD4", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("CD8", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("Neutrophils", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Neutrophils" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("Monocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("B-cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500") & (grepl("NK", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "NK-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP045500"),"Cancer"]= FALSE + +#Fix SRP076719 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719") & (grepl("pbmc", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719") & (grepl("ln", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood"#in other cases we also put all t-cell to blood regardless of source +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP076719"),"Cancer"]= FALSE + +#Fix SRP051688 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("T cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("monocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("NK cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "NK-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("PBMC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "PBMC" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("myeloid DC", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Dendritic cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688") & (grepl("neutrophils", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Neutrophils" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP051688"),"Cancer"]= FALSE + +#Fix SRP078912 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912") & (grepl("T cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "T-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912") & (grepl("Monocyte", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP078912"),"Cancer"]= FALSE + +#Fix SRP110609 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609") & (grepl("lymphocytes", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609") & (grepl("Monocyte", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Monocytes" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP110609"),"Cancer"]= FALSE + +#Fix SRP158943 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("cll", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "CLL" #It doesn't state further clasification +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "B-cells" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("cll", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= TRUE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP158943") & (grepl("B cells", combinedMeta$sra.sample_attributes, ignore.case=T)),"Cancer"]= FALSE + +#Fix ERP104864 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("synovium", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Synovium" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("blood", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Whole Blood" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864") & (grepl("synovium", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP104864"),"Cancer"]= FALSE + +#Fix intestine samples +#ERP000546 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP000546") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#ERP003613 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("smallintestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("duodenum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP003613") & (combinedMeta$Tissue=="Intestine") & (grepl("duodenum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Duodenum" +#ERP006650 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP006650") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP039090 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP039090") & (combinedMeta$Tissue=="Intestine") & (grepl("Small Intestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +#SRP043391 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP043391") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon" +#SRP048801 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP048801") & (combinedMeta$Tissue=="Intestine") & (grepl("ileum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP048801") & (combinedMeta$Tissue=="Intestine") & (grepl("ileum", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue2"]= "Ileum" +#SRP055438 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP055438") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP056520 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP056520") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP006900 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP006900") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP063496 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP063496") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP000941 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP000941") & (combinedMeta$Tissue=="Intestine") & (grepl("colon", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Colon" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP000941") & (combinedMeta$Tissue=="Intestine") & (grepl("small intestine", combinedMeta$sra.sample_attributes, ignore.case=T)),"Tissue"]= "Small Intestine" +#SRP021221 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP021221") & (TissucombinedMetaes$Tissue=="Intestine"),"Tissue"]= "Colon" +#SRP009386 +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "SRP009386") & (combinedMeta$Tissue=="Intestine"),"Tissue"]= "Colon" +#exclude SRP048804 (Cell line) +combinedMeta=combinedMeta[!combinedMeta$study=="SRP048804",] +#exclude the remaining sample of Intestine +combinedMeta=combinedMeta[!combinedMeta$Tissue=="Intestine",] + +#Fix ERP109002 Annotations +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Heart", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Heart" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Heart", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Kidney", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Kidney" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Kidney", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Testis", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Testis" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Testis", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Liver", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Liver" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Liver", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Brain", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Brain", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Forebrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Forebrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue2"]= "Forebrain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Hindbrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue"]= "Brain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Hindbrain", combinedMeta$sra.experiment_attributes, ignore.case=T)),"Tissue2"]= "Hindbrain" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Ovary", combinedMeta$sra.library_name, ignore.case=T)),"Tissue"]= "Ovary" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("Ovary", combinedMeta$sra.library_name, ignore.case=T)),"Tissue2"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Cellline"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"CelllineName"]= "" +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Cancer"]= FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002"),"Fetal"] <- FALSE +combinedMeta[!is.na(combinedMeta$study) & (combinedMeta$study== "ERP109002") & (grepl("embryo", combinedMeta$sra.sample_attributes, ignore.case=T)),"Fetal"]= TRUE + + +studySamples <- combinedMeta$study %in% c("SRP105369", "ERP006121", "SRP062144") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "AML" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + + + +studySamples <- combinedMeta$study %in% c("SRP221351", "SRP110313", "SRP115151", "SRP133278", "SRP156583", "SRP201603") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "B-cells" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("ERP107715", "ERP111116", "SRP092158", "SRP133442", "SRP065795", "SRP119636") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + +studySamples <- combinedMeta$study %in% c("ERP109703", "SRP100686", "SRP161505") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "CLL" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + + + +studySamples <- combinedMeta$study == "SRP123604" +combinedMeta$Tissue[studySamples] <- "Colon" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- TRUE + + + +studySamples <- combinedMeta$study %in% c("ERP113862", "ERP002323", "ERP114921", "SRP051368", "SRP097893", "SRP101856", "SRP151577") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Dendritic cells" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- combinedMeta$study %in% c("SRP056733", "SRP062278", "SRP064515", "SRP074274", "SRP076097", "SRP095287", "SRP103821", "SRP109107", "SRP110187", "SRP118741", "SRP118760", "SRP145599", "SRP190161", "SRP218274", "SRP155941") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Macrophages" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("ERP020977", "ERP022909") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Macrophages-iPSC" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- combinedMeta$study %in% c("ERP014531", "SRP041826", "SRP058953", "SRP096201", "SRP113586", "SRP192825", "SRP173842", "SRP045352", "SRP055514", "SRP069333", "SRP101726") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "Monocytes" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("SRP150456") +combinedMeta$Tissue[studySamples] <- "Nasal Lavage" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + + +studySamples <- combinedMeta$study %in% c("SRP102104", "SRP162654", "SRP042596", "SRP049605", "SRP074736", "SRP090282", "SRP125882", "SRP140711", "SRP162023", "SRP168421", "SRP201023", "SRP212077", "SRP140558") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "PBMC" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + + +studySamples <- combinedMeta$study %in% c("SRP072980", "SRP169062", "SRP086613", "SRP092010", "ERP105662", "SRP093990", "SRP215282", "SRP032926", "SRP053186", "SRP059057", "SRP098715", "SRP101784", "SRP117629", "SRP140710", "SRP155217", "SRP158900", "SRP192607") +combinedMeta$Tissue[studySamples] <- "Blood" +combinedMeta$Tissue2[studySamples] <- "T-cells" +combinedMeta$Cellline[studySamples] <- FALSE +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- FALSE + +#SRP081020 is mis-anotated in SRA as PBMC, paper and clustering both state wholeblood +studySamples <- combinedMeta$study %in% c("ERP114104", "SRP051848", "SRP056784", "SRP071965", "SRP077975", "SRP081020", "SRP098758", "SRP113245", "SRP126580", "SRP126582", "SRP126583", "SRP136057", "SRP144583", "SRP150872", "SRP214077", "SRP056443") +combinedMeta$Tissue[studySamples] <- "" +combinedMeta$Tissue2[studySamples] <- "" +combinedMeta$Cellline[studySamples] <- NA +combinedMeta$CelllineName[studySamples] <- "" +combinedMeta$Cancer[studySamples] <- NA + + + + + + +combinedMeta$Cellline[!is.na(combinedMeta$CelllineName)&combinedMeta$CelllineName=="iPSC"] <- TRUE +combinedMeta$Cancer[!is.na(combinedMeta$Cellline) & combinedMeta$Cellline] <- NA + + +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2, " - ", combinedMeta$Fetal)[combinedMeta$study == "ERP109002"]) + +#Exclude spike in +combinedMeta$exclude[combinedMeta$study == "SRP041955"] <- TRUE + +(x <- table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2),combinedMeta$Cancer)) +write.table(x, file = "test.txt", row.names = T, col.names = NA, quote = F, sep = "\t") + + +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2),combinedMeta$Cancer) + +table(combinedMeta$Tissue) +table(combinedMeta$Tissue2) + +table(combinedMeta$Tissue, combinedMeta$Cellline) + +table(combinedMeta$Cancer) + +table(combinedMeta$Tissue, combinedMeta$Cancer) + +table(combinedMeta$Tissue[combinedMeta$Cohort == "TCGA"], combinedMeta$Cancer[combinedMeta$Cohort == "TCGA"]) + + +sum(combinedMeta$gado.TissueType %in% combinedMeta$gado.PlotClass) + +sum((!is.na(combinedMeta$gado.TissueType) & combinedMeta$gado.TissueType != "")) + +sum( ) +unique(combinedMeta$gado.TissueType) + +table(combinedMeta$gado.PlotClass, useNA = "a") + +combinedMeta$Tissue[combinedMeta$Cohort == "GSA"] + + + +#save(combinedMeta, file = "combinedMeta_2022_09_15.RData") + +load(file = "combinedMeta_2022_08_19.RData") + +pcsAndMeta <- merge(expPcs[,1:100], combinedMeta, by = 0, all.x = T) +dim(pcsAndMeta) +str(combinedMeta) + +tissueCol <- read.delim("Recount3_QC_2ndRun/SRA_Studies_Annotations_Patrick/Annotations_color2.txt", row.names = 1) + +sum(unique(pcsAndMeta[,"Tissue"]) %in% tissueCol$PlotClass) +sum(unique(pcsAndMeta[,"Tissue2"]) %in% tissueCol$PlotClass) + +x <- unique(pcsAndMeta[,"Tissue2"]) +x[!x %in% tissueCol$PlotClass] + + +defaultCol <- adjustcolor("grey", alpha.f = 0.6) +pcsAndMeta$col <- defaultCol + +tissueAndCol <- pcsAndMeta[,"Tissue"] %in% tissueCol$PlotClass + +pcsAndMeta$col[tissueAndCol] <- adjustcolor(tissueCol$col[match(pcsAndMeta[tissueAndCol,"Tissue"], tissueCol$PlotClass)], alpha.f = 0.6) + + +tissue2AndCol <- pcsAndMeta[,"Tissue2"] %in% tissueCol$PlotClass +sum(tissue2AndCol) +pcsAndMeta$col[tissue2AndCol] <- adjustcolor(tissueCol$col[match(pcsAndMeta[tissue2AndCol,"Tissue2"], tissueCol$PlotClass)], alpha.f = 0.6) + +table(pcsAndMeta[pcsAndMeta[,"PC_2"] >= 0,"Tissue2"]) + + +sum(is.na(tolower(pcsAndMeta[,"Tissue"]) %in% tolower(tisueCol$PlotClass))) + +#pcsAndMeta$col <- tissueCol$col[match(tolower(pcsAndMeta[,"Tissue"]), tolower(tissueCol$PlotClass), nomatch = nrow(tissueCol))] + +plotOrder <- order((pcsAndMeta$col != defaultCol) + 1) + +rpng(width = 800, height = 800) +#pdf(file = "test.pdf") +plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$col[plotOrder], cex = 0.3, pch = 16) +dev.off() + + +rpng(width = 800, height = 800) +#pdf(file = "test.pdf") +plot(pcsAndMeta[plotOrder,"PC_3"], pcsAndMeta[plotOrder,"PC_7"], col = pcsAndMeta$col[plotOrder], cex = 0.3, pch = 16) +dev.off() + + +#rpng(width = 800, height = 800) +png("tissues.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + +png("tissues2.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",6:10)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + +png("tissues3.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",11:15)], col = pcsAndMeta$col[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colCelline <- defaultCol +pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cellline"]) & pcsAndMeta[,"Cellline"]] <- adjustcolor("magenta", alpha.f = 0.3) +pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cellline"]) & !pcsAndMeta[,"Cellline"]] <- adjustcolor("royalblue1", alpha.f = 0.3) +pcsAndMeta$colCelline[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- adjustcolor("forestgreen", alpha.f = 0.3) +plotOrder <- order((pcsAndMeta$colCelline != defaultCol) + 1) + + +pcsAndMeta$cellineTissueCancer <- "Unkown" +pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cellline"]) & pcsAndMeta[,"Cellline"]] <- "Cellline" +pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cellline"]) & !pcsAndMeta[,"Cellline"]] <- "Tissue" +pcsAndMeta$cellineTissueCancer[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- "Cancer" + +pcsAndMeta$cellineTissueCancer <- factor(pcsAndMeta$cellineTissueCancer, levels = c("Tissue", "Cancer", "Cellline", "Unkown")) + +table(pcsAndMeta$cellineTissueCancer, useNA = "always") + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, pch = 16) +dev.off() + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_3"], pcsAndMeta[plotOrder,"PC_75"], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, pch = 16) +dev.off() + +for(i in c(1,3:100)){ + png(paste0("cellinePlots/pc",i,".png"),width = 1000, height = 1000) + #rpng() + plot(pcsAndMeta[plotOrder,"PC_2"], pcsAndMeta[plotOrder,paste0("PC_",i)], col = pcsAndMeta$colCelline[plotOrder], cex = 1, pch = 16, xlab = "PC2", ylab = paste0("PC", i)) + dev.off() +} + +library(vioplot) + +for(i in 1:100){ +png(paste0("cellinePlots2/pc",i,".png"),width = 500, height = 500) +vioplot( pcsAndMeta[,paste0("PC_",i)] ~ pcsAndMeta$cellineTissueCancer, col = c(adjustcolor("royalblue1", alpha.f = 0.3), adjustcolor("forestgreen", alpha.f = 0.3), adjustcolor("magenta", alpha.f = 0.3), defaultCol)) +dev.off() +} +table(paste0(combinedMeta$Tissue, " - ", combinedMeta$Tissue2)) + +png("celllines_c.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5, "_c")], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + + +png("celllines2.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",6:10, "")], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colCancer <- defaultCol +pcsAndMeta$colCancer[!is.na(pcsAndMeta[,"Cancer"]) & pcsAndMeta[,"Cancer"]] <- adjustcolor("chartreuse1", alpha.f = 0.6) +plotOrder <- order((pcsAndMeta$colCancer != defaultCol) + 1) + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_1"], pcsAndMeta[plotOrder,"PC_2"], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4) +dev.off() + +png("cancers.png",width = 2000, height = 2000) +pairs(pcsAndMeta[plotOrder,paste0("PC_",1:5)], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4, upper.panel = NULL, pch = 16) +dev.off() + +library(pROC) +cancerAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cancer, predictor = x))}) +sort(cancerAuc) + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_33"], pcsAndMeta[plotOrder,"PC_9"], col = pcsAndMeta$col[plotOrder], cex = 0.4) +dev.off() + + +library(pROC) +cancerAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cancer, predictor = x))}) +sort(cancerAuc) + +celllineAuc <- apply(pcsAndMeta[,paste0("PC_",1:100)], 2, function(x){as.numeric(auc(response = pcsAndMeta$Cellline, predictor = x))}) +sort(celllineAuc) + + +rpng(width = 800, height = 800) +plot(pcsAndMeta[plotOrder,"PC_33"], pcsAndMeta[plotOrder,"PC_9"], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4) +dev.off() + +rpng() +pairs(pcsAndMeta[plotOrder,paste0("PC_",c(33,32,9,10,21))], col = pcsAndMeta$colCancer[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + + +rpng() +pairs(pcsAndMeta[plotOrder,paste0("PC_",c(3,2,27,6,20))], col = pcsAndMeta$colCelline[plotOrder], cex = 0.4, upper.panel = NULL) +dev.off() + +combinedMeta$sra.sample_title <- gsub("\"", "", combinedMeta$sra.sample_title) + +tmp <- merge(combinedMeta,pcs[,1:100], by = 0, all.y = T) +dim(tmp) +write.table(tmp, file = "tmpAnnotations.txt", sep = "\t", quote = FALSE, col.names = NA) + +qseq <- read.delim("quantseqSamples.txt")[,1] +str(qseq) + + +defaultCol <- adjustcolor("grey", alpha.f = 0.3) +pcsAndMeta$colQseq <- defaultCol +pcsAndMeta$colQseq[pcsAndMeta$Row.names %in% qseq] <- "orangered" +plotOrderQseq <- order((pcsAndMeta$colQseq != defaultCol) + 1) + +plot(pcsAndMeta[plotOrderQseq,"PC_1"], pcsAndMeta[plotOrderQseq,"PC_2"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4, main = "Quantseq") + +plot(pcsAndMeta[plotOrderQseq,"PC_6"], pcsAndMeta[plotOrderQseq,"PC_1"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4, main = "Quantseq") + + +table(pcsAndMeta$sra.library_layout) + + +numColumns <- unlist(lapply(combinedMeta, is.numeric)) + + +combinedMetaMatrix <- as.matrix(combinedMeta[,numColumns]) + +library(pROC) + +qseqClass <- as.factor(rownames(combinedMeta) %in% qseq) +table(qseqClass) +dim(combinedMetaMatrix) +qseqPValues <- apply(combinedMetaMatrix,2,function(x){ + tryCatch( + { + #wilcox.test(x ~ qseqClass)$p.value + as.numeric(auc(response = qseqClass, predictor = x)) + }, + error=function(cond){return(1)} + ) +}) +sort(qseqPValues) + + +boxplot(combinedMetaMatrix[,"recount_qc.bc_frag.kallisto_mean_length"] ~ qseqClass ) + + + + + + + + +plot(pcsAndMeta[plotOrderQseq,"recount_seq_qc.%c"], log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.6) +10^5.2 +abline(h=log10(150000)) +abline(v=60) +log10(10^5) + +plot(log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), pcsAndMeta[plotOrderQseq,"recount_qc.bc_frag.kallisto_mean_length"], col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4) + +plot(pcsAndMeta[plotOrderQseq,"PC_6"], log10(pcsAndMeta[plotOrderQseq,"recount_qc.star.number_of_splices:_total"]), col = pcsAndMeta$colQseq[plotOrderQseq], cex = 0.4) + + +pc1Cor <- cor(pcsAndMeta[,"PC_1"], pcsAndMeta[,numColumns], use = "pairwise.complete.obs") +sort(pc1Cor[1,]) + +pc6Cor <- apply(combinedMetaMatrix[pcsAndMeta$Row.names,],2,function(x){ + tryCatch( + { + #wilcox.test(x ~ qseqClass)$p.value + cor(pcsAndMeta[,"PC_6"], x, use = "pairwise.complete.obs") + }, + error=function(cond){return(0)}, + warning=function(cond){return(0)} + ) +}) +sort(pc6Cor^2) + +load("testPcPAtrickFrist100.RData", verbose = T) +colnames(expPcs) +str(expPcs) +colnames(expPcs) <- paste0("PC_", 1:ncol(expPcs)) +pcsAndMeta <- merge(expPcs, combinedMeta, by = 0, all.x = T) +dim(pcsAndMeta) + + + +load("gadoPca.RData", verbose = T) +colnames(expGadoPcsSub) +str(expGadoPcsSub) +colnames(expGadoPcsSub) <- paste0("PC_", 1:ncol(expGadoPcsSub)) +pcsAndMeta <- merge(expGadoPcsSub, combinedMeta, by = 0, all.x = T) +dim(pcsAndMeta) + + + +table(pcsAndMeta$CelllineName) +pcsAndMeta$Cellline[grepl("s4u", pcsAndMeta$CelllineName)] + + + + + + diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R b/Downstreamer/src/main/r/downstreamer_main/recount3/doPcaUsingCorMatrix.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/doPcaUsingCorMatrix.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/doPcaUsingCorMatrix.R diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/qqNorm.R b/Downstreamer/src/main/r/downstreamer_main/recount3/qqNorm.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/qqNorm.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/qqNorm.R diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R b/Downstreamer/src/main/r/downstreamer_main/recount3/recountCancerCellline2.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/recountCancerCellline2.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/recountCancerCellline2.R diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/recount3/umap.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/legacy_scripts/recount3/umap.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/umap.R From d44990191f0dc0c883d911975540241bd757cdf1 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 22 Sep 2022 16:29:44 +0200 Subject: [PATCH 07/22] Update README.md --- .../src/main/r/downstreamer_main/recount3/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md index e69de29bb..696882a71 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md @@ -0,0 +1,9 @@ +# Processing of recount3 data + +## Downloading and extracting the expression values + + +## QQ normalization of the data + + +## PCA on the co-expression matrix \ No newline at end of file From 699c15de5756257a621d03abd7eb03b67b018741 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 22 Sep 2022 16:37:48 +0200 Subject: [PATCH 08/22] refactor --- .../main/r/downstreamer_main/recount3/README.md | 15 +++++++++++++-- .../recount3/{umap.R => createUmapOfRecount3.R} | 0 ...ns.R => harmonizeAndExtentSampleAnnotations.R} | 0 ...CorMatrix.R => performPcaOnFullRecount3Data.R} | 0 ...lline2.R => predictRecount3CancerCellllines.R} | 0 .../recount3/{qqNorm.R => qqNormOfFullRecount3.R} | 0 6 files changed, 13 insertions(+), 2 deletions(-) rename Downstreamer/src/main/r/downstreamer_main/recount3/{umap.R => createUmapOfRecount3.R} (100%) rename Downstreamer/src/main/r/downstreamer_main/recount3/{combineAnnotations.R => harmonizeAndExtentSampleAnnotations.R} (100%) rename Downstreamer/src/main/r/downstreamer_main/recount3/{doPcaUsingCorMatrix.R => performPcaOnFullRecount3Data.R} (100%) rename Downstreamer/src/main/r/downstreamer_main/recount3/{recountCancerCellline2.R => predictRecount3CancerCellllines.R} (100%) rename Downstreamer/src/main/r/downstreamer_main/recount3/{qqNorm.R => qqNormOfFullRecount3.R} (100%) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md index 696882a71..5edb92e21 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/README.md +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/README.md @@ -1,9 +1,20 @@ # Processing of recount3 data -## Downloading and extracting the expression values +## Downloading and extracting the expression values and meta data + +## Harmonize and expand the annotations + +In `harmonizeAndExtentSampleAnnotations.R` we expand and map the sample annotations. + +For a large part the annotations where added manually. + ## QQ normalization of the data -## PCA on the co-expression matrix \ No newline at end of file +## PCA on the co-expression matrix + + +## Excluding the cell line and cancer samples + diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/umap.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/recount3/umap.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/combineAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/recount3/combineAnnotations.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/doPcaUsingCorMatrix.R b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/recount3/doPcaUsingCorMatrix.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/recountCancerCellline2.R b/Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/recount3/recountCancerCellline2.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/predictRecount3CancerCellllines.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/qqNorm.R b/Downstreamer/src/main/r/downstreamer_main/recount3/qqNormOfFullRecount3.R similarity index 100% rename from Downstreamer/src/main/r/downstreamer_main/recount3/qqNorm.R rename to Downstreamer/src/main/r/downstreamer_main/recount3/qqNormOfFullRecount3.R From 7fbe2deea7ed850eb205b497b5786b56cafaff7e Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sat, 24 Sep 2022 11:05:29 +0200 Subject: [PATCH 09/22] recount3 --- .../recount3/createUmapOfRecount3.R | 8 ++- .../recount3/extractRawReadCounts.R | 67 +++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R index 73b276d03..7cd2549fd 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R @@ -547,7 +547,8 @@ for(tissueClass in levels(umapAndMeta$umapFactor)){ } dev.off() -save(umapAndMeta, file = "tissuePredictions_16_09_22.RData") +#save(umapAndMeta, file = "tissuePredictions/tissuePredictions_16_09_22.RData") +load("tissuePredictions/tissuePredictions_16_09_22.RData") unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)] @@ -566,8 +567,11 @@ samplesWithPrediction <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umap )] colnames(samplesWithPrediction)[3] <- "annotatedTissue" str(samplesWithPrediction) -save(samplesWithPrediction, file = "samplesWithPrediction_16_09_22.RData") +#save(samplesWithPrediction, file = "tissuePredictions/samplesWithPrediction_16_09_22.RData") +load("tissuePredictions/samplesWithPrediction_16_09_22.RData") +str(samplesWithPrediction) +table(samplesWithPrediction$predictedTissue) load(file = "umap/sampleUmap6.RData", verbose = T) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R new file mode 100644 index 000000000..db6535abf --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R @@ -0,0 +1,67 @@ +#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T) + + +remoter::client("localhost", port = 55501, password = "laberkak") + + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") + + + +sraFiles <- list.files(path="rse-sra/SRA_Files/", pattern="sra*", full.names=TRUE, recursive=FALSE) +gtexFiles <- list.files(path="rse-gtex/rse_gtex", pattern="rse*", full.names=TRUE, recursive=FALSE) +allFiles <- c(sraFiles, gtexFiles, "rse-tcga/rseTCGA.rda") + +load("tissuePredictions/samplesWithPrediction_16_09_22.RData") +selectedSamples <- rownames(samplesWithPrediction) +str(selectedSamples) + + +#file = sraFiles[10] + + + +perChunkExp <- sapply(allFiles, function(file){ + + loadedObject <- load(file) + + sreObjects <- get(loadedObject[1]) + + #sreObject <- sreObject[[1]] + + perStudyExp <- lapply(sreObjects, function(sreObject){ + studyExp <- sreObject@assays@data@listData$raw_counts + return(studyExp[,colnames(studyExp) %in% selectedSamples, drop = F]) + }) + + return(do.call(cbind, perStudyExp)) + +}) + +selectedSamplesExp <- do.call(cbind, perChunkExp) + +all(selectedSamples %in% colnames(selectedSamplesExp )) +table(selectedSamples %in% colnames(selectedSamplesExp )) + +#Some samples are duplicated in the chunks, now make sure only one is in the matrix +uniqueSamplesIndex <- match(selectedSamples, colnames(selectedSamplesExp)) +selectedSamplesExp <- selectedSamplesExp[,uniqueSamplesIndex] + +save(selectedSamplesExp, file = "tissuePredictions/selectedSamplesRawExpression.RData") + +tissueClasses <- levels(samplesWithPrediction$predictedTissue) + +tissue <- tissueClasses[1] + +rownames(samplesWithPrediction)[!rownames(samplesWithPrediction) %in% colnames(selectedSamplesExp)] +table(rownames(samplesWithPrediction) %in% colnames(selectedSamplesExp)) +dim(samplesWithPrediction) +dim(selectedSamplesExp) + +perTissueExp <- lapply(tissueClasses, function(tissue){ + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- selectedSamplesExp[,tissueSamples] +}) + From 57238c16bcb67c337651fc5f199cc5577943b180 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 29 Sep 2022 09:14:51 +0200 Subject: [PATCH 10/22] recount3 --- .gitignore | 1 + .../recount3/createUmapOfRecount3.R | 8 +- .../recount3/extractRawReadCounts.R | 45 +++++---- .../harmonizeAndExtentSampleAnnotations.R | 2 +- .../normalizeRecount3PerTissueAndDoPca.R | 92 +++++++++++++++++++ .../recount3/performPcaOnFullRecount3Data.R | 9 +- 6 files changed, 128 insertions(+), 29 deletions(-) create mode 100644 Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R diff --git a/.gitignore b/.gitignore index 1e72b30bf..7668984af 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ deconvolutionTestResults/ .Rproj.user .Rhistory /DEPICT2/src/main/r/downstreamer_main/downstreamer_main.Rproj +Downstreamer/src/main/r/downstreamer_main/.remoterserverlog diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R index 7cd2549fd..2ff4218fb 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R @@ -11,11 +11,11 @@ library(uwot) setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") -tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "") load(file = "DataForPredictions.RData") rownames(pcsAndMeta) <- pcsAndMeta$Row.names -load(file = "combinedMeta_2022_09_15.RData", verbose = T) +load(file = "tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) +tissueCol <- read.delim("umap/col.txt", row.names = 1, na.strings = "") colnamesToUpdate <- colnames(pcsAndMeta)[colnames(pcsAndMeta) %in% colnames(combinedMeta)] @@ -93,7 +93,7 @@ sampleUmap <- umap( n_neighbors = 500, min_dist = 1, init_sdev = 1e-4, learning_rate = 2, spread = 20, - bandwidth = 5, + bandwidth = 10, scale = "scale", local_connectivity = 10, repulsion_strength = 0.5, @@ -548,7 +548,7 @@ for(tissueClass in levels(umapAndMeta$umapFactor)){ dev.off() #save(umapAndMeta, file = "tissuePredictions/tissuePredictions_16_09_22.RData") -load("tissuePredictions/tissuePredictions_16_09_22.RData") +load("tissuePredictions/tissuePredictions_16_09_22.RData", verbose = T) unique(umapAndMeta$predictedTissue)[!unique(umapAndMeta$predictedTissue) %in% rownames(tissueCol)] diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R index db6535abf..4f1dc35cc 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R @@ -1,8 +1,8 @@ #srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i -#remoter::server(verbose = T, port = 55556, password = "laberkak", sync = T) +#remoter::server(verbose = T, port = 55556, sync = T) -remoter::client("localhost", port = 55501, password = "laberkak") +remoter::client("localhost", port = 55501) @@ -12,16 +12,14 @@ setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") sraFiles <- list.files(path="rse-sra/SRA_Files/", pattern="sra*", full.names=TRUE, recursive=FALSE) gtexFiles <- list.files(path="rse-gtex/rse_gtex", pattern="rse*", full.names=TRUE, recursive=FALSE) -allFiles <- c(sraFiles, gtexFiles, "rse-tcga/rseTCGA.rda") +allFiles <- c(sraFiles, gtexFiles, "rse-tcga/rseTCGA.rda", "rse-tcga/rse_ESCA_TCGA.rda") load("tissuePredictions/samplesWithPrediction_16_09_22.RData") selectedSamples <- rownames(samplesWithPrediction) str(selectedSamples) -#file = sraFiles[10] - - +#file = allFiles[10] perChunkExp <- sapply(allFiles, function(file){ @@ -29,7 +27,12 @@ perChunkExp <- sapply(allFiles, function(file){ sreObjects <- get(loadedObject[1]) - #sreObject <- sreObject[[1]] + #sometimes single RSE is not in list. Put in list of one to make code uniform + if(!is.list(sreObjects)){ + sreObjects <- list(sreObjects) + } + + #sreObject <- sreObjects[[1]] perStudyExp <- lapply(sreObjects, function(sreObject){ studyExp <- sreObject@assays@data@listData$raw_counts @@ -40,28 +43,32 @@ perChunkExp <- sapply(allFiles, function(file){ }) -selectedSamplesExp <- do.call(cbind, perChunkExp) +str(sreObject) + +hist(log2(sreObject@assays@data@listData$raw_counts[,1])) +dev.off() + +hist(studyExp[,1]) +dev.off() +sum(!perChunkExp[[1]]==0) +hist(perChunkExp[[1]][,1]) +dev.off() + +selectedSamplesExp <- do.call(cbind, perChunkExp) +str(selectedSamplesExp) all(selectedSamples %in% colnames(selectedSamplesExp )) table(selectedSamples %in% colnames(selectedSamplesExp )) + + #Some samples are duplicated in the chunks, now make sure only one is in the matrix uniqueSamplesIndex <- match(selectedSamples, colnames(selectedSamplesExp)) selectedSamplesExp <- selectedSamplesExp[,uniqueSamplesIndex] -save(selectedSamplesExp, file = "tissuePredictions/selectedSamplesRawExpression.RData") -tissueClasses <- levels(samplesWithPrediction$predictedTissue) -tissue <- tissueClasses[1] +#save(selectedSamplesExp, file = "perTissueNormalization/selectedSamplesRawExpression.RData") -rownames(samplesWithPrediction)[!rownames(samplesWithPrediction) %in% colnames(selectedSamplesExp)] -table(rownames(samplesWithPrediction) %in% colnames(selectedSamplesExp)) -dim(samplesWithPrediction) -dim(selectedSamplesExp) -perTissueExp <- lapply(tissueClasses, function(tissue){ - tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] - tissueExp <- selectedSamplesExp[,tissueSamples] -}) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R index 892d655c4..fdb5022b5 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/harmonizeAndExtentSampleAnnotations.R @@ -8,7 +8,7 @@ remoter::client("localhost", port = 55501, password = "laberkak") #save.image("tmp2.RData") - + setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") #load("tmp2.RData") diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R new file mode 100644 index 000000000..7f2d55cd3 --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R @@ -0,0 +1,92 @@ +#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, sync = T) + + +remoter::client("localhost", port = 55501) + +load(file = "perTissueNormalization/selectedSamplesRawExpression.RData") + + +is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) { + abs(x - round(x)) < tol +} +table(is.wholenumber(selectedSamplesExp)) + + +tissueClasses <- unique(samplesWithPrediction$predictedTissue) + +tissue <- tissueClasses[1] + +perTissueExp <- lapply(tissueClasses, function(tissue){ + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- selectedSamplesExp[,tissueSamples] + numberOfSamples <- length(tissueSamples) + + includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) >= 0.5}) + table(includedGenes) + +}) +names(perTissueExp) <- tissueClasses +#save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData") + + + + +library(DESeq2) + +perTissueExpRlog <- lapply(tissueClasses, function(tissue){ + + + rawCounts <- perTissueExp[[tissue]] + rlogExp <- rlog(rawCounts) + + x <- apply(log2(rawCounts), 1, mean) + hist(x) + dev.off() + + + +}) + + +perTissueExpRlog <- lapply(tissueClasses, function(tissue){ + + + + +}) + + +#https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 +expScale = exp - rowMeans(exp); +# Standardize each variable +expScale = expScale / sqrt(rowSums(expScale^2)); +expCov = tcrossprod(expScale);#equevelent to correlation due to center scale +range(expCov) +str(expCov) + +expEigen <- eigen(expCov) + +eigenVectors <- expEigen$vectors +colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) +rownames(eigenVectors) <- rownames(expScale) +str(eigenVectors) + +eigenValues <- expEigen$values +names(eigenValues) <- paste0("PC_",1:length(eigenValues)) +str(eigenValues) + + + + + + + + + + + + + + + diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R index 25d056ae6..aca649870 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R @@ -19,14 +19,14 @@ str(exp) #save(exp, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") +#exp contains expression rows genes cols samples +#First center and scale each row #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 expScale = exp - rowMeans(exp); # Standardize each variable expScale = expScale / sqrt(rowSums(expScale^2)); -expCov = tcrossprod(expScale);#equevelent to correlation due to center scale -range(expCov) -str(expCov) +expCov = tcrossprod(expScale);#equivalent to correlation due to center scale expEigen <- eigen(expCov) @@ -41,8 +41,7 @@ str(eigenValues) save(eigenVectors, eigenValues, expFile, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/PCA_Patrick/eigen.RData") - - +#Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) expPcs <- t(expScale) %*% expEigen$vectors[,1:1000] colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) From 72180c52f5e843c226ac236fdc37f03330cb1417 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 29 Sep 2022 22:48:25 +0200 Subject: [PATCH 11/22] recount3 --- .../recount3/createUmapOfRecount3.R | 5 +- .../recount3/extractRawReadCounts.R | 10 -- .../normalizeRecount3PerTissueAndDoPca.R | 94 +++++++++---------- 3 files changed, 50 insertions(+), 59 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R index 2ff4218fb..26932db3f 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R @@ -563,12 +563,15 @@ samplesWithPrediction <- umapAndMeta[!is.na(umapAndMeta$predictedTissue) & !umap "predictedTissue", "predictedTissueScore", "umapFactor", - "misclasified" + "misclasified", + "study", + "sra.library_layout" )] colnames(samplesWithPrediction)[3] <- "annotatedTissue" str(samplesWithPrediction) #save(samplesWithPrediction, file = "tissuePredictions/samplesWithPrediction_16_09_22.RData") +write.table(samplesWithPrediction, file = "samplesWithPrediction.txt") load("tissuePredictions/samplesWithPrediction_16_09_22.RData") str(samplesWithPrediction) table(samplesWithPrediction$predictedTissue) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R index 4f1dc35cc..a01f9d656 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/extractRawReadCounts.R @@ -45,16 +45,6 @@ perChunkExp <- sapply(allFiles, function(file){ str(sreObject) -hist(log2(sreObject@assays@data@listData$raw_counts[,1])) -dev.off() - -hist(studyExp[,1]) -dev.off() - -sum(!perChunkExp[[1]]==0) -hist(perChunkExp[[1]][,1]) -dev.off() - selectedSamplesExp <- do.call(cbind, perChunkExp) str(selectedSamplesExp) all(selectedSamples %in% colnames(selectedSamplesExp )) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R index 7f2d55cd3..017840fc4 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R @@ -4,89 +4,87 @@ remoter::client("localhost", port = 55501) -load(file = "perTissueNormalization/selectedSamplesRawExpression.RData") +library(DESeq2) + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") + +load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T) -is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) { - abs(x - round(x)) < tol -} -table(is.wholenumber(selectedSamplesExp)) tissueClasses <- unique(samplesWithPrediction$predictedTissue) -tissue <- tissueClasses[1] +tissueClasses <- tissueClasses[c(1,2,6,14,55)] + perTissueExp <- lapply(tissueClasses, function(tissue){ tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] tissueExp <- selectedSamplesExp[,tissueSamples] numberOfSamples <- length(tissueSamples) - includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) >= 0.5}) - table(includedGenes) + includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5}) + + + tissueExp <- tissueExp[includedGenes,] + + mode(tissueExp) <- "integer" + + rlogExp <- rlog(tissueExp) + + return(rlogExp) }) names(perTissueExp) <- tissueClasses #save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData") - - -library(DESeq2) - -perTissueExpRlog <- lapply(tissueClasses, function(tissue){ +perTissuePca <- lapply(perTissueExp, function(exp){ + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = exp - rowMeans(exp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + expCov = tcrossprod(expScale);#equevelent to correlation due to center scale - rawCounts <- perTissueExp[[tissue]] - rlogExp <- rlog(rawCounts) + expEigen <- eigen(expCov) - x <- apply(log2(rawCounts), 1, mean) - hist(x) - dev.off() - - - -}) - - -perTissueExpRlog <- lapply(tissueClasses, function(tissue){ + eigenVectors <- expEigen$vectors + colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + rownames(eigenVectors) <- rownames(expScale) + eigenValues <- expEigen$values + names(eigenValues) <- paste0("PC_",1:length(eigenValues)) + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + return(list(eigenVectors, eigenValues, expPcs)) }) +#save(rlogExp, file = "perTissueNormalization/tmpTestRlog.RData") +#load(file = "perTissueNormalization/tmpTestRlog.RData") -#https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 -expScale = exp - rowMeans(exp); -# Standardize each variable -expScale = expScale / sqrt(rowSums(expScale^2)); -expCov = tcrossprod(expScale);#equevelent to correlation due to center scale -range(expCov) -str(expCov) - -expEigen <- eigen(expCov) - -eigenVectors <- expEigen$vectors -colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) -rownames(eigenVectors) <- rownames(expScale) -str(eigenVectors) - -eigenValues <- expEigen$values -names(eigenValues) <- paste0("PC_",1:length(eigenValues)) -str(eigenValues) - - - - - +exp <- rlogExp +tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] +str(tissueSamplesInfo) +studies <- length(unique(tissueSamplesInfo$study)) +library(viridisLite, lib.loc = .libPaths()[2]) +palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) +pchMap <- rep(c(15,17,19), length.out = studies) +rpng(width = 1000, height = 1000) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2) +dev.off() From f49660d2f5f6c5b3b4c557de52f639dc21caa4db Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Mon, 10 Oct 2022 11:17:48 +0200 Subject: [PATCH 12/22] Update normalizeRecount3PerTissueAndDoPca.R --- .../normalizeRecount3PerTissueAndDoPca.R | 86 +++++++++++++++---- 1 file changed, 70 insertions(+), 16 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R index 017840fc4..54e96f375 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R @@ -1,42 +1,65 @@ -#srun --cpus-per-task=1 --mem=50gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#srun --cpus-per-task=20 --mem=200gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i #remoter::server(verbose = T, port = 55556, sync = T) -remoter::client("localhost", port = 55501) +remoter::client("localhost", port = 55504) library(DESeq2) - +library(parallel) +library(viridisLite, lib.loc = .libPaths()[2]) setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T) - +load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) tissueClasses <- unique(samplesWithPrediction$predictedTissue) -tissueClasses <- tissueClasses[c(1,2,6,14,55)] +#tissueClasses <- tissueClasses[1:29] +#tissueClasses <- tissueClasses[30:57] + +#tissueClasses <- tissueClasses[c(1,2,6,14,55)] + +#limit expression to max int +selectedSamplesExp[selectedSamplesExp > .Machine$integer.max] <- .Machine$integer.max -perTissueExp <- lapply(tissueClasses, function(tissue){ +mclapply(tissueClasses, mc.cores = 20, function(tissue){ + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] tissueExp <- selectedSamplesExp[,tissueSamples] numberOfSamples <- length(tissueSamples) includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5}) - tissueExp <- tissueExp[includedGenes,] mode(tissueExp) <- "integer" - rlogExp <- rlog(tissueExp) + save(tissueExp, file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData")) + +}) + + + + +#cl <- makeCluster(20) - return(rlogExp) +#clusterExport(cl, c("samplesWithPrediction", "selectedSamplesExp")) +#tissue <- "Prostate" +# +perTissueExp <- mclapply(tissueClasses, mc.cores = 20, function(tissue){ + + load(file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData")) + rlogExp <- rlog(tissueExp) + save(rlogExp, file = paste0("perTissueNormalization/rlogExp/",make.names(tissue),".RData")) + return(NULL) }) -names(perTissueExp) <- tissueClasses + +#names(perTissueExp) <- tissueClasses #save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData") @@ -65,26 +88,57 @@ perTissuePca <- lapply(perTissueExp, function(exp){ }) -#save(rlogExp, file = "perTissueNormalization/tmpTestRlog.RData") +save(perTissueExp, perTissuePca, file = "perTissueNormalization/tmpTestSession.RData") #load(file = "perTissueNormalization/tmpTestRlog.RData") -exp <- rlogExp - +save(expPcs, samplesWithPrediction, file = "perTissueNormalization/tmpTest2.RData") +load("perTissueNormalization/tmpTest2.RData") +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired" +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired" tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] str(tissueSamplesInfo) +#Put TCGA and GTEx to paired end + studies <- length(unique(tissueSamplesInfo$study)) -library(viridisLite, lib.loc = .libPaths()[2]) + palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) -pchMap <- rep(c(15,17,19), length.out = studies) +pchMap <- rep(c(15,16,17), length.out = studies) rpng(width = 1000, height = 1000) -plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) +pairs(expPcs[,1:5], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, upper.panel = NULL) dev.off() +View(tissueSamplesInfo) + +palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$sra.library_layout), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) + + + + +breakPoints <- seq(0.5,1,by = 0.05) +breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) + + +plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) +legend("bottomright",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) + + +plot(expPcs[,1],expPcs[,5], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) + + +pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 1, upper.panel = NULL, pch = 16) + + + + sum(expPcs[,2]>10) +x <- cbind(expPcs, tissueSamplesInfo) +View(x) From 71f72b698c2c5732680dc773e4105d49d506c7a6 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Thu, 13 Oct 2022 10:59:38 +0200 Subject: [PATCH 13/22] recount3 --- .../recount3/createUmapOfRecount3.R | 7 +- .../normalizeRecount3PerTissueAndDoPca.R | 55 ++++-- .../recount3/perTissueQqnormAndPcaForQc.R | 184 ++++++++++++++++++ 3 files changed, 226 insertions(+), 20 deletions(-) create mode 100644 Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R index 26932db3f..cf4b191d4 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/createUmapOfRecount3.R @@ -623,8 +623,9 @@ dev.off() countTable <- table(umapAndPredictions$predictedTissue) sum(countTable) -pdf("baplotTissues.pdf", width = 12, height = 10) -par(mar = c(20,5,2,0.1), xpd = NA) +sum(countTable >= 500) +pdf("baplotTissues.pdf", width = 15, height = 10) +par(mar = c(25,5,2,0.1), xpd = NA) b <- barplot(countTable, las =2, col = tissueCol[names(countTable),]) -text(b, countTable + 250, countTable, font=1, srt = 90) +text(b, countTable + 280, countTable, font=1, srt = 90) dev.off() diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R index 54e96f375..ee8f5db99 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/normalizeRecount3PerTissueAndDoPca.R @@ -2,6 +2,8 @@ #remoter::server(verbose = T, port = 55556, sync = T) + + remoter::client("localhost", port = 55504) library(DESeq2) @@ -9,14 +11,15 @@ library(parallel) library(viridisLite, lib.loc = .libPaths()[2]) setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") - +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T) load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) - +sort(table(samplesWithPrediction$predictedTissue)) tissueClasses <- unique(samplesWithPrediction$predictedTissue) + #tissueClasses <- tissueClasses[1:29] #tissueClasses <- tissueClasses[30:57] @@ -26,7 +29,7 @@ tissueClasses <- unique(samplesWithPrediction$predictedTissue) selectedSamplesExp[selectedSamplesExp > .Machine$integer.max] <- .Machine$integer.max -mclapply(tissueClasses, mc.cores = 20, function(tissue){ +mclapply(tissueClasses, mc.cores = 10, function(tissue){ tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] tissueExp <- selectedSamplesExp[,tissueSamples] @@ -43,14 +46,20 @@ mclapply(tissueClasses, mc.cores = 20, function(tissue){ }) +tissueClasses <- unique(samplesWithPrediction$predictedTissue) +#Run 1 +#tissueClasses <- tissueClasses[1:5] +#Run 2 +run2Tisses <- c("Whole Blood", "T-Cells", "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells", "PBMC") +tissueClasses <- run2Tisses +#Run3 +run3Tisses <- c("derived-neural-progenitor_derived-neurons", "Macrophages", "Liver", "Macrophages-iPSC") +tissueClasses <- run3Tisses -#cl <- makeCluster(20) +#Run4 colorectal en prostate -#clusterExport(cl, c("samplesWithPrediction", "selectedSamplesExp")) -#tissue <- "Prostate" -# -perTissueExp <- mclapply(tissueClasses, mc.cores = 20, function(tissue){ +perTissueExp <- mclapply(tissueClasses, mc.cores = 4, function(tissue){ load(file = paste0("perTissueNormalization/raw/",make.names(tissue),".RData")) rlogExp <- rlog(tissueExp) @@ -61,28 +70,39 @@ perTissueExp <- mclapply(tissueClasses, mc.cores = 20, function(tissue){ #names(perTissueExp) <- tissueClasses #save(perTissueExp, file = "perTissueNormalization/selectedSamplesRawExpressionPerTissue.RData") +tissue = "Kidney" +load(file = paste0("perTissueNormalization/rlogExp/",make.names(tissue),".RData")) perTissuePca <- lapply(perTissueExp, function(exp){ #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 - expScale = exp - rowMeans(exp); + expScale = rlogExp - rowMeans(rlogExp); # Standardize each variable expScale = expScale / sqrt(rowSums(expScale^2)); - expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expEigen <- eigen(expCov) + #eigenVectors <- expEigen$vectors + #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + #rownames(eigenVectors) <- rownames(expScale) - expEigen <- eigen(expCov) + #eigenValues <- expEigen$values + #names(eigenValues) <- paste0("PC_",1:length(eigenValues)) - eigenVectors <- expEigen$vectors + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + #expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + + expSvd <- svd(expScale, nu = 1000, nv = 1000) + + eigenValues <- expSvd$d^2 + eigenVectors <- expSvd$u colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) rownames(eigenVectors) <- rownames(expScale) - eigenValues <- expEigen$values - names(eigenValues) <- paste0("PC_",1:length(eigenValues)) - - #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) - expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + expPcs <- expSvd$v[,1:25] %*% diag(expSvd$d[1:25]) colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + rownames(expPcs) <- colnames(expScale) return(list(eigenVectors, eigenValues, expPcs)) @@ -134,6 +154,7 @@ legend("bottomright",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = b plot(expPcs[,1],expPcs[,5], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) +legend("topleft",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 1, upper.panel = NULL, pch = 16) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R new file mode 100644 index 000000000..2cb31672d --- /dev/null +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -0,0 +1,184 @@ +#srun --cpus-per-task=20 --mem=200gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#remoter::server(verbose = T, port = 55556, sync = T) + + + + +remoter::client("localhost", port = 55504) + +library(DESeq2) +library(parallel) +library(viridisLite, lib.loc = .libPaths()[2]) +library(preprocessCore) + + + + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") +setwd("D:\\UMCG\\Genetica\\Projects\\Depict2Pgs\\Recount3\\") + +load(file = "perTissueNormalization/selectedSamplesRawExpression.RData", verbose = T) +load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) + +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired" +samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired" + + + +sort(table(samplesWithPrediction$predictedTissue)) +tissueClasses <- unique(samplesWithPrediction$predictedTissue) + +mclapply(tissueClasses, mc.cores = 10, function(tissue){ + + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- selectedSamplesExp[,tissueSamples] + numberOfSamples <- length(tissueSamples) + + includedGenes <- apply(tissueExp, 1, function(x){(sum(x==0)/numberOfSamples) <= 0.5}) + + tissueExp <- tissueExp[includedGenes,] + + tissueExp <- log2(tissueExp + 1) + + normalize.quantiles(tissueExp,copy=FALSE) + + save(tissueExp, file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + +}) + + +perTissuePca <- mclapply(tissueClasses, mc.cores = 10, function(tissue){ + + load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = tissueExp - rowMeans(tissueExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expEigen <- eigen(expCov) + #eigenVectors <- expEigen$vectors + #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + #rownames(eigenVectors) <- rownames(expScale) + + #eigenValues <- expEigen$values + #names(eigenValues) <- paste0("PC_",1:length(eigenValues)) + + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + #expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + + expSvd <- svd(expScale, nu = 50, nv = 50) + + eigenValues <- expSvd$d^2 + eigenVectors <- expSvd$u + colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + rownames(eigenVectors) <- rownames(expScale) + + expPcs <- expSvd$v[,1:50] %*% diag(expSvd$d[1:50]) + colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + rownames(expPcs) <- colnames(expScale) + + pcaRes <- list(eigenVectors, eigenValues, expPcs) + + save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) + + return(pcaRes) + +}) + + +sink <- mclapply(tissueClasses, mc.cores = 10, function(tissue, samplesWithPrediction){ + + load(file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) + + expPcs <- pcaRes$expPcs + + tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] + studies <- length(unique(tissueSamplesInfo$study)) + + + palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + + pchMap <- rep(c(15,16,17), length.out = studies) + + + plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) + + + return(NULL) + +}, samplesWithPrediction = samplesWithPrediction) + + + + + + + + + + + + + + + + + + + + + +save(perTissueExp, perTissuePca, file = "perTissueNormalization/tmpTestSession.RData") +#load(file = "perTissueNormalization/tmpTestRlog.RData") + +save(expPcs, samplesWithPrediction, file = "perTissueNormalization/tmpTest2.RData") +load("perTissueNormalization/tmpTest2.RData") + + + +tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] +str(tissueSamplesInfo) + +#Put TCGA and GTEx to paired end + +studies <- length(unique(tissueSamplesInfo$study)) + + + +palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + +pchMap <- rep(c(15,16,17), length.out = studies) + +rpng(width = 1000, height = 1000) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) +pairs(expPcs[,1:5], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, upper.panel = NULL) +dev.off() +View(tissueSamplesInfo) + +palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) +plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$sra.library_layout), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) + + + + +breakPoints <- seq(0.5,1,by = 0.05) +breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) + + +plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) +legend("bottomright",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) + + +plot(expPcs[,1],expPcs[,4], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) + +legend("topleft",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) + +pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 1, upper.panel = NULL, pch = 16) + + + +sum(expPcs[,2]>10) +x <- cbind(expPcs, tissueSamplesInfo) +View(x) From a6105100e1a201d0d60424d07f950c63ea0c6efa Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Fri, 14 Oct 2022 08:31:27 +0200 Subject: [PATCH 14/22] recount3 --- .../recount3/perTissueQqnormAndPcaForQc.R | 157 +++++++++++++++--- .../recount3/performPcaOnFullRecount3Data.R | 1 + 2 files changed, 133 insertions(+), 25 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R index 2cb31672d..d30e18720 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -4,7 +4,7 @@ -remoter::client("localhost", port = 55504) +remoter::client("localhost", port = 55505) library(DESeq2) library(parallel) @@ -23,6 +23,7 @@ load("tissuePredictions/samplesWithPrediction_16_09_22.RData", verbose = T) samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="TCGA"] <- "paired" samplesWithPrediction$sra.library_layout[samplesWithPrediction$study=="GTEx"] <- "paired" +table(samplesWithPrediction$predictedTissue) sort(table(samplesWithPrediction$predictedTissue)) @@ -46,10 +47,16 @@ mclapply(tissueClasses, mc.cores = 10, function(tissue){ }) +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") + + +mclapply(tissueClasses, mc.cores = 10, function(tissue, exp){ -perTissuePca <- mclapply(tissueClasses, mc.cores = 10, function(tissue){ + #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] + tissueExp <- exp[,tissueSamples] - load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 expScale = tissueExp - rowMeans(tissueExp); @@ -70,6 +77,9 @@ perTissuePca <- mclapply(tissueClasses, mc.cores = 10, function(tissue){ expSvd <- svd(expScale, nu = 50, nv = 50) + + + eigenValues <- expSvd$d^2 eigenVectors <- expSvd$u colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) @@ -79,31 +89,141 @@ perTissuePca <- mclapply(tissueClasses, mc.cores = 10, function(tissue){ colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) rownames(expPcs) <- colnames(expScale) - pcaRes <- list(eigenVectors, eigenValues, expPcs) + + explainedVariance <- eigenValues * 100 / nrow(expScale) + + + pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance) save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) - return(pcaRes) + return(NULL) -}) +}, exp = exp) +tissue = "Kidney" -sink <- mclapply(tissueClasses, mc.cores = 10, function(tissue, samplesWithPrediction){ +sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ load(file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) - expPcs <- pcaRes$expPcs - + expPcs <- pcaRes$expPcs[,1:10] + explainedVariance <- pcaRes$explainedVariance tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] studies <- length(unique(tissueSamplesInfo$study)) + breakPoints <- seq(0.5,1,by = 0.05) + breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) - palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + expPcsMeans <- apply(expPcs, 2, mean) + expPcsSds <- apply(expPcs, 2, sd) + threshold <- expPcsMeans + 3 * expPcsSds + + outlierPerComp <- sapply(1:10, function(i){ + + abs(expPcs[,i]) > threshold[i] + }) + outlier <- apply(outlierPerComp, 1, any) + sum(outlier) + + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"1.png"), width = 1200, height = 900) + #rpng() + layout(matrix(c(1,1,1,1,2,3,4,8,5,6,7,8),ncol = 4, byrow = T), heights = c(0.1,1,1), widths = c(1,1,1,0.1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) pchMap <- rep(c(15,16,17), length.out = studies) + plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Studies (", studies,")"), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + + palette(adjustcolor(c("lemonchiffon3", "darkorange1", "springgreen2"), alpha.f = 0.5)) + annotated <- factor(rep("Unkown", nrow(tissueSamplesInfo)), levels = c("Unkown", "Other", "Current")) + annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue != tissue] <- "Other" + annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue == tissue] <- "Current" + plot(expPcs[,1],expPcs[,2], col = annotated, pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Annotated as " , tissue), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + + palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) + plot(expPcs[,1],expPcs[,2], col = factor(tissueSamplesInfo$sra.library_layout, levels = c("paired", "single")), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = "Sequencing layout", xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1, main = "Prediction posterior probability", xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + + plot(cumsum(explainedVariance)[1:10], bty = "n", pch = 16, xlab = "Components", ylab = "Cumulative explained variance (%)", main = "Explained variance", ylim = c(0,100), xlim = c(0,10)) + + + + + + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("topleft",title="Annotation",legend=c("Unkown", "Other", tissue), col = c("lemonchiffon3", "darkorange1", "springgreen2") , pch = 16, bty = "n") + legend("top",title="Layout",legend=c("Single", "Paired"), col = c("maroon2", "dodgerblue1") , pch = 16, bty = "n") + legend("topright",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n") + + + dev.off() + + colnames(expPcs) <- paste0("Comp ",1:10, " (", round(explainedVariance[1:10],2) ,"%)") + + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"2.png"), width = 2000, height = 2000) + #pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 2, upper.panel = NULL, pch = 16) + #dev.off() + + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000) + #palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) + #pchMap <- rep(c(15,16,17), length.out = studies) + #pairs(expPcs[,1:10], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 2, upper.panel = NULL) + #dev.off() + + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000) + #palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) + #pairs(expPcs[,1:10], col = outlier + 1, pch = 16, cex = 2, upper.panel = NULL) + #dev.off() + + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"4.png"), width = 1500, height = 700) + #rpng() + palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) + layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + for(i in 2:10){ + plot(expPcs[,1],expPcs[,i], col = outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") + abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) + abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) + } + + dev.off() + + + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"5.png"), width = 1500, height = 700) + #rpng() + layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + for(i in 2:10){ + plot(expPcs[,1],expPcs[,i], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") + abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) + abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) + } + + dev.off() - - plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) return(NULL) @@ -157,19 +277,6 @@ pairs(expPcs[,1:5], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.fa dev.off() View(tissueSamplesInfo) -palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) -plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$sra.library_layout), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) - - - - -breakPoints <- seq(0.5,1,by = 0.05) -breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) - - -plot(expPcs[,1],expPcs[,2], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) -legend("bottomright",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) - plot(expPcs[,1],expPcs[,4], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R index aca649870..e312a3287 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/performPcaOnFullRecount3Data.R @@ -18,6 +18,7 @@ rm(table_tmp) str(exp) #save(exp, file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") #exp contains expression rows genes cols samples From adb25d5e04b8249408da59bafd5191879ba431e5 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sat, 15 Oct 2022 20:32:32 +0200 Subject: [PATCH 15/22] Update perTissueQqnormAndPcaForQc.R --- .../recount3/perTissueQqnormAndPcaForQc.R | 147 ++++++++++++------ 1 file changed, 100 insertions(+), 47 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R index d30e18720..33e0cc967 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -101,7 +101,12 @@ mclapply(tissueClasses, mc.cores = 10, function(tissue, exp){ }, exp = exp) -tissue = "Kidney" +tissue = "Whole Blood" +tissue = "Brain-Nucleus accumbens (basal ganglia)" + +ERP009290 +ERP009290 +samplesWithPrediction[samplesWithPrediction$study=="ERP009290",] sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ @@ -112,19 +117,99 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] studies <- length(unique(tissueSamplesInfo$study)) + #are in the same order + write.table(cbind(tissueSamplesInfo, expPcs), col.names = T, row.names = F, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt")) + + shortTissue <- ifelse(nchar(tissue) > 20, paste0(substr(tissue,0,17),"..."), tissue) + breakPoints <- seq(0.5,1,by = 0.05) breakCols <- (adjustcolor(viridis(length(breakPoints), option = "inferno"), alpha.f = 0.5)) expPcsMeans <- apply(expPcs, 2, mean) expPcsSds <- apply(expPcs, 2, sd) - threshold <- expPcsMeans + 3 * expPcsSds + + #Larger threshold is needed for wholeblood + sdThreshold <- ifelse(tissue == "Whole Blood", 4,3) + + threshold <- expPcsMeans + sdThreshold * expPcsSds outlierPerComp <- sapply(1:10, function(i){ abs(expPcs[,i]) > threshold[i] }) - outlier <- apply(outlierPerComp, 1, any) - sum(outlier) + tissueSamplesInfo$outlier <- apply(outlierPerComp, 1, any) + sum(tissueSamplesInfo$outlier) + + ### Do some manual corrections + if(tissue == "Airway basal cells"){ + tissueSamplesInfo$outlier[expPcs[,1] < 2] <- TRUE #Checked annotation, these are wrongly predicted + } + tissueSamplesInfo$outlier[tissueSamplesInfo$study == "ERP009290"] <- TRUE #Mixed tissue samples + + if(tissue == "Brain-Hindbrain-Fetal"){ + tissueSamplesInfo$outlier[expPcs[,1] < -10] <- TRUE #Checked annotation, these are wrongly predicted + tissueSamplesInfo$outlier[expPcs[,3] > 3] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Brain-Nucleus accumbens (basal ganglia)"){ + tissueSamplesInfo$outlier[expPcs[,1] < -5] <- TRUE #Checked annotation, these are wrongly predicted + tissueSamplesInfo$outlier[expPcs[,3] < -10] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Kidney"){ + tissueSamplesInfo$outlier[expPcs[,3] < -6] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Macrophages-iPSC"){ + tissueSamplesInfo$outlier[tissueSamplesInfo$study == "ERP020977"] <- FALSE #not real outlier, strange distribution due to stimulations. + } + if(tissue == "Monocytes"){ + tissueSamplesInfo$outlier[expPcs[,2] > 3] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Nasal Lavage"){ + tissueSamplesInfo$outlier[expPcs[,1] < -5] <- TRUE #Checked annotation, these are wrongly predicted + } + if(tissue == "Vagina"){ + tissueSamplesInfo$outlier[expPcs[,1] > 10] <- TRUE #Checked annotation, these are wrongly predicted + tissueSamplesInfo$outlier[expPcs[,2] < -8] <- TRUE #Checked annotation, these are wrongly predicted + } + + + + ########################### + rpng(width = 1000, height = 1000) + palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) + layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + for(i in 2:10){ + plot(expPcs[,1],expPcs[,i], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") + abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) + abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) + } + + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("top",title="Outliers",legend=c("Included", "Excluded"), col = c("grey", "firebrick3") , pch = 16, bty = "n") + + + + dev.off() + + + + + + + + + ###################################3 + colnames(expPcs) <- paste0("Comp ",1:10, " (", round(explainedVariance[1:10],2) ,"%)") + write.table(cbind(tissueSamplesInfo, expPcs), col.names = NA, row.names = T, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt")) + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"1.png"), width = 1200, height = 900) #rpng() @@ -145,7 +230,7 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ annotated <- factor(rep("Unkown", nrow(tissueSamplesInfo)), levels = c("Unkown", "Other", "Current")) annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue != tissue] <- "Other" annotated[!is.na(tissueSamplesInfo$annotatedTissue) & tissueSamplesInfo$annotatedTissue == tissue] <- "Current" - plot(expPcs[,1],expPcs[,2], col = annotated, pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Annotated as " , tissue), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") + plot(expPcs[,1],expPcs[,2], col = annotated, pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, main = paste0("Annotated as " , shortTissue), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp 2 (", round(explainedVariance[2],2) ,"%)"), bty = "n") palette(adjustcolor(c("dodgerblue1", "maroon2"), alpha.f = 0.5)) @@ -162,14 +247,14 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ par(mar = c(0,2,3,1), xpd = NA) plot.new() plot.window(xlim = 0:1, ylim = 0:1) - legend("topleft",title="Annotation",legend=c("Unkown", "Other", tissue), col = c("lemonchiffon3", "darkorange1", "springgreen2") , pch = 16, bty = "n") + legend("topleft",title="Annotation",legend=c("Unkown", "Other", shortTissue), col = c("lemonchiffon3", "darkorange1", "springgreen2") , pch = 16, bty = "n") legend("top",title="Layout",legend=c("Single", "Paired"), col = c("maroon2", "dodgerblue1") , pch = 16, bty = "n") legend("topright",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n") dev.off() - colnames(expPcs) <- paste0("Comp ",1:10, " (", round(explainedVariance[1:10],2) ,"%)") + #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"2.png"), width = 2000, height = 2000) #pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 2, upper.panel = NULL, pch = 16) @@ -183,27 +268,13 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ #png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"3.png"), width = 2000, height = 2000) #palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) - #pairs(expPcs[,1:10], col = outlier + 1, pch = 16, cex = 2, upper.panel = NULL) + #pairs(expPcs[,1:10], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 2, upper.panel = NULL) #dev.off() png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"4.png"), width = 1500, height = 700) - #rpng() - palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) - layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) - par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) - plot.new() - plot.window(xlim = 0:1, ylim = 0:1) - text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) - par(mar = c(4,4,3,0.5), xpd = NA) + ########################################## - for(i in 2:10){ - plot(expPcs[,1],expPcs[,i], col = outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") - abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) - abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) - } - - dev.off() png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"5.png"), width = 1500, height = 700) @@ -222,6 +293,12 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) } + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("top",title="Probability",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16, bty = "n") + + dev.off() @@ -234,30 +311,6 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ - - - - - - - - - - - - - - - - -save(perTissueExp, perTissuePca, file = "perTissueNormalization/tmpTestSession.RData") -#load(file = "perTissueNormalization/tmpTestRlog.RData") - -save(expPcs, samplesWithPrediction, file = "perTissueNormalization/tmpTest2.RData") -load("perTissueNormalization/tmpTest2.RData") - - - tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] str(tissueSamplesInfo) From a061521f067202bcc2807bf8ab119a194673f45d Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sun, 16 Oct 2022 14:21:27 +0200 Subject: [PATCH 16/22] Update playingWithPcaSvd.R --- .../r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R index f12897f41..ebff773a5 100644 --- a/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R +++ b/Downstreamer/src/main/r/downstreamer_main/legacy_scripts/playingWithPcaSvd.R @@ -270,12 +270,6 @@ plot(log(expSvd$d)) abline(v=60) -library(rpca) - - -expRpca <- rpca(t(expSub2)) - - library(corpcor) expSvdFast <- fast.svd(expSubScale) From 009c653b295d31df481b3cb43ef3a859497ddc8b Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sun, 16 Oct 2022 14:22:39 +0200 Subject: [PATCH 17/22] Update perTissueQqnormAndPcaForQc.R --- .../recount3/perTissueQqnormAndPcaForQc.R | 66 ++++++++----------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R index 33e0cc967..f96aa2f41 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -29,6 +29,7 @@ table(samplesWithPrediction$predictedTissue) sort(table(samplesWithPrediction$predictedTissue)) tissueClasses <- unique(samplesWithPrediction$predictedTissue) +#not used currently, we now use the expression data used for the primary QC and sample predictions. mclapply(tissueClasses, mc.cores = 10, function(tissue){ tissueSamples <- rownames(samplesWithPrediction)[samplesWithPrediction$predictedTissue == tissue] @@ -108,7 +109,7 @@ ERP009290 ERP009290 samplesWithPrediction[samplesWithPrediction$study=="ERP009290",] -sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ +nonOutlierSampleList <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ load(file = paste0("perTissueNormalization/perTissueQqPca/",make.names(tissue),".RData")) @@ -172,41 +173,7 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ } - - ########################### - rpng(width = 1000, height = 1000) - palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) - layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) - par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) - plot.new() - plot.window(xlim = 0:1, ylim = 0:1) - text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) - - par(mar = c(4,4,3,0.5), xpd = NA) - - for(i in 2:10){ - plot(expPcs[,1],expPcs[,i], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") - abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) - abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) - } - - par(mar = c(0,2,3,1), xpd = NA) - plot.new() - plot.window(xlim = 0:1, ylim = 0:1) - legend("top",title="Outliers",legend=c("Included", "Excluded"), col = c("grey", "firebrick3") , pch = 16, bty = "n") - - - - dev.off() - - - - - - - - - ###################################3 + colnames(expPcs) <- paste0("Comp ",1:10, " (", round(explainedVariance[1:10],2) ,"%)") write.table(cbind(tissueSamplesInfo, expPcs), col.names = NA, row.names = T, sep = "\t", quote = F, file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),".txt")) @@ -272,10 +239,31 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ #dev.off() png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"4.png"), width = 1500, height = 700) + #rpng(width = 1000, height = 1000) + palette(adjustcolor(c("grey", "firebrick3"), alpha.f = 0.5)) + layout(matrix(c(1,1,1,1,1,2:11),ncol = 5, byrow = T), heights = c(0.1,1,1)) + par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + text(0.5,0.5, paste0(tissue, " (", nrow(tissueSamplesInfo) ,")"), cex = 2, font = 2) + + par(mar = c(4,4,3,0.5), xpd = NA) + + for(i in 2:10){ + plot(expPcs[,1],expPcs[,i], col = tissueSamplesInfo$outlier + 1, pch = 16, cex = 1, main = paste0("Comp ", i), xlab = paste0("Comp 1 (", round(explainedVariance[1],2) ,"%)"), ylab = paste0("Comp ", i," (", round(explainedVariance[i],2) ,"%)"), bty = "n") + abline(v=c(-threshold[1],threshold[1]), lwd = 2, col = "firebrick3", xpd = FALSE) + abline(h=c(-threshold[i],threshold[i]), lwd = 2, col = "firebrick3", xpd = FALSE) + } + + par(mar = c(0,2,3,1), xpd = NA) + plot.new() + plot.window(xlim = 0:1, ylim = 0:1) + legend("top",title="Outliers",legend=c("Included", "Excluded"), col = c("grey", "firebrick3") , pch = 16, bty = "n") - ########################################## + dev.off() + png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"5.png"), width = 1500, height = 700) #rpng() @@ -303,13 +291,13 @@ sink <- lapply(tissueClasses, function(tissue, samplesWithPrediction){ - return(NULL) + return(tissueSamplesInfo[!tissueSamplesInfo$outlier,1:(ncol(tissueSamplesInfo)-1)]) }, samplesWithPrediction = samplesWithPrediction) - +str(nonOutlierSampleList) tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] str(tissueSamplesInfo) From beda4b502bcdb97f121c8ac0675c4fb4bf7d94b0 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Tue, 18 Oct 2022 14:36:37 +0200 Subject: [PATCH 18/22] Update perTissueQqnormAndPcaForQc.R --- .../recount3/perTissueQqnormAndPcaForQc.R | 134 +++++++++++++++--- 1 file changed, 115 insertions(+), 19 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R index f96aa2f41..4cdb0747c 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -102,7 +102,7 @@ mclapply(tissueClasses, mc.cores = 10, function(tissue, exp){ }, exp = exp) -tissue = "Whole Blood" +tissue = "Kidney" tissue = "Brain-Nucleus accumbens (basal ganglia)" ERP009290 @@ -179,7 +179,7 @@ nonOutlierSampleList <- lapply(tissueClasses, function(tissue, samplesWithPredic png(file = paste0("perTissueNormalization/qcPlots/",make.names(tissue),"1.png"), width = 1200, height = 900) - #rpng() + #rpng(width = 1000, height = 1000) layout(matrix(c(1,1,1,1,2,3,4,8,5,6,7,8),ncol = 4, byrow = T), heights = c(0.1,1,1), widths = c(1,1,1,0.1)) par(mar = c(0,0,0,0), xpd = NA, cex = 1.2) plot.new() @@ -295,38 +295,134 @@ nonOutlierSampleList <- lapply(tissueClasses, function(tissue, samplesWithPredic }, samplesWithPrediction = samplesWithPrediction) +samplesWithPredictionNoOutliers <- do.call(rbind, nonOutlierSampleList) +save(samplesWithPredictionNoOutliers, file = "tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) + + + + +sink <- lapply(tissueClasses, function(tissue, exp){ + + #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] + tissueExp <- exp[,tissueSamples] + + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = tissueExp - rowMeans(tissueExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + #expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + #expEigen <- eigen(expCov) + #eigenVectors <- expEigen$vectors + #colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + #rownames(eigenVectors) <- rownames(expScale) + + #eigenValues <- expEigen$values + #names(eigenValues) <- paste0("PC_",1:length(eigenValues)) + + #Here calculate sample principle components. Number needed is arbritary (no more than eigen vectors) + #expPcs <- t(expScale) %*% expEigen$vectors[,1:10] + #colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + + expSvd <- svd(expScale, nu = 50, nv = 50) + + + + + eigenValues <- expSvd$d^2 + eigenVectors <- expSvd$u + colnames(eigenVectors) <- paste0("PC_",1:ncol(eigenVectors)) + rownames(eigenVectors) <- rownames(expScale) + + expPcs <- expSvd$v[,1:50] %*% diag(expSvd$d[1:50]) + colnames(expPcs) <- paste0("PC_",1:ncol(expPcs)) + rownames(expPcs) <- colnames(expScale) + + + explainedVariance <- eigenValues * 100 / nrow(expScale) + + + pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance) + + save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) + + return(expPcs) + +}, exp = exp) + -str(nonOutlierSampleList) +pcsPerTissue <- lapply(tissueClasses, function(tissue){ + load(file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) + eigenvectors <- pcaRes$eigenVectors + colnames(eigenvectors) <- paste0(tissue,"_",colnames(eigenvectors)) + return(eigenvectors) +}) +str(pcsPerTissue) +pcsPerTissue2 <- do.call(cbind, pcsPerTissue) -tissueSamplesInfo <- samplesWithPrediction[rownames(expPcs),] -str(tissueSamplesInfo) +str(pcsPerTissue2) -#Put TCGA and GTEx to paired end +rownames(pcsPerTissue2) <- (gsub("\\..+", "", rownames(pcsPerTissue2))) +write.table(pcsPerTissue2, file = "perTissueNormalization/perTissueQqPcaNoOutliers/combinedComponents.txt", sep = "\t", quote = FALSE, col.names = NA) -studies <- length(unique(tissueSamplesInfo$study)) +pcsPerTissue2t <- t(pcsPerTissue2) +pcsPerTissue2Scale = pcsPerTissue2t - rowMeans(pcsPerTissue2t) +# Standardize each variable +pcsPerTissue2Scale = pcsPerTissue2Scale / sqrt(rowSums(pcsPerTissue2Scale^2)) +pcCorMatrix <- pcsPerTissue2Scale %*% t(pcsPerTissue2Scale) +range(diag(pcCorMatrix)) +range(pcCorMatrix) +range(lower.tri(pcCorMatrix)) -palette(adjustcolor(viridis(studies, option = "H"), alpha.f = 0.5)) +sum(pcCorMatrix[lower.tri(pcCorMatrix)] >= 0.5) -pchMap <- rep(c(15,16,17), length.out = studies) +identicalPerPc <- apply(pcCorMatrix, 2, function(x){sum(x>=0.3)}) +tail(sort(identicalPerPc)) -rpng(width = 1000, height = 1000) -plot(expPcs[,1],expPcs[,2], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1) -pairs(expPcs[,1:5], col = as.factor(tissueSamplesInfo$study), pch = pchMap[as.factor(tissueSamplesInfo$study)], cex = 1, upper.panel = NULL) +hist(pcCorMatrix[,"Brain-Cortex_PC_3"]) dev.off() -View(tissueSamplesInfo) +pcCorMatrix[,"Brain-Cerebellum_PC_2"][pcCorMatrix[,"Brain-Cerebellum_PC_2"] > 0.3] + + +compEigen <- eigen(pcCorMatrix) +str(compEigen) +sum(compEigen$values) -plot(expPcs[,1],expPcs[,4], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], pch = 16, cex = 1) +sum(as.numeric(compEigen$values) >= 1) -legend("topleft",title="PredictionScore",legend=seq(0.5,1,by = 0.05),col = breakCols,pch=16) -pairs(expPcs[,1:10], col = breakCols[as.numeric(cut(tissueSamplesInfo$predictedTissueScore, breaks = breakPoints ))], cex = 1, upper.panel = NULL, pch = 16) +rpng() +plot(cumsum(as.numeric(compEigen$values) * 100 / sum(as.numeric(compEigen$values)))) +dev.off() + +rpng() +plot(as.numeric(compEigen$values)) +dev.off() + +head(as.numeric(compEigen$values)) +sum(eigenValues >= 1) + +compSvd <- svd(t(pcsPerTissue2Scale)) +str(compSvd) + +head(compSvd$d^2) + +eigenValues <- compSvd$d^2 + +rpng() +plot(cumsum(eigenValues * 100 / sum(eigenValues))) +dev.off() + + +rpng() +plot(eigenValues) +dev.off() -sum(expPcs[,2]>10) -x <- cbind(expPcs, tissueSamplesInfo) -View(x) From 64d691db6f20e1e18fcb33b57eb46c6fc92c405c Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Fri, 21 Oct 2022 13:16:05 +0200 Subject: [PATCH 19/22] Update perTissueQqnormAndPcaForQc.R --- .../recount3/perTissueQqnormAndPcaForQc.R | 104 +++++++++++++++--- 1 file changed, 90 insertions(+), 14 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R index 4cdb0747c..ab93807f3 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -1,10 +1,10 @@ -#srun --cpus-per-task=20 --mem=200gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i +#srun --cpus-per-task=20 --mem=100gb --nodes=1 --qos=priority --time=168:00:00 --pty bash -i #remoter::server(verbose = T, port = 55556, sync = T) -remoter::client("localhost", port = 55505) +remoter::client("localhost", port = 55506) library(DESeq2) library(parallel) @@ -296,9 +296,11 @@ nonOutlierSampleList <- lapply(tissueClasses, function(tissue, samplesWithPredic }, samplesWithPrediction = samplesWithPrediction) samplesWithPredictionNoOutliers <- do.call(rbind, nonOutlierSampleList) -save(samplesWithPredictionNoOutliers, file = "tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) +#save(samplesWithPredictionNoOutliers, file = "tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) +load("tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) +tissue = "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells" sink <- lapply(tissueClasses, function(tissue, exp){ @@ -308,6 +310,20 @@ sink <- lapply(tissueClasses, function(tissue, exp){ tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] tissueExp <- exp[,tissueSamples] + write.table(tissueExp, file = "fibro.txt", sep = "\t", quote = F, col.names = NA) + + save(tissueExp, file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData")) + +}, exp = exp) + +sink <- lapply(tissueClasses, function(tissue){ + + #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) + + #tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] + #tissueExp <- exp[,tissueSamples] + + load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData")) #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 expScale = tissueExp - rowMeans(tissueExp); @@ -346,21 +362,24 @@ sink <- lapply(tissueClasses, function(tissue, exp){ pcaRes <- list(eigenVectors = eigenVectors, eigenValues = eigenValues, expPcs = expPcs, explainedVariance = explainedVariance) - save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) + save(pcaRes, file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) return(expPcs) -}, exp = exp) - +}) +write.table(pcaRes$eigenVectors, file = "fibroEigenVectors.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$eigenValues, file = "fibroEigenValues.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$expPcs, file = "fibroPcs.txt", sep = "\t", quote = F, col.names = NA) pcsPerTissue <- lapply(tissueClasses, function(tissue){ load(file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) eigenvectors <- pcaRes$eigenVectors colnames(eigenvectors) <- paste0(tissue,"_",colnames(eigenvectors)) + return(eigenvectors) }) -str(pcsPerTissue) +#str(pcsPerTissue) pcsPerTissue2 <- do.call(cbind, pcsPerTissue) str(pcsPerTissue2) @@ -375,27 +394,32 @@ pcsPerTissue2Scale = pcsPerTissue2Scale / sqrt(rowSums(pcsPerTissue2Scale^2)) pcCorMatrix <- pcsPerTissue2Scale %*% t(pcsPerTissue2Scale) -range(diag(pcCorMatrix)) range(pcCorMatrix) -range(lower.tri(pcCorMatrix)) +range(diag(pcCorMatrix)) -sum(pcCorMatrix[lower.tri(pcCorMatrix)] >= 0.5) +sum(pcCorMatrix[lower.tri(pcCorMatrix)] >= 0.8) -identicalPerPc <- apply(pcCorMatrix, 2, function(x){sum(x>=0.3)}) +identicalPerPc <- apply(pcCorMatrix, 2, function(x){sum(x>=0.7)}) tail(sort(identicalPerPc)) hist(pcCorMatrix[,"Brain-Cortex_PC_3"]) dev.off() -pcCorMatrix[,"Brain-Cerebellum_PC_2"][pcCorMatrix[,"Brain-Cerebellum_PC_2"] > 0.3] +pcCorMatrix[,"Whole Blood Fetal_PC_1"][pcCorMatrix[,"Whole Blood Fetal_PC_1"] >= 0.7] compEigen <- eigen(pcCorMatrix) str(compEigen) sum(compEigen$values) -sum(as.numeric(compEigen$values) >= 1) +(numberOfCompsEigenvalue1 <- sum(as.numeric(compEigen$values) >= 1)) +str(compEigen) +pcsOfComps <- t(pcsPerTissue2t) %*% compEigen$vectors[,1:numberOfCompsEigenvalue1] +colnames(pcsOfComps) <- paste0("PC_",1:ncol(pcsOfComps)) +rownames(pcsOfComps) <- (gsub("\\..+", "", rownames(pcsOfComps))) +write.table(pcsOfComps, col.names = NA, sep = "\t", quote = F, file = gzfile("perTissueNormalization/perTissueQqPcaNoOutliers/pcaCombinedComponents.txt.gz")) +str(pcsOfComps) rpng() plot(cumsum(as.numeric(compEigen$values) * 100 / sum(as.numeric(compEigen$values)))) @@ -405,6 +429,21 @@ rpng() plot(as.numeric(compEigen$values)) dev.off() + + + + + + + + + + + + + + + head(as.numeric(compEigen$values)) sum(eigenValues >= 1) @@ -412,6 +451,42 @@ sum(eigenValues >= 1) compSvd <- svd(t(pcsPerTissue2Scale)) str(compSvd) +(numberOfCompsEigenvalue1 <- sum(compSvd$d^2>=1)) + +setwd("/groups/umcg-fg/tmp01/projects/genenetwork/recount3/") + +load("problem.RData") + + +combinedCompsPcs <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs2 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs3 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs4 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) +combinedCompsPcs5 <- compSvd$u[,1:numberOfCompsEigenvalue1] %*% diag(compSvd$d[1:numberOfCompsEigenvalue1]) + +cor.test(combinedCompsPcs[,1], combinedCompsPcs2[,1]) +cor.test(combinedCompsPcs[,1], combinedCompsPcs3[,1]) +cor.test(combinedCompsPcs[,1], combinedCompsPcs4[,1]) +cor.test(combinedCompsPcs[,1], combinedCompsPcs5[,1]) + +range(abs(combinedCompsPcs[,1]) - abs(combinedCompsPcs2[,1])) + +plot(combinedCompsPcs[,1], combinedCompsPcs2[,1]) +plot(combinedCompsPcs[,1], combinedCompsPcs3[,1]) +plot(combinedCompsPcs[,1], combinedCompsPcs4[,1]) +plot(combinedCompsPcs[,1], combinedCompsPcs5[,1]) +dev.off() + +save(compSvd, numberOfCompsEigenvalue1, file = "problem.RData") + +str(combinedCompsPcs) +range(combinedCompsPcs) + +plot(as.numeric(teest[,1]),combinedCompsPcs[,1]) +dev.off() + + +head(compSvd$d^2) head(compSvd$d^2) eigenValues <- compSvd$d^2 @@ -421,8 +496,9 @@ plot(cumsum(eigenValues * 100 / sum(eigenValues))) dev.off() + + rpng() plot(eigenValues) dev.off() - From 9ff72fc57ad667110336c6734bf9fa8c38a5bb5f Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sat, 22 Oct 2022 14:52:56 +0200 Subject: [PATCH 20/22] Update perTissueQqnormAndPcaForQc.R --- .../recount3/perTissueQqnormAndPcaForQc.R | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R index ab93807f3..e4c35f6c2 100644 --- a/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R +++ b/Downstreamer/src/main/r/downstreamer_main/recount3/perTissueQqnormAndPcaForQc.R @@ -48,7 +48,7 @@ mclapply(tissueClasses, mc.cores = 10, function(tissue){ }) -load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData") +load(file = "/groups/umcg-fg/tmp01/projects/genenetwork/recount3/Recount3_QC_2ndRun/Filtered_Matrices/TPM_log2_QNorm_QCed_CovCorrected_AllCovariates.RData", verbose = T) mclapply(tissueClasses, mc.cores = 10, function(tissue, exp){ @@ -301,7 +301,7 @@ samplesWithPredictionNoOutliers <- do.call(rbind, nonOutlierSampleList) load("tissuePredictions/samplesWithPrediction_16_09_22_noOutliers.RData", verbose = T) tissue = "fibroblasts_cell-lines_smooth-muscle-cell_mesenchymal-stem-cells" - +tissue = "HUVEC" sink <- lapply(tissueClasses, function(tissue, exp){ @@ -310,12 +310,25 @@ sink <- lapply(tissueClasses, function(tissue, exp){ tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] tissueExp <- exp[,tissueSamples] - write.table(tissueExp, file = "fibro.txt", sep = "\t", quote = F, col.names = NA) - save(tissueExp, file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData")) }, exp = exp) +#Create co-expression matrices +sink <- lapply(tissueClasses, function(tissue){ + + load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"), verbose = T) + + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 + expScale = tissueExp - rowMeans(tissueExp); + # Standardize each variable + expScale = expScale / sqrt(rowSums(expScale^2)); + expCov = tcrossprod(expScale);#equevelent to correlation due to center scale + + write.table(expCov, file = paste0("perTissueNormalization/qqCoExp/",make.names(tissue),".txt"), sep = "\t", quote = F, col.names = NA) + +}) + sink <- lapply(tissueClasses, function(tissue){ #load(file = paste0("perTissueNormalization/perTissueQq/",make.names(tissue),".RData")) @@ -323,7 +336,10 @@ sink <- lapply(tissueClasses, function(tissue){ #tissueSamples <- rownames(samplesWithPredictionNoOutliers)[samplesWithPredictionNoOutliers$predictedTissue == tissue] #tissueExp <- exp[,tissueSamples] - load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData")) + load(file = paste0("perTissueNormalization/globalQqnorm/",make.names(tissue),".RData"), verbose = T) + + write.table(tissueExp, file = gzfile("huvec.txt.gz"), sep = "\t", quote = F, col.names = NA) + #https://stackoverflow.com/questions/18964837/fast-correlation-in-r-using-c-and-parallelization/18965892#18965892 expScale = tissueExp - rowMeans(tissueExp); @@ -368,9 +384,9 @@ sink <- lapply(tissueClasses, function(tissue){ }) -write.table(pcaRes$eigenVectors, file = "fibroEigenVectors.txt", sep = "\t", quote = F, col.names = NA) -write.table(pcaRes$eigenValues, file = "fibroEigenValues.txt", sep = "\t", quote = F, col.names = NA) -write.table(pcaRes$expPcs, file = "fibroPcs.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$eigenVectors, file = "huvecEigenVectors.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$eigenValues, file = "huvecEigenValues.txt", sep = "\t", quote = F, col.names = NA) +write.table(pcaRes$expPcs, file = "huvecPcs.txt", sep = "\t", quote = F, col.names = NA) pcsPerTissue <- lapply(tissueClasses, function(tissue){ load(file = paste0("perTissueNormalization/perTissueQqPcaNoOutliers/",make.names(tissue),".RData")) From 638eb303addba2e107b7e211f5873522a314e780 Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sat, 22 Oct 2022 15:31:14 +0200 Subject: [PATCH 21/22] Update pom.xml --- Downstreamer/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Downstreamer/pom.xml b/Downstreamer/pom.xml index ac6432d85..64e8ca675 100644 --- a/Downstreamer/pom.xml +++ b/Downstreamer/pom.xml @@ -7,7 +7,7 @@ 1.0.4-SNAPSHOT Downstreamer - 1.29-SNAPSHOT + 1.30-SNAPSHOT jar From 887cd41f1bc72c7292b6250233f72a4933e20f4d Mon Sep 17 00:00:00 2001 From: Patrick Deelen Date: Sat, 22 Oct 2022 15:34:50 +0200 Subject: [PATCH 22/22] Update pom.xml --- Genotype-Harmonizer/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Genotype-Harmonizer/pom.xml b/Genotype-Harmonizer/pom.xml index 4414958d1..44e9bf74a 100644 --- a/Genotype-Harmonizer/pom.xml +++ b/Genotype-Harmonizer/pom.xml @@ -7,7 +7,7 @@ 4.0.0 Genotype-Harmonizer - 1.4.23-SNAPSHOT + 1.4.24-SNAPSHOT Genotype Harmonizer jar