Move github repo

sc-eQTLgen-consortium · Feb 8, 2023 · 92e842a · 92e842a
commit 92e842a
Show file tree

Hide file tree

Showing 108 changed files with 109,771 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea/*
+05_coeqtl_mapping/launch_sbatch_files.sh
diff --git a/01_association_metrics/.ipynb_checkpoints/GRNBoost2-checkpoint.ipynb b/01_association_metrics/.ipynb_checkpoints/GRNBoost2-checkpoint.ipynb
diff --git a/01_association_metrics/.ipynb_checkpoints/compare_cell_classification-checkpoint.ipynb b/01_association_metrics/.ipynb_checkpoints/compare_cell_classification-checkpoint.ipynb
diff --git a/01_association_metrics/GRNBoost2.ipynb b/01_association_metrics/GRNBoost2.ipynb
diff --git a/01_association_metrics/README.md b/01_association_metrics/README.md
@@ -0,0 +1,31 @@
+# 01_association_metrics
+
+
+In *setting_files_for_grnboost2* there are setting files for running GRNBoost2 on scRNAseq and BIOS data
+
+*GRNBoost2.ipynb*: Prepare the input files for BEELINE, and examine the GRNBoost2 results for scRNAseq and for BIOS data
+
+*rho_comparison_lowlyexpressed.R*:  explores the differences between Spearman correlation and Rho propensity specially for very lowly expressed genes
+
+*scorpius_and_slingshot_clean.R*: calculates the pseudotime ordering for Oelen v2 classical monocytes, using SCORPIUS and Slingshot algorithms
+
+*scvelo_analysis_dm.py*: runs RNA velocity analysis on Oelen v3 dataset classical monocytes after creating loom files using [velocyto](http://velocyto.org/velocyto.py/tutorial/cli.html) to get both spliced and unspliced gene count matrices
+
+*compare_cell_classification.ipynb*: compares the aximuth cell type classification with the marker gene cell type classification in Oelen v2 and v3 dataset for untreated cells
+
+Metacell calculation and evaluation files are all in the directory *metacell*:
+
+*metacell_per_sample_original_algorithm.R*: calculates metacells based on original algorithm (implemented in the metacell R package)
+
+*metacells_from_leiden.R*: calculates metacells based on grouping from leiden clustering
+
+*create_genesets.R*: split all genes expressed in Oelen v3 dataset, Monocytes, into different expression bins for treshold-dependent evaluation with BLUEPRINT
+
+*metacell_general_correlation_tp.R*: calculates correlation from metacells (original or leiden) for different expression tresholds from *create_genesets.R* for comparison with BLUEPRINT
+
+*single_cell_correlation_tp.R*: calculates correlation from single cell dataset for different expression tresholds from *create_genesets.R* for comparison with BLUEPRINT
+
+*eval_blueprint_genesets.R*: compares correlation from BLUEPRINT with correlation from metacells/single cell for different expression tresholds, using correlation vlaues from *metacell_general_correlation_tp.R* and *single_cell_correlation_tp.R*
+
+*plot_overview_metacell.R*: visualize outputs from metacell evaluation in one plot
+
diff --git a/01_association_metrics/compare_cell_classification.ipynb b/01_association_metrics/compare_cell_classification.ipynb
diff --git a/01_association_metrics/metacell/create_genesets.R b/01_association_metrics/metacell/create_genesets.R
@@ -0,0 +1,39 @@
+# ------------------------------------------------------------------------------
+# Gene selections for metacell evaluation: generate files for different
+# gene subsets (expressed in x% until (x+20)% of the cells with x between 20-80)
+# for Oelen v3 dataset, Monocytes
+# This allows threshold dependent evaluation for BLUEPRINT comparison. 
+# See details downstream scripts metacell_general_correlation_tp.R, 
+# single_cell_correlation_tp.R, eval_blueprint_genesets.R
+# ------------------------------------------------------------------------------
+
+library(Seurat)
+
+#Load complete seurat object
+seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
+DefaultAssay(seurat)<-"RNA"
+
+#Filter for monocytes
+seurat<-seurat[,seurat$cell_type_lowerres=="monocyte"]
+
+#Selected cutoffs
+cutoffs<-c(1,0.8,0.6,0.4,0.2)
+
+#Split into lists dependent on expression cutoff
+exprGenes.singleCell<-rowSums(as.matrix(seurat@assays$RNA@counts)>0)/ncol(seurat)
+
+print(paste("Number of genes expressed in at least 50% of cells:",
+            sum(exprGenes.singleCell>=0.5)))
+
+for(i in 1:(length(cutoffs)-1)){
+  gene.subset<-rownames(seurat)[exprGenes.singleCell<=cutoffs[i] &
+                                  exprGenes.singleCell>cutoffs[i+1]]
+  print(paste("Number of genes with expression cutoff",
+              cutoffs[i+1],":",length(gene.subset)))
+
+  write.table(gene.subset,
+              file=paste0("metacell_general/eval_allmethods/gene_lists/",
+                          "mono_expr_genes_cut_",cutoffs[i+1],".txt"),
+              row.names = FALSE,col.names = FALSE,quote=FALSE)
+}
+
diff --git a/01_association_metrics/metacell/eval_blueprint_genesets.R b/01_association_metrics/metacell/eval_blueprint_genesets.R
@@ -0,0 +1,127 @@
+# ------------------------------------------------------------------------------
+# Compare correlation from BLUEPRINT with correlation from metacells/single cell
+# for different expression tresholds
+# Here: shown for leiden metacells, calculation done the same way for original
+#       metacells and single cell
+# ------------------------------------------------------------------------------
+
+library(reticulate) # to read the single cell data (numpy)
+library(data.table)
+library(ggplot2) # only required if plotting=TRUE
+
+np <- import("numpy")
+
+#Iterate over the list of different gene sets
+corr_files<-c("metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff08.tsv",
+              "metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff06.tsv",
+              "metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff04.tsv",
+              "metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff02.tsv")
+
+res_file<-"metacell_general/eval_allmethods/sc_leiden_SCT_eval.tsv"
+
+plotting<-TRUE
+
+#Write column headers
+write.table(data.frame("Condition","Num_pairs","Corr_corr","Test","File"),
+            file=res_file,quote=FALSE,sep="\t", 
+            row.names=FALSE, col.names = FALSE)
+
+#Load the large blueprint data set
+path<-"blueprint/allGenePairs_BlueprintScMonocytes_GeneGeneCorrelationComparison.pairwiseSpearman."
+corr.blue.vals <- np$load(paste0(path,"npy"))
+corr.blue.vals<-corr.blue.vals[,1]
+
+corr.blue<-fread(paste0(path,"genePairs.txt"),header=FALSE)
+
+#Split into gene1 and gene2
+corr.blue$gene1<-sapply(corr.blue$V1,function(s) strsplit(s,"/")[[1]][1])
+corr.blue$gene2<-sapply(corr.blue$V1,function(s) strsplit(s,"/")[[1]][2])
+corr.blue$V1<-NULL
+corr.blue$corr.blue<-corr.blue.vals
+rm(corr.blue.vals)
+
+for(cfile in corr_files){
+
+  print(paste("Processing:",cfile))
+
+  #Load corr.mc.r
+  corr.mc.r<-read.table(cfile)
+
+  #Filter Blueprint matrix
+  corr.genes<-unique(c(corr.mc.r$Gene1,corr.mc.r$Gene2))
+  corr.blue.subset<-corr.blue[gene1 %in% corr.genes & 
+                         gene2 %in% corr.genes]
+
+  #Order correctly so that gene1 smaller than gene2
+  corr.blue.subset$swap<-ifelse(corr.blue.subset$gene1 < corr.blue.subset$gene2, 
+                                corr.blue.subset$gene1,corr.blue.subset$gene2)
+  corr.blue.subset$gene2<-ifelse(corr.blue.subset$gene1 < corr.blue.subset$gene2, 
+                                 corr.blue.subset$gene2,corr.blue.subset$gene1)
+  corr.blue.subset$gene1<-corr.blue.subset$swap
+  corr.blue.subset$swap<-NULL
+  colnames(corr.blue.subset)<-c("Gene1","Gene2","Correlation.blue")
+
+  #Merge everything
+  corr.mc.r<-merge(corr.mc.r,corr.blue.subset,by=c("Gene1","Gene2"))
+  corr.mc.r<-reshape2::melt(corr.mc.r,id.vars=c("Gene1","Gene2","Correlation.blue"))
+  colnames(corr.mc.r)[4:5]<-c("Condition","Correlation")
+
+  #Correlation for all genes
+  corr.corr<-sapply(unique(corr.mc.r$Condition), function(tp)  
+    cor(corr.mc.r$Correlation.blue[corr.mc.r$Condition==tp],
+        corr.mc.r$Correlation[corr.mc.r$Condition==tp],
+        method="pearson",use = "pairwise.complete.obs"))
+
+  res<-data.frame(condition=unique(corr.mc.r$Condition),
+                  num.pairs=as.vector(table(corr.mc.r$Condition)),
+                  corr.corr,
+                  test="allGenes",
+                  file=cfile)
+
+  write.table(res, file=res_file,quote=FALSE,sep="\t",
+              append=TRUE,row.names=FALSE, col.names = FALSE)
+
+  corr.mc.r<-corr.mc.r[! is.na(corr.mc.r$Correlation.blue),]
+  if(plotting){
+    corr.mc.r$class<-ifelse(corr.mc.r$Correlation.blue>0,
+                            "positive","negative")
+
+    g<-ggplot(corr.mc.r,aes(x=Correlation.blue,y=Correlation,color=class))+
+      geom_point()+facet_wrap(~Condition,ncol=3)+
+      xlab("Correlation Blueprint")+ylab("Correlation MC")
+    ggsave(g,file=paste0("metacell_general/eval_allmethods/plots/",
+                         "comp_corr_blue_",strsplit(cfile,"/")[[1]][2],".png"))
+  }
+
+  #Correlation for genes with positive correlation
+  corr.mc.r.pos<-corr.mc.r[corr.mc.r$Correlation.blue>0,]
+  corr.corr<-sapply(unique(corr.mc.r.pos$Condition), function(tp)  
+    cor(corr.mc.r.pos$Correlation.blue[corr.mc.r.pos$Condition==tp],
+        corr.mc.r.pos$Correlation[corr.mc.r.pos$Condition==tp],
+        method="pearson",use = "pairwise.complete.obs"))
+
+  res<-data.frame(condition=unique(corr.mc.r.pos$Condition),
+                  num.pairs=as.vector(table(corr.mc.r.pos$Condition)),
+                  corr.corr,
+                  test="posGenes",
+                  file=cfile)
+
+  write.table(res, file=res_file,quote=FALSE,sep="\t",
+              append=TRUE,row.names=FALSE, col.names = FALSE)
+
+  #Correlation for genes with negative correlation
+  corr.mc.r.neg<-corr.mc.r[corr.mc.r$Correlation.blue<0,]
+  corr.corr<-sapply(unique(corr.mc.r.neg$Condition), function(tp)  
+    cor(corr.mc.r.neg$Correlation.blue[corr.mc.r.neg$Condition==tp],
+        corr.mc.r.neg$Correlation[corr.mc.r.neg$Condition==tp],
+        method="pearson",use = "pairwise.complete.obs"))
+
+  res<-data.frame(condition=unique(corr.mc.r.neg$Condition),
+                  num.pairs=as.vector(table(corr.mc.r.neg$Condition)),
+                  corr.corr,
+                  test="negGenes",
+                  file=cfile)
+
+  write.table(res, file=res_file,quote=FALSE,sep="\t",
+              append=TRUE,row.names=FALSE, col.names = FALSE)
+}
diff --git a/01_association_metrics/metacell/metacell_general_correlation_tp.R b/01_association_metrics/metacell/metacell_general_correlation_tp.R
@@ -0,0 +1,188 @@
+# ------------------------------------------------------------------------------
+# Calculate correlation per timepoint (and sample if stated) from the
+# metacells (original or leiden) for different
+# gene sets (split dependent on gene expression cutoff) for comparison with
+# metacells (see corresponding files create_genesets.R,
+# metacell_general_correlation_tp.R and eval_blueprint_genesets.R)
+# Input: Seurat object, file with selected genes
+# Output: files with correlation values (r-values and p-values)
+# ------------------------------------------------------------------------------
+
+library(Hmisc)
+library(optparse)
+
+#Parse arguments
+option_list = list(
+  make_option(c("-g","--selectedGenes"), 
+              default="../../benchmark/celltypes/gene_expressed_over_hald_cells.txt",
+              help="path to list with selected genes"),
+  make_option(c("-m","--method"),
+              default="metacell",
+              help="method for metacell grouping (leiden[_SCT] or metacell)"),
+  make_option(c("-s","--perSample"),action="store_true",
+              default=FALSE,
+              help="Shall the evaluation be done for each sample separatly"),
+  make_option(c("-o","--outputFile"),
+              default="timepoint_monocytes",
+              help="Suffix of the output files")
+)
+
+opt_parser = OptionParser(option_list=option_list)
+opt = parse_args(opt_parser)
+
+pathSelectedGenes<-opt$selectedGenes
+type<-opt$method
+perSample<-opt$perSample
+outputSuffix<-opt$outputFile
+
+print(paste("Evaluating",type,"for gene set:"))
+print(pathSelectedGenes)
+
+print(paste("Evaluating each sample individually:", perSample))
+
+#For leiden clustering
+if(type=="leiden") {
+  setwd("leiden_metacells/")
+  pseudobulkFile<-"metacell_leiden.RDS"
+  annotationFile<-"annotations_mc_leiden_tp.tsv"
+
+  #Read pseudobulk data frame
+  metacell.allsamples<-readRDS(pseudobulkFile)
+
+  #For leiden clustering based on SCT counts
+} else if(type=="leiden_SCT") {
+  setwd("leiden_metacells/")
+  pseudobulkFile<-"metacell_leiden_SCT.RDS"
+  annotationFile<-"annotations_mc_leiden_SCT_tp.tsv"
+
+  #Read pseudobulk data frame
+  metacell.allsamples<-readRDS(pseudobulkFile)
+
+} else if(type=="metacell"){
+  setwd("metacell_general/metacell")
+
+  metacellDir<-"metacells_K20_minCells10"
+  setwd(metacellDir)
+
+  pseudobulkFile<-"pseudobulk_metacell.RDS"
+  annotationFile<-"annotations_metacell.tsv"
+
+  metacell.allsamples<-readRDS(pseudobulkFile)
+} else {
+  stop("Metacell method type not known!")
+}
+
+##########################
+
+#Read annotation data frame
+annotations.allsamples<-read.table(annotationFile)
+colnames(annotations.allsamples)[2]<-"timepoint"
+
+#Select which genes shall be chosen for correlation (same as for single cell)
+selected.genes<-read.table(pathSelectedGenes,
+                           header=FALSE)
+metacell.allsamples<-metacell.allsamples[selected.genes$V1,]
+
+#Result data frame (correlation and pvalues)
+corr.df<-NULL
+pval.df<-NULL
+
+correlationRes<-function(meta_counts,colName){
+
+  #Be carefull: rcorr does not work with less than 5 samples
+  corr.mc<-rcorr(t(meta_counts), type="spearman")
+
+  #Create a pairwise data frame for the correlation
+  corr.pairs.mc<-as.data.frame(as.table(corr.mc$r),
+                               stringsAsFactors = FALSE)
+  corr.pairs.mc<-corr.pairs.mc[corr.pairs.mc$Var1<corr.pairs.mc$Var2,]
+  colnames(corr.pairs.mc)<-c("Gene1","Gene2",colName)
+
+  #Create a pairwise data frame for the pvalue
+  corr.pairs.pval<-as.data.frame(as.table(corr.mc$P),
+                                 stringsAsFactors = FALSE)
+  corr.pairs.pval<-corr.pairs.pval[corr.pairs.pval$Var1<corr.pairs.pval$Var2,]
+  colnames(corr.pairs.pval)<-c("Gene1","Gene2",colName)
+
+  return(list(corr.pairs.mc,corr.pairs.pval))
+}
+
+if(perSample){
+
+  #Calculate correlation for each sample
+  samples<-unique(annotations.allsamples$sample)
+  for(sample in samples){
+    print(paste("Processing sample:",sample))
+
+    annot.sample<-annotations.allsamples[annotations.allsamples$sample==sample,]
+    for(timepoint in unique(annot.sample$timepoint)){
+
+      #Run the analysis only if at least 5 meta-cells exists
+      #Probably increase the threshold again to more later ...
+      mc.ids.timepoint<-annot.sample$metacell[annot.sample$timepoint==timepoint]
+      if(length(mc.ids.timepoint)>4){
+
+        print(paste("Calculate correlation for timepoint",timepoint))
+
+        meta_counts<-metacell.allsamples[,mc.ids.timepoint]
+        tmp<-correlationRes(meta_counts,colName = paste0(timepoint,"-",sample))
+        corr.pairs.mc<-tmp[[1]]
+        corr.pairs.pval<-tmp[[2]]
+
+        #Concatinate the sample - timepoint pairs
+        if(is.null(corr.df)){
+          corr.df<-corr.pairs.mc
+          pval.df<-corr.pairs.pval
+        } else {
+          corr.df<-merge(corr.df,corr.pairs.mc,by=c("Gene1","Gene2"),
+                         all=TRUE)
+          pval.df<-merge(pval.df,corr.pairs.pval,by=c("Gene1","Gene2"),
+                         all=TRUE)
+        }
+
+      } else {
+        print(paste("Skip timepoint",timepoint,"(too less metacells)"))
+      }
+    }
+  }
+
+} else {
+
+  annot.sample<-annotations.allsamples
+  for(timepoint in unique(annot.sample$timepoint)){
+
+    #Run the analysis only if at least 5 meta-cells exists
+    #Probably increase the threshold again to more later ...
+    mc.ids.timepoint<-annot.sample$metacell[annot.sample$timepoint==timepoint]
+    if(length(mc.ids.timepoint)>4){
+
+      print(paste("Calculate correlation for timepoint",timepoint))
+
+      meta_counts<-metacell.allsamples[,mc.ids.timepoint]
+      tmp<-correlationRes(meta_counts,colName = timepoint)
+      corr.pairs.mc<-tmp[[1]]
+      corr.pairs.pval<-tmp[[2]]
+
+      #Concatinate the sample - timepoint pairs
+      if(is.null(corr.df)){
+        corr.df<-corr.pairs.mc
+        pval.df<-corr.pairs.pval
+      } else {
+        corr.df<-merge(corr.df,corr.pairs.mc,by=c("Gene1","Gene2"),
+                       all=TRUE)
+        pval.df<-merge(pval.df,corr.pairs.pval,by=c("Gene1","Gene2"),
+                       all=TRUE)
+      }
+
+    } else {
+      print(paste("Skip timepoint",timepoint,"(too less metacells)"))
+    }
+  }
+}
+
+write.table(corr.df,
+            file=paste0("correlation_r_",outputSuffix,".tsv"),
+            sep="\t",quote=FALSE)
+write.table(pval.df,
+            file=paste0("correlation_pval_",outputSuffix,".tsv"),
+            sep="\t",quote=FALSE)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		.idea/*
		05_coeqtl_mapping/launch_sbatch_files.sh