Skip to content

Commit

Permalink
Move github repo
Browse files Browse the repository at this point in the history
  • Loading branch information
KatharinaSchmid committed Feb 8, 2023
0 parents commit 92e842a
Show file tree
Hide file tree
Showing 108 changed files with 109,771 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/*
05_coeqtl_mapping/launch_sbatch_files.sh
349 changes: 349 additions & 0 deletions 01_association_metrics/.ipynb_checkpoints/GRNBoost2-checkpoint.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

349 changes: 349 additions & 0 deletions 01_association_metrics/GRNBoost2.ipynb

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions 01_association_metrics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# 01_association_metrics


In *setting_files_for_grnboost2* there are setting files for running GRNBoost2 on scRNAseq and BIOS data

*GRNBoost2.ipynb*: Prepare the input files for BEELINE, and examine the GRNBoost2 results for scRNAseq and for BIOS data

*rho_comparison_lowlyexpressed.R*: explores the differences between Spearman correlation and Rho propensity specially for very lowly expressed genes

*scorpius_and_slingshot_clean.R*: calculates the pseudotime ordering for Oelen v2 classical monocytes, using SCORPIUS and Slingshot algorithms

*scvelo_analysis_dm.py*: runs RNA velocity analysis on Oelen v3 dataset classical monocytes after creating loom files using [velocyto](http://velocyto.org/velocyto.py/tutorial/cli.html) to get both spliced and unspliced gene count matrices

*compare_cell_classification.ipynb*: compares the aximuth cell type classification with the marker gene cell type classification in Oelen v2 and v3 dataset for untreated cells

Metacell calculation and evaluation files are all in the directory *metacell*:

*metacell_per_sample_original_algorithm.R*: calculates metacells based on original algorithm (implemented in the metacell R package)

*metacells_from_leiden.R*: calculates metacells based on grouping from leiden clustering

*create_genesets.R*: split all genes expressed in Oelen v3 dataset, Monocytes, into different expression bins for treshold-dependent evaluation with BLUEPRINT

*metacell_general_correlation_tp.R*: calculates correlation from metacells (original or leiden) for different expression tresholds from *create_genesets.R* for comparison with BLUEPRINT

*single_cell_correlation_tp.R*: calculates correlation from single cell dataset for different expression tresholds from *create_genesets.R* for comparison with BLUEPRINT

*eval_blueprint_genesets.R*: compares correlation from BLUEPRINT with correlation from metacells/single cell for different expression tresholds, using correlation vlaues from *metacell_general_correlation_tp.R* and *single_cell_correlation_tp.R*

*plot_overview_metacell.R*: visualize outputs from metacell evaluation in one plot

1,164 changes: 1,164 additions & 0 deletions 01_association_metrics/compare_cell_classification.ipynb

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions 01_association_metrics/metacell/create_genesets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# ------------------------------------------------------------------------------
# Gene selections for metacell evaluation: generate files for different
# gene subsets (expressed in x% until (x+20)% of the cells with x between 20-80)
# for Oelen v3 dataset, Monocytes
# This allows threshold dependent evaluation for BLUEPRINT comparison.
# See details downstream scripts metacell_general_correlation_tp.R,
# single_cell_correlation_tp.R, eval_blueprint_genesets.R
# ------------------------------------------------------------------------------

library(Seurat)

#Load complete seurat object
seurat<-readRDS("seurat_objects/1M_v3_mediumQC_ctd_rnanormed_demuxids_20201106.rds")
DefaultAssay(seurat)<-"RNA"

#Filter for monocytes
seurat<-seurat[,seurat$cell_type_lowerres=="monocyte"]

#Selected cutoffs
cutoffs<-c(1,0.8,0.6,0.4,0.2)

#Split into lists dependent on expression cutoff
exprGenes.singleCell<-rowSums(as.matrix(seurat@assays$RNA@counts)>0)/ncol(seurat)

print(paste("Number of genes expressed in at least 50% of cells:",
sum(exprGenes.singleCell>=0.5)))

for(i in 1:(length(cutoffs)-1)){
gene.subset<-rownames(seurat)[exprGenes.singleCell<=cutoffs[i] &
exprGenes.singleCell>cutoffs[i+1]]
print(paste("Number of genes with expression cutoff",
cutoffs[i+1],":",length(gene.subset)))

write.table(gene.subset,
file=paste0("metacell_general/eval_allmethods/gene_lists/",
"mono_expr_genes_cut_",cutoffs[i+1],".txt"),
row.names = FALSE,col.names = FALSE,quote=FALSE)
}

127 changes: 127 additions & 0 deletions 01_association_metrics/metacell/eval_blueprint_genesets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# ------------------------------------------------------------------------------
# Compare correlation from BLUEPRINT with correlation from metacells/single cell
# for different expression tresholds
# Here: shown for leiden metacells, calculation done the same way for original
# metacells and single cell
# ------------------------------------------------------------------------------

library(reticulate) # to read the single cell data (numpy)
library(data.table)
library(ggplot2) # only required if plotting=TRUE

np <- import("numpy")

#Iterate over the list of different gene sets
corr_files<-c("metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff08.tsv",
"metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff06.tsv",
"metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff04.tsv",
"metacell_general/leiden_metacells/correlation_r_leiden_SCT_cutoff02.tsv")

res_file<-"metacell_general/eval_allmethods/sc_leiden_SCT_eval.tsv"

plotting<-TRUE

#Write column headers
write.table(data.frame("Condition","Num_pairs","Corr_corr","Test","File"),
file=res_file,quote=FALSE,sep="\t",
row.names=FALSE, col.names = FALSE)

#Load the large blueprint data set
path<-"blueprint/allGenePairs_BlueprintScMonocytes_GeneGeneCorrelationComparison.pairwiseSpearman."
corr.blue.vals <- np$load(paste0(path,"npy"))
corr.blue.vals<-corr.blue.vals[,1]

corr.blue<-fread(paste0(path,"genePairs.txt"),header=FALSE)

#Split into gene1 and gene2
corr.blue$gene1<-sapply(corr.blue$V1,function(s) strsplit(s,"/")[[1]][1])
corr.blue$gene2<-sapply(corr.blue$V1,function(s) strsplit(s,"/")[[1]][2])
corr.blue$V1<-NULL
corr.blue$corr.blue<-corr.blue.vals
rm(corr.blue.vals)

for(cfile in corr_files){

print(paste("Processing:",cfile))

#Load corr.mc.r
corr.mc.r<-read.table(cfile)

#Filter Blueprint matrix
corr.genes<-unique(c(corr.mc.r$Gene1,corr.mc.r$Gene2))
corr.blue.subset<-corr.blue[gene1 %in% corr.genes &
gene2 %in% corr.genes]

#Order correctly so that gene1 smaller than gene2
corr.blue.subset$swap<-ifelse(corr.blue.subset$gene1 < corr.blue.subset$gene2,
corr.blue.subset$gene1,corr.blue.subset$gene2)
corr.blue.subset$gene2<-ifelse(corr.blue.subset$gene1 < corr.blue.subset$gene2,
corr.blue.subset$gene2,corr.blue.subset$gene1)
corr.blue.subset$gene1<-corr.blue.subset$swap
corr.blue.subset$swap<-NULL
colnames(corr.blue.subset)<-c("Gene1","Gene2","Correlation.blue")

#Merge everything
corr.mc.r<-merge(corr.mc.r,corr.blue.subset,by=c("Gene1","Gene2"))
corr.mc.r<-reshape2::melt(corr.mc.r,id.vars=c("Gene1","Gene2","Correlation.blue"))
colnames(corr.mc.r)[4:5]<-c("Condition","Correlation")

#Correlation for all genes
corr.corr<-sapply(unique(corr.mc.r$Condition), function(tp)
cor(corr.mc.r$Correlation.blue[corr.mc.r$Condition==tp],
corr.mc.r$Correlation[corr.mc.r$Condition==tp],
method="pearson",use = "pairwise.complete.obs"))

res<-data.frame(condition=unique(corr.mc.r$Condition),
num.pairs=as.vector(table(corr.mc.r$Condition)),
corr.corr,
test="allGenes",
file=cfile)

write.table(res, file=res_file,quote=FALSE,sep="\t",
append=TRUE,row.names=FALSE, col.names = FALSE)

corr.mc.r<-corr.mc.r[! is.na(corr.mc.r$Correlation.blue),]
if(plotting){
corr.mc.r$class<-ifelse(corr.mc.r$Correlation.blue>0,
"positive","negative")

g<-ggplot(corr.mc.r,aes(x=Correlation.blue,y=Correlation,color=class))+
geom_point()+facet_wrap(~Condition,ncol=3)+
xlab("Correlation Blueprint")+ylab("Correlation MC")
ggsave(g,file=paste0("metacell_general/eval_allmethods/plots/",
"comp_corr_blue_",strsplit(cfile,"/")[[1]][2],".png"))
}

#Correlation for genes with positive correlation
corr.mc.r.pos<-corr.mc.r[corr.mc.r$Correlation.blue>0,]
corr.corr<-sapply(unique(corr.mc.r.pos$Condition), function(tp)
cor(corr.mc.r.pos$Correlation.blue[corr.mc.r.pos$Condition==tp],
corr.mc.r.pos$Correlation[corr.mc.r.pos$Condition==tp],
method="pearson",use = "pairwise.complete.obs"))

res<-data.frame(condition=unique(corr.mc.r.pos$Condition),
num.pairs=as.vector(table(corr.mc.r.pos$Condition)),
corr.corr,
test="posGenes",
file=cfile)

write.table(res, file=res_file,quote=FALSE,sep="\t",
append=TRUE,row.names=FALSE, col.names = FALSE)

#Correlation for genes with negative correlation
corr.mc.r.neg<-corr.mc.r[corr.mc.r$Correlation.blue<0,]
corr.corr<-sapply(unique(corr.mc.r.neg$Condition), function(tp)
cor(corr.mc.r.neg$Correlation.blue[corr.mc.r.neg$Condition==tp],
corr.mc.r.neg$Correlation[corr.mc.r.neg$Condition==tp],
method="pearson",use = "pairwise.complete.obs"))

res<-data.frame(condition=unique(corr.mc.r.neg$Condition),
num.pairs=as.vector(table(corr.mc.r.neg$Condition)),
corr.corr,
test="negGenes",
file=cfile)

write.table(res, file=res_file,quote=FALSE,sep="\t",
append=TRUE,row.names=FALSE, col.names = FALSE)
}
188 changes: 188 additions & 0 deletions 01_association_metrics/metacell/metacell_general_correlation_tp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# ------------------------------------------------------------------------------
# Calculate correlation per timepoint (and sample if stated) from the
# metacells (original or leiden) for different
# gene sets (split dependent on gene expression cutoff) for comparison with
# metacells (see corresponding files create_genesets.R,
# metacell_general_correlation_tp.R and eval_blueprint_genesets.R)
# Input: Seurat object, file with selected genes
# Output: files with correlation values (r-values and p-values)
# ------------------------------------------------------------------------------

library(Hmisc)
library(optparse)

#Parse arguments
option_list = list(
make_option(c("-g","--selectedGenes"),
default="../../benchmark/celltypes/gene_expressed_over_hald_cells.txt",
help="path to list with selected genes"),
make_option(c("-m","--method"),
default="metacell",
help="method for metacell grouping (leiden[_SCT] or metacell)"),
make_option(c("-s","--perSample"),action="store_true",
default=FALSE,
help="Shall the evaluation be done for each sample separatly"),
make_option(c("-o","--outputFile"),
default="timepoint_monocytes",
help="Suffix of the output files")
)

opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)

pathSelectedGenes<-opt$selectedGenes
type<-opt$method
perSample<-opt$perSample
outputSuffix<-opt$outputFile

print(paste("Evaluating",type,"for gene set:"))
print(pathSelectedGenes)

print(paste("Evaluating each sample individually:", perSample))

#For leiden clustering
if(type=="leiden") {
setwd("leiden_metacells/")
pseudobulkFile<-"metacell_leiden.RDS"
annotationFile<-"annotations_mc_leiden_tp.tsv"

#Read pseudobulk data frame
metacell.allsamples<-readRDS(pseudobulkFile)

#For leiden clustering based on SCT counts
} else if(type=="leiden_SCT") {
setwd("leiden_metacells/")
pseudobulkFile<-"metacell_leiden_SCT.RDS"
annotationFile<-"annotations_mc_leiden_SCT_tp.tsv"

#Read pseudobulk data frame
metacell.allsamples<-readRDS(pseudobulkFile)

} else if(type=="metacell"){
setwd("metacell_general/metacell")

metacellDir<-"metacells_K20_minCells10"
setwd(metacellDir)

pseudobulkFile<-"pseudobulk_metacell.RDS"
annotationFile<-"annotations_metacell.tsv"

metacell.allsamples<-readRDS(pseudobulkFile)
} else {
stop("Metacell method type not known!")
}

##########################

#Read annotation data frame
annotations.allsamples<-read.table(annotationFile)
colnames(annotations.allsamples)[2]<-"timepoint"

#Select which genes shall be chosen for correlation (same as for single cell)
selected.genes<-read.table(pathSelectedGenes,
header=FALSE)
metacell.allsamples<-metacell.allsamples[selected.genes$V1,]

#Result data frame (correlation and pvalues)
corr.df<-NULL
pval.df<-NULL

correlationRes<-function(meta_counts,colName){

#Be carefull: rcorr does not work with less than 5 samples
corr.mc<-rcorr(t(meta_counts), type="spearman")

#Create a pairwise data frame for the correlation
corr.pairs.mc<-as.data.frame(as.table(corr.mc$r),
stringsAsFactors = FALSE)
corr.pairs.mc<-corr.pairs.mc[corr.pairs.mc$Var1<corr.pairs.mc$Var2,]
colnames(corr.pairs.mc)<-c("Gene1","Gene2",colName)

#Create a pairwise data frame for the pvalue
corr.pairs.pval<-as.data.frame(as.table(corr.mc$P),
stringsAsFactors = FALSE)
corr.pairs.pval<-corr.pairs.pval[corr.pairs.pval$Var1<corr.pairs.pval$Var2,]
colnames(corr.pairs.pval)<-c("Gene1","Gene2",colName)

return(list(corr.pairs.mc,corr.pairs.pval))
}

if(perSample){

#Calculate correlation for each sample
samples<-unique(annotations.allsamples$sample)
for(sample in samples){
print(paste("Processing sample:",sample))

annot.sample<-annotations.allsamples[annotations.allsamples$sample==sample,]
for(timepoint in unique(annot.sample$timepoint)){

#Run the analysis only if at least 5 meta-cells exists
#Probably increase the threshold again to more later ...
mc.ids.timepoint<-annot.sample$metacell[annot.sample$timepoint==timepoint]
if(length(mc.ids.timepoint)>4){

print(paste("Calculate correlation for timepoint",timepoint))

meta_counts<-metacell.allsamples[,mc.ids.timepoint]
tmp<-correlationRes(meta_counts,colName = paste0(timepoint,"-",sample))
corr.pairs.mc<-tmp[[1]]
corr.pairs.pval<-tmp[[2]]

#Concatinate the sample - timepoint pairs
if(is.null(corr.df)){
corr.df<-corr.pairs.mc
pval.df<-corr.pairs.pval
} else {
corr.df<-merge(corr.df,corr.pairs.mc,by=c("Gene1","Gene2"),
all=TRUE)
pval.df<-merge(pval.df,corr.pairs.pval,by=c("Gene1","Gene2"),
all=TRUE)
}

} else {
print(paste("Skip timepoint",timepoint,"(too less metacells)"))
}
}
}

} else {

annot.sample<-annotations.allsamples
for(timepoint in unique(annot.sample$timepoint)){

#Run the analysis only if at least 5 meta-cells exists
#Probably increase the threshold again to more later ...
mc.ids.timepoint<-annot.sample$metacell[annot.sample$timepoint==timepoint]
if(length(mc.ids.timepoint)>4){

print(paste("Calculate correlation for timepoint",timepoint))

meta_counts<-metacell.allsamples[,mc.ids.timepoint]
tmp<-correlationRes(meta_counts,colName = timepoint)
corr.pairs.mc<-tmp[[1]]
corr.pairs.pval<-tmp[[2]]

#Concatinate the sample - timepoint pairs
if(is.null(corr.df)){
corr.df<-corr.pairs.mc
pval.df<-corr.pairs.pval
} else {
corr.df<-merge(corr.df,corr.pairs.mc,by=c("Gene1","Gene2"),
all=TRUE)
pval.df<-merge(pval.df,corr.pairs.pval,by=c("Gene1","Gene2"),
all=TRUE)
}

} else {
print(paste("Skip timepoint",timepoint,"(too less metacells)"))
}
}
}

write.table(corr.df,
file=paste0("correlation_r_",outputSuffix,".tsv"),
sep="\t",quote=FALSE)
write.table(pval.df,
file=paste0("correlation_pval_",outputSuffix,".tsv"),
sep="\t",quote=FALSE)
Loading

0 comments on commit 92e842a

Please sign in to comment.