Skip to content

Commit

Permalink
Extensive changes to exome pipeline, bamstats and exploratory analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
jscaber committed Sep 20, 2023
1 parent 3231f31 commit af5661a
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 53 deletions.
21 changes: 1 addition & 20 deletions cgatpipelines/Rtools/exploratory.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,26 +74,7 @@ run <- function(opt) {
dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
}
futile.logger::flog.info(paste("reading Experiment object", paste(dim(counts(experiment)), collapse = ",")))

### SVA - ANALYSIS OF BIASES ###
futile.logger::flog.info(paste("Performing Surrogate Variable Analysis"))
dds <- estimateSizeFactors(dds)
dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(formula(opt$model), colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
for(factor in opt$factors){
start_plot(paste0('SVA for ', factor), outdir=opt$outdir)
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(svseq$sv[, i] ~ colData(dds)[, factor], vertical = TRUE, main = paste0("SV", i))
abline(h = 0)
}
end_plot()
}


### TRANSFORMATION OF DATA ###
futile.logger::flog.info(paste("Transforming data"))
rld<- rlog(dds)
Expand Down
14 changes: 7 additions & 7 deletions cgatpipelines/tasks/bamstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def buildPicardInsertSizeStats(infile, outfile, genome_file,
Filename with genomic sequence.
'''
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -159,7 +159,7 @@ def buildPicardAlignmentStats(infile, outfile, genome_file,
'''

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -191,7 +191,7 @@ def buildPicardDuplicationStats(infile, outfile, picardmem):
'''

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -244,7 +244,7 @@ def buildPicardDuplicateStats(infile, outfile, picardmem):
Output filename with picard output.
'''
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -279,7 +279,7 @@ def buildPicardCoverageStats(infile, outfile, baits, regions,
'''

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -310,7 +310,7 @@ def buildPicardGCStats(infile, outfile, genome_file, picardmem):
"""

job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3

if BamTools.getNumReads(infile) == 0:
Expand Down Expand Up @@ -1291,7 +1291,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
'''
job_memory = picardmem
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
job_threads = 3
infile, genome, rRNA_intervals = infiles

Expand Down
20 changes: 20 additions & 0 deletions cgatpipelines/tools/pipeline_bamstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,26 @@ def buildPicardDuplicationStats(infile, outfile):
PICARD_MEMORY)


@follows(mkdir("RSeQC.dir"))
@follows(countReads)
@transform(intBam,
regex("BamFiles.dir/(.*).bam$"),
r"RSeQC.dir/\1.read_distribution")
def runRSeQC(infile, outfile):

bed_file = "/ceph/project/talbotlab/jscaber/iPS-C9TrapYan/bamstats-fullgenome/hg38_GENCODE.v38.bed"

job_memory = "6G"

statement = '''
read_distribution.py
-i %(infile)s
-r %(bed_file)s
> %(outfile)s
'''

P.run(statement)

@follows(mkdir("BamStats.dir"))
@follows(countReads)
@transform(intBam,
Expand Down
171 changes: 146 additions & 25 deletions cgatpipelines/tools/pipeline_exome.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
from ruffus import transform, mkdir, follows, merge, regex, suffix, \
jobs_limit, files, collate, add_inputs, formatter, \
active_if, originate, subdivide
from ruffus.combinatorics import permutations
from ruffus.combinatorics import permutations, product
import sys
import os
import csv
Expand Down Expand Up @@ -201,6 +201,8 @@

PANELS = glob.glob("*.panel.tsv")

FILTERS = glob.glob ("*.filter.tsv")


###############################################################################
###############################################################################
Expand Down Expand Up @@ -277,7 +279,7 @@ def createEnsemblDictionary(outfile):
@subdivide([x for x in PANELS],
regex("(\S+).panel.tsv"),
add_inputs(createEnsemblDictionary),
r"\1.bed")
r"panels/\1.bed")
def createPanelBed(infiles, outfile):

geneset = PARAMS['geneset']
Expand Down Expand Up @@ -713,13 +715,34 @@ def ancestrySomalier(infile,outfile):
P.run(statement)


###############################################################################
# Expansionhunter


@follows(mkdir("expansionhunter"))
@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"),
r"expansionhunter/\1.vcf")
def expansionHunter(infile, outfile):

genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
catalog = PARAMS["expansionhunter"]
outfile = P.snip(outfile)

statement = '''ExpansionHunter
--reads %(infile)s
--reference %(genome)s
--variant-catalog %(catalog)s
--output-prefix %(outfile)s
> %(outfile)s.log 2>&1
'''
P.run(statement, job_condaenv="expansionhunter")

###############################################################################
# VEP

@transform(applyVQSRIndels,
regex(r"variants/all_samples.vqsr.vcf.gz"),
r"variants/all_samples.vep.txt.gz")
r"variants/all_samples.vep.vcf.gz")
def annotateVariantsVEP(infile, outfile):

#Adds annotations as specified in the pipeline.yml using Ensembl
Expand All @@ -744,11 +767,12 @@ def annotateVariantsVEP(infile, outfile):

statement = '''vep
-i %(infile)s
--cache --dir %(vep_cache)s --tab
--cache --dir %(vep_cache)s --vcf
--species %(vep_species)s
--fork %(job_threads)s
--compress_output bgzip
--assembly %(vep_assembly)s
--plugin SameCodon
-o %(outfile)s --force_overwrite'''
if dbNSFP_path is not None:
statement +='''
Expand All @@ -763,30 +787,62 @@ def annotateVariantsVEP(infile, outfile):
P.run(statement)


@transform(annotateVariantsVEP, regex("variants/all_samples.vep.vcf.gz"),
r'all_samples.filtered.vcf.gz')
def filterVEP(infile, outfiles):

@subdivide([x for x in FILTERS],
regex("(\S+).filter.tsv"),
add_inputs(annotateVariantsVEP),
r"variants/all_samples.\1.vcf.gz")
def filterVEP(infiles, outfile):
'''
Filter variants using vep-filter.
More docs
More docs
'''

filters = PARAMS["annotation"]
filter_file,infile = infiles
with open(filter_file) as f:
filters = f.read()
outfile = P.snip(outfile, ".gz")

statement = '''filter_vep
--gz
-i %(infile)s
-o %(outfile)s
--only-matched
--force-overwrite
-f %(filter)s
--only_matched
--force_overwrite
-f "%(filters)s";
> %(outfile)s.log 2>&1;
bgzip %(outfile)s;
tabix -p vcf %(outfile)s.gz
'''
P.run(statement)


@product(filterVEP, formatter("variants/all_samples.(?P<FILTER>.*).vcf.gz$"),
createPanelBed, formatter("panels/(?P<PANEL>.*).bed$"),
"results/{FILTER[0][0]}/{PANEL[1][0]}.vcf.gz")
def filterPanel(infiles, outfile):
'''
Filter variants using vep-filter.
More docs
'''
vcf, panelbed = infiles
outfile = P.snip(outfile, ".gz")

statement = '''gatk
SelectVariants
-V %(vcf)s
-O %(outfile)s
-L %(panelbed)s
> %(outfile)s.log 2>&1;
bgzip %(outfile)s;
tabix -p vcf %(outfile)s.gz
'''
P.run(statement)


@follows(mkdir("variant_tables"))
@transform(RemoveDuplicatesSample, regex(r"gatk/(.*).dedup2.bam"),
add_inputs(filterVEP), r"variant_tables/\1.tsv")
@product(RemoveDuplicatesSample, formatter("gatk/(?P<TRACK>.*).dedup2.bam$"),
filterPanel, formatter("results/(?P<FILTER>.*)/(?P<PANEL>.*).vcf.gz$"),
"results/{FILTER[1][0]}/{PANEL[1][0]}/{TRACK[0][0]}.tsv")
def makeAnnotationsTables(infiles, outfile):
'''
Converts the multi sample vcf into
Expand Down Expand Up @@ -814,27 +870,92 @@ def makeAnnotationsTables(infiles, outfile):
cols2.append(line.split("\t")[0])
l1 = "\t".join(cols2)
l2 = "\t".join(colds)
out = open(outfile, "w")
os.unlink(TF)
out = open(TF, "w")
out.write('''CHROM\tPOS\tQUAL\tID\tFILTER\tREF1\tALT\tGT\t%s\n\
chromosome\tposition\tquality\tid\tfilter\tref\talt\t\
genotype\t%s\n''' % (l1, l2))
out.close()
cstring = "\\t".join(cols)
cstring = "%CHROM\\t%POS\\t%QUAL\\t%ID\\t%FILTER\\t%REF\\t\
%ALT\\t[%GT]\\t" + cstring
if PARAMS['test'] == 1:
statement = '''bcftools query
-f '%(cstring)s\\n'
-i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
%(inputvcf)s >> %(outfile)s'''
statement = '''bcftools query -s %(samplename)s
-f '%(cstring)s\\n'
-i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
%(inputvcf)s >> %(TF)s'''
P.run(statement)

maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)

df = pd.read_table(TF, sep = '\t', engine='python', quoting=3)
CSQ_index = df['CSQ'][0].split("Format:_")[-1].rstrip('\"')
df = df.drop(0)

if len(df)==0:
df.drop('CSQ',axis=1,inplace=True)
temp_df = pd.DataFrame(columns=CSQ_index.split("|"))
else:
statement = '''bcftools query -s %(samplename)s
-f '%(cstring)s\\n'
-i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
%(inputvcf)s >> %(outfile)s'''
df['CSQ'] = df.CSQ.str.split(",")
df = df.explode('CSQ').reset_index(drop=True)
temp_df = df.CSQ.str.split("|", expand = True)
temp_df.columns = CSQ_index.split("|")
df.drop('CSQ',axis=1,inplace=True)
df = df.join(temp_df, rsuffix="_VEP")
if "SYMBOL" in df.columns:
col = df.pop("SYMBOL")
df.insert(0, "SYMBOL", col)
if "HGVSp_VEP" in df.columns:
col = df.pop("HGVSp_VEP")
df.insert(0, "HGVSp_VEP", col)
if len(df)!=0 and "CANONICAL" in df.columns and "PolyPhen" in df.columns:
# Select canonical variant, then most deleterious
df_final = df.sort_values(by = ['CANONICAL', 'PolyPhen'], ascending = [False, False]).drop_duplicates(["CHROM", "POS"])
df_final.to_csv(outfile, sep="\t")
outfile_full = P.snip(outfile, ".tsv") + ".tsv_full"
df.to_csv(outfile_full, sep="\t")
else:
df.to_csv(outfile, sep="\t")
os.unlink(TF)



@collate(makeAnnotationsTables, formatter("results/(?P<FILTER>.*)/(?P<PANEL>.*)/.*.tsv$"),
"results/{FILTER[0]}/{PANEL[0]}_merged.tsv")
def mergeAnnotationsTables(infiles, outfile):

result = False
for infile in infiles:
lines = 0
with open(infile) as f:
for line in f:
lines = lines + 1
if lines > 1:
result = True
break

path = os.path.dirname(infiles[1])
infiles = " ".join(infiles)

if result == True:
statement = ''' cgat tables2table %(infiles)s
--cat patient_id
--regex-filename '%(path)s/(\S+).tsv'
| awk '!/^#/{print > "%(outfile)s" ;next} 1'
> %(outfile)s.log 2>&1'''
else:
statement = '''touch %(outfile)s'''
P.run(statement)



###############################################################################
# Quality Filtering #

Expand Down
2 changes: 1 addition & 1 deletion cgatpipelines/tools/pipeline_genesets.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def buildCdsTranscript(infile, outfile):


@transform(buildUCSCGeneSet,
regex("*.gtf.gz"),
regex(".*.gtf.gz"),
PARAMS['interface_geneset_exons_gtf'])
def buildExonTranscript(infile, outfile):
'''
Expand Down

0 comments on commit af5661a

Please sign in to comment.