Skip to content

Commit

Permalink
Merge pull request #97 from stjude/p8-egs
Browse files Browse the repository at this point in the history
calculated effective genome size using for SICER & MACS
  • Loading branch information
madetunj authored Feb 23, 2022
2 parents 363db47 + 498289c commit a64289b
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 40 deletions.
21 changes: 17 additions & 4 deletions seaseq-case.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ workflow seaseq {
reference=reference
}
call util.effective_genome_size as egs {
# effective genome size for FASTA
input :
reference=reference
}
# Process FASTQs
if ( defined(sample_fastq) ) {

Expand Down Expand Up @@ -409,9 +415,11 @@ workflow seaseq {
call macs.macs {
input :
bamfile=sample_bam,
pvalue = "1e-9",
pvalue="1e-9",
keep_dup="auto",
egs=egs.genomesize,
default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/NARROW_peaks' + '/' + basename(sample_bam,'.bam') + '-p9_kd-auto'
}
call util.addreadme {
Expand All @@ -422,16 +430,19 @@ workflow seaseq {
call macs.macs as all {
input :
bamfile=sample_bam,
pvalue = "1e-9",
pvalue="1e-9",
keep_dup="all",
egs=egs.genomesize,
default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/NARROW_peaks' + '/' + basename(sample_bam,'.bam') + '-p9_kd-all'
}
call macs.macs as nomodel {
input :
bamfile=sample_bam,
nomodel=true,
egs=egs.genomesize,
default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/NARROW_peaks' + '/' + basename(sample_bam,'.bam') + '-nm'
}
call bamtogff.bamtogff {
Expand All @@ -452,7 +463,9 @@ workflow seaseq {
input :
bedfile=forsicerbed.bedfile,
chromsizes=samtools_faidx.chromsizes,
genome_fraction=egs.genomefraction,
default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/BROAD_peaks'
}
call rose.rose {
Expand Down Expand Up @@ -743,11 +756,11 @@ workflow seaseq {
File? pdf_gene = bamtogff.pdf_gene
File? pdf_h_gene = bamtogff.pdf_h_gene
File? png_h_gene = bamtogff.png_h_gene
File? jpg_h_gene = bamtogff.jpg_h_gene
File? jpg_h_gene = bamtogff.jpg_h_gene
File? pdf_promoters = bamtogff.pdf_promoters
File? pdf_h_promoters = bamtogff.pdf_h_promoters
File? png_h_promoters = bamtogff.png_h_promoters
File? jpg_h_promoters = bamtogff.jpg_h_promoters
File? jpg_h_promoters = bamtogff.jpg_h_promoters

#PEAKS-ANNOTATION
File? peak_promoters = peaksanno.peak_promoters
Expand Down
24 changes: 20 additions & 4 deletions seaseq-control.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ workflow seaseq {
reference=reference
}
call util.effective_genome_size as egs {
# effective genome size for FASTA
input :
reference=reference
}
# Process FASTQs
if ( defined(sample_fastq) ) {

Expand Down Expand Up @@ -661,10 +667,12 @@ workflow seaseq {
input :
bamfile=sample_bam,
control=control_bam,
pvalue = "1e-9",
pvalue="1e-9",
keep_dup="auto",
egs=egs.genomesize,
output_name = if defined(results_name) then results_name + '-p9_kd-auto' else basename(sample_bam,'.bam') + '+control-p9_kd-auto',
default_location = if defined(results_name) then results_name + '/PEAKS/NARROW_peaks/' + results_name + '-p9_kd-auto' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/NARROW_peaks/' + basename(sample_bam,'.bam') + '+control-p9_kd-auto'
}
call util.addreadme {
Expand All @@ -676,8 +684,9 @@ workflow seaseq {
input :
bamfile=sample_bam,
control=control_bam,
pvalue = "1e-9",
pvalue="1e-9",
keep_dup="all",
egs=egs.genomesize,
output_name = if defined(results_name) then results_name + '-p9_kd-all' else basename(sample_bam,'.bam') + '+control-p9_kd-all',
default_location = if defined(results_name) then results_name + '/PEAKS/NARROW_peaks/' + results_name + '-p9_kd-all' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/NARROW_peaks/' + basename(sample_bam,'.bam') + '+control-p9_kd-all'
}
Expand All @@ -687,6 +696,7 @@ workflow seaseq {
bamfile=sample_bam,
control=control_bam,
nomodel=true,
egs=egs.genomesize,
output_name = if defined(results_name) then results_name + '-nm' else basename(sample_bam,'.bam') + '+control-nm',
default_location = if defined(results_name) then results_name + '/PEAKS/NARROW_peaks/' + results_name + '-nm' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/NARROW_peaks/' + basename(sample_bam,'.bam') + '+control-nm'
}
Expand Down Expand Up @@ -718,8 +728,10 @@ workflow seaseq {
bedfile=s_forsicerbed.bedfile,
control_bed=c_forsicerbed.bedfile,
chromsizes=samtools_faidx.chromsizes,
genome_fraction=egs.genomefraction,
outputname=if defined(results_name) then results_name else basename(s_forsicerbed.bedfile,'.bed') + '+control',
default_location=if defined(results_name) then results_name + '/PEAKS/BROAD_peaks' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/BROAD_peaks'
}
call rose.rose {
Expand Down Expand Up @@ -874,18 +886,22 @@ workflow seaseq {
call macs.macs as only_s_macs {
input :
bamfile=sample_bam,
pvalue = "1e-9",
pvalue="1e-9",
keep_dup="auto",
egs=egs.genomesize,
default_location='SAMPLE/' + sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS_forQC/' + basename(sample_bam,'.bam') + '-p9_kd-auto'
}
#Peak Calling for Control BAM only
call macs.macs as only_c_macs {
input :
bamfile=control_bam,
pvalue = "1e-9",
pvalue="1e-9",
keep_dup="auto",
egs=egs.genomesize,
default_location='CONTROL/' + sub(basename(control_bam),'.sorted.b.*$','') + '/PEAKS_forQC/' + basename(control_bam,'.bam') + '-p9_kd-auto'
}
call bedtools.bamtobed as only_c_finalbed {
Expand Down
3 changes: 3 additions & 0 deletions workflows/tasks/macs.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ task macs {

String pvalue = '1e-9'
String keep_dup = 'auto'
String egs = 'hs'
Int space = 50
Int shiftsize = 200
Boolean wiggle = true
Expand All @@ -32,6 +33,7 @@ task macs {
-t ~{bamfile} \
~{if defined(control) then "-c" + control else ""} \
--space=~{space} \
--gsize=~{egs} \
--shiftsize=~{shiftsize} \
~{true="-w" false="" wiggle} \
~{true="-S" false="" single_profile} \
Expand All @@ -47,6 +49,7 @@ task macs {
-p ~{pvalue} \
--keep-dup=~{keep_dup} \
--space=~{space} \
--gsize=~{egs} \
~{true="-w" false="" wiggle} \
~{true="-S" false="" single_profile} \
-n ~{output_name} \
Expand Down
73 changes: 41 additions & 32 deletions workflows/tasks/util.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ task basicfastqstats {
runtime {
memory: ceil(memory_gb * ncpu) + " GB"
maxRetries: max_retries
docker: 'abralab/seaseq:v2.2.0'
cpu: ncpu
}
output {
Expand Down Expand Up @@ -88,6 +89,7 @@ task flankbed {
runtime {
memory: ceil(memory_gb * ncpu) + " GB"
maxRetries: max_retries
docker: 'abralab/seaseq:v2.2.0'
cpu: ncpu
}
output {
Expand Down Expand Up @@ -115,7 +117,7 @@ task summaryreport {
command <<<
# Printing header
head -n 121 /usr/local/bin/scripts/seaseq_overall.header > ~{outputfile}
head -n 121 /usr/local/bin/seaseq_overall.header > ~{outputfile}
if [ -f "~{sampleqc_html}" ]; then
# Printing Sample Quality Reports
echo '<h2>Sample FASTQs Quality Results</h2>' >> ~{outputfile}
Expand Down Expand Up @@ -148,7 +150,7 @@ task summaryreport {
echo "<p><b>*</b> Peaks identified after Input/Control correction.</p>" >> ~{outputfile}
fi
echo '</div>' >> ~{outputfile}
tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputfile}
tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputfile}
echo -e '\n' >> ~{outputtxt}
>>>
Expand Down Expand Up @@ -209,11 +211,11 @@ task evalstats {
~{if defined(superenhancers) then "-rs " + superenhancers else ""} \
-outfile ~{outputfile}
head -n 245 /usr/local/bin/scripts/seaseq_overall.header | tail -n 123 > ~{outputhtml}
head -n 245 /usr/local/bin/seaseq_overall.header | tail -n 123 > ~{outputhtml}
cat ~{outputhtml}x >> ~{outputhtml}
sed -i "s/SEAseq Sample FASTQ Report/SEAseq ~{fastq_type} Report/" ~{outputhtml}
echo '</table></div>' >> ~{outputhtml}
tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputhtml}
tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputhtml}
>>>
Expand Down Expand Up @@ -370,13 +372,13 @@ task mergehtml {
mkdir -p ~{default_location} && cd ~{default_location}
#extract header information
head -n 245 /usr/local/bin/scripts/seaseq_overall.header | tail -n 123 > ~{outputfile}
head -n 245 /usr/local/bin/seaseq_overall.header | tail -n 123 > ~{outputfile}
mergeoutput=$(cat ~{sep='; tail -n 1 ' htmlfiles})
echo $mergeoutput >> ~{outputfile}
sed -i "s/SEAseq Sample FASTQ Report/SEAseq ~{fastq_type} Report/" ~{outputfile}
echo '</table></div>' >> ~{outputfile}
tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputfile}
tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputfile}
echo $mergeoutput > ~{outputfile}x
Expand All @@ -399,30 +401,6 @@ task mergehtml {
}
}
task linkname {
input {
String prefix
File in_fastq
Int memory_gb = 10
Int max_retries = 1
Int ncpu = 1
}
command <<<
mv ~{in_fastq} ~{prefix}.fastq.gz
>>>
runtime {
memory: ceil(memory_gb * ncpu) + " GB"
maxRetries: max_retries
cpu: ncpu
}
output {
File out_fastq = "~{prefix}.fastq.gz"
}
}
task concatstats {
input {
File sample_config
Expand Down Expand Up @@ -541,16 +519,17 @@ task concatstats {
CODE
head -n 245 /usr/local/bin/scripts/seaseq_overall.header | tail -n 123 > ~{outputfile}-stats.html
head -n 245 /usr/local/bin/seaseq_overall.header | tail -n 123 > ~{outputfile}-stats.html
cat ~{outputfile}-stats.htmlx >> ~{outputfile}-stats.html
echo "</table><p><b>*</b> Peaks identified after Input/Control correction.</p></div>" >> ~{outputfile}-stats.html
tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputfile}-stats.html
tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputfile}-stats.html
sed -i "s/SEAseq Sample FASTQ Report/SEAseq Comprehensive Report/" ~{outputfile}-stats.html
>>>
runtime {
memory: ceil(memory_gb * ncpu) + " GB"
maxRetries: max_retries
docker: 'abralab/seaseq:v2.2.0'
cpu: ncpu
}
output {
Expand Down Expand Up @@ -590,9 +569,39 @@ task addreadme {
runtime {
memory: ceil(memory_gb * ncpu) + " GB"
maxRetries: max_retries
docker: 'abralab/seaseq:v2.2.0'
cpu: ncpu
}
output {
File readme_peaks = "~{default_location}/~{output_file}"
}
}
task effective_genome_size {
# Calculate effective genome size and fraction
input {
File reference
Int memory_gb = 5
Int max_retries = 1
Int ncpu = 1
}
command <<<
faCount -summary ~{reference} | cut -f3,4,5,6 | tail -n 2 > genomeinformation.txt
head -n 1 genomeinformation.txt | awk -F'[\t ]' '{print $1 + $2 + $3 + $4}' > genomesize
tail -n 1 genomeinformation.txt | awk -F'[\t ]' '{print $1 + $2 + $3 + $4}' > genomefraction
>>>
runtime {
memory: ceil(memory_gb * ncpu) + " GB"
maxRetries: max_retries
docker: 'abralab/kentutils:latest'
cpu: ncpu
}
output {
Float genomefraction = read_float('genomefraction')
String genomesize = read_string('genomesize')
}
}

0 comments on commit a64289b

Please sign in to comment.