Merge pull request #97 from stjude/p8-egs

calculated effective genome size using for SICER & MACS
stjude · Feb 23, 2022 · a64289b · a64289b
2 parents 363db47 + 498289c
commit a64289b
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 40 deletions.
diff --git a/seaseq-case.wdl b/seaseq-case.wdl
@@ -181,6 +181,12 @@ workflow seaseq {
             reference=reference
     }
 
+    call util.effective_genome_size as egs {
+        # effective genome size for FASTA
+        input :
+            reference=reference
+    }
+
     # Process FASTQs
     if ( defined(sample_fastq) ) {
 
@@ -409,9 +415,11 @@ workflow seaseq {
     call macs.macs {
         input :
             bamfile=sample_bam,
-            pvalue = "1e-9",
+            pvalue="1e-9",
             keep_dup="auto",
+            egs=egs.genomesize,
             default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/NARROW_peaks' + '/' + basename(sample_bam,'.bam') + '-p9_kd-auto'
+
     }
 
     call util.addreadme {
@@ -422,16 +430,19 @@ workflow seaseq {
     call macs.macs as all {
         input :
             bamfile=sample_bam,
-            pvalue = "1e-9",
+            pvalue="1e-9",
             keep_dup="all",
+            egs=egs.genomesize,
             default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/NARROW_peaks' + '/' + basename(sample_bam,'.bam') + '-p9_kd-all'
     }
 
     call macs.macs as nomodel {
         input :
             bamfile=sample_bam,
             nomodel=true,
+            egs=egs.genomesize,
             default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/NARROW_peaks' + '/' + basename(sample_bam,'.bam') + '-nm'
+
     }
 
     call bamtogff.bamtogff {
@@ -452,7 +463,9 @@ workflow seaseq {
         input :
             bedfile=forsicerbed.bedfile,
             chromsizes=samtools_faidx.chromsizes,
+            genome_fraction=egs.genomefraction,
             default_location=sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS/BROAD_peaks'
+
     }
 
     call rose.rose {
@@ -743,11 +756,11 @@ workflow seaseq {
         File? pdf_gene = bamtogff.pdf_gene
         File? pdf_h_gene = bamtogff.pdf_h_gene
         File? png_h_gene = bamtogff.png_h_gene
-	File? jpg_h_gene = bamtogff.jpg_h_gene
+        File? jpg_h_gene = bamtogff.jpg_h_gene
         File? pdf_promoters = bamtogff.pdf_promoters
         File? pdf_h_promoters = bamtogff.pdf_h_promoters
         File? png_h_promoters = bamtogff.png_h_promoters
-	File? jpg_h_promoters = bamtogff.jpg_h_promoters
+        File? jpg_h_promoters = bamtogff.jpg_h_promoters
 
         #PEAKS-ANNOTATION
         File? peak_promoters = peaksanno.peak_promoters

diff --git a/seaseq-control.wdl b/seaseq-control.wdl
@@ -206,6 +206,12 @@ workflow seaseq {
             reference=reference
     }
 
+    call util.effective_genome_size as egs {
+        # effective genome size for FASTA
+        input :
+            reference=reference
+    }
+    
     # Process FASTQs
     if ( defined(sample_fastq) ) {
 
@@ -661,10 +667,12 @@ workflow seaseq {
         input :
             bamfile=sample_bam,
             control=control_bam,
-            pvalue = "1e-9",
+            pvalue="1e-9",
             keep_dup="auto",
+            egs=egs.genomesize,
             output_name = if defined(results_name) then results_name + '-p9_kd-auto' else basename(sample_bam,'.bam') + '+control-p9_kd-auto',
             default_location = if defined(results_name) then results_name + '/PEAKS/NARROW_peaks/' + results_name + '-p9_kd-auto' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/NARROW_peaks/' + basename(sample_bam,'.bam') + '+control-p9_kd-auto'
+
     }
 
     call util.addreadme {
@@ -676,8 +684,9 @@ workflow seaseq {
         input :
             bamfile=sample_bam,
             control=control_bam,
-            pvalue = "1e-9",
+            pvalue="1e-9",
             keep_dup="all",
+            egs=egs.genomesize,
             output_name = if defined(results_name) then results_name + '-p9_kd-all' else basename(sample_bam,'.bam') + '+control-p9_kd-all',
             default_location = if defined(results_name) then results_name + '/PEAKS/NARROW_peaks/' + results_name + '-p9_kd-all' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/NARROW_peaks/' + basename(sample_bam,'.bam') + '+control-p9_kd-all'
     }
@@ -687,6 +696,7 @@ workflow seaseq {
             bamfile=sample_bam,
             control=control_bam,
             nomodel=true,
+            egs=egs.genomesize,
             output_name = if defined(results_name) then results_name + '-nm' else basename(sample_bam,'.bam') + '+control-nm',
             default_location = if defined(results_name) then results_name + '/PEAKS/NARROW_peaks/' + results_name + '-nm' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/NARROW_peaks/' + basename(sample_bam,'.bam') + '+control-nm'
     }
@@ -718,8 +728,10 @@ workflow seaseq {
             bedfile=s_forsicerbed.bedfile,
             control_bed=c_forsicerbed.bedfile,
             chromsizes=samtools_faidx.chromsizes,
+            genome_fraction=egs.genomefraction,
             outputname=if defined(results_name) then results_name else basename(s_forsicerbed.bedfile,'.bed') + '+control',
             default_location=if defined(results_name) then results_name + '/PEAKS/BROAD_peaks' else sub(basename(sample_bam),'.sorted.b.*$','') + '+control/PEAKS/BROAD_peaks'
+
     }
 
     call rose.rose {
@@ -874,18 +886,22 @@ workflow seaseq {
     call macs.macs as only_s_macs {
         input :
             bamfile=sample_bam,
-            pvalue = "1e-9",
+            pvalue="1e-9",
             keep_dup="auto",
+            egs=egs.genomesize,
             default_location='SAMPLE/' + sub(basename(sample_bam),'.sorted.b.*$','') + '/PEAKS_forQC/' + basename(sample_bam,'.bam') + '-p9_kd-auto'
+
     }
 
     #Peak Calling for Control BAM only
     call macs.macs as only_c_macs {
         input :
             bamfile=control_bam,
-            pvalue = "1e-9",
+            pvalue="1e-9",
             keep_dup="auto",
+            egs=egs.genomesize,
             default_location='CONTROL/' + sub(basename(control_bam),'.sorted.b.*$','') + '/PEAKS_forQC/' + basename(control_bam,'.bam') + '-p9_kd-auto'
+
     }
 
     call bedtools.bamtobed as only_c_finalbed {

diff --git a/workflows/tasks/macs.wdl b/workflows/tasks/macs.wdl
@@ -12,6 +12,7 @@ task macs {
 
         String pvalue = '1e-9'
         String keep_dup = 'auto'
+        String egs = 'hs'
         Int space = 50
         Int shiftsize = 200
         Boolean wiggle = true
@@ -32,6 +33,7 @@ task macs {
                 -t ~{bamfile} \
                 ~{if defined(control) then "-c" + control else ""} \
                 --space=~{space} \
+                --gsize=~{egs} \
                 --shiftsize=~{shiftsize} \
                 ~{true="-w" false="" wiggle} \
                 ~{true="-S" false="" single_profile} \
@@ -47,6 +49,7 @@ task macs {
                 -p ~{pvalue} \
                 --keep-dup=~{keep_dup} \
                 --space=~{space} \
+                --gsize=~{egs} \
                 ~{true="-w" false="" wiggle} \
                 ~{true="-S" false="" single_profile} \
                 -n ~{output_name} \

diff --git a/workflows/tasks/util.wdl b/workflows/tasks/util.wdl
@@ -44,6 +44,7 @@ task basicfastqstats {
     runtime {
         memory: ceil(memory_gb * ncpu) + " GB"
         maxRetries: max_retries
+        docker: 'abralab/seaseq:v2.2.0'
         cpu: ncpu
     }
     output {
@@ -88,6 +89,7 @@ task flankbed {
     runtime {
         memory: ceil(memory_gb * ncpu) + " GB"
         maxRetries: max_retries
+        docker: 'abralab/seaseq:v2.2.0'
         cpu: ncpu
     }
     output {
@@ -115,7 +117,7 @@ task summaryreport {
     command <<<
 
         # Printing header
-        head -n 121 /usr/local/bin/scripts/seaseq_overall.header > ~{outputfile}
+        head -n 121 /usr/local/bin/seaseq_overall.header > ~{outputfile}
         if [ -f "~{sampleqc_html}" ]; then
             # Printing Sample Quality Reports
             echo '<h2>Sample FASTQs Quality Results</h2>' >> ~{outputfile}
@@ -148,7 +150,7 @@ task summaryreport {
             echo "<p><b>*</b> Peaks identified after Input/Control correction.</p>" >> ~{outputfile}
         fi
         echo '</div>' >> ~{outputfile}
-        tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputfile}
+        tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputfile}
         echo -e '\n' >> ~{outputtxt}
 
     >>>
@@ -209,11 +211,11 @@ task evalstats {
             ~{if defined(superenhancers) then "-rs " + superenhancers else ""} \
             -outfile ~{outputfile}
 
-        head -n 245 /usr/local/bin/scripts/seaseq_overall.header  | tail -n 123 > ~{outputhtml}
+        head -n 245 /usr/local/bin/seaseq_overall.header  | tail -n 123 > ~{outputhtml}
         cat ~{outputhtml}x >> ~{outputhtml}
         sed -i "s/SEAseq Sample FASTQ Report/SEAseq ~{fastq_type} Report/" ~{outputhtml}
         echo '</table></div>' >> ~{outputhtml}
-        tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputhtml}
+        tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputhtml}
 
 
     >>> 
@@ -370,13 +372,13 @@ task mergehtml {
         mkdir -p ~{default_location} && cd ~{default_location}
 
         #extract header information
-        head -n 245 /usr/local/bin/scripts/seaseq_overall.header  | tail -n 123 > ~{outputfile}
+        head -n 245 /usr/local/bin/seaseq_overall.header  | tail -n 123 > ~{outputfile}
 
         mergeoutput=$(cat ~{sep='; tail -n 1 ' htmlfiles})
         echo $mergeoutput >> ~{outputfile}
         sed -i "s/SEAseq Sample FASTQ Report/SEAseq ~{fastq_type} Report/" ~{outputfile}
         echo '</table></div>' >> ~{outputfile}
-        tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputfile}
+        tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputfile}
 
         echo $mergeoutput > ~{outputfile}x
 
@@ -399,30 +401,6 @@ task mergehtml {
     }
 }
 
-task linkname {
-    input {
-        String prefix
-        File in_fastq
-
-        Int memory_gb = 10
-        Int max_retries = 1
-        Int ncpu = 1
-    }
-    command <<<
-        mv ~{in_fastq} ~{prefix}.fastq.gz
-        
-    >>>
-    runtime {
-        memory: ceil(memory_gb * ncpu) + " GB"
-        maxRetries: max_retries
-        cpu: ncpu
-    }
-    output {
-        File out_fastq = "~{prefix}.fastq.gz"
-    }
-}
-
-
 task concatstats {
     input {
         File sample_config
@@ -541,16 +519,17 @@ task concatstats {
 
         CODE
 
-        head -n 245 /usr/local/bin/scripts/seaseq_overall.header | tail -n 123 > ~{outputfile}-stats.html
+        head -n 245 /usr/local/bin/seaseq_overall.header | tail -n 123 > ~{outputfile}-stats.html
         cat ~{outputfile}-stats.htmlx >> ~{outputfile}-stats.html
         echo "</table><p><b>*</b> Peaks identified after Input/Control correction.</p></div>" >> ~{outputfile}-stats.html
-        tail -n 13 /usr/local/bin/scripts/seaseq_overall.header >> ~{outputfile}-stats.html
+        tail -n 13 /usr/local/bin/seaseq_overall.header >> ~{outputfile}-stats.html
         sed -i "s/SEAseq Sample FASTQ Report/SEAseq Comprehensive Report/" ~{outputfile}-stats.html
 
     >>> 
     runtime {
         memory: ceil(memory_gb * ncpu) + " GB"
         maxRetries: max_retries
+        docker: 'abralab/seaseq:v2.2.0'
         cpu: ncpu
     }
     output {
@@ -590,9 +569,39 @@ task addreadme {
     runtime {
         memory: ceil(memory_gb * ncpu) + " GB"
         maxRetries: max_retries
+        docker: 'abralab/seaseq:v2.2.0'
         cpu: ncpu
     }
     output {
 	File readme_peaks = "~{default_location}/~{output_file}"
     }
 }
+
+
+task effective_genome_size {
+    # Calculate effective genome size and fraction
+    input {
+        File reference
+
+        Int memory_gb = 5
+        Int max_retries = 1
+        Int ncpu = 1
+    }
+    command <<<
+        faCount -summary ~{reference} | cut -f3,4,5,6 | tail -n 2 > genomeinformation.txt
+        
+        head -n 1 genomeinformation.txt | awk -F'[\t ]' '{print $1 + $2 + $3 + $4}' > genomesize
+        tail -n 1 genomeinformation.txt | awk -F'[\t ]' '{print $1 + $2 + $3 + $4}' > genomefraction
+    >>>
+    runtime {
+        memory: ceil(memory_gb * ncpu) + " GB"
+        maxRetries: max_retries
+        docker: 'abralab/kentutils:latest'
+        cpu: ncpu
+    }
+    output {
+        Float genomefraction = read_float('genomefraction')
+        String genomesize = read_string('genomesize')
+    }
+}
+