Consensus sequence workflow

Also fixed issue with multiple headers being written to splitcode config file when resuming pipeline
WEHIGenomicsRnD · Jun 21, 2024 · 0a86dc9 · 0a86dc9
1 parent 0ec414a
commit 0a86dc9
Show file tree

Hide file tree

Showing 10 changed files with 216 additions and 5 deletions.
diff --git a/.test/test.config b/.test/test.config
@@ -52,6 +52,12 @@ params {
     // set this to a higher value if using a reference with a large number of
     // repetitive sequences
     minimap_f               = 0.0002
+
+    // whether to generate consensus sequence per sample
+    consensus               = true
+
+    // model to use for medaka polishing
+    medaka_model            = "r1041_e82_400bps_sup_v4.2.0"
 }
 
 //SQL DB Plugin
@@ -101,7 +107,9 @@ profiles {
         stubRun = true
         cleanup = false
     }
-    log {
-        includeConfig 'logger.config'
+    debug {
+        dumpHashes             = true
+        process.beforeScript   = 'echo $HOSTNAME'
+        cleanup                = false
     }
 }
diff --git a/envs/racon-medaka.yaml b/envs/racon-medaka.yaml
@@ -0,0 +1,8 @@
+name: racon-medaka
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - racon
+  - medaka
+
diff --git a/main.nf b/main.nf
@@ -17,6 +17,7 @@ include { SplitCode } from './modules/demux.nf'
 include { IndexGuides } from './modules/count.nf'
 include { CountGuides } from './modules/count.nf'
 include { CollateCounts } from './modules/count.nf'
+include { Consensus } from './subworkflows/consensus'
 if (params.use_db) {
     include { fromQuery } from 'plugin/nf-sqldb'
 }
@@ -89,7 +90,7 @@ workflow {
 
                         return "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
                 }
-                .collectFile(name: 'config.txt', newLine: true).set{config_ch}
+                .collectFile(name: 'config_tmp.txt', newLine: true).set{config_ch}
         } else {
             // build the config file from the index template
             def indexes = []
@@ -108,7 +109,7 @@ workflow {
                 }
             }
             Channel.from( indexes )
-                .collectFile(name: 'config.txt', newLine: true).set{config_ch}
+                .collectFile(name: 'config_tmp.txt', newLine: true).set{config_ch}
         }
         CreateConfigFile(config_ch).set{configFile}
         GenerateSelectFile(file(params.index_template_file)).set{selectTxt}
@@ -126,5 +127,22 @@ workflow {
         IndexGuides(params.guides_fasta).set{index_ch}
         CountGuides(index_ch.done, demux_ch, file("${params.outdir}/${guidesIndex}")).set{count_ch}
         CollateCounts(count_ch.counts.collect())
+
+        if (params.consensus) {
+            // reformat channel for consensus input
+            // to tuple (sampleName, bamFile, fastqFile)
+            // filter out unmapped and out files from splitcode
+            count_ch.alignments.flatMap { sample ->
+                def (sampleName, bamFiles, fastqFiles) = sample
+                return bamFiles.indices.collect { index ->
+                    [sampleName, bamFiles[index], fastqFiles[index]]
+                }
+            }.filter{ sampleName, bamFile, fastqFile -> 
+                !bamFile.getName().startsWith("unmapped.bam") && 
+                !bamFile.getName().startsWith("out.bam") 
+            }.set{ bam_ch }
+
+            Consensus(bam_ch, file(params.guides_fasta), params.medaka_model)
+        }
     }
 }
diff --git a/modules/bam2sam.nf b/modules/bam2sam.nf
@@ -0,0 +1,22 @@
+process PrepareForConsensus {
+    label = "PrepareForConsensus"
+
+    publishDir "${params.outdir}/count/${sampleName}"
+
+    conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+              params.conda_env_location + '/minimap-samtools' :
+              projectDir + '/envs/minimap-samtools.yaml' }"
+
+    input:
+    tuple val(sampleName), path(bam), path(fastq)
+
+    output:
+    tuple val(sampleName), path("*.sam"), path("*.bz")
+
+    script:
+    def sample = bam.getSimpleName()
+    """
+    samtools view -h ${bam} > ${sample}.sam
+    zcat ${fastq} | bgzip -c - > ${sample}.fastq.bz
+    """
+}
diff --git a/modules/count.nf b/modules/count.nf
@@ -37,7 +37,7 @@ process CountGuides {
     path guides_index
 
     output:
-    path "*.bam*"
+    tuple val(sampleName), path("*.bam"), path(fastqs), emit: alignments
     path "*.txt", emit: counts
 
     script:

diff --git a/modules/medaka.nf b/modules/medaka.nf
@@ -0,0 +1,49 @@
+// based on https://github.com/nf-core/modules/blob/master/modules/nf-core/medaka/main.nf
+
+process Medaka {
+    tag "${reads.getSimpleName()}"
+    label 'Medaka'
+
+    publishDir "${params.outdir}/consensus/${sampleName}", mode: 'copy'
+
+    conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+              params.conda_env_location + '/racon-medaka' :
+              projectDir + '/envs/racon-medaka.yaml' }"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/medaka:1.4.4--py38h130def0_0' :
+        'biocontainers/medaka:1.4.4--py38h130def0_0' }"
+
+    input:
+    tuple val(sampleName), path(reads), path(assembly)
+    val(medaka_model)
+
+    output:
+    tuple val(sampleName), path("*.fa.gz"), emit: assembly
+    path "versions.yml"             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: reads.getSimpleName()
+    """
+    medaka_consensus \\
+        -t $task.cpus \\
+        $args \\
+        -i $reads \\
+        -d $assembly \\
+        -m $medaka_model \\
+        -o ./
+
+    mv consensus.fasta ${prefix}.fa
+
+    gzip -n ${prefix}.fa
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        medaka: \$( medaka --version 2>&1 | sed 's/medaka //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/racon.nf b/modules/racon.nf
@@ -0,0 +1,44 @@
+// adapted from https://github.com/nf-core/modules/blob/master/modules/nf-core/racon/main.nf
+
+process Racon {
+    tag "${sam.getSimpleName()}"
+    label 'Racon'
+
+    publishDir "${params.outdir}/consensus/${sampleName}", mode: 'copy'
+
+    conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+              params.conda_env_location + '/racon-medaka' :
+              projectDir + '/envs/racon-medaka.yaml' }"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/racon:1.4.20--h9a82719_1' :
+        'biocontainers/racon:1.4.20--h9a82719_1' }"
+
+    input:
+    tuple val(sampleName), path(sam), path(reads)
+    path(assembly)
+
+    output:
+    tuple val(sampleName), path(reads), path('*_racon_consensus.fasta') , emit: racon_consensus
+    path "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: sam.getSimpleName()
+    """
+    racon -t "$task.cpus" \\
+        "${reads}" \\
+        "${sam}" \\
+        $args \\
+        "${assembly}" > \\
+        ${prefix}_racon_consensus.fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        racon: \$( racon --version 2>&1 | sed 's/^.*v//' )
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -46,6 +46,12 @@ params {
     // set this to a higher value if using a reference with a large number of
     // repetitive sequences
     minimap_f               = 0.0002
+
+    // whether to generate consensus sequence per sample
+    consensus               = false
+
+    // model to use for medaka polishing
+    medaka_model            = "r1041_e82_400bps_sup_v4.2.0"
 }
 
 //SQL DB Plugin
@@ -99,6 +105,11 @@ profiles {
         stubRun = true
         cleanup = false
     }
+    debug {
+        dumpHashes             = true
+        process.beforeScript   = 'echo $HOSTNAME'
+        cleanup                = false
+    }
 }
 
 def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -122,6 +122,11 @@
                     "type": "boolean",
                     "default":false,
                     "description": "If true, only count reads and skip trimming and demultiplexing."
+                },
+                "consensus": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "If true, generate consensus sequences from guide alignments."
                 }
             }
         },

diff --git a/subworkflows/consensus.nf b/subworkflows/consensus.nf
@@ -0,0 +1,46 @@
+include { Racon } from '../modules/racon'
+include { Medaka } from '../modules/medaka'
+
+process PrepareForConsensus {
+    label = "PrepareForConsensus"
+    // convert bam file to sam for racon and
+    // rezip fastq file using bgzip for medaka
+
+    publishDir "${params.outdir}/count/${sampleName}"
+
+    conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+              params.conda_env_location + '/minimap-samtools' :
+              projectDir + '/envs/minimap-samtools.yaml' }"
+
+    input:
+    tuple val(sampleName), path(bam), path(fastq)
+
+    output:
+    tuple val(sampleName), path("*.sam"), path("rezip_*.fastq.gz"), emit: racon_input
+
+    script:
+    def sample = bam.getSimpleName()
+    """
+    samtools view -h ${bam} > ${sample}.sam
+    zcat ${fastq} | bgzip -c - > rezip_${sample}.fastq.gz
+    """
+}
+
+workflow Consensus {
+    take:
+        consensus_input
+        reference
+        medaka_model
+
+    main:
+        PrepareForConsensus(consensus_input)
+
+        Racon(PrepareForConsensus.out.racon_input, reference)
+
+        Medaka(Racon.out.racon_consensus, medaka_model)
+
+        consensus_sequences = Medaka.out.assembly
+
+    emit:
+        consensus_sequences
+}