Added demultiplexing option

Added shared conda env location
WEHIGenomicsRnD · Feb 26, 2024 · 1ea7c9a · 1ea7c9a
1 parent b518aff
commit 1ea7c9a
Show file tree

Hide file tree

Showing 7 changed files with 289 additions and 16 deletions.
diff --git a/.test/data/config.txt b/.test/data/config.txt
@@ -0,0 +1,5 @@
+groups  ids     tags    distances       next    minFindsG       maxFindsG       locations
+Fwd     Fwd_01  TAGATCGC        0       {{Rev}} 1       1       0:0:12
+Fwd     Fwd_02  CTCTCTAT        0       {{Rev}} 1       1       0:0:12
+Rev     Rev_01  TCGCCTTA        0       -       1       1       0:-12:0
+Rev     Rev_02  CTAGTACG        0       -       1       1       0:-12:0
diff --git a/.test/data/indexes.txt b/.test/data/indexes.txt
@@ -0,0 +1,4 @@
+Fwd_01
+Fwd_02
+Rev_01
+Rev_02
diff --git a/main.nf b/main.nf
@@ -10,7 +10,13 @@ println "*            genomicsrnd@wehi.edu.au                    *"
 println "*                                                       *"
 println "*********************************************************"
 
-include { trim_primer } from './modules/trim.nf'
+include { TrimPrimer } from './modules/trim.nf'
+include { GenerateSelectFile } from './modules/demux.nf'
+include { CreateConfigFile } from './modules/demux.nf'
+include { SplitCode } from './modules/demux.nf'
+if (!workflow.stubRun) {
+    include { fromQuery } from 'plugin/nf-sqldb'
+}
 
 workflow {
 
@@ -22,6 +28,50 @@ workflow {
                      """)
          }.set{input_ch}
 
-    trim_primer(input_ch, params.fwd_primer, params.rev_primer, params.mismatches, params.barcode_length, params.output_untrimmed)
+    trim_ch = TrimPrimer(input_ch,
+                         params.fwd_primer,
+                         params.rev_primer,
+                         params.primer_mismatches,
+                         params.barcode_length,
+                         params.output_untrimmed)
 
+    if (params.demultiplex) {
+        if (!params.is_config_file_provided) {
+            def where_ch = []
+            // Construct the where clause for the query
+            new File(params.index_template_file).readLines().each { line ->
+                if (line.trim() != 'index_name') {
+                    where_ch << "'${line.trim()}'"
+                }
+            }
+            def where_clause = where_ch.join(",")
+            def query = """SELECT index_name,
+                                  index_sequence,
+                                  index_sequence_rc,
+                                  index_direction
+                            FROM amplicon_index
+                            WHERE index_name IN (${where_clause});"""
+
+            Channel.fromQuery(query, db: 'my-db', batchSize:100)
+                .map { index ->
+                        def id = index[0]
+                        def direction = index[3]
+                        def group = direction == "F" ? "Fwd" : "Rev"
+                        def tag = direction == "F" ? index[1] : index[2] // index_sequence or index_sequence_rc
+                        def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
+                        def next = direction == 'F' ? '{{Rev}}' : '-'
+                        def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
+
+                        return "$group\t$id\t$tag\t$distances\t$next\t1\t1\t$locations"
+                }
+                .collectFile(name: 'config.txt', newLine: true).set{config_ch}
+            CreateConfigFile(config_ch).set{configFile}
+        } else {
+            Channel.fromPath("${params.input_dir}/config.txt").set { configFile }
+        }
+        GenerateSelectFile(file(params.index_template_file)).set{selectTxt}
+        SplitCode(trim_ch.trimmed_ch,
+                  configFile,
+                  selectTxt)
+    }
 }
diff --git a/modules/demux.nf b/modules/demux.nf
@@ -0,0 +1,100 @@
+// This module was written by WEHI Research Computing
+// see https://github.com/WEHIGenomicsRnD/demultiplex-paired-end-library
+
+process GenerateSelectFile {
+    label 'CreateSFile'
+    queue 'regular'
+    cpus  2
+    memory 1.GB
+    time '1h'
+    publishDir "${params.outdir}/", mode: 'copy'
+
+    input:
+    path primer_template_file
+
+    output:
+    path "select.txt"
+
+
+    script:
+    """
+    #!/usr/bin/env bash
+
+    primer_template=\$(cat ${primer_template_file})
+    fwd_primers=\$(cat ${primer_template_file} |tr -d '\r'|  awk -F ',' '{print \$1}' | grep Fwd)
+    rev_primers=\$(cat ${primer_template_file} |tr -d '\r'| awk -F ',' '{print \$1}' | grep Rev)
+
+    > select.txt
+    for fwd_primer in \${fwd_primers[@]}; do
+        for rev_primer in \${rev_primers[@]}; do
+            echo -e "\${fwd_primer},\${rev_primer}\t\${fwd_primer}-\${rev_primer}" >> select.txt
+        done
+    done
+    """
+}
+
+process CreateConfigFile {
+
+        label 'CreateCFile'
+        queue 'regular'
+        cpus  2
+        memory 1.GB
+        time '1h'
+
+        tag "${sampleId}"
+        publishDir "${params.outdir}/", mode: 'copy'
+
+        input:
+        path(configtxt)
+
+        output:
+        path('config.txt') 
+
+        script:
+        """
+        echo -e "groups\tids\ttags\tdistances\tnext\tminFindsG\tmaxFindsG\tlocations\n\$(cat ${configtxt})" > config.txt
+        """
+
+}
+process SplitCode{
+    label 'SplitCode'
+
+    if ( "${workflow.stubRun}" == "false" ) {
+        queue 'regular'
+        cpus  16
+        memory { 8.GB * task.attempt }
+        errorStrategy { 'retry' }
+        maxRetries 5
+        time '24h'
+    }
+
+    tag "${reads.getSimpleName()}"
+    publishDir "${params.outdir}/split/${reads.getSimpleName()}", mode: 'copy'
+    container 'oras://ghcr.io/wehi-researchcomputing/splitcode_container:latest'
+
+
+    input:
+    path(reads)
+    path(config)
+    path(select)
+
+
+    output:
+    path "*.fastq*"
+    path "*.txt"
+
+
+    script:
+    """
+    splitcode -c ${config} --keep=${select} -t ${task.cpus} --nFastqs=1 \
+                --assign --summary summary.txt -o out.fastq --gzip \
+                --no-outb --mapping mapping.txt --seq-names \
+                --mod-names --com-names --unassigned=unmapped.fastq.gz \
+                ${reads}
+
+    for file in *_0.fastq.gz; do
+        newname="\${file/_0.fastq.gz/.fastq.gz}"
+        mv "\$file" "\$newname"
+    done
+    """
+}
diff --git a/modules/trim.nf b/modules/trim.nf
@@ -1,9 +1,11 @@
-process trim_primer {
-    label = 'trim_primer'
+process TrimPrimer {
+    label = 'TrimPrimer'
 
     publishDir params.outdir, mode: 'copy'
 
-    conda "${projectDir}/envs/biopython.yaml"
+    conda "${ params.conda_env_location != null && params.conda_env_location != '' ?
+              params.conda_env_location + '/biopython' :
+              projectDir + '/envs/biopython.yaml' }"
 
     input:
     path fastq
@@ -14,7 +16,8 @@ process trim_primer {
     val output_untrimmed
 
     output:
-    path "*.fastq*"
+    path "*_trimmed.fastq*", emit: trimmed_ch
+    path "*_untrimmed.fastq*", emit: untrimmed_ch, optional: true
     path "*.txt"
 
     script:

diff --git a/nextflow.config b/nextflow.config
@@ -7,22 +7,66 @@ params {
 
     rev_primer              = "CCTAATATACGGACGCAATC"
 
-    mismatches              = 1
+    primer_mismatches       = 1
 
     barcode_length          = 12
 
     output_untrimmed        = true
+
+    index_template_file     = "$projectDir/.test/data/indexes.txt"
+
+    demultiplex             = false
+
+    //number of bases to search for index 1 at start of read
+    bases_num_r1            = 12
+    //number of bases to search for index 2 at end of read
+    //note that this number should be negative
+    bases_num_r2            = -12
+
+    idx_5p_mismatch         = 1
+
+    idx_3p_mismatch         = 1
+
+    //if false, the database is queried to create one
+    //if true, config.txt should be in the input directory
+    is_config_file_provided = true
+
+    // if specified, use this location to load prebuilt conda environments
+    conda_env_location      = ""
+}
+
+//SQL DB Plugin
+plugins {
+    id 'nf-sqldb@0.1.0'
+}
+
+sql {
+    db {
+        'my-db' {
+              url = 'jdbc:mysql://pipelinepilot.hpc.wehi.edu.au:3306/genomics?useLegacyDatetimeCode=false&serverTimezone=Australia/Melbourne'
+              user = 'genomics'
+              password = secrets.DB
+        }
+    }
 }
 
 profiles {
     milton {
         conda.enabled = true
         cleanup = true
-        process.cpus = 1
-        process.memory = { 8.GB * task.attempt }
-        process.time = '12h'
-        process.errorStrategy = { 'retry' }
-        process.maxRetries = 5
+        apptainer.enabled = false
+        apptainer.autoMounts = false
+        singularity.enabled = false
+        process {
+            cpus = 1
+            memory = { 8.GB * task.attempt }
+            time = '12h'
+            errorStrategy = { 'retry' }
+            maxRetries = 4
+            withLabel:SplitCode {
+                module = 'splitcode/0.28.3'
+            }
+        }
     }
     test {
         conda.enabled = true

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -29,8 +29,8 @@
                 }
             }
         },
-        "other_options": {
-            "title": "Other Options",
+        "primer_options": {
+            "title": "Primer Options",
             "type": "object",
             "description": "",
             "default": "",
@@ -45,7 +45,7 @@
                     "default": "",
                     "description": "Reverse primer sequence"
                 },
-                "mismatches": {
+                "primer_mismatches": {
                     "type": "integer",
                     "default": 3,
                     "description": "number of mismatches allowed in fwd or rev primers"
@@ -67,14 +67,81 @@
                 "mismatches",
                 "barcode_length"
             ]
+        },
+        "demultiplex_options": {
+            "title": "Demultiplexing Options",
+            "type": "object",
+            "description": "",
+            "default": "",
+            "properties": {
+                "demultiplex" : {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Whether to demultiplex the reads."
+                },
+                "bases_num_r1": {
+                    "type": "integer",
+                    "default": 10,
+                    "description": "Number of bases to search for forward index from start of read."
+                },
+                "bases_num_r2": {
+                    "type": "integer",
+                    "default": 13,
+                    "description": "Number of bases to search for reverse index from end of read (number should be negative)."
+                },
+                "idx_5p_mismatch": {
+                    "type": "integer",
+                    "default": 0,
+                    "description": "Number of mismatches allowed in 5' index."
+                },
+                "idx_3p_mismatch": {
+                    "type": "integer",
+                    "default": 0,
+                    "description": "Number of mismatches allowed in 3' index."
+                },
+                "index_template_file": {
+                    "type": "string",
+                    "fa_icon": "fas fa-file-csv",
+                    "format": "file-path",
+                    "description": "Text file containing index names used (must match database or config file IDs)."
+                },
+                "is_config_file_provided": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Optional ready-made config file for splitcode. Use this to skip the databse index lookup. Must be called config.txt and put in input directory."
+                }
+            }
+        },
+        "generic_options": {
+            "title": "Generic options",
+            "type": "object",
+            "fa_icon": "fas fa-file-import",
+            "description": "Less common options for the pipeline, typically set in a config file.",
+            "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.",
+            "properties": {
+                "conda_env_location": {
+                    "type": "string",
+                    "format": "directory-path",
+                    "description": "Use this location for pre-build conda environments (if blank, new conda environments will be created).",
+                    "fa_icon": "fas fa-cog",
+                    "default": "",
+                    "hidden": true
+                }
+            }
         }
     },
     "allOf": [
         {
             "$ref": "#/definitions/input_output_options"
         },
         {
-            "$ref": "#/definitions/other_options"
+            "$ref": "#/definitions/primer_options"
+        },
+        {
+            "$ref": "#/definitions/demultiplex_options"
+        },
+        {
+            "$ref": "#/definitions/generic_options"
         }
     ]
 }