Easier way to generate config.txt file

- can now specify csv file with index name, direction and sequence, and the splitcode config file will be built automatically - use_db config option to simplify if we are using the database for index lookup or now - update readme + nextflow json
WEHIGenomicsRnD · May 5, 2024 · eb340cb · eb340cb
1 parent bfbb1ec
commit eb340cb
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 25 deletions.
diff --git a/.test/data/indexes.txt b/.test/data/indexes.txt
@@ -1,4 +1,5 @@
-Fwd_01
-Fwd_02
-Rev_01
-Rev_02
+index_name,direction,sequence
+Fwd_01,F,TAGATCGC
+Fwd_02,F,CTCTCTAT
+Rev_01,R,TCGCCTTA
+Rev_02,R,CTAGTACG
diff --git a/README.md b/README.md
@@ -8,7 +8,6 @@ The read structure is typically:
 
 `[sequence][fwd_index][fwd_primer][sequence_of_interest][rev_primer][rev_index][sequence]`
 
-
 ## How to install (WEHI only)
 
 The easiest way to run the pipeline is to use the [Seqera Platform](https://seqera.services.biocommons.org.au/) service provided to WEHI by Australian Biocommons. You can find more information about Seqera Platform (formerly Nextflow Tower) on WEHI's [Research Computing page](https://wehieduau.sharepoint.com/sites/rc2/SitePages/Nextflow-Tower.aspx). See the [Configuration](https://github.com/WEHIGenomicsRnD/nf-qc-pipe#tower-configuration) section for more info.
@@ -34,7 +33,9 @@ nextflow run main.nf \
     --rev_primer $rev_primer \
     --mismatches 3 \
     --barcode_length 12 \
-    --guides_fasta $guides_fasta
+    --index_template_file $index_file \
+    --guides_fasta $guides_fasta \
+    --use_db false
 ```
 
 If you are running on WEHI's Milton HPC, remember to run `module load nextflow` before running nextflow and also run with `-profile log,milton`.
@@ -49,4 +50,30 @@ Here are the parameters you will need to set:
 - `--rev_primer`: reverse primer sequence.
 - `--mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces.
 - `--barcode_length`: how many bases to trim to the left and right of the primer sequences. If your barcode includes spacers make sure to take that into account (i.e., non-informative bases between the index and primer). Set this to 0 if you do not have barcodes.
+- `--index_template_file`: if demultiplexing, use this index file to specify or lookup indexes (see below for format).
 - `--guides_fasta`: (optional) fasta file contains guide sequences to count.
+- `--use_db`: boolean value, whether or not to look up indexes in the Genomics database.
+
+### Index template file format
+
+If you are using the Genomics database for index lookup, your index file should look like this:
+
+```
+index_name
+Fwd_01
+Fwd_02
+Rev_01
+Rev_02
+```
+
+This will fetch the index names from the database. If your indexes are custom ones, or you don not want to use the database, use the following file format:
+
+```
+index_name,direction,sequence
+Fwd_01,F,TAGATCGC
+Fwd_02,F,CTCTCTAT
+Rev_01,R,TCGCCTTA
+Rev_02,R,CTAGTACG
+```
+
+Note that both sequences must match the forward direction. We do not perform any reverse complementing of the reverse sequence.
diff --git a/main.nf b/main.nf
@@ -16,7 +16,7 @@ include { CreateConfigFile } from './modules/demux.nf'
 include { SplitCode } from './modules/demux.nf'
 include { IndexGuides } from './modules/count.nf'
 include { CountGuides } from './modules/count.nf'
-if (!workflow.stubRun && params.demultiplex && !params.is_config_file_provided) {
+if (params.use_db) {
     include { fromQuery } from 'plugin/nf-sqldb'
 }
 
@@ -41,7 +41,9 @@ workflow {
                          params.output_untrimmed)
 
     if (params.demultiplex) {
-        if (!params.is_config_file_provided) {
+        if (params.splitcode_config_file != null && params.splitcode_config_file != '') {
+            Channel.fromPath(params.splitcode_config_file).set{config_ch}
+        } else if (params.use_db) {
             def where_ch = []
             // Construct the where clause for the query
             new File(params.index_template_file).readLines().each { line ->
@@ -62,16 +64,33 @@ workflow {
                         def id = index[0]
                         def direction = index[3]
                         def group = direction == "F" ? "Fwd" : "Rev"
-                        def tag = direction == "F" ? index[1] : index[2] // index_sequence or index_sequence_rc
+                        def sequence = direction == "F" ? index[1] : index[2] // index_sequence or index_sequence_rc
                         def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
-                        def next = direction == 'F' ? '{{Rev}}' : '-'
+                        def nextTag = direction == 'F' ? '{{Rev}}' : '-'
                         def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
 
-                        return "$group\t$id\t$tag\t$distances\t$next\t1\t1\t$locations"
+                        return "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
                 }
                 .collectFile(name: 'config.txt', newLine: true).set{config_ch}
         } else {
-            Channel.fromPath("${params.input_dir}/config.txt").set{config_ch}
+            // build the config file from the index template
+            def indexes = []
+            new File(params.index_template_file).readLines().each { line ->
+                if (!line.startsWith('index_name')) {
+                    def index = line.trim().split(',').each { it.trim() }
+                    def id = index[0]
+                    def direction = index[1]
+                    def group = direction == "F" ? "Fwd" : "Rev"
+                    def sequence = index[2]
+                    def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
+                    def nextTag = direction == 'F' ? '{{Rev}}' : '-'
+                    def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
+
+                    indexes << "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
+                }
+            }
+            Channel.from( indexes )
+                .collectFile(name: 'config.txt', newLine: true).set{config_ch}
         }
         CreateConfigFile(config_ch).set{configFile}
         GenerateSelectFile(file(params.index_template_file)).set{selectTxt}

diff --git a/modules/demux.nf b/modules/demux.nf
@@ -52,7 +52,9 @@ process CreateConfigFile {
         val true, emit: done
 
         script:
-        def header = params.is_config_file_provided ? "" : "groups\tids\ttags\tdistances\tnext\tminFindsG\tmaxFindsG\tlocations\n"
+        // add a header if not using a custom config file
+        def header = params.splitcode_config_file == "" || params.splitcode_config_file == null ?
+            "groups\tids\ttags\tdistances\tnext\tminFindsG\tmaxFindsG\tlocations\n" : ""
         """
         echo -e "${header}\$(cat ${configtxt})" > config.txt
         """
@@ -98,4 +100,4 @@ process SplitCode{
         mv "\$file" "\$newname"
     done
     """
-}
+}
diff --git a/nextflow.config b/nextflow.config
@@ -27,9 +27,14 @@ params {
 
     idx_3p_mismatch         = 1
 
-    //if false, the database is queried to create one
-    //if true, config.txt should be in the input directory
-    is_config_file_provided = true
+    // whether to use genomics database for index lookup
+    // if false, barcode info has to be provided in the
+    // index template file
+    use_db                  = false
+
+    // if you want to use a custom splitcode config file,
+    // you can provide it here
+    splitcode_config_file   = ""
 
     // if specified, use this location to load prebuilt conda environments
     conda_env_location      = ""

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -91,26 +91,35 @@
                 },
                 "idx_5p_mismatch": {
                     "type": "integer",
-                    "default": 0,
-                    "description": "Number of mismatches allowed in 5' index."
+                    "default": 1,
+                    "description": "Number of mismatches allowed in 5' index (default = 1)."
                 },
                 "idx_3p_mismatch": {
                     "type": "integer",
-                    "default": 0,
-                    "description": "Number of mismatches allowed in 3' index."
+                    "default": 1,
+                    "description": "Number of mismatches allowed in 3' index (default = 1)."
                 },
                 "index_template_file": {
                     "type": "string",
                     "fa_icon": "fas fa-file-csv",
                     "format": "file-path",
                     "description": "Text file containing index names used (must match database or config file IDs)."
                 },
-                "is_config_file_provided": {
+                "use_db" : {
                     "type": "boolean",
                     "default": false,
-                    "description": "Optional ready-made config file for splitcode. Use this to skip the databse index lookup. Must be called config.txt and put in input directory."
+                    "description": "Whether to use the Genomics database for index lookup."
+                },
+                "splitcode_config_file": {
+                    "type": "string",
+                    "default": "",
+                    "description": "Optional ready-made config file for splitcode."
                 }
-            }
+            },
+            "required": [
+                "bases_num_r1",
+                "bases_num_r2"
+            ]
         },
         "count": {
             "title": "Count options",
@@ -161,4 +170,4 @@
             "$ref": "#/definitions/generic_options"
         }
     ]
-}
+}