From 7c368fe1870eade5365b859baf3554db5dcc0a68 Mon Sep 17 00:00:00 2001
From: mcmero <7799828+mcmero@users.noreply.github.com>
Date: Tue, 21 May 2024 11:29:58 +1000
Subject: [PATCH] Update documentation + remove bases_num_r args

bases_num_r[1,2] no longer required as we can just use the
barcode length.
---
 README.md            | 32 +++++++++++++++++++++++++++++---
 main.nf              |  4 ++--
 nextflow.config      | 10 ++--------
 nextflow_schema.json | 10 ----------
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 131a0d7..132523e 100644
--- a/README.md
+++ b/README.md
@@ -36,14 +36,37 @@ nextflow run main.nf \
     --outdir $output_directory \
     --fwd_primer $fwd_primer \
     --rev_primer $rev_primer \
-    --mismatches 3 \
+    --primer_mismatches 3 \
     --barcode_length 12 \
+    --idx_5p_mismatch 1 \
+    --idx_3p_mismatch 1 \
     --index_template_file $index_file \
     --guides_fasta $guides_fasta \
     --use_db false
 ```
 
-If you are running on WEHI's Milton HPC, remember to run `module load nextflow` before running nextflow and also run with `-profile log,milton`.
+If you are running on WEHI's Milton HPC:
+
+```bash
+module load nextflow
+nextflow run main.nf \
+    --input_dir $path_to_fastqs \
+    --outdir $output_directory \
+    --fwd_primer $fwd_primer \
+    --rev_primer $rev_primer \
+    --primer_mismatches 3 \
+    --barcode_length 12 \
+    --idx_5p_mismatch 1 \
+    --idx_3p_mismatch 1 \
+    --index_template_file $index_file \
+    --guides_fasta $guides_fasta \
+    --use_db true \
+    -profile log,milton
+```
+
+Note that this uses the Genomics database for the primer lookup. To do this you will need to either run through Seqera or have set your `DB` secret key to access the database.
+
+If you are running against a large number of highly similar reference sequence, you may need to set the `--minimap_f` parameter. For exmaple, if you have 10s of thousands of sequences, set this to something like `1000`.
 
 ## Configuration
 
@@ -53,12 +76,15 @@ Here are the parameters you will need to set:
 - `--outdir`: directory path where output is written.
 - `--fwd_primer`: forward primer sequence.
 - `--rev_primer`: reverse primer sequence.
-- `--mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces.
+- `--primer_mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces.
 - `--barcode_length`: how many bases to trim to the left and right of the primer sequences. If your barcode includes spacers make sure to take that into account (i.e., non-informative bases between the index and primer). Set this to 0 if you do not have barcodes.
+- `--idx_5p_mismatches`: mismatches allowed in the 5' index.
+- `--idx_3p_mismatches`: mismatches allowed in the 3' index.
 - `--index_template_file`: if demultiplexing, use this index file to specify or lookup indexes (see below for format).
 - `--guides_fasta`: (optional) fasta file contains guide sequences to count.
 - `--use_db`: boolean value, default: false. Whether or not to look up indexes in the Genomics database.
 - `--lenient_counts`: boolean value, default: false. If true, reads do not have to span the whole guide sequence to be counted (they will be counted as a partial map).
+- `--count_only`: only perform counting of fastq files input via `--input_dir`, i.e., skip primer trimming and demultiplexing.
 
 ### Index template file format
 
diff --git a/main.nf b/main.nf
index 583e12c..b263fec 100644
--- a/main.nf
+++ b/main.nf
@@ -85,7 +85,7 @@ workflow {
                         def sequence = direction == "F" ? index[1] : index[2] // index_sequence or index_sequence_rc
                         def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
                         def nextTag = direction == 'F' ? '{{Rev}}' : '-'
-                        def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
+                        def locations = direction == 'F' ? "0:0:${params.barcode_length}" : "0:-${params.barcode_length}:0"
 
                         return "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
                 }
@@ -102,7 +102,7 @@ workflow {
                     def sequence = index[2]
                     def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
                     def nextTag = direction == 'F' ? '{{Rev}}' : '-'
-                    def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
+                    def locations = direction == 'F' ? "0:0:${params.barcode_length}" : "0:-${params.barcode_length}:0"
 
                     indexes << "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
                 }
diff --git a/nextflow.config b/nextflow.config
index e93bc89..496653f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -7,7 +7,7 @@ params {
 
     rev_primer              = ""
 
-    primer_mismatches       = 1
+    primer_mismatches       = 3
 
     barcode_length          = 12
 
@@ -17,14 +17,8 @@ params {
 
     demultiplex             = true
 
-    //number of bases to search for index 1 at start of read
-    bases_num_r1            = 12
-    //number of bases to search for index 2 at end of read
-    //note that this number should be negative
-    bases_num_r2            = -12
-
+    // mismatches allowed for indexes
     idx_5p_mismatch         = 1
-
     idx_3p_mismatch         = 1
 
     // whether to use genomics database for index lookup
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 974ff96..487d15b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -73,16 +73,6 @@
                     "default": false,
                     "description": "Whether to demultiplex the reads."
                 },
-                "bases_num_r1": {
-                    "type": "integer",
-                    "default": 10,
-                    "description": "Number of bases to search for forward index from start of read."
-                },
-                "bases_num_r2": {
-                    "type": "integer",
-                    "default": 13,
-                    "description": "Number of bases to search for reverse index from end of read (number should be negative)."
-                },
                 "idx_5p_mismatch": {
                     "type": "integer",
                     "default": 1,