Update documentation + remove bases_num_r args

bases_num_r[1,2] no longer required as we can just use the barcode length.
WEHIGenomicsRnD · May 21, 2024 · 7c368fe · 7c368fe
1 parent 1b04d11
commit 7c368fe
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -36,14 +36,37 @@ nextflow run main.nf \
  --outdir $output_directory \
  --fwd_primer $fwd_primer \
  --rev_primer $rev_primer \
- --mismatches 3 \
+ --primer_mismatches 3 \
  --barcode_length 12 \
+ --idx_5p_mismatch 1 \
+ --idx_3p_mismatch 1 \
  --index_template_file $index_file \
  --guides_fasta $guides_fasta \
  --use_db false
 ```
 
-If you are running on WEHI's Milton HPC, remember to run `module load nextflow` before running nextflow and also run with `-profile log,milton`.
+If you are running on WEHI's Milton HPC:
+
+```bash
+module load nextflow
+nextflow run main.nf \
+ --input_dir $path_to_fastqs \
+ --outdir $output_directory \
+ --fwd_primer $fwd_primer \
+ --rev_primer $rev_primer \
+ --primer_mismatches 3 \
+ --barcode_length 12 \
+ --idx_5p_mismatch 1 \
+ --idx_3p_mismatch 1 \
+ --index_template_file $index_file \
+ --guides_fasta $guides_fasta \
+ --use_db true \
+ -profile log,milton
+```
+
+Note that this uses the Genomics database for the primer lookup. To do this you will need to either run through Seqera or have set your `DB` secret key to access the database.
+
+If you are running against a large number of highly similar reference sequence, you may need to set the `--minimap_f` parameter. For exmaple, if you have 10s of thousands of sequences, set this to something like `1000`.
 
 ## Configuration
 
@@ -53,12 +76,15 @@ Here are the parameters you will need to set:
 - `--outdir`: directory path where output is written.
 - `--fwd_primer`: forward primer sequence.
 - `--rev_primer`: reverse primer sequence.
-- `--mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces.
+- `--primer_mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces.
 - `--barcode_length`: how many bases to trim to the left and right of the primer sequences. If your barcode includes spacers make sure to take that into account (i.e., non-informative bases between the index and primer). Set this to 0 if you do not have barcodes.
+- `--idx_5p_mismatches`: mismatches allowed in the 5' index.
+- `--idx_3p_mismatches`: mismatches allowed in the 3' index.
 - `--index_template_file`: if demultiplexing, use this index file to specify or lookup indexes (see below for format).
 - `--guides_fasta`: (optional) fasta file contains guide sequences to count.
 - `--use_db`: boolean value, default: false. Whether or not to look up indexes in the Genomics database.
 - `--lenient_counts`: boolean value, default: false. If true, reads do not have to span the whole guide sequence to be counted (they will be counted as a partial map).
+- `--count_only`: only perform counting of fastq files input via `--input_dir`, i.e., skip primer trimming and demultiplexing.
 
 ### Index template file format
 

diff --git a/main.nf b/main.nf
@@ -85,7 +85,7 @@ workflow {
  def sequence = direction == "F" ? index[1] : index[2] // index_sequence or index_sequence_rc
  def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
  def nextTag = direction == 'F' ? '{{Rev}}' : '-'
- def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
+ def locations = direction == 'F' ? "0:0:${params.barcode_length}" : "0:-${params.barcode_length}:0"
 
  return "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
  }
@@ -102,7 +102,7 @@ workflow {
  def sequence = index[2]
  def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}"
  def nextTag = direction == 'F' ? '{{Rev}}' : '-'
- def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0"
+ def locations = direction == 'F' ? "0:0:${params.barcode_length}" : "0:-${params.barcode_length}:0"
 
  indexes << "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations"
  }

diff --git a/nextflow.config b/nextflow.config
@@ -7,7 +7,7 @@ params {
 
  rev_primer = ""
 
- primer_mismatches = 1
+ primer_mismatches = 3
 
  barcode_length = 12
 
@@ -17,14 +17,8 @@ params {
 
  demultiplex = true
 
- //number of bases to search for index 1 at start of read
- bases_num_r1 = 12
- //number of bases to search for index 2 at end of read
- //note that this number should be negative
- bases_num_r2 = -12
-
+ // mismatches allowed for indexes
  idx_5p_mismatch = 1
-
  idx_3p_mismatch = 1
 
  // whether to use genomics database for index lookup

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -73,16 +73,6 @@
  "default": false,
  "description": "Whether to demultiplex the reads."
  },
- "bases_num_r1": {
- "type": "integer",
- "default": 10,
- "description": "Number of bases to search for forward index from start of read."
- },
- "bases_num_r2": {
- "type": "integer",
- "default": 13,
- "description": "Number of bases to search for reverse index from end of read (number should be negative)."
- },
  "idx_5p_mismatch": {
  "type": "integer",
  "default": 1,