From 7c368fe1870eade5365b859baf3554db5dcc0a68 Mon Sep 17 00:00:00 2001 From: mcmero <7799828+mcmero@users.noreply.github.com> Date: Tue, 21 May 2024 11:29:58 +1000 Subject: [PATCH] Update documentation + remove bases_num_r args bases_num_r[1,2] no longer required as we can just use the barcode length. --- README.md | 32 +++++++++++++++++++++++++++++--- main.nf | 4 ++-- nextflow.config | 10 ++-------- nextflow_schema.json | 10 ---------- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 131a0d7..132523e 100644 --- a/README.md +++ b/README.md @@ -36,14 +36,37 @@ nextflow run main.nf \ --outdir $output_directory \ --fwd_primer $fwd_primer \ --rev_primer $rev_primer \ - --mismatches 3 \ + --primer_mismatches 3 \ --barcode_length 12 \ + --idx_5p_mismatch 1 \ + --idx_3p_mismatch 1 \ --index_template_file $index_file \ --guides_fasta $guides_fasta \ --use_db false ``` -If you are running on WEHI's Milton HPC, remember to run `module load nextflow` before running nextflow and also run with `-profile log,milton`. +If you are running on WEHI's Milton HPC: + +```bash +module load nextflow +nextflow run main.nf \ + --input_dir $path_to_fastqs \ + --outdir $output_directory \ + --fwd_primer $fwd_primer \ + --rev_primer $rev_primer \ + --primer_mismatches 3 \ + --barcode_length 12 \ + --idx_5p_mismatch 1 \ + --idx_3p_mismatch 1 \ + --index_template_file $index_file \ + --guides_fasta $guides_fasta \ + --use_db true \ + -profile log,milton +``` + +Note that this uses the Genomics database for the primer lookup. To do this you will need to either run through Seqera or have set your `DB` secret key to access the database. + +If you are running against a large number of highly similar reference sequence, you may need to set the `--minimap_f` parameter. For exmaple, if you have 10s of thousands of sequences, set this to something like `1000`. ## Configuration @@ -53,12 +76,15 @@ Here are the parameters you will need to set: - `--outdir`: directory path where output is written. - `--fwd_primer`: forward primer sequence. - `--rev_primer`: reverse primer sequence. -- `--mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces. +- `--primer_mismatches`: how many mismatches are allowed in the primer sequences. Calculated as the levehnstein edit distance using [edlib](https://github.com/Martinsos/edlib). You may want to set this higher for longer primer sequnces. - `--barcode_length`: how many bases to trim to the left and right of the primer sequences. If your barcode includes spacers make sure to take that into account (i.e., non-informative bases between the index and primer). Set this to 0 if you do not have barcodes. +- `--idx_5p_mismatches`: mismatches allowed in the 5' index. +- `--idx_3p_mismatches`: mismatches allowed in the 3' index. - `--index_template_file`: if demultiplexing, use this index file to specify or lookup indexes (see below for format). - `--guides_fasta`: (optional) fasta file contains guide sequences to count. - `--use_db`: boolean value, default: false. Whether or not to look up indexes in the Genomics database. - `--lenient_counts`: boolean value, default: false. If true, reads do not have to span the whole guide sequence to be counted (they will be counted as a partial map). +- `--count_only`: only perform counting of fastq files input via `--input_dir`, i.e., skip primer trimming and demultiplexing. ### Index template file format diff --git a/main.nf b/main.nf index 583e12c..b263fec 100644 --- a/main.nf +++ b/main.nf @@ -85,7 +85,7 @@ workflow { def sequence = direction == "F" ? index[1] : index[2] // index_sequence or index_sequence_rc def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}" def nextTag = direction == 'F' ? '{{Rev}}' : '-' - def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0" + def locations = direction == 'F' ? "0:0:${params.barcode_length}" : "0:-${params.barcode_length}:0" return "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations" } @@ -102,7 +102,7 @@ workflow { def sequence = index[2] def distances = direction == 'F' ? "${params.idx_5p_mismatch}" : "${params.idx_3p_mismatch}" def nextTag = direction == 'F' ? '{{Rev}}' : '-' - def locations = direction == 'F' ? "0:0:${params.bases_num_r1}" : "0:${params.bases_num_r2}:0" + def locations = direction == 'F' ? "0:0:${params.barcode_length}" : "0:-${params.barcode_length}:0" indexes << "$group\t$id\t$sequence\t$distances\t$nextTag\t1\t1\t$locations" } diff --git a/nextflow.config b/nextflow.config index e93bc89..496653f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,7 +7,7 @@ params { rev_primer = "" - primer_mismatches = 1 + primer_mismatches = 3 barcode_length = 12 @@ -17,14 +17,8 @@ params { demultiplex = true - //number of bases to search for index 1 at start of read - bases_num_r1 = 12 - //number of bases to search for index 2 at end of read - //note that this number should be negative - bases_num_r2 = -12 - + // mismatches allowed for indexes idx_5p_mismatch = 1 - idx_3p_mismatch = 1 // whether to use genomics database for index lookup diff --git a/nextflow_schema.json b/nextflow_schema.json index 974ff96..487d15b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,16 +73,6 @@ "default": false, "description": "Whether to demultiplex the reads." }, - "bases_num_r1": { - "type": "integer", - "default": 10, - "description": "Number of bases to search for forward index from start of read." - }, - "bases_num_r2": { - "type": "integer", - "default": 13, - "description": "Number of bases to search for reverse index from end of read (number should be negative)." - }, "idx_5p_mismatch": { "type": "integer", "default": 1,