diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6583953..5cda947 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,14 @@ repos: - repo: local hooks: - - id: docs_schema - name: docs_schema - entry: parse_docs -p docs -e .md -s intro links -oj nextflow_schema.json - language: python - always_run: true - pass_filenames: false - additional_dependencies: - - epi2melabs - id: docs_readme name: docs_readme - entry: parse_docs -p docs -e .md -s header intro quickstart links -ot README.md + entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_inputs 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json language: python always_run: true pass_filenames: false additional_dependencies: - - epi2melabs + - epi2melabs>=0.0.48 - id: build_models name: build_models entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d760c23..9437a93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.3.33] +## Changes +- Updates for cloud readiness +- Docs updated +### Removed +- Default local executor CPU and RAM limits + ## [v0.3.32] ### Fixed - reporting of sequence summaries crashing with `TypeError` diff --git a/README.md b/README.md index e7d23d1..e0981ca 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,6 @@ -# ARTIC SARS-CoV-2 Workflow - -This repository contains a [Nextflow](https://www.nextflow.io/) workflow for -running the ARTIC SARS-CoV-2 workflow on multiplexed MinION, GridION, and -PromethION runs. +# Artic Network SARS-CoV-2 Analysis +Run the ARTIC SARS-CoV-2 methodology on multiplexed MinION, GridION, and PromethION data. @@ -16,75 +13,207 @@ genomes that have been DNA sequenced using a pooled tiling amplicon strategy. The workflow consumes a folder containing demultiplexed sequence reads as prepared by either MinKNOW or Guppy. The workflow needs to know the primer scheme that has been used during genome amplification and library preparation -e.g. ARTIC/V3 or ONT_Midnight/V1. Other parameters can be specified too e.g. +e.g. `ARTIC/V3` or `ONT_Midnight/V1`. Other parameters can be specified too e.g. assign sample names to the barcodes or to adjust the length distribution of acceptable amplicon sequences. The Medaka variant model is selected based on the provided basecaller configuration (using the parameter `--basecaller_cfg`), or alternatively the Medaka model can be provided directly via the `--medaka_variant_model` parameter. -DNA sequences in FASTQ format are aggregated, filtered for sequence length and -quality characteristics and are mapped to the reference SARS-CoV-2 genome using -minimap2. A primer-scheme specific bed file is used to identify the regions of -the mapped sequences that correspond to synthetic sequences (primers) - these -regions are clipped to ensure that sequences are entirely of biological origin. -The retained sequences are used to prepare a consensus sequence that is then -polished using Medaka and variant calling is performed to produce a VCF file of -genetic differences relative to the reference genome. The consensus sequence is -annotated for virus clade information using NextClade and a strain assignment -is performed using Pangolin. -The completed analysis is summarised in an HTML format report that summarises -key information that includes number of reads, depth of coverage information -per amplicon and both the Nextclade and Pangolin information. -More information can be found in these two blog posts: -* [SARS-CoV-2 Midnight Analysis](https://labs.epi2me.io/sarscov2-midnight-analysis/) -* [Midnight Scheme Update](https://labs.epi2me.io/ont-midnight-scheme-update/) +## Compute requirements + +Recommended requirements: + ++ CPUs = 4 ++ memory = 8GB + +Minimum requirement: + ++ CPUs = 2 ++ memory = 4GB +Approximate run time: 5 minutes per sample +ARM processor support: False -## Quickstart -The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such Nextflow will need to be installed before attempting -to run the workflow. -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) or -[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either Docker or Singularity is installed. +## Install and run -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit our website](https://labs.epi2me.io/wfindex). +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). -**Workflow options** +The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore nextflow will need to be installed before attempting to run the workflow. -To obtain the workflow, having installed `nextflow`, users can run: +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either docker or singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). + +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of nextflow and provide a list of all parameters available for the workflow as well as an example command: + +``` +nextflow run epi2me-labs/wf-artic –help +``` +A demo dataset is provided for testing of the workflow. It can be downloaded using: ``` -nextflow run epi2me-labs/wf-artic --help +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-artic/wf-artic-demo.tar.gz +tar -xzvf wf-artic-demo.tar.gz ``` +The workflow can be run with the demo data using: +``` +nextflow run epi2me-labs/wf-artic \ +--fastq test_data/reads.fastq.gz \ +-profile standard +``` +For further information about running a workflow on the cmd line see https://labs.epi2me.io/wfquickstart/ + + + +## Related protocols + +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +The Midnight protocol for sample preparation and sequencing can be found in the [Nanopore community](https://community.nanoporetech.com/docs/prepare/library_prep_protocols/pcr-tiling-of-sars-cov-2-virus-rbk114-and-midnight-rt/v/mrt_9186_v114_revd_19apr2023). + + + + +## Inputs + +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | +| basecaller_cfg | string | Name of the model that was used to basecall signal data, used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_variant_model' and 'medaka_consensus_model' parameters. The model list only shows models that are compatible with this workflow. | dna_r9.4.1_450bps_hac | + + +### Primer Scheme Selection + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| scheme_name | string | Primer scheme name. | This should be set to `SARS-CoV-2`, or `spike-seq` or your custom scheme name. This affects the choice of scheme versions you can use. The only scheme versions compatible with `spike-seq` are `ONT/V1` and `ONT/V4.1` | SARS-CoV-2 | +| scheme_version | string | Primer scheme version. | This is the version of the primer scheme to use, more details about primer shemes can be found [here](https://labs.epi2me.io/ont-midnight-scheme-update/). | ARTIC/V3 | +| custom_scheme | string | Path to a custom scheme. | If you have a custom primer scheme you can enter the details here. This must be the full path to the directory containing your appropriately named scheme bed and fasta files; .bed and .fasta. More details [here](https://labs.epi2me.io/ont-midnight-scheme-update/). | | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Reporting Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| report_depth | integer | Min. depth for percentage coverage. (e.g. 89% genome covered at > `report_depth`) | | 100 | +| report_clade | boolean | Show results of Nextclade analysis in report. | | True | +| report_coverage | boolean | Show genome coverage traces in report. | | True | +| report_lineage | boolean | Show results of Pangolin analysis in report. | | True | +| report_variant_summary | boolean | Show variant information in report. | | True | + + +### Advanced Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| artic_threads | number | Number of CPU threads to use per artic task. | The total CPU resource used by the workflow is constrained by the executor configuration. | 4 | +| pangolin_threads | number | Number of CPU threads to use per pangolin task. | The total CPU resource used by the workflow is constrained by the executor configuration. | 4 | +| genotype_variants | string | Report genotyping information for scheme's known variants of interest, optionally provide file path as argument. | | | +| list_schemes | boolean | List primer schemes and exit without running analysis. | | False | +| min_len | number | Minimum read length (default: set by scheme). | | | +| max_len | number | Maximum read length (default: set by scheme). | | | +| max_softclip_length | integer | Remove reads with alignments showing large soft clipping | | | +| update_data | boolean | Update Pangolin and Nextclade data at runtime. | | True | +| pangolin_options | string | Pass options to Pangolin, for example "--analysis-mode fast --min-length 26000". | | | +| normalise | integer | Depth ceiling for depth of coverage normalisation | | 200 | +| medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. | | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| lab_id | string | Laboratory identifier, used in reporting. | | | +| testkit | string | Test kit identifier, used in reporting. | | | +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + + + + + +## Outputs + +Outputs files may be aggregated including information for all samples or provided per sample. Per sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| Workflow report | ./wf-artic-report.html | Report for all samples. | aggregated | +| Consensus sequences | ./all_consensus.fasta | Final consensus sequences for all samples in the analysis. | aggregated | +| Pangolin results | ./lineage_report.csv | Pangolin results for each of the samples in the analysis. | aggregated | +| Nextclade results | ./nextclade.json | Nextclade results for each of the samples in the analysis. | aggregated | +| Coverage data | ./all_depth.txt | Coverage of the reference genome in 20 base windows in all the samples in the analysis. | aggregated | +| Variants | ./{{ alias }}.pass.named.vcf.gz | A VCF file containing high confidence variants in the sample when compared to the reference. | per-sample | +| Variants index | ./{{ alias }}.pass.named.vcf.gz.tbi | An index file for the variants. | per-sample | +| Alignments | ./{{ alias }}.primertrimmed.rg.sorted.bam | A BAM file containing the reads for the sample aligned to the reference. | per-sample | +| Alignments index | ./{{ alias }}.primertrimmed.rg.sorted.bam.bai | An index file for the alignments. | per-sample | + + + + +## Pipeline overview + +The pipeline is largely a wrapper around the [Artic Network](https://artic.network/) [Field Bioinformatics](https://github.com/artic-network/fieldbioinformatics) analysis package. + +### 1. Concatenates input files and generate per read stats. + +The [fastcat/bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities. Reads are additionally filtered for sequence length and quality characteristics. + +### 2. Mapping and primer trimming (Artic) + +Concatenated reads are mapped to the reference SARS-CoV-2 genome using [minimap2](https://github.com/lh3/minimap2). A primer scheme-specific BED file is used to identify the regions of +the mapped sequences that correspond to synthetic sequences (primers) - these regions are clipped to ensure that sequences are entirely of biological origin. + +### 3. Variant calling and consensus generation (Artic) + +The retained sequences are used to prepare a consensus sequence that is then polished using [Medaka](https://github.com/nanoporetech/medaka) and variant calling is performed to produce a VCF file of genetic differences relative to the reference genome. + +### 4. Lineage/clade assignment + +The consensus sequence is annotated for virus clade information using [NextClade](https://clades.nextstrain.org/), and strain assignment is performed using [Pangolin](https://github.com/cov-lineages/pangolin). + + + +## Troubleshooting + ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). + + + +## FAQ's -to see the options for the workflow. +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-template/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). -**Workflow outputs** -The primary outputs of the workflow include: -* a [FASTA](https://en.wikipedia.org/wiki/FASTA) file containing the consensus sequence for all samples, -* a [VCF](https://en.wikipedia.org/wiki/Variant_Call_Format) file sample all samples, -* an HTML report document detailing QC metrics and the primary findings of the workflow. +## Related blog posts ++ [SARS-CoV-2 Midnight Analysis](https://labs.epi2me.io/sarscov2-midnight-analysis/) ++ [Midnight Scheme Update](https://labs.epi2me.io/ont-midnight-scheme-update/) +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -## Useful links -* [Medaka](https://www.github.com/nanoporetech/medaka) -* [Artic](https://github.com/artic-network/fieldbioinformatics) -* [Nextflow](https://www.nextflow.io/) -* [Docker](https://www.docker.com/products/docker-desktop) -* [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py index d77f1ab..fe4fc37 100755 --- a/bin/workflow_glue/check_sample_sheet.py +++ b/bin/workflow_glue/check_sample_sheet.py @@ -2,6 +2,7 @@ import codecs import csv import os +import re import sys from .util import get_named_logger, wf_parser # noqa: ABS101 @@ -79,6 +80,19 @@ def main(args): sys.stdout.write(f"Parsing error: {e}") sys.exit() + # check barcodes are correct format + for barcode in barcodes: + if not re.match(r'^barcode\d\d+$', barcode): + sys.stdout.write("values in 'barcode' column are incorrect format") + sys.exit() + + # check barcodes are all the same length + first_length = len(barcodes[0]) + for barcode in barcodes[1:]: + if len(barcode) != first_length: + sys.stdout.write("values in 'barcode' column are different lengths") + sys.exit() + # check barcode and alias values are unique if len(barcodes) > len(set(barcodes)): sys.stdout.write("values in 'barcode' column not unique") diff --git a/docs/01_brief_description.md b/docs/01_brief_description.md new file mode 100644 index 0000000..a4b8f4f --- /dev/null +++ b/docs/01_brief_description.md @@ -0,0 +1 @@ +Run the ARTIC SARS-CoV-2 methodology on multiplexed MinION, GridION, and PromethION data. \ No newline at end of file diff --git a/docs/02_introduction.md b/docs/02_introduction.md new file mode 100644 index 0000000..2cf66a7 --- /dev/null +++ b/docs/02_introduction.md @@ -0,0 +1,13 @@ +The wf-artic workflow implements a slightly modified ARTIC FieldBioinformatics +workflow for the purpose of preparing consensus sequences from SARS-CoV-2 +genomes that have been DNA sequenced using a pooled tiling amplicon strategy. + +The workflow consumes a folder containing demultiplexed sequence reads as +prepared by either MinKNOW or Guppy. The workflow needs to know the primer +scheme that has been used during genome amplification and library preparation +e.g. `ARTIC/V3` or `ONT_Midnight/V1`. Other parameters can be specified too e.g. +assign sample names to the barcodes or to adjust the length distribution of +acceptable amplicon sequences. The Medaka variant model is selected based on the +provided basecaller configuration (using the parameter `--basecaller_cfg`), or +alternatively the Medaka model can be provided directly via the `--medaka_variant_model` +parameter. \ No newline at end of file diff --git a/docs/03_compute_requirements.md b/docs/03_compute_requirements.md new file mode 100644 index 0000000..1d0e57b --- /dev/null +++ b/docs/03_compute_requirements.md @@ -0,0 +1,13 @@ +Recommended requirements: + ++ CPUs = 4 ++ memory = 8GB + +Minimum requirement: + ++ CPUs = 2 ++ memory = 4GB + +Approximate run time: 5 minutes per sample + +ARM processor support: False diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md new file mode 100644 index 0000000..27d5bea --- /dev/null +++ b/docs/04_install_and_run.md @@ -0,0 +1,29 @@ +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either docker or singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. + +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). + +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of nextflow and provide a list of all parameters available for the workflow as well as an example command: + +``` +nextflow run epi2me-labs/wf-artic –help +``` +A demo dataset is provided for testing of the workflow. It can be downloaded using: +``` +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-artic/wf-artic-demo.tar.gz +tar -xzvf wf-artic-demo.tar.gz +``` +The workflow can be run with the demo data using: +``` +nextflow run epi2me-labs/wf-artic \ +--fastq test_data/reads.fastq.gz \ +-profile standard +``` +For further information about running a workflow on the cmd line see https://labs.epi2me.io/wfquickstart/ \ No newline at end of file diff --git a/docs/05_related_protocols.md b/docs/05_related_protocols.md new file mode 100644 index 0000000..fc5ecc6 --- /dev/null +++ b/docs/05_related_protocols.md @@ -0,0 +1,3 @@ +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +The Midnight protocol for sample preparation and sequencing can be found in the [Nanopore community](https://community.nanoporetech.com/docs/prepare/library_prep_protocols/pcr-tiling-of-sars-cov-2-virus-rbk114-and-midnight-rt/v/mrt_9186_v114_revd_19apr2023). diff --git a/docs/06_inputs.md b/docs/06_inputs.md new file mode 100644 index 0000000..6b4e415 --- /dev/null +++ b/docs/06_inputs.md @@ -0,0 +1,63 @@ +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | +| basecaller_cfg | string | Name of the model that was used to basecall signal data, used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_variant_model' and 'medaka_consensus_model' parameters. The model list only shows models that are compatible with this workflow. | dna_r9.4.1_450bps_hac | + + +### Primer Scheme Selection + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| scheme_name | string | Primer scheme name. | This should be set to `SARS-CoV-2`, or `spike-seq` or your custom scheme name. This affects the choice of scheme versions you can use. The only scheme versions compatible with `spike-seq` are `ONT/V1` and `ONT/V4.1` | SARS-CoV-2 | +| scheme_version | string | Primer scheme version. | This is the version of the primer scheme to use, more details about primer shemes can be found [here](https://labs.epi2me.io/ont-midnight-scheme-update/). | ARTIC/V3 | +| custom_scheme | string | Path to a custom scheme. | If you have a custom primer scheme you can enter the details here. This must be the full path to the directory containing your appropriately named scheme bed and fasta files; .bed and .fasta. More details [here](https://labs.epi2me.io/ont-midnight-scheme-update/). | | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Reporting Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| report_depth | integer | Min. depth for percentage coverage. (e.g. 89% genome covered at > `report_depth`) | | 100 | +| report_clade | boolean | Show results of Nextclade analysis in report. | | True | +| report_coverage | boolean | Show genome coverage traces in report. | | True | +| report_lineage | boolean | Show results of Pangolin analysis in report. | | True | +| report_variant_summary | boolean | Show variant information in report. | | True | + + +### Advanced Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| artic_threads | number | Number of CPU threads to use per artic task. | The total CPU resource used by the workflow is constrained by the executor configuration. | 4 | +| pangolin_threads | number | Number of CPU threads to use per pangolin task. | The total CPU resource used by the workflow is constrained by the executor configuration. | 4 | +| genotype_variants | string | Report genotyping information for scheme's known variants of interest, optionally provide file path as argument. | | | +| list_schemes | boolean | List primer schemes and exit without running analysis. | | False | +| min_len | number | Minimum read length (default: set by scheme). | | | +| max_len | number | Maximum read length (default: set by scheme). | | | +| max_softclip_length | integer | Remove reads with alignments showing large soft clipping | | | +| update_data | boolean | Update Pangolin and Nextclade data at runtime. | | True | +| pangolin_options | string | Pass options to Pangolin, for example "--analysis-mode fast --min-length 26000". | | | +| normalise | integer | Depth ceiling for depth of coverage normalisation | | 200 | +| medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. | | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| lab_id | string | Laboratory identifier, used in reporting. | | | +| testkit | string | Test kit identifier, used in reporting. | | | +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + diff --git a/docs/07_outputs.md b/docs/07_outputs.md new file mode 100644 index 0000000..b29a713 --- /dev/null +++ b/docs/07_outputs.md @@ -0,0 +1,13 @@ +Outputs files may be aggregated including information for all samples or provided per sample. Per sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| Workflow report | ./wf-artic-report.html | Report for all samples. | aggregated | +| Consensus sequences | ./all_consensus.fasta | Final consensus sequences for all samples in the analysis. | aggregated | +| Pangolin results | ./lineage_report.csv | Pangolin results for each of the samples in the analysis. | aggregated | +| Nextclade results | ./nextclade.json | Nextclade results for each of the samples in the analysis. | aggregated | +| Coverage data | ./all_depth.txt | Coverage of the reference genome in 20 base windows in all the samples in the analysis. | aggregated | +| Variants | ./{{ alias }}.pass.named.vcf.gz | A VCF file containing high confidence variants in the sample when compared to the reference. | per-sample | +| Variants index | ./{{ alias }}.pass.named.vcf.gz.tbi | An index file for the variants. | per-sample | +| Alignments | ./{{ alias }}.primertrimmed.rg.sorted.bam | A BAM file containing the reads for the sample aligned to the reference. | per-sample | +| Alignments index | ./{{ alias }}.primertrimmed.rg.sorted.bam.bai | An index file for the alignments. | per-sample | diff --git a/docs/08_pipeline_overview.md b/docs/08_pipeline_overview.md new file mode 100644 index 0000000..0579e25 --- /dev/null +++ b/docs/08_pipeline_overview.md @@ -0,0 +1,18 @@ +The pipeline is largely a wrapper around the [Artic Network](https://artic.network/) [Field Bioinformatics](https://github.com/artic-network/fieldbioinformatics) analysis package. + +### 1. Concatenates input files and generate per read stats. + +The [fastcat/bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities. Reads are additionally filtered for sequence length and quality characteristics. + +### 2. Mapping and primer trimming (Artic) + +Concatenated reads are mapped to the reference SARS-CoV-2 genome using [minimap2](https://github.com/lh3/minimap2). A primer scheme-specific BED file is used to identify the regions of +the mapped sequences that correspond to synthetic sequences (primers) - these regions are clipped to ensure that sequences are entirely of biological origin. + +### 3. Variant calling and consensus generation (Artic) + +The retained sequences are used to prepare a consensus sequence that is then polished using [Medaka](https://github.com/nanoporetech/medaka) and variant calling is performed to produce a VCF file of genetic differences relative to the reference genome. + +### 4. Lineage/clade assignment + +The consensus sequence is annotated for virus clade information using [NextClade](https://clades.nextstrain.org/), and strain assignment is performed using [Pangolin](https://github.com/cov-lineages/pangolin). \ No newline at end of file diff --git a/docs/09_troubleshooting.md b/docs/09_troubleshooting.md new file mode 100644 index 0000000..ecb81f2 --- /dev/null +++ b/docs/09_troubleshooting.md @@ -0,0 +1,2 @@ ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). \ No newline at end of file diff --git a/docs/10_FAQ.md b/docs/10_FAQ.md new file mode 100644 index 0000000..2414605 --- /dev/null +++ b/docs/10_FAQ.md @@ -0,0 +1 @@ +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-template/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). \ No newline at end of file diff --git a/docs/11_other.md b/docs/11_other.md new file mode 100644 index 0000000..e780bc0 --- /dev/null +++ b/docs/11_other.md @@ -0,0 +1,4 @@ ++ [SARS-CoV-2 Midnight Analysis](https://labs.epi2me.io/sarscov2-midnight-analysis/) ++ [Midnight Scheme Update](https://labs.epi2me.io/ont-midnight-scheme-update/) + +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. \ No newline at end of file diff --git a/docs/header.md b/docs/header.md deleted file mode 100644 index b4f3f0c..0000000 --- a/docs/header.md +++ /dev/null @@ -1,5 +0,0 @@ -# ARTIC SARS-CoV-2 Workflow - -This repository contains a [Nextflow](https://www.nextflow.io/) workflow for -running the ARTIC SARS-CoV-2 workflow on multiplexed MinION, GridION, and -PromethION runs. diff --git a/docs/intro.md b/docs/intro.md deleted file mode 100644 index 5964e44..0000000 --- a/docs/intro.md +++ /dev/null @@ -1,34 +0,0 @@ -## Introduction - -The wf-artic workflow implements a slightly modified ARTIC FieldBioinformatics -workflow for the purpose of preparing consensus sequences from SARS-CoV-2 -genomes that have been DNA sequenced using a pooled tiling amplicon strategy. - -The workflow consumes a folder containing demultiplexed sequence reads as -prepared by either MinKNOW or Guppy. The workflow needs to know the primer -scheme that has been used during genome amplification and library preparation -e.g. ARTIC/V3 or ONT_Midnight/V1. Other parameters can be specified too e.g. -assign sample names to the barcodes or to adjust the length distribution of -acceptable amplicon sequences. The Medaka variant model is selected based on the -provided basecaller configuration (using the parameter `--basecaller_cfg`), or -alternatively the Medaka model can be provided directly via the `--medaka_variant_model` -parameter. - -DNA sequences in FASTQ format are aggregated, filtered for sequence length and -quality characteristics and are mapped to the reference SARS-CoV-2 genome using -minimap2. A primer-scheme specific bed file is used to identify the regions of -the mapped sequences that correspond to synthetic sequences (primers) - these -regions are clipped to ensure that sequences are entirely of biological origin. -The retained sequences are used to prepare a consensus sequence that is then -polished using Medaka and variant calling is performed to produce a VCF file of -genetic differences relative to the reference genome. The consensus sequence is -annotated for virus clade information using NextClade and a strain assignment -is performed using Pangolin. - -The completed analysis is summarised in an HTML format report that summarises -key information that includes number of reads, depth of coverage information -per amplicon and both the Nextclade and Pangolin information. - -More information can be found in these two blog posts: -* [SARS-CoV-2 Midnight Analysis](https://labs.epi2me.io/sarscov2-midnight-analysis/) -* [Midnight Scheme Update](https://labs.epi2me.io/ont-midnight-scheme-update/) diff --git a/docs/links.md b/docs/links.md deleted file mode 100644 index c3065af..0000000 --- a/docs/links.md +++ /dev/null @@ -1,7 +0,0 @@ -## Useful links - -* [Medaka](https://www.github.com/nanoporetech/medaka) -* [Artic](https://github.com/artic-network/fieldbioinformatics) -* [Nextflow](https://www.nextflow.io/) -* [Docker](https://www.docker.com/products/docker-desktop) -* [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) diff --git a/docs/quickstart.md b/docs/quickstart.md deleted file mode 100644 index d553c36..0000000 --- a/docs/quickstart.md +++ /dev/null @@ -1,32 +0,0 @@ -## Quickstart - -The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such Nextflow will need to be installed before attempting -to run the workflow. - -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) or -[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either Docker or Singularity is installed. - -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit our website](https://labs.epi2me.io/wfindex). - -**Workflow options** - -To obtain the workflow, having installed `nextflow`, users can run: - -``` -nextflow run epi2me-labs/wf-artic --help -``` - -to see the options for the workflow. - -**Workflow outputs** - -The primary outputs of the workflow include: - -* a [FASTA](https://en.wikipedia.org/wiki/FASTA) file containing the consensus sequence for all samples, -* a [VCF](https://en.wikipedia.org/wiki/Variant_Call_Format) file sample all samples, -* an HTML report document detailing QC metrics and the primary findings of the workflow. diff --git a/nextflow.config b/nextflow.config index 8304feb..359d278 100644 --- a/nextflow.config +++ b/nextflow.config @@ -72,7 +72,7 @@ manifest { description = 'Run the ARTIC SARS-CoV-2 methodology on multiplexed MinION, GridION, and PromethION data.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v0.3.32' + version = 'v0.3.33' } epi2melabs { @@ -85,21 +85,26 @@ env { PYTHONNOUSERSITE = 1 } -executor { - $local { - cpus = 4 - memory = "8 GB" +process { + withLabel:wf_common { + container = "ontresearch/wf-common:${params.wf.common_sha}" + memory = '1G' + } + withLabel:artic { + container = "ontresearch/wf-artic:${params.wf.container_sha}" + memory = '2G' + } + withLabel:pangolin { + container = "ontresearch/pangolin:${params.wf.pangolin_sha}" + memory = '2G' + } + withLabel:nextclade { + container = "ontresearch/nextclade:${params.wf.nextclade_sha}" + memory = '1G' } + shell = ['/bin/bash', '-euo', 'pipefail'] } -process { - withLabel:wf_common { container = "ontresearch/wf-common:${params.wf.common_sha}" } - withLabel:artic { container = "ontresearch/wf-artic:${params.wf.container_sha}" } - withLabel:pangolin { container = "ontresearch/pangolin:${params.wf.pangolin_sha}" } - withLabel:nextclade { container = "ontresearch/nextclade:${params.wf.nextclade_sha}" } - shell = ['/bin/bash', '-euo', 'pipefail'] - } - profiles { // the "standard" profile is used implicitely by nextflow @@ -131,18 +136,21 @@ profiles { process { executor = 'awsbatch' queue = "${params.aws_queue}" - memory = '8G' withLabel:wf_common { container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root" + memory = '1G' } withLabel:artic { container = "${params.aws_image_prefix}-wf-artic:${params.wf.container_sha}-root" + memory = '2G' } withLabel:pangolin { container = "${params.aws_image_prefix}-pangolin:${params.wf.pangolin_sha}-root" + memory = '2G' } withLabel:nextclade { container = "${params.aws_image_prefix}-nextclade:${params.wf.nextclade_sha}-root" + memory = '1G' } shell = ['/bin/bash', '-euo', 'pipefail'] } diff --git a/nextflow_schema.json b/nextflow_schema.json index 12e8abc..565cb67 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,6 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": "epi2me-labs/wf-artic", + "workflow_title": "Artic Network SARS-CoV-2 Analysis", "description": "Run the ARTIC SARS-CoV-2 methodology on multiplexed MinION, GridION, and PromethION data.", "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-artic/wf-artic-demo.tar.gz", "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-artic/wf-artic-demo/aws.nextflow.config", @@ -354,6 +355,18 @@ "type": "boolean" } }, + "resources": { + "recommended": { + "cpus": 4, + "memory": "8GB" + }, + "minimum": { + "cpus": 2, + "memory": "4GB" + }, + "run_time": "5 minutes per sample", + "arm_support": false + }, "docs": { "intro": "## Introduction\n\nThe wf-artic workflow implements a slightly modified ARTIC FieldBioinformatics\nworkflow for the purpose of preparing consensus sequences from SARS-CoV-2\ngenomes that have been DNA sequenced using a pooled tiling amplicon strategy.\n\nThe workflow consumes a folder containing demultiplexed sequence reads as\nprepared by either MinKNOW or Guppy. The workflow needs to know the primer\nscheme that has been used during genome amplification and library preparation\ne.g. ARTIC/V3 or ONT_Midnight/V1. Other parameters can be specified too e.g.\nassign sample names to the barcodes or to adjust the length distribution of\nacceptable amplicon sequences. The Medaka variant model is selected based on the\nprovided basecaller configuration (using the parameter `--basecaller_cfg`), or\nalternatively the Medaka model can be provided directly via the `--medaka_variant_model`\nparameter.\n\nDNA sequences in FASTQ format are aggregated, filtered for sequence length and\nquality characteristics and are mapped to the reference SARS-CoV-2 genome using\nminimap2. A primer-scheme specific bed file is used to identify the regions of\nthe mapped sequences that correspond to synthetic sequences (primers) - these\nregions are clipped to ensure that sequences are entirely of biological origin.\nThe retained sequences are used to prepare a consensus sequence that is then\npolished using Medaka and variant calling is performed to produce a VCF file of\ngenetic differences relative to the reference genome. The consensus sequence is\nannotated for virus clade information using NextClade and a strain assignment\nis performed using Pangolin.\n\nThe completed analysis is summarised in an HTML format report that summarises\nkey information that includes number of reads, depth of coverage information\nper amplicon and both the Nextclade and Pangolin information.\n\nMore information can be found in these two blog posts:\n* [SARS-CoV-2 Midnight Analysis](https://labs.epi2me.io/sarscov2-midnight-analysis/)\n* [Midnight Scheme Update](https://labs.epi2me.io/ont-midnight-scheme-update/)\n", "links": "## Useful links\n\n* [Medaka](https://www.github.com/nanoporetech/medaka)\n* [Artic](https://github.com/artic-network/fieldbioinformatics)\n* [Nextflow](https://www.nextflow.io/)\n* [Docker](https://www.docker.com/products/docker-desktop)\n* [Singularity](https://docs.sylabs.io/guides/latest/user-guide/)\n" diff --git a/output_definition.json b/output_definition.json new file mode 100644 index 0000000..2da8465 --- /dev/null +++ b/output_definition.json @@ -0,0 +1,76 @@ +{ + "files": { + "workflow-report": { + "filepath": "./wf-artic-report.html", + "title": "Workflow report", + "description": "Report for all samples.", + "mime-type": "text/html", + "optional": false, + "type": "aggregated" + }, + "consensus": { + "filepath": "./all_consensus.fasta", + "title": "Consensus sequences", + "description": "Final consensus sequences for all samples in the analysis.", + "mime-type": "text/plain", + "optional": false, + "type": "aggregated" + }, + "pangolin-results": { + "filepath": "./lineage_report.csv", + "title": "Pangolin results", + "description": "Pangolin results for each of the samples in the analysis.", + "mime-type": "text/csv", + "optional": false, + "type": "aggregated" + }, + "nextclade-results": { + "filepath": "./nextclade.json", + "title": "Nextclade results", + "description": "Nextclade results for each of the samples in the analysis.", + "mime-type": "application/json", + "optional": false, + "type": "aggregated" + }, + "coverage": { + "filepath": "./all_depth.txt", + "title": "Coverage data", + "description": "Coverage of the reference genome in 20 base windows in all the samples in the analysis.", + "mime-type": "text/plain", + "optional": false, + "type": "aggregated" + }, + "variants": { + "filepath": "./{{ alias }}.pass.named.vcf.gz", + "title": "Variants", + "description": "A VCF file containing high confidence variants in the sample when compared to the reference.", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "variants-index": { + "filepath": "./{{ alias }}.pass.named.vcf.gz.tbi", + "title": "Variants index", + "description": "An index file for the variants.", + "mime-type": "application/octet-stream", + "optional": false, + "type": "per-sample" + }, + "alignments": { + "filepath": "./{{ alias }}.primertrimmed.rg.sorted.bam", + "title": "Alignments", + "description": "A BAM file containing the reads for the sample aligned to the reference.", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "alignments-index": { + "filepath": "./{{ alias }}.primertrimmed.rg.sorted.bam.bai", + "title": "Alignments index", + "description": "An index file for the alignments.", + "mime-type": "application/octet-stream", + "optional": false, + "type": "per-sample" + } + } +} \ No newline at end of file