diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index d8f9579..9c53149 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -122,3 +122,22 @@ body: render: shell validations: required: false + - type: dropdown + id: run-demo + attributes: + label: Were you able to successfully run the latest version of the workflow with the demo data? + description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button? + options: + - 'yes' + - 'no' + - other (please describe below) + validations: + required: true + - type: textarea + id: demo-other + attributes: + label: Other demo data information + render: shell + validations: + required: false + diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e47d70a..a60f6fd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -37,5 +37,3 @@ docker-run: - if: $MATRIX_NAME == "NEB-VarSkip/v2" variables: NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version NEB-VarSkip/v2" - - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5cda947..2f0bb9f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,12 +3,12 @@ repos: hooks: - id: docs_readme name: docs_readme - entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_inputs 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json + entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json language: python always_run: true pass_filenames: false additional_dependencies: - - epi2melabs>=0.0.48 + - epi2melabs>=0.0.52 - id: build_models name: build_models entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9437a93..9405562 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.0.0] +## Changes +- Documentation +- Defined resource requirments + ## [v0.3.33] ## Changes - Updates for cloud readiness diff --git a/README.md b/README.md index c44e6a4..be8c63e 100644 --- a/README.md +++ b/README.md @@ -27,12 +27,12 @@ parameter. Recommended requirements: + CPUs = 4 -+ memory = 8GB ++ Memory = 8GB -Minimum requirement: +Minimum requirements: + CPUs = 2 -+ memory = 4GB ++ Memory = 4GB Approximate run time: 5 minutes per sample @@ -85,7 +85,30 @@ The Midnight protocol for sample preparation and sequencing can be found in the -## Inputs +## Input example + + +This workflow accepts FASTQ files as input. + +The FASTQ input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. + +``` +(i) (ii) (iii) +input_reads.fastq ─── input_directory ─── input_directory + ├── reads0.fastq ├── barcode01 + └── reads1.fastq │ ├── reads0.fastq + │ └── reads1.fastq + ├── barcode02 + │ ├── reads0.fastq + │ ├── reads1.fastq + │ └── reads2.fastq + └── barcode03 + └── reads0.fastq +``` + + + +## Input parameters ### Input Options @@ -113,6 +136,13 @@ The Midnight protocol for sample preparation and sequencing can be found in the | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | +### Output Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| out_dir | string | Directory for output of all workflow results. | | output | + + ### Reporting Options | Nextflow parameter name | Type | Description | Help | Default | @@ -137,6 +167,7 @@ The Midnight protocol for sample preparation and sequencing can be found in the | max_softclip_length | integer | Remove reads with alignments showing large soft clipping | | | | update_data | boolean | Update Pangolin and Nextclade data at runtime. | | True | | pangolin_options | string | Pass options to Pangolin, for example "--analysis-mode fast --min-length 26000". | | | +| nextclade_data_tag | string | The tag of the nextclade data packet | | | | normalise | integer | Depth ceiling for depth of coverage normalisation | | 200 | | medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. | | @@ -156,7 +187,7 @@ The Midnight protocol for sample preparation and sequencing can be found in the ## Outputs -Outputs files may be aggregated including information for all samples or provided per sample. Per sample files will be prefixed with respective aliases and represented below as {{ alias }}. +Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. | Title | File path | Description | Per sample or aggregated | |-------|-----------|-------------|--------------------------| diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py index fe4fc37..62e3483 100755 --- a/bin/workflow_glue/check_sample_sheet.py +++ b/bin/workflow_glue/check_sample_sheet.py @@ -43,7 +43,7 @@ def main(args): ] if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet): - sys.stdout.write(f"Could not open sample sheet '{args.sample_sheet}'.") + sys.stdout.write("Could not open sample sheet file.") sys.exit() try: diff --git a/docs/03_compute_requirements.md b/docs/03_compute_requirements.md index 1d0e57b..682ad63 100644 --- a/docs/03_compute_requirements.md +++ b/docs/03_compute_requirements.md @@ -1,12 +1,12 @@ Recommended requirements: + CPUs = 4 -+ memory = 8GB ++ Memory = 8GB -Minimum requirement: +Minimum requirements: + CPUs = 2 -+ memory = 4GB ++ Memory = 4GB Approximate run time: 5 minutes per sample diff --git a/docs/06_input_example.md b/docs/06_input_example.md new file mode 100644 index 0000000..a64db34 --- /dev/null +++ b/docs/06_input_example.md @@ -0,0 +1,18 @@ + +This workflow accepts FASTQ files as input. + +The FASTQ input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. + +``` +(i) (ii) (iii) +input_reads.fastq ─── input_directory ─── input_directory + ├── reads0.fastq ├── barcode01 + └── reads1.fastq │ ├── reads0.fastq + │ └── reads1.fastq + ├── barcode02 + │ ├── reads0.fastq + │ ├── reads1.fastq + │ └── reads2.fastq + └── barcode03 + └── reads0.fastq +``` \ No newline at end of file diff --git a/docs/06_inputs.md b/docs/06_input_parameters.md similarity index 94% rename from docs/06_inputs.md rename to docs/06_input_parameters.md index 6b4e415..05cad67 100644 --- a/docs/06_inputs.md +++ b/docs/06_input_parameters.md @@ -24,6 +24,13 @@ | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | +### Output Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| out_dir | string | Directory for output of all workflow results. | | output | + + ### Reporting Options | Nextflow parameter name | Type | Description | Help | Default | @@ -48,6 +55,7 @@ | max_softclip_length | integer | Remove reads with alignments showing large soft clipping | | | | update_data | boolean | Update Pangolin and Nextclade data at runtime. | | True | | pangolin_options | string | Pass options to Pangolin, for example "--analysis-mode fast --min-length 26000". | | | +| nextclade_data_tag | string | The tag of the nextclade data packet | | | | normalise | integer | Depth ceiling for depth of coverage normalisation | | 200 | | medaka_variant_model | string | The name of a Medaka variant model to use. This name will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecalling model used to a suitable Medaka variant model. You can override this by providing a model with this option instead. | | diff --git a/docs/07_outputs.md b/docs/07_outputs.md index b29a713..678436d 100644 --- a/docs/07_outputs.md +++ b/docs/07_outputs.md @@ -1,4 +1,4 @@ -Outputs files may be aggregated including information for all samples or provided per sample. Per sample files will be prefixed with respective aliases and represented below as {{ alias }}. +Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. | Title | File path | Description | Per sample or aggregated | |-------|-----------|-------------|--------------------------| diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 3b29be1..81fdc2e 100644 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -141,7 +141,7 @@ class NfcoreSchema { for (specifiedParam in params.keySet()) { // nextflow params if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + log.error "You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" has_error = true } // unexpected params @@ -180,7 +180,7 @@ class NfcoreSchema { schema.validate(params_json) } catch (ValidationException e) { println '' - log.error 'ERROR: Validation of pipeline parameters failed!' + log.error 'Validation of pipeline parameters failed!' JSONObject exceptionJSON = e.toJSON() HashSet observed_exceptions = [] printExceptions(exceptionJSON, params_json, log, enums, raw_schema, observed_exceptions) diff --git a/lib/ingress.nf b/lib/ingress.nf index 5839730..0e492f4 100644 --- a/lib/ingress.nf +++ b/lib/ingress.nf @@ -149,13 +149,12 @@ def xam_ingress(Map arguments) def input = get_valid_inputs(margs, xam_extensions) + // check BAM headers to see if any samples are uBAM ch_result = input.dirs | map { meta, path -> [meta, get_target_files_in_dir(path, xam_extensions)] } | mix(input.files) - - ch_is_unaligned = ch_result | checkBamHeaders - | map { meta, is_unaligned_env, mixed_headers_env -> + | map { meta, paths, is_unaligned_env, mixed_headers_env -> // convert the env. variables from strings ('0' or '1') into bools boolean is_unaligned = is_unaligned_env as int as boolean boolean mixed_headers = mixed_headers_env as int as boolean @@ -163,14 +162,11 @@ def xam_ingress(Map arguments) if (mixed_headers) { error "Found mixed headers in (u)BAM files of sample '${meta.alias}'." } - [meta, is_unaligned] + // add `is_unaligned` to the metamap (note the use of `+` to create a copy of + // `meta` to avoid modifying every item in the channel; + // https://github.com/nextflow-io/nextflow/issues/2660) + [meta + [is_unaligned: is_unaligned], paths] } - - ch_result = ch_result | join(ch_is_unaligned) - // add `is_unaligned` to the metamap (note the use of `+` to create a copy of `meta` - // to avoid modifying every item in the channel; - // https://github.com/nextflow-io/nextflow/issues/2660) - | map { meta, paths, is_unaligned -> [meta + [is_unaligned: is_unaligned], paths] } | branch { meta, paths -> // set `paths` to `null` for uBAM samples if unallowed (they will be added to // the results channel in shape of `[meta, null]` at the end of the function @@ -240,11 +236,17 @@ process checkBamHeaders { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: tuple val(meta), path("input_dir/reads*.bam") output: // set the two env variables by `eval`-ing the output of the python script // checking the XAM headers - tuple val(meta), env(IS_UNALIGNED), env(MIXED_HEADERS) + tuple( + val(meta), + path("input_dir/reads*.bam", includeInputs: true), + env(IS_UNALIGNED), + env(MIXED_HEADERS), + ) script: """ workflow-glue check_bam_headers_in_dir input_dir > env.vars @@ -257,6 +259,7 @@ process mergeBams { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") shell: @@ -271,6 +274,7 @@ process catSortBams { label "ingress" label "wf_common" cpus 4 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") script: @@ -285,6 +289,7 @@ process sortBam { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: tuple val(meta), path("reads.sorted.bam") script: @@ -298,17 +303,22 @@ process bamstats { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: - tuple val(meta), path("reads.bam"), path("bamstats_results") + tuple val(meta), + path("reads.bam"), + path("bamstats_results") script: def bamstats_threads = Math.max(1, task.cpus - 1) """ mkdir bamstats_results bamstats reads.bam -s $meta.alias -u \ -f bamstats_results/bamstats.flagstat.tsv -t $bamstats_threads \ + --histograms histograms \ | bgzip > bamstats_results/bamstats.readstats.tsv.gz + mv histograms/* bamstats_results/ # extract the run IDs from the per-read stats csvtk cut -tf runid bamstats_results/bamstats.readstats.tsv.gz \ @@ -414,6 +424,7 @@ process move_or_compress_fq_file { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: // don't stage `input` with a literal because we check the file extension tuple val(meta), path(input) @@ -439,11 +450,14 @@ process fastcat { label "ingress" label "wf_common" cpus 3 + memory "2 GB" input: tuple val(meta), path("input") val extra_args output: - tuple val(meta), path("seqs.fastq.gz"), path("fastcat_stats") + tuple val(meta), + path("seqs.fastq.gz"), + path("fastcat_stats") script: String out = "seqs.fastq.gz" String fastcat_stats_outdir = "fastcat_stats" @@ -453,10 +467,12 @@ process fastcat { -s ${meta["alias"]} \ -r >(bgzip -c > $fastcat_stats_outdir/per-read-stats.tsv.gz) \ -f $fastcat_stats_outdir/per-file-stats.tsv \ + --histograms histograms \ $extra_args \ input \ | bgzip > $out + mv histograms/* $fastcat_stats_outdir # extract the run IDs from the per-read stats csvtk cut -tf runid $fastcat_stats_outdir/per-read-stats.tsv.gz \ | csvtk del-header | sort | uniq > $fastcat_stats_outdir/run_ids @@ -737,6 +753,7 @@ process validate_sample_sheet { cpus 1 label "ingress" label "wf_common" + memory "2 GB" input: path "sample_sheet.csv" val required_sample_types diff --git a/nextflow.config b/nextflow.config index 359d278..6a1130b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ params { "--scheme_name 'SARS-CoV-2'", "--scheme_version 'Midnight-ONT/V3'" ] - common_sha = 'sha91452ece4f647f62b32dac3a614635a6f0d7f8b5' + common_sha = 'sha1c5febff9f75143710826498b093d9769a5edbb9' container_sha = 'sha6e8c02f120faf92b4e61e1d0797d71210aaec20b' nextclade_sha = 'shae56aff3b5b498b8cb950993692f914033397f8da' pangolin_sha = 'shae304dd3bc308a519f26908eb9d5ffa7686131d17' @@ -72,7 +72,7 @@ manifest { description = 'Run the ARTIC SARS-CoV-2 methodology on multiplexed MinION, GridION, and PromethION data.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v0.3.33' + version = 'v1.0.0' } epi2melabs { @@ -137,7 +137,7 @@ profiles { executor = 'awsbatch' queue = "${params.aws_queue}" withLabel:wf_common { - container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root" + container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}" memory = '1G' } withLabel:artic { diff --git a/nextflow_schema.json b/nextflow_schema.json index 565cb67..1220600 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -232,10 +232,12 @@ "nextclade_version": { "type": "string", "default": "2.14.0", + "description": "The version of nextclade", "hidden": true }, "nextclade_data_tag": { "type": "string", + "description": "The tag of the nextclade data packet", "hidden": false }, "normalise": {