diff --git a/CITATIONS.md b/CITATIONS.md index 8451b47..e9b5164 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -40,6 +40,10 @@ > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. +- [Bracken](https://doi.org/10.7717/peerj-cs.104) + + > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. https://doi.org/10.7717/peerj-cs.104 + - [Centrifuge](https://doi.org/10.1101/gr.210641.116) > Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. https://doi.org/10.1101/gr.210641.116 diff --git a/README.md b/README.md index 7b1787c..c072d55 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ 1. Prepares input FASTA files for building 2. Builds databases for: + - [Bracken](https://doi.org/10.7717/peerj-cs.104) - [Centrifuge](https://doi.org/10.1101/gr.210641.116) - [DIAMOND](https://doi.org/10.1038/nmeth.3176) - [Kaiju](https://doi.org/10.1038/ncomms11257) @@ -84,7 +85,7 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/createtaxdb was originally written by James A. Fellows Yates and the nf-core community. +nf-core/createtaxdb was originally written by James A. Fellows Yates, Joon Klaps, Alexander Ramos Díaz and the nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: diff --git a/conf/test.config b/conf/test.config index c1be203..4046af5 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,13 +22,16 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'createtaxdb/samplesheets/test.csv' + input = params.pipelines_testdata_base_path + 'createtaxdb/samplesheets/test.csv' + + dbname = "database" build_diamond = true build_kaiju = true build_malt = true build_centrifuge = true build_kraken2 = true + build_bracken = true accession2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl_gb.accession2taxid' nucl2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl2tax.map' diff --git a/conf/test_full.config b/conf/test_full.config index 591cce3..b43690d 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -18,4 +18,12 @@ params { // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) // TODO nf-core: Give any required params for the test so that command line flags are not needed input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + + build_diamond = true + build_kaiju = true + build_malt = true + build_centrifuge = true + build_kraken2 = true + build_bracken = true + } diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 72c07e3..4fd0e0f 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -30,5 +30,6 @@ params { build_malt = false build_centrifuge = false build_kraken2 = false + build_bracken = false } diff --git a/docs/output.md b/docs/output.md index 5a54002..1a30100 100644 --- a/docs/output.md +++ b/docs/output.md @@ -14,6 +14,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Bracken](#bracken) - Database files for Brakcen - [Centrifuge](#centrifuge) - Database files for Centrifuge - [DIAMOND](#diamond) - Database files for DIAMOND - [Kaiju](#kaiju) - Database files for Kaiju @@ -51,6 +52,31 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +### Bracken + +[Bracken](https://github.com/jenniferlu717/Bracken/)(Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. + +
+Output files + +- `bracken/` + - `/` + - `database100mers.kmer_distrib`: Bracken kmer distribution file + - `database100mers.kraken`: Bracken index file + - `database.kraken`: Bracken database file + - `hash.k2d`: Kraken2 hash database file + - `opts.k2d`: Kraken2 opts database file + - `taxo.k2d`: Kraken2 taxo database file + - `library/`: Intermediate Kraken2 directory containing FASTAs and related files of added genomes + - `taxonomy/`: Intermediate Kraken2 directory containing taxonomy files of added genomes + - `seqid2taxid.map`: Intermediate Kraken2 file containing taxonomy files of added genomes + +
+ +Note that all intermediate files are required for Bracken2 database, even if Kraken2 itself only requires the `*.k2d` files. + +The resulting `/` directory can be given to Bracken itself with `bracken -d ` etc. + ### Centrifuge [Centrifuge](https://github.com/bbuchfink/diamond) is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. @@ -105,6 +131,9 @@ The `fmi` file can be given to Kaiju itself with `kaiju -f .fmi` - `hash.k2d`: Kraken2 hash database file - `opts.k2d`: Kraken2 opts database file - `taxo.k2d`: Kraken2 taxo database file + - `library/`: Intermediate directory containing FASTAs and related files of added genomes (only present if `--build_bracken` or `--kraken2_keepintermediate` supplied) + - `taxonomy/`: Intermediate directory containing taxonomy files of added genomes (only present if `--build_bracken` or `--kraken2_keepintermediate` supplied) + - `seqid2taxid.map`: Intermediate file containing taxonomy files of added genomes (only present if `--build_bracken` or `--kraken2_keepintermediate` supplied) diff --git a/modules.json b/modules.json index fb3d228..a318afd 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bracken/build": { + "branch": "master", + "git_sha": "dcbe6e77bc6cc0843ce93e6c7bd884d65c215984", + "installed_by": ["fasta_build_add_kraken2_bracken"] + }, "cat/cat": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", @@ -38,12 +43,12 @@ "kraken2/add": { "branch": "master", "git_sha": "ca87ad032a62f025f0c373facacef2df0c5411b2", - "installed_by": ["fasta_build_add_kraken2"] + "installed_by": ["fasta_build_add_kraken2_bracken"] }, "kraken2/build": { "branch": "master", "git_sha": "ca87ad032a62f025f0c373facacef2df0c5411b2", - "installed_by": ["fasta_build_add_kraken2"] + "installed_by": ["fasta_build_add_kraken2_bracken"] }, "malt/build": { "branch": "master", @@ -69,9 +74,9 @@ }, "subworkflows": { "nf-core": { - "fasta_build_add_kraken2": { + "fasta_build_add_kraken2_bracken": { "branch": "master", - "git_sha": "a4d1e13a2da05307deb65a87d501aa6520162dcd", + "git_sha": "9758e4dedd5788369e61b57e7d6f4751e682b17a", "installed_by": ["subworkflows"] }, "utils_nextflow_pipeline": { diff --git a/modules/nf-core/bracken/build/environment.yml b/modules/nf-core/bracken/build/environment.yml new file mode 100644 index 0000000..7288a38 --- /dev/null +++ b/modules/nf-core/bracken/build/environment.yml @@ -0,0 +1,8 @@ +--- +name: "bracken_build" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::bracken=2.9" diff --git a/modules/nf-core/bracken/build/main.nf b/modules/nf-core/bracken/build/main.nf new file mode 100644 index 0000000..a2ee2c8 --- /dev/null +++ b/modules/nf-core/bracken/build/main.nf @@ -0,0 +1,48 @@ +process BRACKEN_BUILD { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bracken:2.9--py38h2494328_0': + 'biocontainers/bracken:2.9--py38h2494328_0' }" + + input: + tuple val(meta), path(kraken2db) + + output: + tuple val(meta), path(kraken2db , includeInputs: true), emit: db + tuple val(meta), path("${kraken2db}/database*", includeInputs: true), emit: bracken_files + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + bracken-build \\ + $args \\ + -t $task.cpus \\ + -d $kraken2db + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: \$(echo \$(bracken -v) | cut -f2 -d'v') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${kraken2db}/database100mers.kmer_distrib + touch ${kraken2db}/database100mers.kraken + touch ${kraken2db}/database.kraken + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: \$(echo \$(bracken -v) | cut -f2 -d'v') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bracken/build/meta.yml b/modules/nf-core/bracken/build/meta.yml new file mode 100644 index 0000000..2bce245 --- /dev/null +++ b/modules/nf-core/bracken/build/meta.yml @@ -0,0 +1,47 @@ +--- +name: "bracken_build" +description: Extends a Kraken2 database to be compatible with Bracken +keywords: + - kraken2 + - bracken + - database + - build +tools: + - "bracken": + description: "Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample." + homepage: "https://ccb.jhu.edu/software/bracken/" + documentation: "https://ccb.jhu.edu/software/bracken/" + tool_dev_url: "https://github.com/jenniferlu717/Bracken/" + doi: "10.7717/peerj-cs.104 " + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - kraken2db: + type: directory + description: A Kraken2 database directory + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - db: + type: directory + description: A Kraken2 database directory with required bracken files in side + pattern: "*/" + +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/bracken/build/tests/main.nf.test b/modules/nf-core/bracken/build/tests/main.nf.test new file mode 100644 index 0000000..f4168a7 --- /dev/null +++ b/modules/nf-core/bracken/build/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process BRACKEN_BUILD" + script "../main.nf" + process "BRACKEN_BUILD" + + tag "modules" + tag "modules_nfcore" + tag "bracken" + tag "bracken/build" + tag "untar" + + setup { + run ("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [[id: 'db'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2_intermediate.tar.gz', checkIfExists: true)] + """ + } + } + } + + test("kraken2 - db") { + + when { + process { + """ + input[0] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file("${process.out.db[0][1]}/database100mers.kmer_distrib").name, + file("${process.out.db[0][1]}/database100mers.kraken").name, + file("${process.out.db[0][1]}/database.kraken").name, + file("${process.out.bracken_files[0][1]}/database100mers.kmer_distrib").name, + file("${process.out.bracken_files[0][1]}/database100mers.kraken").name, + file("${process.out.bracken_files[0][1]}/database.kraken").name, + ).match() + } + ) + } + + } + + test("kraken2 - db - stub") { + + options "-stub" + + when { + process { + """ + input[0] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bracken/build/tests/main.nf.test.snap b/modules/nf-core/bracken/build/tests/main.nf.test.snap new file mode 100644 index 0000000..49f4240 --- /dev/null +++ b/modules/nf-core/bracken/build/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "kraken2 - db": { + "content": [ + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken", + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-17T18:41:03.693430543" + }, + "kraken2 - db - stub": { + "content": [ + { + "0": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "2": [ + "versions.yml:md5,925c6ae1387eaf6dbd14656125bc6d9b" + ], + "bracken_files": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "db": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,925c6ae1387eaf6dbd14656125bc6d9b" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-17T18:41:14.406736156" + } +} \ No newline at end of file diff --git a/modules/nf-core/bracken/build/tests/tags.yml b/modules/nf-core/bracken/build/tests/tags.yml new file mode 100644 index 0000000..92d7c26 --- /dev/null +++ b/modules/nf-core/bracken/build/tests/tags.yml @@ -0,0 +1,2 @@ +bracken/build: + - "modules/nf-core/bracken/build/**" diff --git a/nextflow.config b/nextflow.config index 2767aad..15cec1c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,7 +54,7 @@ params { validate_params = true // General parameters - dbname = "database" + dbname = null save_concatenated_fastas = false accession2taxid = null @@ -72,7 +72,7 @@ params { build_centrifuge = false build_kraken2 = false kraken2_keepintermediate = false - + build_bracken = false } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 85dfcae..853449f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["input", "outdir", "dbname"], "properties": { "input": { "type": "string", @@ -43,7 +43,6 @@ }, "dbname": { "type": "string", - "default": "database", "description": "Specify name that resulting databases will be prefixed with.", "fa_icon": "fas fa-id-badge" }, @@ -145,6 +144,12 @@ "type": "boolean", "fa_icon": "fas fa-save", "description": "Retain intermediate Kraken2 build files for inspection." + }, + "build_bracken": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on extending of Kraken2 database to include Bracken files. Requires nucleotide FASTA File input.", + "help_text": "Bracken2 databases are simply just a Kraken2 database with two additional files.\n\nNote however this requires a Kraken2 database _with_ intermediate files still in it, thus can result in large database directories." } }, "fa_icon": "fas fa-database" diff --git a/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf b/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf index 1a9db9f..adf9a3d 100644 --- a/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf @@ -194,6 +194,7 @@ def toolCitationText() { // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", + params.build_bracken ? "Bracken (Lu et al. 2017)," : "", params.build_centrifuge ? "Centrifuge (Kim et al. 2016)," : "", params.build_diamond ? "DIAMOND (Buchfink et al. 2015)," : "", params.build_kaiju ? "Kaiju (Menzel et al. 2016)," : "", @@ -211,6 +212,7 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ + params.build_bracken ? "
  • Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. 10.7717/peerj-cs.104
  • " : "", params.build_centrifuge ? "
  • Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. 10.1101/gr.210641.116
  • " : "", params.build_diamond ? "
  • Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. 10.1038/nmeth.3176
  • " : "", params.build_kaiju ? "
  • Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. 10.1038/ncomms11257
  • " : "", diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/main.nf b/subworkflows/nf-core/fasta_build_add_kraken2/main.nf deleted file mode 100644 index 306896b..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/main.nf +++ /dev/null @@ -1,35 +0,0 @@ -include { KRAKEN2_ADD } from '../../../modules/nf-core/kraken2/add/main' -include { KRAKEN2_BUILD } from '../../../modules/nf-core/kraken2/build/main' - -workflow FASTA_BUILD_ADD_KRAKEN2 { - - take: - ch_fasta // channel: [ val(meta), fasta ] - ch_taxonomy_names // channel: [ names.dmp ] - ch_taxonomy_nodes // channel: [ nodes.dmp ] - ch_accession2taxid // channel: [ acc2taxidfile ] - val_cleanintermediate // value: [ true | false ] - - main: - - ch_versions = Channel.empty() - - ch_fastas_for_kraken2add = ch_fasta - .map { - meta, fasta -> - - [[id: 'db'], fasta] - } - .groupTuple() - - KRAKEN2_ADD ( ch_fastas_for_kraken2add, ch_taxonomy_names, ch_taxonomy_nodes, ch_accession2taxid ) - ch_versions = ch_versions.mix(KRAKEN2_ADD.out.versions.first()) - - KRAKEN2_BUILD ( KRAKEN2_ADD.out.db, val_cleanintermediate ) - ch_versions = ch_versions.mix(KRAKEN2_BUILD.out.versions.first()) - - emit: - db = KRAKEN2_BUILD.out.db // channel: [ val(meta), [ db ] ] - versions = ch_versions // channel: [ versions.yml ] -} - diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test b/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test deleted file mode 100644 index a7baca6..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test +++ /dev/null @@ -1,103 +0,0 @@ -nextflow_workflow { - - name "Test Subworkflow FASTA_BUILD_ADD_KRAKEN2" - script "../main.nf" - workflow "FASTA_BUILD_ADD_KRAKEN2" - - tag "subworkflows" - tag "subworkflows_nfcore" - tag "subworkflows/fasta_build_add_kraken2" - tag "gunzip" - tag "kraken2" - tag "kraken2/add" - tag "kraken2/build" - - test("metagenome - fasta") { - - setup { - run("GUNZIP") { - script "modules/nf-core/gunzip/main.nf" - process { - """ - input[0] = Channel.of([\ - [id:'haemophilus_influenzae'], - file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) - ] - ) - """ - } - } - } - - when { - workflow { - """ - input[0] = Channel.of([[id:'sarscov2'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip) - input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) - input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) - input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) - input[4] = true - """ - } - } - - then { - assertAll( - { assert workflow.success}, - { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, - { assert snapshot ( - workflow.out.versions, - path("${workflow.out.db[0][1]}/hash.k2d"), - path("${workflow.out.db[0][1]}/taxo.k2d"), - file("${workflow.out.db[0][1]}/opts.k2d").name, - ).match() - } - ) - } - } - - test("metagenome - fasta - nocleanup") { - - setup { - run("GUNZIP") { - script "modules/nf-core/gunzip/main.nf" - process { - """ - input[0] = Channel.of([\ - [id:'haemophilus_influenzae'], - file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) - ] - ) - """ - } - } - } - - when { - workflow { - """ - input[0] = Channel.of([[id:'sarscov2'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip) - input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) - input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) - input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) - input[4] = false - """ - } - } - - then { - assertAll( - { assert workflow.success}, - { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, - { assert snapshot ( - workflow.out.versions, - path("${workflow.out.db[0][1]}/hash.k2d"), - path("${workflow.out.db[0][1]}/taxo.k2d"), - file("${workflow.out.db[0][1]}/opts.k2d").name, - file("${workflow.out.db[0][1]}/unmapped.txt").name - ).match() - } - ) - } - } -} diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap deleted file mode 100644 index 57b4a48..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metagenome - fasta - nocleanup": { - "content": [ - [ - "versions.yml:md5,62fb719633dd8f110bbc2c1bec53d0a9", - "versions.yml:md5,82f39c3ef1ba0742da3105cbe5ed3cf7" - ], - "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", - "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", - "opts.k2d", - "unmapped.txt" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-04-05T08:17:49.670974771" - }, - "metagenome - fasta": { - "content": [ - [ - "versions.yml:md5,62fb719633dd8f110bbc2c1bec53d0a9", - "versions.yml:md5,82f39c3ef1ba0742da3105cbe5ed3cf7" - ], - "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", - "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", - "opts.k2d" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-04-05T08:17:31.501399396" - } -} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml b/subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml deleted file mode 100644 index af5f2a0..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/fasta_build_add_kraken2: - - subworkflows/nf-core/fasta_build_add_kraken2/** diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf new file mode 100644 index 0000000..cb9e370 --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf @@ -0,0 +1,42 @@ +include { KRAKEN2_ADD } from '../../../modules/nf-core/kraken2/add/main' +include { KRAKEN2_BUILD } from '../../../modules/nf-core/kraken2/build/main' +include { BRACKEN_BUILD } from '../../../modules/nf-core/bracken/build/main' + +workflow FASTA_BUILD_ADD_KRAKEN2_BRACKEN { + + take: + ch_fasta // channel: [ val(meta), [ fasta1, fasta2, fasta3] ] + ch_taxonomy_names // channel: [ names.dmp ] + ch_taxonomy_nodes // channel: [ nodes.dmp ] + ch_accession2taxid // channel: [ acc2taxidfile ] + val_cleanintermediates // value: [ true | false ] + val_runbrackenbuild // value: [ true | false ] + + main: + + if ( val_cleanintermediates && val_runbrackenbuild ) { error("Cannot perform Kraken2 cleanup and build Bracken database. Bracken requires intermediate files") } + val_cleanup = [ val_cleanintermediates && !val_runbrackenbuild ].any() ? true : false + + ch_versions = Channel.empty() + + KRAKEN2_ADD ( ch_fasta, ch_taxonomy_names, ch_taxonomy_nodes, ch_accession2taxid ) + ch_versions = ch_versions.mix( KRAKEN2_ADD.out.versions.first() ) + + KRAKEN2_BUILD ( KRAKEN2_ADD.out.db, val_cleanup ) + ch_versions = ch_versions.mix( KRAKEN2_BUILD.out.versions.first() ) + + if ( val_runbrackenbuild ) { + BRACKEN_BUILD ( KRAKEN2_BUILD.out.db ) + ch_final_db = BRACKEN_BUILD.out.db + ch_versions = ch_versions.mix( BRACKEN_BUILD.out.versions.first() ) + } + else { + ch_final_db = KRAKEN2_BUILD.out.db + ch_versions = ch_versions.mix( KRAKEN2_BUILD.out.versions.first() ) + } + + emit: + db = ch_final_db // channel: [ val(meta), [ db ] ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/meta.yml b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/meta.yml similarity index 61% rename from subworkflows/nf-core/fasta_build_add_kraken2/meta.yml rename to subworkflows/nf-core/fasta_build_add_kraken2_bracken/meta.yml index 1506709..8125c60 100644 --- a/subworkflows/nf-core/fasta_build_add_kraken2/meta.yml +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/meta.yml @@ -1,21 +1,23 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json -name: "fasta_build_add_kraken2" -description: KRAKEN2 build custom database subworkflow +name: "fasta_build_add_kraken2_bracken" +description: KRAKEN2 and BRACKEN build custom database subworkflow keywords: - metagenomics - kraken2 - database - build - custom + - bracken components: - kraken2/add - kraken2/build + - bracken/build input: - ch_fasta: type: file description: | - Channel containing each fasta as a distinct element with meta - Structure: [ val(meta), path(fasta) ] + Channel containing a meta with a list of FASTAs to be built + Structure: [ val(meta), [ fasta1, fasta2, fasta3 ] ] pattern: "*.{fasta,fa,fna}" - ch_taxonomy_names: type: file @@ -35,18 +37,26 @@ input: Channel containing a NCBI-style taxdump accession2taxid (acc2tax) file Structure: [ accession2taxid_file ] pattern: "*.accession2taxid" - - val_cleanintermediate: + - val_cleanintermediates: type: boolean description: | - Boolean flag whether to clean up intermediate files after build or not + Boolean flag whether to clean up intermediate files after build or not. + If val_runbrackenbuild set, will be ignored as BRACKEN requires intermediate files. Structure: [ val_cleanintermediate ] pattern: "true|false" + - val_runbrackenbuild: + type: boolean + description: | + Boolean flag whether to additionally insert required BRACKEN database files into KRAKEN2 directory. + Note any changes for k-mer or read lengths must come via Nextflow config `ext.args`. + Structure: [ val_runbrackenbuild ] + pattern: "true|false" output: - db: type: directory description: | - Channel containing KRAKEN2 database directory. - Use `$ext.prefix` in a modules.conf file to change default name + Channel containing KRAKEN2 (and BRACKEN) database directory files. + Use `ext.prefix` in a modules.conf file to change default name Structure: [ val(meta), path(db) ] pattern: "*/" - versions: diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test new file mode 100644 index 0000000..94bc184 --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test @@ -0,0 +1,160 @@ +nextflow_workflow { + + name "Test Subworkflow FASTA_BUILD_ADD_KRAKEN2_BRACKEN" + script "../main.nf" + workflow "FASTA_BUILD_ADD_KRAKEN2_BRACKEN" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_build_add_kraken2_bracken" + tag "gunzip" + tag "kraken2" + tag "kraken2/add" + tag "kraken2/build" + tag "bracken/build" + + test("metagenome - nocleanup - nobracken - fasta") { + + setup { + run("GUNZIP") { + script "modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'db'], + file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) + ] + ) + """ + } + } + } + + when { + workflow { + """ + input[0] = Channel.of([[id:'db'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip).groupTuple() + input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot ( + workflow.out.versions, + path("${workflow.out.db[0][1]}/hash.k2d"), + path("${workflow.out.db[0][1]}/taxo.k2d"), + file("${workflow.out.db[0][1]}/opts.k2d").name, + ).match() + }, + { assert path("${workflow.out.db[0][1]}/library/").exists() }, + { assert path("${workflow.out.db[0][1]}/taxonomy/").exists() } + ) + } + } + + test("metagenome - withcleanup - nobracken - fasta") { + + setup { + run("GUNZIP") { + script "modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = Channel.of([\ + [id:'db'], + file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) + ] + ) + """ + } + } + } + + when { + workflow { + """ + input[0] = Channel.of([[id:'db'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip).groupTuple() + input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) + input[4] = true + input[5] = false + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, + { assert snapshot ( + workflow.out.versions, + path("${workflow.out.db[0][1]}/hash.k2d"), + path("${workflow.out.db[0][1]}/taxo.k2d"), + file("${workflow.out.db[0][1]}/opts.k2d").name, + file("${workflow.out.db[0][1]}/unmapped.txt").name + ).match() + }, + { assert !path("${workflow.out.db[0][1]}/library/").exists() }, + { assert !path("${workflow.out.db[0][1]}/taxonomy/").exists() } + ) + } + } + +test("metagenome - nocleanup - withbracken - fasta") { + + setup { + run("GUNZIP") { + script "modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = Channel.of([\ + [id:'db'], + file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) + ] + ) + """ + } + } + } + + when { + workflow { + """ + input[0] = Channel.of([[id:'db'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip).groupTuple() + input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) + input[4] = false + input[5] = true + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, + { assert path("${workflow.out.db[0][1]}/library/").exists() }, + { assert path("${workflow.out.db[0][1]}/taxonomy/").exists() }, + { assert snapshot ( + workflow.out.versions, + path("${workflow.out.db[0][1]}/hash.k2d"), + path("${workflow.out.db[0][1]}/taxo.k2d"), + file("${workflow.out.db[0][1]}/opts.k2d").name, + file("${workflow.out.db[0][1]}/unmapped.txt").name, + file("${workflow.out.db[0][1]}/database100mers.kmer_distrib").name, + file("${workflow.out.db[0][1]}/database100mers.kraken").name, + file("${workflow.out.db[0][1]}/database.kraken").name + ).match() + } + ) + } + } +} diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap new file mode 100644 index 0000000..9ad0c78 --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap @@ -0,0 +1,58 @@ +{ + "metagenome - nocleanup - nobracken - fasta": { + "content": [ + [ + "versions.yml:md5,b5f92f68a6af1f422ccc1a5c75178793", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae" + ], + "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", + "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", + "opts.k2d" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-18T09:04:48.196774778" + }, + "metagenome - withcleanup - nobracken - fasta": { + "content": [ + [ + "versions.yml:md5,b5f92f68a6af1f422ccc1a5c75178793", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae" + ], + "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", + "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", + "opts.k2d", + "unmapped.txt" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-18T08:42:24.87325051" + }, + "metagenome - nocleanup - withbracken - fasta": { + "content": [ + [ + "versions.yml:md5,130d220d293e4f75863b6c0756bb8324", + "versions.yml:md5,b5f92f68a6af1f422ccc1a5c75178793", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae" + ], + "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", + "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", + "opts.k2d", + "unmapped.txt", + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-18T10:58:01.065026262" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml new file mode 100644 index 0000000..40273bf --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fasta_build_add_kraken2_bracken: + - subworkflows/nf-core/fasta_build_add_kraken2_bracken/** diff --git a/tests/test.nf.test b/tests/test.nf.test index 46dd23f..54bea3e 100644 --- a/tests/test.nf.test +++ b/tests/test.nf.test @@ -21,9 +21,12 @@ nextflow_pipeline { path("$outputDir/centrifuge/"), path("$outputDir/diamond/database.dmnd"), path("$outputDir/kaiju/database.fmi"), - path("$outputDir/kraken2/db/hash.k2d"), - file("$outputDir/kraken2/db/opts.k2d").name, - path("$outputDir/kraken2/db/taxo.k2d"), + path("$outputDir/kraken2/database/hash.k2d"), + file("$outputDir/kraken2/database/opts.k2d").name, + path("$outputDir/kraken2/database/taxo.k2d"), + file("$outputDir/bracken/database/database100mers.kmer_distrib").name, + file("$outputDir/bracken/database/database100mers.kraken").name, + file("$outputDir/bracken/database/database.kraken").name, path("$outputDir/malt/malt-build.log").readLines().last().contains('Peak memory'), path("$outputDir/malt/malt_index/index0.idx"), path("$outputDir/malt/malt_index/ref.db"), diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap index 73e0d6d..bf9427f 100644 --- a/tests/test.nf.test.snap +++ b/tests/test.nf.test.snap @@ -12,6 +12,9 @@ "hash.k2d:md5,01122a04dcef29ceb3baa68a9f6e6ef5", "opts.k2d", "taxo.k2d:md5,cd8170a8c5a1b763a9ac1ffa2107cc88", + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken", true, "index0.idx:md5,876139dc930e68992cd2625e08bba48a", "ref.db:md5,377073f58a9f9b85acca59fcf21744a9", @@ -23,8 +26,8 @@ ], "meta": { "nf-test": "0.8.4", - "nextflow": "24.02.0" + "nextflow": "24.04.1" }, - "timestamp": "2024-04-11T10:59:28.687364796" + "timestamp": "2024-05-23T08:15:27.641419595" } } \ No newline at end of file diff --git a/workflows/createtaxdb.nf b/workflows/createtaxdb.nf index fa0fe83..0192d99 100644 --- a/workflows/createtaxdb.nf +++ b/workflows/createtaxdb.nf @@ -21,7 +21,7 @@ include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pi include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_createtaxdb_pipeline' -include { FASTA_BUILD_ADD_KRAKEN2 } from '../subworkflows/nf-core/fasta_build_add_kraken2/main' +include { FASTA_BUILD_ADD_KRAKEN2_BRACKEN } from '../subworkflows/nf-core/fasta_build_add_kraken2_bracken/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -53,7 +53,7 @@ workflow CREATETAXDB { // PREPARE: Prepare input for single file inputs modules - if ( [params.build_malt, params.build_centrifuge, params.build_kraken2].any() ) { // Pull just DNA sequences + if ( [params.build_malt, params.build_centrifuge, params.build_kraken2, params.build_bracken].any() ) { // Pull just DNA sequences ch_dna_refs_for_singleref = ch_samplesheet .map{meta, fasta_dna, fasta_aa -> [[id: params.dbname], fasta_dna]} @@ -141,13 +141,16 @@ workflow CREATETAXDB { ch_kaiju_output = Channel.empty() } - // SUBWORKFLOW: Kraken2 - if ( params.build_kraken2 ) { - FASTA_BUILD_ADD_KRAKEN2 ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, !params.kraken2_keepintermediate ) - ch_versions = ch_versions.mix(FASTA_BUILD_ADD_KRAKEN2.out.versions.first()) - ch_kraken2_output = FASTA_BUILD_ADD_KRAKEN2.out.db + // SUBWORKFLOW: Kraken2 and Bracken + // Bracken requires intermediate files, if build_bracken=true then kraken2_keepintermediate=true, otherwise an error will be raised + // Condition is inverted because subworkflow asks if you want to 'clean' (true) or not, but pipeline says to 'keep' + if ( params.build_kraken2 || params.build_bracken ) { + def k2_keepintermediates = params.kraken2_keepintermediate || params.build_bracken ? false : true + FASTA_BUILD_ADD_KRAKEN2_BRACKEN ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, k2_keepintermediates, params.build_bracken ) + ch_versions = ch_versions.mix(FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.versions.first()) + ch_kraken2_bracken_output = FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.db } else { - ch_kraken2_output = Channel.empty() + ch_kraken2_bracken_output = Channel.empty() } // Module: Run MALT/BUILD @@ -226,13 +229,13 @@ workflow CREATETAXDB { multiqc_report = MULTIQC.out.report.toList() emit: - versions = ch_collated_versions - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - centrifuge_database = ch_centrifuge_output - diamond_database = ch_diamond_output - kaiju_database = ch_kaiju_output - kraken2_database = ch_kraken2_output - malt_database = ch_malt_output + versions = ch_collated_versions + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + centrifuge_database = ch_centrifuge_output + diamond_database = ch_diamond_output + kaiju_database = ch_kaiju_output + kraken2_bracken_database = ch_kraken2_bracken_output + malt_database = ch_malt_output } /*