From e37c84d69b73dc9dc2075937987bf9a8194aaa6a Mon Sep 17 00:00:00 2001 From: alxndrdiaz Date: Thu, 2 May 2024 18:01:24 -0600 Subject: [PATCH 1/8] include fasta_build_add_kraken2_bracken --- modules.json | 13 +- modules/nf-core/bracken/build/environment.yml | 8 + modules/nf-core/bracken/build/main.nf | 48 ++++++ modules/nf-core/bracken/build/meta.yml | 47 +++++ .../nf-core/bracken/build/tests/main.nf.test | 72 ++++++++ .../bracken/build/tests/main.nf.test.snap | 84 +++++++++ modules/nf-core/bracken/build/tests/tags.yml | 2 + nextflow.config | 1 + .../nf-core/fasta_build_add_kraken2/main.nf | 35 ---- .../tests/main.nf.test | 103 ----------- .../tests/main.nf.test.snap | 35 ---- .../fasta_build_add_kraken2/tests/tags.yml | 2 - .../fasta_build_add_kraken2_bracken/main.nf | 42 +++++ .../meta.yml | 26 ++- .../tests/main.nf.test | 160 ++++++++++++++++++ .../tests/main.nf.test.snap | 58 +++++++ .../tests/tags.yml | 2 + workflows/createtaxdb.nf | 9 +- 18 files changed, 556 insertions(+), 191 deletions(-) create mode 100644 modules/nf-core/bracken/build/environment.yml create mode 100644 modules/nf-core/bracken/build/main.nf create mode 100644 modules/nf-core/bracken/build/meta.yml create mode 100644 modules/nf-core/bracken/build/tests/main.nf.test create mode 100644 modules/nf-core/bracken/build/tests/main.nf.test.snap create mode 100644 modules/nf-core/bracken/build/tests/tags.yml delete mode 100644 subworkflows/nf-core/fasta_build_add_kraken2/main.nf delete mode 100644 subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test delete mode 100644 subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap delete mode 100644 subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml create mode 100644 subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf rename subworkflows/nf-core/{fasta_build_add_kraken2 => fasta_build_add_kraken2_bracken}/meta.yml (61%) create mode 100644 subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test create mode 100644 subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap create mode 100644 subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml diff --git a/modules.json b/modules.json index cb08493..0b6df73 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bracken/build": { + "branch": "master", + "git_sha": "dcbe6e77bc6cc0843ce93e6c7bd884d65c215984", + "installed_by": ["fasta_build_add_kraken2_bracken"] + }, "cat/cat": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", @@ -38,12 +43,12 @@ "kraken2/add": { "branch": "master", "git_sha": "ca87ad032a62f025f0c373facacef2df0c5411b2", - "installed_by": ["fasta_build_add_kraken2"] + "installed_by": ["fasta_build_add_kraken2_bracken"] }, "kraken2/build": { "branch": "master", "git_sha": "ca87ad032a62f025f0c373facacef2df0c5411b2", - "installed_by": ["fasta_build_add_kraken2"] + "installed_by": ["fasta_build_add_kraken2_bracken"] }, "malt/build": { "branch": "master", @@ -69,9 +74,9 @@ }, "subworkflows": { "nf-core": { - "fasta_build_add_kraken2": { + "fasta_build_add_kraken2_bracken": { "branch": "master", - "git_sha": "a4d1e13a2da05307deb65a87d501aa6520162dcd", + "git_sha": "9758e4dedd5788369e61b57e7d6f4751e682b17a", "installed_by": ["subworkflows"] }, "utils_nextflow_pipeline": { diff --git a/modules/nf-core/bracken/build/environment.yml b/modules/nf-core/bracken/build/environment.yml new file mode 100644 index 0000000..7288a38 --- /dev/null +++ b/modules/nf-core/bracken/build/environment.yml @@ -0,0 +1,8 @@ +--- +name: "bracken_build" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::bracken=2.9" diff --git a/modules/nf-core/bracken/build/main.nf b/modules/nf-core/bracken/build/main.nf new file mode 100644 index 0000000..a2ee2c8 --- /dev/null +++ b/modules/nf-core/bracken/build/main.nf @@ -0,0 +1,48 @@ +process BRACKEN_BUILD { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bracken:2.9--py38h2494328_0': + 'biocontainers/bracken:2.9--py38h2494328_0' }" + + input: + tuple val(meta), path(kraken2db) + + output: + tuple val(meta), path(kraken2db , includeInputs: true), emit: db + tuple val(meta), path("${kraken2db}/database*", includeInputs: true), emit: bracken_files + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + bracken-build \\ + $args \\ + -t $task.cpus \\ + -d $kraken2db + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: \$(echo \$(bracken -v) | cut -f2 -d'v') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${kraken2db}/database100mers.kmer_distrib + touch ${kraken2db}/database100mers.kraken + touch ${kraken2db}/database.kraken + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: \$(echo \$(bracken -v) | cut -f2 -d'v') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bracken/build/meta.yml b/modules/nf-core/bracken/build/meta.yml new file mode 100644 index 0000000..2bce245 --- /dev/null +++ b/modules/nf-core/bracken/build/meta.yml @@ -0,0 +1,47 @@ +--- +name: "bracken_build" +description: Extends a Kraken2 database to be compatible with Bracken +keywords: + - kraken2 + - bracken + - database + - build +tools: + - "bracken": + description: "Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample." + homepage: "https://ccb.jhu.edu/software/bracken/" + documentation: "https://ccb.jhu.edu/software/bracken/" + tool_dev_url: "https://github.com/jenniferlu717/Bracken/" + doi: "10.7717/peerj-cs.104 " + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - kraken2db: + type: directory + description: A Kraken2 database directory + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - db: + type: directory + description: A Kraken2 database directory with required bracken files in side + pattern: "*/" + +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/bracken/build/tests/main.nf.test b/modules/nf-core/bracken/build/tests/main.nf.test new file mode 100644 index 0000000..f4168a7 --- /dev/null +++ b/modules/nf-core/bracken/build/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process BRACKEN_BUILD" + script "../main.nf" + process "BRACKEN_BUILD" + + tag "modules" + tag "modules_nfcore" + tag "bracken" + tag "bracken/build" + tag "untar" + + setup { + run ("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [[id: 'db'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2_intermediate.tar.gz', checkIfExists: true)] + """ + } + } + } + + test("kraken2 - db") { + + when { + process { + """ + input[0] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file("${process.out.db[0][1]}/database100mers.kmer_distrib").name, + file("${process.out.db[0][1]}/database100mers.kraken").name, + file("${process.out.db[0][1]}/database.kraken").name, + file("${process.out.bracken_files[0][1]}/database100mers.kmer_distrib").name, + file("${process.out.bracken_files[0][1]}/database100mers.kraken").name, + file("${process.out.bracken_files[0][1]}/database.kraken").name, + ).match() + } + ) + } + + } + + test("kraken2 - db - stub") { + + options "-stub" + + when { + process { + """ + input[0] = UNTAR.out.untar + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bracken/build/tests/main.nf.test.snap b/modules/nf-core/bracken/build/tests/main.nf.test.snap new file mode 100644 index 0000000..49f4240 --- /dev/null +++ b/modules/nf-core/bracken/build/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "kraken2 - db": { + "content": [ + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken", + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-17T18:41:03.693430543" + }, + "kraken2 - db - stub": { + "content": [ + { + "0": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "2": [ + "versions.yml:md5,925c6ae1387eaf6dbd14656125bc6d9b" + ], + "bracken_files": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "db": [ + [ + { + "id": "db" + }, + [ + "database.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kmer_distrib:md5,d41d8cd98f00b204e9800998ecf8427e", + "database100mers.kraken:md5,d41d8cd98f00b204e9800998ecf8427e", + "file.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,925c6ae1387eaf6dbd14656125bc6d9b" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-17T18:41:14.406736156" + } +} \ No newline at end of file diff --git a/modules/nf-core/bracken/build/tests/tags.yml b/modules/nf-core/bracken/build/tests/tags.yml new file mode 100644 index 0000000..92d7c26 --- /dev/null +++ b/modules/nf-core/bracken/build/tests/tags.yml @@ -0,0 +1,2 @@ +bracken/build: + - "modules/nf-core/bracken/build/**" diff --git a/nextflow.config b/nextflow.config index 4725942..56b7e6f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,6 +71,7 @@ params { build_centrifuge = false build_kraken2 = false kraken2_keepintermediate = false + run_bracken = false } diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/main.nf b/subworkflows/nf-core/fasta_build_add_kraken2/main.nf deleted file mode 100644 index 306896b..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/main.nf +++ /dev/null @@ -1,35 +0,0 @@ -include { KRAKEN2_ADD } from '../../../modules/nf-core/kraken2/add/main' -include { KRAKEN2_BUILD } from '../../../modules/nf-core/kraken2/build/main' - -workflow FASTA_BUILD_ADD_KRAKEN2 { - - take: - ch_fasta // channel: [ val(meta), fasta ] - ch_taxonomy_names // channel: [ names.dmp ] - ch_taxonomy_nodes // channel: [ nodes.dmp ] - ch_accession2taxid // channel: [ acc2taxidfile ] - val_cleanintermediate // value: [ true | false ] - - main: - - ch_versions = Channel.empty() - - ch_fastas_for_kraken2add = ch_fasta - .map { - meta, fasta -> - - [[id: 'db'], fasta] - } - .groupTuple() - - KRAKEN2_ADD ( ch_fastas_for_kraken2add, ch_taxonomy_names, ch_taxonomy_nodes, ch_accession2taxid ) - ch_versions = ch_versions.mix(KRAKEN2_ADD.out.versions.first()) - - KRAKEN2_BUILD ( KRAKEN2_ADD.out.db, val_cleanintermediate ) - ch_versions = ch_versions.mix(KRAKEN2_BUILD.out.versions.first()) - - emit: - db = KRAKEN2_BUILD.out.db // channel: [ val(meta), [ db ] ] - versions = ch_versions // channel: [ versions.yml ] -} - diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test b/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test deleted file mode 100644 index a7baca6..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test +++ /dev/null @@ -1,103 +0,0 @@ -nextflow_workflow { - - name "Test Subworkflow FASTA_BUILD_ADD_KRAKEN2" - script "../main.nf" - workflow "FASTA_BUILD_ADD_KRAKEN2" - - tag "subworkflows" - tag "subworkflows_nfcore" - tag "subworkflows/fasta_build_add_kraken2" - tag "gunzip" - tag "kraken2" - tag "kraken2/add" - tag "kraken2/build" - - test("metagenome - fasta") { - - setup { - run("GUNZIP") { - script "modules/nf-core/gunzip/main.nf" - process { - """ - input[0] = Channel.of([\ - [id:'haemophilus_influenzae'], - file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) - ] - ) - """ - } - } - } - - when { - workflow { - """ - input[0] = Channel.of([[id:'sarscov2'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip) - input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) - input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) - input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) - input[4] = true - """ - } - } - - then { - assertAll( - { assert workflow.success}, - { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, - { assert snapshot ( - workflow.out.versions, - path("${workflow.out.db[0][1]}/hash.k2d"), - path("${workflow.out.db[0][1]}/taxo.k2d"), - file("${workflow.out.db[0][1]}/opts.k2d").name, - ).match() - } - ) - } - } - - test("metagenome - fasta - nocleanup") { - - setup { - run("GUNZIP") { - script "modules/nf-core/gunzip/main.nf" - process { - """ - input[0] = Channel.of([\ - [id:'haemophilus_influenzae'], - file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) - ] - ) - """ - } - } - } - - when { - workflow { - """ - input[0] = Channel.of([[id:'sarscov2'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip) - input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) - input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) - input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) - input[4] = false - """ - } - } - - then { - assertAll( - { assert workflow.success}, - { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, - { assert snapshot ( - workflow.out.versions, - path("${workflow.out.db[0][1]}/hash.k2d"), - path("${workflow.out.db[0][1]}/taxo.k2d"), - file("${workflow.out.db[0][1]}/opts.k2d").name, - file("${workflow.out.db[0][1]}/unmapped.txt").name - ).match() - } - ) - } - } -} diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap deleted file mode 100644 index 57b4a48..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/tests/main.nf.test.snap +++ /dev/null @@ -1,35 +0,0 @@ -{ - "metagenome - fasta - nocleanup": { - "content": [ - [ - "versions.yml:md5,62fb719633dd8f110bbc2c1bec53d0a9", - "versions.yml:md5,82f39c3ef1ba0742da3105cbe5ed3cf7" - ], - "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", - "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", - "opts.k2d", - "unmapped.txt" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-04-05T08:17:49.670974771" - }, - "metagenome - fasta": { - "content": [ - [ - "versions.yml:md5,62fb719633dd8f110bbc2c1bec53d0a9", - "versions.yml:md5,82f39c3ef1ba0742da3105cbe5ed3cf7" - ], - "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", - "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", - "opts.k2d" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-04-05T08:17:31.501399396" - } -} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml b/subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml deleted file mode 100644 index af5f2a0..0000000 --- a/subworkflows/nf-core/fasta_build_add_kraken2/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/fasta_build_add_kraken2: - - subworkflows/nf-core/fasta_build_add_kraken2/** diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf new file mode 100644 index 0000000..cb9e370 --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/main.nf @@ -0,0 +1,42 @@ +include { KRAKEN2_ADD } from '../../../modules/nf-core/kraken2/add/main' +include { KRAKEN2_BUILD } from '../../../modules/nf-core/kraken2/build/main' +include { BRACKEN_BUILD } from '../../../modules/nf-core/bracken/build/main' + +workflow FASTA_BUILD_ADD_KRAKEN2_BRACKEN { + + take: + ch_fasta // channel: [ val(meta), [ fasta1, fasta2, fasta3] ] + ch_taxonomy_names // channel: [ names.dmp ] + ch_taxonomy_nodes // channel: [ nodes.dmp ] + ch_accession2taxid // channel: [ acc2taxidfile ] + val_cleanintermediates // value: [ true | false ] + val_runbrackenbuild // value: [ true | false ] + + main: + + if ( val_cleanintermediates && val_runbrackenbuild ) { error("Cannot perform Kraken2 cleanup and build Bracken database. Bracken requires intermediate files") } + val_cleanup = [ val_cleanintermediates && !val_runbrackenbuild ].any() ? true : false + + ch_versions = Channel.empty() + + KRAKEN2_ADD ( ch_fasta, ch_taxonomy_names, ch_taxonomy_nodes, ch_accession2taxid ) + ch_versions = ch_versions.mix( KRAKEN2_ADD.out.versions.first() ) + + KRAKEN2_BUILD ( KRAKEN2_ADD.out.db, val_cleanup ) + ch_versions = ch_versions.mix( KRAKEN2_BUILD.out.versions.first() ) + + if ( val_runbrackenbuild ) { + BRACKEN_BUILD ( KRAKEN2_BUILD.out.db ) + ch_final_db = BRACKEN_BUILD.out.db + ch_versions = ch_versions.mix( BRACKEN_BUILD.out.versions.first() ) + } + else { + ch_final_db = KRAKEN2_BUILD.out.db + ch_versions = ch_versions.mix( KRAKEN2_BUILD.out.versions.first() ) + } + + emit: + db = ch_final_db // channel: [ val(meta), [ db ] ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/nf-core/fasta_build_add_kraken2/meta.yml b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/meta.yml similarity index 61% rename from subworkflows/nf-core/fasta_build_add_kraken2/meta.yml rename to subworkflows/nf-core/fasta_build_add_kraken2_bracken/meta.yml index 1506709..8125c60 100644 --- a/subworkflows/nf-core/fasta_build_add_kraken2/meta.yml +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/meta.yml @@ -1,21 +1,23 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json -name: "fasta_build_add_kraken2" -description: KRAKEN2 build custom database subworkflow +name: "fasta_build_add_kraken2_bracken" +description: KRAKEN2 and BRACKEN build custom database subworkflow keywords: - metagenomics - kraken2 - database - build - custom + - bracken components: - kraken2/add - kraken2/build + - bracken/build input: - ch_fasta: type: file description: | - Channel containing each fasta as a distinct element with meta - Structure: [ val(meta), path(fasta) ] + Channel containing a meta with a list of FASTAs to be built + Structure: [ val(meta), [ fasta1, fasta2, fasta3 ] ] pattern: "*.{fasta,fa,fna}" - ch_taxonomy_names: type: file @@ -35,18 +37,26 @@ input: Channel containing a NCBI-style taxdump accession2taxid (acc2tax) file Structure: [ accession2taxid_file ] pattern: "*.accession2taxid" - - val_cleanintermediate: + - val_cleanintermediates: type: boolean description: | - Boolean flag whether to clean up intermediate files after build or not + Boolean flag whether to clean up intermediate files after build or not. + If val_runbrackenbuild set, will be ignored as BRACKEN requires intermediate files. Structure: [ val_cleanintermediate ] pattern: "true|false" + - val_runbrackenbuild: + type: boolean + description: | + Boolean flag whether to additionally insert required BRACKEN database files into KRAKEN2 directory. + Note any changes for k-mer or read lengths must come via Nextflow config `ext.args`. + Structure: [ val_runbrackenbuild ] + pattern: "true|false" output: - db: type: directory description: | - Channel containing KRAKEN2 database directory. - Use `$ext.prefix` in a modules.conf file to change default name + Channel containing KRAKEN2 (and BRACKEN) database directory files. + Use `ext.prefix` in a modules.conf file to change default name Structure: [ val(meta), path(db) ] pattern: "*/" - versions: diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test new file mode 100644 index 0000000..94bc184 --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test @@ -0,0 +1,160 @@ +nextflow_workflow { + + name "Test Subworkflow FASTA_BUILD_ADD_KRAKEN2_BRACKEN" + script "../main.nf" + workflow "FASTA_BUILD_ADD_KRAKEN2_BRACKEN" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_build_add_kraken2_bracken" + tag "gunzip" + tag "kraken2" + tag "kraken2/add" + tag "kraken2/build" + tag "bracken/build" + + test("metagenome - nocleanup - nobracken - fasta") { + + setup { + run("GUNZIP") { + script "modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'db'], + file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) + ] + ) + """ + } + } + } + + when { + workflow { + """ + input[0] = Channel.of([[id:'db'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip).groupTuple() + input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) + input[4] = false + input[5] = false + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot ( + workflow.out.versions, + path("${workflow.out.db[0][1]}/hash.k2d"), + path("${workflow.out.db[0][1]}/taxo.k2d"), + file("${workflow.out.db[0][1]}/opts.k2d").name, + ).match() + }, + { assert path("${workflow.out.db[0][1]}/library/").exists() }, + { assert path("${workflow.out.db[0][1]}/taxonomy/").exists() } + ) + } + } + + test("metagenome - withcleanup - nobracken - fasta") { + + setup { + run("GUNZIP") { + script "modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = Channel.of([\ + [id:'db'], + file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) + ] + ) + """ + } + } + } + + when { + workflow { + """ + input[0] = Channel.of([[id:'db'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip).groupTuple() + input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) + input[4] = true + input[5] = false + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, + { assert snapshot ( + workflow.out.versions, + path("${workflow.out.db[0][1]}/hash.k2d"), + path("${workflow.out.db[0][1]}/taxo.k2d"), + file("${workflow.out.db[0][1]}/opts.k2d").name, + file("${workflow.out.db[0][1]}/unmapped.txt").name + ).match() + }, + { assert !path("${workflow.out.db[0][1]}/library/").exists() }, + { assert !path("${workflow.out.db[0][1]}/taxonomy/").exists() } + ) + } + } + +test("metagenome - nocleanup - withbracken - fasta") { + + setup { + run("GUNZIP") { + script "modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = Channel.of([\ + [id:'db'], + file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/haemophilus_influenzae.fna.gz', checkIfExists: true) + ] + ) + """ + } + } + } + + when { + workflow { + """ + input[0] = Channel.of([[id:'db'], file(params.modules_testdata_base_path + '/genomics/prokaryotes/metagenome/fasta/sarscov2.fasta', checkIfExists: true)]).mix(GUNZIP.out.gunzip).groupTuple() + input[1] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/names.dmp', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/taxdmp/nodes.dmp', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/prokaryotes/metagenome/taxonomy/accession2taxid/nucl_gb.accession2taxid', checkIfExists: true) + input[4] = false + input[5] = true + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.db.get(0).get(1) ==~ ".*/db" }, + { assert path("${workflow.out.db[0][1]}/library/").exists() }, + { assert path("${workflow.out.db[0][1]}/taxonomy/").exists() }, + { assert snapshot ( + workflow.out.versions, + path("${workflow.out.db[0][1]}/hash.k2d"), + path("${workflow.out.db[0][1]}/taxo.k2d"), + file("${workflow.out.db[0][1]}/opts.k2d").name, + file("${workflow.out.db[0][1]}/unmapped.txt").name, + file("${workflow.out.db[0][1]}/database100mers.kmer_distrib").name, + file("${workflow.out.db[0][1]}/database100mers.kraken").name, + file("${workflow.out.db[0][1]}/database.kraken").name + ).match() + } + ) + } + } +} diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap new file mode 100644 index 0000000..9ad0c78 --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/main.nf.test.snap @@ -0,0 +1,58 @@ +{ + "metagenome - nocleanup - nobracken - fasta": { + "content": [ + [ + "versions.yml:md5,b5f92f68a6af1f422ccc1a5c75178793", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae" + ], + "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", + "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", + "opts.k2d" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-18T09:04:48.196774778" + }, + "metagenome - withcleanup - nobracken - fasta": { + "content": [ + [ + "versions.yml:md5,b5f92f68a6af1f422ccc1a5c75178793", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae" + ], + "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", + "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", + "opts.k2d", + "unmapped.txt" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-18T08:42:24.87325051" + }, + "metagenome - nocleanup - withbracken - fasta": { + "content": [ + [ + "versions.yml:md5,130d220d293e4f75863b6c0756bb8324", + "versions.yml:md5,b5f92f68a6af1f422ccc1a5c75178793", + "versions.yml:md5,f815f0afa0f648fb6532bf6d780ce0ae" + ], + "hash.k2d:md5,4717689f8ba88d4cae51ecc7c9d9b372", + "taxo.k2d:md5,24338e2d78f803f48bcc5653c6e51816", + "opts.k2d", + "unmapped.txt", + "database100mers.kmer_distrib", + "database100mers.kraken", + "database.kraken" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-18T10:58:01.065026262" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml new file mode 100644 index 0000000..40273bf --- /dev/null +++ b/subworkflows/nf-core/fasta_build_add_kraken2_bracken/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fasta_build_add_kraken2_bracken: + - subworkflows/nf-core/fasta_build_add_kraken2_bracken/** diff --git a/workflows/createtaxdb.nf b/workflows/createtaxdb.nf index ea8c59a..01317a3 100644 --- a/workflows/createtaxdb.nf +++ b/workflows/createtaxdb.nf @@ -21,7 +21,7 @@ include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pi include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_createtaxdb_pipeline' -include { FASTA_BUILD_ADD_KRAKEN2 } from '../subworkflows/nf-core/fasta_build_add_kraken2/main' +include { FASTA_BUILD_ADD_KRAKEN2_BRACKEN } from '../subworkflows/nf-core/fasta_build_add_kraken2_bracken/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -142,10 +142,11 @@ workflow CREATETAXDB { } // SUBWORKFLOW: Kraken2 + // Bracken requires intermediate files, if run_bracken=true then kraken2_keepintermediate=true, otherwise an error will be raised if ( params.build_kraken2 ) { - FASTA_BUILD_ADD_KRAKEN2 ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, !params.kraken2_keepintermediate ) - ch_versions = ch_versions.mix(FASTA_BUILD_ADD_KRAKEN2.out.versions.first()) - ch_kraken2_output = FASTA_BUILD_ADD_KRAKEN2.out.db + FASTA_BUILD_ADD_KRAKEN2_BRACKEN ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, !params.kraken2_keepintermediate, params.run_bracken ) + ch_versions = ch_versions.mix(FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.versions.first()) + ch_kraken2_output = FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.db } else { ch_kraken2_output = Channel.empty() } From 130d8d2b872572132de3cc52231198dd97cebb15 Mon Sep 17 00:00:00 2001 From: alxndrdiaz Date: Thu, 2 May 2024 18:11:14 -0600 Subject: [PATCH 2/8] add run_bracken --- nextflow_schema.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index b1344ea..52e702e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -145,6 +145,11 @@ "type": "boolean", "fa_icon": "fas fa-save", "description": "Retain intermediate Kraken2 build files for inspection." + }, + "run_bracken": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Generate Bracken files required for abundance estimation." } }, "fa_icon": "fas fa-database" From 312b878208cc2c0665d90b04de0cb56f643cee06 Mon Sep 17 00:00:00 2001 From: alxndrdiaz Date: Thu, 2 May 2024 18:26:08 -0600 Subject: [PATCH 3/8] update utils_nfcore_pipeline --- modules.json | 2 +- subworkflows/nf-core/utils_nfcore_pipeline/main.nf | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/modules.json b/modules.json index 0b6df73..a318afd 100644 --- a/modules.json +++ b/modules.json @@ -86,7 +86,7 @@ }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index a8b55d6..14558c3 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,9 +65,15 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { + def temp_doi_ref = "" + String[] manifest_doi = workflow.manifest.doi.tokenize(",") + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " ${workflow.manifest.doi}\n\n" + + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + From 23a8c1bfd58f3394991a6101e0aa8db036a2518b Mon Sep 17 00:00:00 2001 From: alxndrdiaz Date: Thu, 2 May 2024 18:37:08 -0600 Subject: [PATCH 4/8] remove trailing whitespace --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 56b7e6f..5880434 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,7 +71,7 @@ params { build_centrifuge = false build_kraken2 = false kraken2_keepintermediate = false - run_bracken = false + run_bracken = false } From 9673e38450196bd912ec26a9dffef021216ca53c Mon Sep 17 00:00:00 2001 From: alxndrdiaz Date: Thu, 2 May 2024 18:40:47 -0600 Subject: [PATCH 5/8] remove whitespace --- nextflow.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 5880434..96d52c4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,8 +71,7 @@ params { build_centrifuge = false build_kraken2 = false kraken2_keepintermediate = false - run_bracken = false - + run_bracken = false } // Load base.config by default for all pipelines From 3e2377cc9a33194279a52c0f8b74dbe4efe28c21 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 23 May 2024 07:31:35 +0000 Subject: [PATCH 6/8] Get build running correctly with keeping intermediate logic, --- conf/test.config | 1 + conf/test_nothing.config | 1 + nextflow.config | 2 +- nextflow_schema.json | 2 +- workflows/createtaxdb.nf | 12 +++++++----- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/conf/test.config b/conf/test.config index c1be203..a817d27 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,7 @@ params { build_malt = true build_centrifuge = true build_kraken2 = true + build_bracken = true accession2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl_gb.accession2taxid' nucl2taxid = params.pipelines_testdata_base_path + 'createtaxdb/data/taxonomy/nucl2tax.map' diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 72c07e3..4fd0e0f 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -30,5 +30,6 @@ params { build_malt = false build_centrifuge = false build_kraken2 = false + build_bracken = false } diff --git a/nextflow.config b/nextflow.config index b036851..648516b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -72,7 +72,7 @@ params { build_centrifuge = false build_kraken2 = false kraken2_keepintermediate = false - run_bracken = false + build_bracken = false } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index e318efd..99b1445 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -146,7 +146,7 @@ "fa_icon": "fas fa-save", "description": "Retain intermediate Kraken2 build files for inspection." }, - "run_bracken": { + "build_bracken": { "type": "boolean", "fa_icon": "fas fa-save", "description": "Generate Bracken files required for abundance estimation." diff --git a/workflows/createtaxdb.nf b/workflows/createtaxdb.nf index bd0b4cc..b461f4d 100644 --- a/workflows/createtaxdb.nf +++ b/workflows/createtaxdb.nf @@ -53,7 +53,7 @@ workflow CREATETAXDB { // PREPARE: Prepare input for single file inputs modules - if ( [params.build_malt, params.build_centrifuge, params.build_kraken2].any() ) { // Pull just DNA sequences + if ( [params.build_malt, params.build_centrifuge, params.build_kraken2, params.build_bracken].any() ) { // Pull just DNA sequences ch_dna_refs_for_singleref = ch_samplesheet .map{meta, fasta_dna, fasta_aa -> [[id: params.dbname], fasta_dna]} @@ -141,10 +141,12 @@ workflow CREATETAXDB { ch_kaiju_output = Channel.empty() } - // SUBWORKFLOW: Kraken2 - // Bracken requires intermediate files, if run_bracken=true then kraken2_keepintermediate=true, otherwise an error will be raised - if ( params.build_kraken2 ) { - FASTA_BUILD_ADD_KRAKEN2_BRACKEN ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, !params.kraken2_keepintermediate, params.run_bracken ) + // SUBWORKFLOW: Kraken2 and Bracken + // Bracken requires intermediate files, if build_bracken=true then kraken2_keepintermediate=true, otherwise an error will be raised + // Condition is inverted because subworkflow asks if you want to 'clean' (true) or not, but pipeline says to 'keep' + if ( params.build_kraken2 || params.build_bracken ) { + def k2_keepintermediates = params.kraken2_keepintermediate || params.build_bracken ? false : true + FASTA_BUILD_ADD_KRAKEN2_BRACKEN ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, k2_keepintermediates, params.build_bracken ) ch_versions = ch_versions.mix(FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.versions.first()) ch_kraken2_output = FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.db } else { From 526ef0dd6705e8faa91241db59d3391f415d4907 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 23 May 2024 08:00:58 +0000 Subject: [PATCH 7/8] Get testing working --- conf/test.config | 4 +++- conf/test_full.config | 8 ++++++++ nextflow.config | 2 +- nextflow_schema.json | 6 +++--- tests/test.nf.test | 8 +++++--- tests/test.nf.test.snap | 6 ++++-- workflows/createtaxdb.nf | 18 +++++++++--------- 7 files changed, 33 insertions(+), 19 deletions(-) diff --git a/conf/test.config b/conf/test.config index a817d27..4046af5 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,7 +22,9 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'createtaxdb/samplesheets/test.csv' + input = params.pipelines_testdata_base_path + 'createtaxdb/samplesheets/test.csv' + + dbname = "database" build_diamond = true build_kaiju = true diff --git a/conf/test_full.config b/conf/test_full.config index 591cce3..b43690d 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -18,4 +18,12 @@ params { // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) // TODO nf-core: Give any required params for the test so that command line flags are not needed input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + + build_diamond = true + build_kaiju = true + build_malt = true + build_centrifuge = true + build_kraken2 = true + build_bracken = true + } diff --git a/nextflow.config b/nextflow.config index 648516b..15cec1c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,7 +54,7 @@ params { validate_params = true // General parameters - dbname = "database" + dbname = null save_concatenated_fastas = false accession2taxid = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 99b1445..853449f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["input", "outdir", "dbname"], "properties": { "input": { "type": "string", @@ -43,7 +43,6 @@ }, "dbname": { "type": "string", - "default": "database", "description": "Specify name that resulting databases will be prefixed with.", "fa_icon": "fas fa-id-badge" }, @@ -149,7 +148,8 @@ "build_bracken": { "type": "boolean", "fa_icon": "fas fa-save", - "description": "Generate Bracken files required for abundance estimation." + "description": "Turn on extending of Kraken2 database to include Bracken files. Requires nucleotide FASTA File input.", + "help_text": "Bracken2 databases are simply just a Kraken2 database with two additional files.\n\nNote however this requires a Kraken2 database _with_ intermediate files still in it, thus can result in large database directories." } }, "fa_icon": "fas fa-database" diff --git a/tests/test.nf.test b/tests/test.nf.test index 46dd23f..81a76d2 100644 --- a/tests/test.nf.test +++ b/tests/test.nf.test @@ -21,9 +21,11 @@ nextflow_pipeline { path("$outputDir/centrifuge/"), path("$outputDir/diamond/database.dmnd"), path("$outputDir/kaiju/database.fmi"), - path("$outputDir/kraken2/db/hash.k2d"), - file("$outputDir/kraken2/db/opts.k2d").name, - path("$outputDir/kraken2/db/taxo.k2d"), + path("$outputDir/kraken2/database/hash.k2d"), + file("$outputDir/kraken2/database/opts.k2d").name, + path("$outputDir/kraken2/database/taxo.k2d"), + file("$outputDir/bracken/database/database100mers.kmer_distrib").name, + file("$outputDir/bracken/database/database100mers.kraken").name, path("$outputDir/malt/malt-build.log").readLines().last().contains('Peak memory'), path("$outputDir/malt/malt_index/index0.idx"), path("$outputDir/malt/malt_index/ref.db"), diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap index 73e0d6d..bf8fe23 100644 --- a/tests/test.nf.test.snap +++ b/tests/test.nf.test.snap @@ -12,6 +12,8 @@ "hash.k2d:md5,01122a04dcef29ceb3baa68a9f6e6ef5", "opts.k2d", "taxo.k2d:md5,cd8170a8c5a1b763a9ac1ffa2107cc88", + "database100mers.kmer_distrib", + "database100mers.kraken", true, "index0.idx:md5,876139dc930e68992cd2625e08bba48a", "ref.db:md5,377073f58a9f9b85acca59fcf21744a9", @@ -23,8 +25,8 @@ ], "meta": { "nf-test": "0.8.4", - "nextflow": "24.02.0" + "nextflow": "24.04.1" }, - "timestamp": "2024-04-11T10:59:28.687364796" + "timestamp": "2024-05-23T08:00:31.799820635" } } \ No newline at end of file diff --git a/workflows/createtaxdb.nf b/workflows/createtaxdb.nf index b461f4d..0192d99 100644 --- a/workflows/createtaxdb.nf +++ b/workflows/createtaxdb.nf @@ -148,9 +148,9 @@ workflow CREATETAXDB { def k2_keepintermediates = params.kraken2_keepintermediate || params.build_bracken ? false : true FASTA_BUILD_ADD_KRAKEN2_BRACKEN ( CAT_CAT_DNA.out.file_out, ch_taxonomy_namesdmp, ch_taxonomy_nodesdmp, ch_accession2taxid, k2_keepintermediates, params.build_bracken ) ch_versions = ch_versions.mix(FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.versions.first()) - ch_kraken2_output = FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.db + ch_kraken2_bracken_output = FASTA_BUILD_ADD_KRAKEN2_BRACKEN.out.db } else { - ch_kraken2_output = Channel.empty() + ch_kraken2_bracken_output = Channel.empty() } // Module: Run MALT/BUILD @@ -229,13 +229,13 @@ workflow CREATETAXDB { multiqc_report = MULTIQC.out.report.toList() emit: - versions = ch_collated_versions - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - centrifuge_database = ch_centrifuge_output - diamond_database = ch_diamond_output - kaiju_database = ch_kaiju_output - kraken2_database = ch_kraken2_output - malt_database = ch_malt_output + versions = ch_collated_versions + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + centrifuge_database = ch_centrifuge_output + diamond_database = ch_diamond_output + kaiju_database = ch_kaiju_output + kraken2_bracken_database = ch_kraken2_bracken_output + malt_database = ch_malt_output } /* From 0d7ed76a7fe4e44f0729adfcbc341650bf34457c Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 23 May 2024 08:16:52 +0000 Subject: [PATCH 8/8] Add missing file in tests, and update docs eveywhere --- CITATIONS.md | 4 +++ README.md | 3 +- docs/output.md | 29 +++++++++++++++++++ .../utils_nfcore_createtaxdb_pipeline/main.nf | 2 ++ tests/test.nf.test | 1 + tests/test.nf.test.snap | 3 +- 6 files changed, 40 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 8451b47..e9b5164 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -40,6 +40,10 @@ > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. +- [Bracken](https://doi.org/10.7717/peerj-cs.104) + + > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. https://doi.org/10.7717/peerj-cs.104 + - [Centrifuge](https://doi.org/10.1101/gr.210641.116) > Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. https://doi.org/10.1101/gr.210641.116 diff --git a/README.md b/README.md index 7b1787c..c072d55 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ 1. Prepares input FASTA files for building 2. Builds databases for: + - [Bracken](https://doi.org/10.7717/peerj-cs.104) - [Centrifuge](https://doi.org/10.1101/gr.210641.116) - [DIAMOND](https://doi.org/10.1038/nmeth.3176) - [Kaiju](https://doi.org/10.1038/ncomms11257) @@ -84,7 +85,7 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/createtaxdb was originally written by James A. Fellows Yates and the nf-core community. +nf-core/createtaxdb was originally written by James A. Fellows Yates, Joon Klaps, Alexander Ramos Díaz and the nf-core community. We thank the following people for their extensive assistance in the development of this pipeline: diff --git a/docs/output.md b/docs/output.md index 5a54002..1a30100 100644 --- a/docs/output.md +++ b/docs/output.md @@ -14,6 +14,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Bracken](#bracken) - Database files for Brakcen - [Centrifuge](#centrifuge) - Database files for Centrifuge - [DIAMOND](#diamond) - Database files for DIAMOND - [Kaiju](#kaiju) - Database files for Kaiju @@ -51,6 +52,31 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +### Bracken + +[Bracken](https://github.com/jenniferlu717/Bracken/)(Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. + +
+Output files + +- `bracken/` + - `/` + - `database100mers.kmer_distrib`: Bracken kmer distribution file + - `database100mers.kraken`: Bracken index file + - `database.kraken`: Bracken database file + - `hash.k2d`: Kraken2 hash database file + - `opts.k2d`: Kraken2 opts database file + - `taxo.k2d`: Kraken2 taxo database file + - `library/`: Intermediate Kraken2 directory containing FASTAs and related files of added genomes + - `taxonomy/`: Intermediate Kraken2 directory containing taxonomy files of added genomes + - `seqid2taxid.map`: Intermediate Kraken2 file containing taxonomy files of added genomes + +
+ +Note that all intermediate files are required for Bracken2 database, even if Kraken2 itself only requires the `*.k2d` files. + +The resulting `/` directory can be given to Bracken itself with `bracken -d ` etc. + ### Centrifuge [Centrifuge](https://github.com/bbuchfink/diamond) is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. @@ -105,6 +131,9 @@ The `fmi` file can be given to Kaiju itself with `kaiju -f .fmi` - `hash.k2d`: Kraken2 hash database file - `opts.k2d`: Kraken2 opts database file - `taxo.k2d`: Kraken2 taxo database file + - `library/`: Intermediate directory containing FASTAs and related files of added genomes (only present if `--build_bracken` or `--kraken2_keepintermediate` supplied) + - `taxonomy/`: Intermediate directory containing taxonomy files of added genomes (only present if `--build_bracken` or `--kraken2_keepintermediate` supplied) + - `seqid2taxid.map`: Intermediate file containing taxonomy files of added genomes (only present if `--build_bracken` or `--kraken2_keepintermediate` supplied) diff --git a/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf b/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf index 1a9db9f..adf9a3d 100644 --- a/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_createtaxdb_pipeline/main.nf @@ -194,6 +194,7 @@ def toolCitationText() { // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", + params.build_bracken ? "Bracken (Lu et al. 2017)," : "", params.build_centrifuge ? "Centrifuge (Kim et al. 2016)," : "", params.build_diamond ? "DIAMOND (Buchfink et al. 2015)," : "", params.build_kaiju ? "Kaiju (Menzel et al. 2016)," : "", @@ -211,6 +212,7 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ + params.build_bracken ? "
  • Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. 10.7717/peerj-cs.104
  • " : "", params.build_centrifuge ? "
  • Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. 10.1101/gr.210641.116
  • " : "", params.build_diamond ? "
  • Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. 10.1038/nmeth.3176
  • " : "", params.build_kaiju ? "
  • Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. 10.1038/ncomms11257
  • " : "", diff --git a/tests/test.nf.test b/tests/test.nf.test index 81a76d2..54bea3e 100644 --- a/tests/test.nf.test +++ b/tests/test.nf.test @@ -26,6 +26,7 @@ nextflow_pipeline { path("$outputDir/kraken2/database/taxo.k2d"), file("$outputDir/bracken/database/database100mers.kmer_distrib").name, file("$outputDir/bracken/database/database100mers.kraken").name, + file("$outputDir/bracken/database/database.kraken").name, path("$outputDir/malt/malt-build.log").readLines().last().contains('Peak memory'), path("$outputDir/malt/malt_index/index0.idx"), path("$outputDir/malt/malt_index/ref.db"), diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap index bf8fe23..bf9427f 100644 --- a/tests/test.nf.test.snap +++ b/tests/test.nf.test.snap @@ -14,6 +14,7 @@ "taxo.k2d:md5,cd8170a8c5a1b763a9ac1ffa2107cc88", "database100mers.kmer_distrib", "database100mers.kraken", + "database.kraken", true, "index0.idx:md5,876139dc930e68992cd2625e08bba48a", "ref.db:md5,377073f58a9f9b85acca59fcf21744a9", @@ -27,6 +28,6 @@ "nf-test": "0.8.4", "nextflow": "24.04.1" }, - "timestamp": "2024-05-23T08:00:31.799820635" + "timestamp": "2024-05-23T08:15:27.641419595" } } \ No newline at end of file