diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7fab61..3bdbf58 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,43 +1,87 @@ -name: nf-core CI # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +name: nf-core CI on: push: branches: - - dev + - "dev" pull_request: + branches: + - "dev" + - "master" release: - types: [published] + types: + - "published" env: NXF_ANSI_LOG: false + NFTEST_VER: "0.7.3" concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: + define_nxf_versions: + name: Choose nextflow versions to test against depending on target branch + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.nxf_versions.outputs.matrix }} + steps: + - id: nxf_versions + run: | + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.base_ref }}" == "dev" && "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + echo matrix='["latest-everything"]' | tee -a $GITHUB_OUTPUT + else + echo matrix='["latest-everything", "23.10.0"]' | tee -a $GITHUB_OUTPUT + fi + test: - name: Run pipeline with test data - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/createtaxdb') }}" + name: nf-test + needs: define_nxf_versions runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - NXF_VER: - - "23.04.0" - - "latest-everything" + NXF_VER: ${{ fromJson(needs.define_nxf_versions.outputs.matrix) }} + tags: + - "test" + profile: + - "docker" + steps: - name: Check out pipeline code uses: actions/checkout@v4 + - name: Check out test data + uses: actions/checkout@v3 + with: + repository: nf-core/test-datasets + ref: createtaxdb + path: test-datasets/ + fetch-depth: 1 + - name: Install Nextflow uses: nf-core/setup-nextflow@v1 with: version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix + - name: Install nf-test + run: | + wget -qO- https://code.askimed.com/install/nf-test | bash -s $NFTEST_VER + sudo mv nf-test /usr/local/bin/ + + - name: Run nf-test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nf-test test --tag ${{ matrix.tags }} --profile ${{ matrix.tags }},${{ matrix.profile }} --junitxml=test.xml + + - name: Output log on failure + if: failure() + run: | + sudo apt install bat > /dev/null + batcat --decorations=always --color=always ${{ github.workspace }}/.nf-test/tests/*/output/pipeline_info/software_versions.yml + + - name: Publish Test Report + uses: mikepenz/action-junit-report@v3 + if: always() # always run even if the previous step fails + with: + report_paths: "*.xml" diff --git a/.gitignore b/.gitignore index 5124c9a..f704e54 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ results/ testing/ testing* *.pyc +.nf-test* +test.xml diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc8..04e9f03 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,4 @@ repository_type: pipeline +## TODO: re-activate once nf-test ci.yml structure updated +lint: + actions_ci: False diff --git a/CHANGELOG.md b/CHANGELOG.md index 7706b5c..233fe7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Initial release of nf-core/createtaxdb, created with the [nf-core](https://nf-co.re/) template. +Adds database building support for: + +- DIAMOND (added by @jfy133) +- Kaiju (added by @jfy133) + ### `Added` ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 4dfe4c1..4f201cb 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -39,3 +39,11 @@ - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + +- [DIAMOND](https://doi.org/10.1038/nmeth.3176) + + > Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. https://doi.org/10.1038/nmeth.3176 + +- [Kaiju](https://doi.org/10.1038/ncomms11257) + +> Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. https://doi.org/10.1038/ncomms11257 diff --git a/assets/schema_input.json b/assets/schema_input.json index cec9d78..8a9c010 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,30 +7,60 @@ "items": { "type": "object", "properties": { - "sample": { + "id": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "unique": true, + "errorMessage": "Sequence reference name must be provided and cannot contain spaces", + "meta": ["id"] }, - "fastq_1": { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "taxid": { + "type": "integer", + "unique": true, + "errorMessage": "Please provide a valid taxonomic ID in integer format", + "meta": ["taxid"] + }, + "fasta_dna": { + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "unique": true, + "errorMessage": "FASTA file for nucleotide sequence cannot contain spaces and must have a valid FASTA extension (fasta, fna, fa, fas, faa), optionally gzipped", + "exists": true, + "format": "file-path" }, - "fastq_2": { - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "fasta_aa": { "anyOf": [ { "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$" + "pattern": "^\\S+\\.(fasta|fas|fa|faa)(\\.gz)?$" }, { "type": "string", "maxLength": 0 } - ] + ], + "unique": true, + "errorMessage": "FASTA file for amino acid reference sequence cannot contain spaces and must have a valid FASTA extension (fasta, fna, fa, fas, faa), optionally gzipped", + "exists": true, + "format": "file-path" } }, - "required": ["sample", "fastq_1"] + "required": ["id", "taxid"], + "anyOf": [ + { + "required": ["fasta_dna"] + }, + { + "required": ["fasta_aa"] + } + ] } } diff --git a/assets/test.csv b/assets/test.csv new file mode 100644 index 0000000..52ac082 --- /dev/null +++ b/assets/test.csv @@ -0,0 +1,3 @@ +id,taxid,fasta_dna,fasta_aa +Severe_acute_respiratory_syndrome_coronavirus_2,2697049,https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/fasta/sarscov2.fasta,https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/fasta/sarscov2.faa +Haemophilus_influenzae,727,https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/fasta/haemophilus_influenzae.fna.gz, diff --git a/conf/modules.config b/conf/modules.config index d91c6ab..512da6e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,18 +18,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: FASTQC { - ext.args = '--quiet' - } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index b75cfc8..02c5cb4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,13 @@ params { max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' - // Genome references - genome = 'R64-1-1' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/samplesheets/test.csv' + + build_kaiju = true + build_diamond = true + + prot2taxid = 'https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/taxonomy/prot.accession2taxid.gz' + nodesdmp = 'https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/taxonomy/prot_nodes.dmp' + namesdmp = 'https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/taxonomy/prot_names.dmp' } diff --git a/docs/output.md b/docs/output.md index ccf3960..1254ad4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,32 +12,36 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### Diamond
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `diamond/` + - `.dmnd`: DIAMOND dmnd database file
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +[DIAMOND](https://github.com/bbuchfink/diamond) is a accelerated BLAST compatible local sequence aligner particularly used for protein alignment. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +The `dmnd` file can be given to one of the DIAMOND alignment commands with `diamond blast -d .dmnd` etc. -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +### Kaiju -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +
+Output files + +- `kaiju/` + - `.fmi`: Kaiju FMI index file + +
+ +[Kaiju](https://bioinformatics-centre.github.io/kaiju/) is a fast and sensitive taxonomic classification for metagenomics utilising nucletoide to protein translations. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +The `fmi` file can be given to Kaiju itself with `kaiju -f .fmi` etc. ### MultiQC diff --git a/lib/WorkflowCreatetaxdb.groovy b/lib/WorkflowCreatetaxdb.groovy index 63827ee..ace0fe0 100755 --- a/lib/WorkflowCreatetaxdb.groovy +++ b/lib/WorkflowCreatetaxdb.groovy @@ -15,9 +15,9 @@ class WorkflowCreatetaxdb { genomeExistsError(params, log) - if (!params.fasta) { - Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - } + // if (!params.fasta) { + // Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + // } } // @@ -58,8 +58,9 @@ class WorkflowCreatetaxdb { // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", + params.build_diamond ? "DIAMOND (Buchfink et al. 2015)," : "", + params.build_kaiju ? "Kaiju (Menzel et al. 2016)," : "", + "and MultiQC (Ewels et al. 2016)", "." ].join(' ').trim() @@ -68,11 +69,11 @@ class WorkflowCreatetaxdb { public static String toolBibliographyText(params) { - // TODO Optionally add bibliographic entries to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + params.build_diamond ? "
  • Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. 10.1038/nmeth.3176
  • " : "", + params.build_kaiju ? "
  • Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. 10.1038/ncomms11257
  • " : "", "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " ].join(' ').trim() @@ -93,9 +94,8 @@ class WorkflowCreatetaxdb { meta["tool_citations"] = "" meta["tool_bibliography"] = "" - // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - //meta["tool_bibliography"] = toolBibliographyText(params) + meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + meta["tool_bibliography"] = toolBibliographyText(params) def methods_text = mqc_methods_yaml.text diff --git a/modules.json b/modules.json index 9d8a6cd..b1dffc7 100644 --- a/modules.json +++ b/modules.json @@ -5,14 +5,29 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "cat/cat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", "installed_by": ["modules"] }, - "fastqc": { + "diamond/makedb": { + "branch": "master", + "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c", + "installed_by": ["modules"] + }, + "kaiju/mkfmi": { + "branch": "master", + "git_sha": "7365564c402cbd01e9407810730efd10039997a3", + "installed_by": ["modules"] + }, + "malt/build": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "multiqc": { diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 0000000..17a04ef --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 0000000..4264a92 --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,62 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 0000000..00a8db0 --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 0000000..5766daa --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,153 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 0000000..423571b --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 0000000..ec26b0f --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 0000000..fbc7978 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 0000000..37b578f --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/diamond/makedb/environment.yml b/modules/nf-core/diamond/makedb/environment.yml new file mode 100644 index 0000000..0196bd6 --- /dev/null +++ b/modules/nf-core/diamond/makedb/environment.yml @@ -0,0 +1,7 @@ +name: diamond_makedb +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::diamond=2.1.8 diff --git a/modules/nf-core/diamond/makedb/main.nf b/modules/nf-core/diamond/makedb/main.nf new file mode 100644 index 0000000..94011cf --- /dev/null +++ b/modules/nf-core/diamond/makedb/main.nf @@ -0,0 +1,65 @@ +process DIAMOND_MAKEDB { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.1.8--h43eeafb_0' : + 'biocontainers/diamond:2.1.8--h43eeafb_0' }" + + input: + tuple val(meta), path(fasta) + path taxonmap + path taxonnodes + path taxonnames + + output: + tuple val(meta), path("*.dmnd"), emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" ? true : false + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def insert_taxonmap = taxonmap ? "--taxonmap $taxonmap" : "" + def insert_taxonnodes = taxonnodes ? "--taxonnodes $taxonnodes" : "" + def insert_taxonnames = taxonnames ? "--taxonnames $taxonnames" : "" + + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${fasta} > ${fasta_name} + fi + + diamond \\ + makedb \\ + --threads ${task.cpus} \\ + --in ${fasta_name} \\ + -d ${prefix} \\ + ${args} \\ + ${insert_taxonmap} \\ + ${insert_taxonnodes} \\ + ${insert_taxonnames} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/makedb/meta.yml b/modules/nf-core/diamond/makedb/meta.yml new file mode 100644 index 0000000..fdcf032 --- /dev/null +++ b/modules/nf-core/diamond/makedb/meta.yml @@ -0,0 +1,57 @@ +name: diamond_makedb +description: Builds a DIAMOND database +keywords: + - fasta + - diamond + - index + - database +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - taxonmap: + type: file + description: Optional mapping file of NCBI protein accession numbers to taxon ids (gzip compressed), required for taxonomy functionality. + pattern: "*.gz" + - taxonnodes: + type: file + description: Optional NCBI taxonomy nodes.dmp file, required for taxonomy functionality. + pattern: "*.dmp" + - taxonnames: + type: file + description: Optional NCBI taxonomy names.dmp file, required for taxonomy functionality. + pattern: "*.dmp" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - db: + type: file + description: File of the indexed DIAMOND database + pattern: "*.dmnd" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@spficklin" +maintainers: + - "@spficklin" + - "@vagkaratzas" + - "@jfy133" diff --git a/modules/nf-core/diamond/makedb/tests/main.nf.test b/modules/nf-core/diamond/makedb/tests/main.nf.test new file mode 100644 index 0000000..e9f7df2 --- /dev/null +++ b/modules/nf-core/diamond/makedb/tests/main.nf.test @@ -0,0 +1,89 @@ +nextflow_process { + + name "Test Process DIAMOND_MAKEDB" + script "../main.nf" + process "DIAMOND_MAKEDB" + tag "modules" + tag "modules_nfcore" + tag "diamond" + tag "diamond/makedb" + + test("Should build a DIAMOND db file from a fasta file without taxonomic information") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], [ file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("db") }, + { assert process.out.versions } + ) + } + + } + + test("Should build a DIAMOND db file from a zipped fasta file without taxonomic information") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], [ file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true) ] ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("gz_db") }, + { assert process.out.versions } + ) + } + + } + + test("Should build a DIAMOND db file from a fasta file with taxonomic information") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], [ file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] ] + input[1] = [ file(params.test_data['sarscov2']['metagenome']['prot_accession2taxid_gz'], checkIfExists: true) ] + input[2] = [ file(params.test_data['sarscov2']['metagenome']['prot_nodes_dmp'], checkIfExists: true) ] + input[3] = [ file(params.test_data['sarscov2']['metagenome']['prot_names_dmp'], checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("db_tax") }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/diamond/makedb/tests/main.nf.test.snap b/modules/nf-core/diamond/makedb/tests/main.nf.test.snap new file mode 100644 index 0000000..a8fe065 --- /dev/null +++ b/modules/nf-core/diamond/makedb/tests/main.nf.test.snap @@ -0,0 +1,41 @@ +{ + "db_tax": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.dmnd:md5,9d57aa88cd1766adfda8360876fc0e4f" + ] + ] + ], + "timestamp": "2023-12-05T14:28:48.616684409" + }, + "db": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.dmnd:md5,6039420745dd4db6e761244498460ae1" + ] + ] + ], + "timestamp": "2023-12-05T14:22:02.696726038" + }, + "gz_db": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.dmnd:md5,6039420745dd4db6e761244498460ae1" + ] + ] + ], + "timestamp": "2023-12-05T14:22:07.430248018" + } +} \ No newline at end of file diff --git a/modules/nf-core/diamond/makedb/tests/tags.yml b/modules/nf-core/diamond/makedb/tests/tags.yml new file mode 100644 index 0000000..6fc7762 --- /dev/null +++ b/modules/nf-core/diamond/makedb/tests/tags.yml @@ -0,0 +1,2 @@ +diamond/makedb: + - modules/nf-core/diamond/makedb/** diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf deleted file mode 100644 index 9e19a74..0000000 --- a/modules/nf-core/fastqc/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -process FASTQC { - tag "$meta.id" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // Make list of old name and new name pairs to use for renaming in the bash while loop - def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') - def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') - """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done - - fastqc \\ - $args \\ - --threads $task.cpus \\ - $renamed_files - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml deleted file mode 100644 index ee5507e..0000000 --- a/modules/nf-core/fastqc/meta.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ - licence: ["GPL-2.0-only"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - html: - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test deleted file mode 100644 index b9e8f92..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ /dev/null @@ -1,109 +0,0 @@ -nextflow_process { - - name "Test Process FASTQC" - script "../main.nf" - process "FASTQC" - tag "modules" - tag "modules_nfcore" - tag "fastqc" - - test("Single-Read") { - - when { - params { - outdir = "$outputDir" - } - process { - """ - input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] - """ - } - } - - then { - assertAll ( - { assert process.success }, - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
    Mon 2 Oct 2023
    test.gz
    - // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, - { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } - ) - } - } -// TODO -// // -// // Test with paired-end data -// // -// workflow test_fastqc_paired_end { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with interleaved data -// // -// workflow test_fastqc_interleaved { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with bam data -// // -// workflow test_fastqc_bam { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with multiple samples -// // -// workflow test_fastqc_multiple { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with custom prefix -// // -// workflow test_fastqc_custom_prefix { -// input = [ -// [ id:'mysample', single_end:true ], // meta map -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } -} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap deleted file mode 100644 index 636a32c..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ /dev/null @@ -1,10 +0,0 @@ -{ - "versions": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "timestamp": "2023-10-09T23:40:54+0000" - } -} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml deleted file mode 100644 index 7834294..0000000 --- a/modules/nf-core/fastqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -fastqc: - - modules/nf-core/fastqc/** diff --git a/modules/nf-core/kaiju/mkfmi/main.nf b/modules/nf-core/kaiju/mkfmi/main.nf new file mode 100644 index 0000000..bd9bad6 --- /dev/null +++ b/modules/nf-core/kaiju/mkfmi/main.nf @@ -0,0 +1,36 @@ +process KAIJU_MKFMI { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kaiju=1.9.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.9.2--h5b5514e_0': + 'biocontainers/kaiju:1.9.2--h5b5514e_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.fmi"), emit: fmi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + kaiju-mkbwt \\ + $args \\ + -n $task.cpus \\ + -o ${prefix} \\ + ${fasta} + kaiju-mkfmi ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kaiju/mkfmi/meta.yml b/modules/nf-core/kaiju/mkfmi/meta.yml new file mode 100644 index 0000000..fe06f35 --- /dev/null +++ b/modules/nf-core/kaiju/mkfmi/meta.yml @@ -0,0 +1,44 @@ +name: "kaiju_mkfmi" +description: Make Kaiju FMI-index file from a protein FASTA file +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling + - database + - index +tools: + - "kaiju": + description: "Fast and sensitive taxonomic classification for metagenomics" + homepage: "https://bioinformatics-centre.github.io/kaiju/" + documentation: "https://github.com/bioinformatics-centre/kaiju/blob/master/README.md" + tool_dev_url: "https://github.com/bioinformatics-centre/kaiju" + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - fasta: + type: file + description: Uncompressed Protein FASTA file (mandatory) + pattern: "*.{fa,faa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fmi: + type: file + description: Kaiju FM-index file + pattern: "*.{fmi}" + +authors: + - "@alxndrdiaz" diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/malt/build/environment.yml similarity index 61% rename from modules/nf-core/fastqc/environment.yml rename to modules/nf-core/malt/build/environment.yml index 1787b38..a3b37a6 100644 --- a/modules/nf-core/fastqc/environment.yml +++ b/modules/nf-core/malt/build/environment.yml @@ -1,7 +1,7 @@ -name: fastqc +name: malt_build channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::fastqc=0.12.1 + - bioconda::malt=0.61 diff --git a/modules/nf-core/malt/build/main.nf b/modules/nf-core/malt/build/main.nf new file mode 100644 index 0000000..f55aeee --- /dev/null +++ b/modules/nf-core/malt/build/main.nf @@ -0,0 +1,43 @@ +process MALT_BUILD { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.61--hdfd78af_0' : + 'biocontainers/malt:0.61--hdfd78af_0' }" + + input: + path fastas + path gff + path mapping_db + + output: + path "malt_index/" , emit: index + path "versions.yml" , emit: versions + path "malt-build.log", emit: log + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + def igff = gff ? "-igff ${gff}" : "" + + """ + malt-build \\ + -v \\ + --input ${fastas.join(' ')} \\ + $igff \\ + -d 'malt_index/' \\ + -t $task.cpus \\ + $args \\ + -mdb ${mapping_db}/*.db |&tee malt-build.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-build --help |& tail -n 3 | head -n 1 | cut -f 2 -d'(' | cut -f 1 -d ',' | cut -d ' ' -f 2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/malt/build/meta.yml b/modules/nf-core/malt/build/meta.yml new file mode 100644 index 0000000..b95fc52 --- /dev/null +++ b/modules/nf-core/malt/build/meta.yml @@ -0,0 +1,49 @@ +name: malt_build +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome + - database +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.cs.uni-tuebingen.de/download/malt/manual.pdf + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] +input: + - fastas: + type: file + description: Directory of, or list of FASTA reference files for indexing + pattern: "*/|*.fasta" + - gff: + type: file + description: Directory of, or GFF3 files of input FASTA files + pattern: "*/|*.gff|*.gff3" + - mapping_db: + type: file + description: MEGAN .db file from https://software-ab.cs.uni-tuebingen.de/download/megan6/welcome.html + pattern: "*.db" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - index: + type: directory + description: Directory containing MALT database index directory + pattern: "malt_index/" + - log: + type: file + description: Log file from STD out of malt-build + pattern: "malt-build.log" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 7b88cca..4eada08 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,7 @@ params { genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - + // MultiQC options multiqc_config = null @@ -43,7 +43,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -58,6 +58,18 @@ params { validationShowHiddenParams = false validate_params = true + // General parameters + dbname = "database" + + prot2taxid = null + nuc2taxid = null + nodesdmp = null + namesdmp = null + + // tool specific options + build_kaiju = false + build_diamond = false + } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index cea6bc0..6114641 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,8 +19,9 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/createtaxdb/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "help_text": "You will need to create a design file with information about the reference genomes you wish to build into a metagenomic profiling database. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/createtaxdb/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv", + "schema": "assets/schema_input.json" }, "outdir": { "type": "string", @@ -39,9 +40,58 @@ "type": "string", "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" + }, + "dbname": { + "type": "string", + "default": "database", + "description": "Specify name that resulting databases will be prefixed with.", + "fa_icon": "fas fa-id-badge" + }, + "prot2taxid": { + "type": "string", + "fa_icon": "fas fa-address-card", + "description": "Path to (NCBI-style) protein accession2taxid file.", + "help_text": "A two column file tab-separated file with `accession.version` and `taxid`. The first refers to an accession ID in each FASTA entry header. The second refers to the taxonomy ID of the organism the sequence belongs to, as listed in `nodes.dmp`." + }, + "nuc2taxid": { + "type": "string", + "fa_icon": "far fa-address-card", + "description": "Path to (NCBI-style) nucleotide accession2taxid file.", + "help_text": "A two column file tab-separated file with `accession.version` and `taxid`. The first refers to an accession ID in each FASTA entry header. The second refers to the taxonomy ID of the organism the sequence belongs to, as listed in `nodes.dmp`." + }, + "nodesdmp": { + "type": "string", + "fa_icon": "fas fa-circle", + "description": "Path to NCBI-style taxonomy node dmp file.", + "help_text": "A tab/pipe/tab separated table file. See nodes.dmp section of [NCBI taxdump README](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/taxdump_readme.txt) for column file structure." + }, + "namesdmp": { + "type": "string", + "fa_icon": "fas fa-tag", + "description": "Path to NCBI-style taxonomy names dmp file.", + "help_text": "A tab/pipe/tab separated table file. See names.dmp section of [NCBI taxdump README](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/taxdump_readme.txt) for column file structure." } } }, + "database_building_options": { + "title": "Database Building Options", + "type": "object", + "description": "", + "default": "", + "properties": { + "build_diamond": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on building of DIAMOND database. Requires amino-acid FASTA file input." + }, + "build_kaiju": { + "type": "boolean", + "description": "Turn on building of Kaiju database. Requires amino-acid FASTA file input.", + "fa_icon": "fas fa-toggle-on" + } + }, + "fa_icon": "fas fa-database" + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -52,7 +102,8 @@ "type": "string", "description": "Name of iGenomes reference.", "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.", + "hidden": true }, "fasta": { "type": "string", @@ -62,7 +113,8 @@ "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" + "fa_icon": "far fa-file-code", + "hidden": true }, "igenomes_ignore": { "type": "boolean", @@ -272,6 +324,9 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/database_building_options" + }, { "$ref": "#/definitions/reference_genome_options" }, diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000..e92f96b --- /dev/null +++ b/nf-test.config @@ -0,0 +1,16 @@ +config { + // location for all nf-tests + testsDir "tests/" + + // nf-test directory including temporary files for each test + workDir "/tmp" + + // location of library folder that is added automatically to the classpath + libDir "lib/" + + // location of an optional nextflow.config file specific for executing tests + configFile "nextflow.config" + + // run all test with the defined docker profile from the main nextflow.config + profile "" +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 0000000..c19b1ad --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,5 @@ +/* +======================================================================================== + Nextflow config file for running tests +======================================================================================== +*/ diff --git a/tests/tags.yml b/tests/tags.yml new file mode 100644 index 0000000..1e63ed9 --- /dev/null +++ b/tests/tags.yml @@ -0,0 +1,2 @@ +nfcore_createtaxdb: + - ./** diff --git a/tests/test.nf.test b/tests/test.nf.test new file mode 100644 index 0000000..6b4e9a8 --- /dev/null +++ b/tests/test.nf.test @@ -0,0 +1,29 @@ +nextflow_pipeline { + + name "Test pipeline: NFCORE_CREATETAXDB" + script "main.nf" + tag "pipeline" + tag "nfcore_createtaxdb" + tag "test" + + test("test_profile") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/diamond/database.dmnd"), + path("$outputDir/kaiju/database.fmi"), + ).match() }, + { assert new File("$outputDir/pipeline_info/software_versions.yml").exists() }, + { assert new File("$outputDir/multiqc/multiqc_report.html").exists() } + ) + } + } +} diff --git a/tests/test.nf.test.snap b/tests/test.nf.test.snap new file mode 100644 index 0000000..21c8776 --- /dev/null +++ b/tests/test.nf.test.snap @@ -0,0 +1,9 @@ +{ + "test_profile": { + "content": [ + "database.dmnd:md5,9d57aa88cd1766adfda8360876fc0e4f", + "database.fmi:md5,54fd89f5e4eab61af30175e8aa389598" + ], + "timestamp": "2023-12-14T12:55:54.070245428" + } +} \ No newline at end of file diff --git a/workflows/createtaxdb.nf b/workflows/createtaxdb.nf index ff56f63..b27b79a 100644 --- a/workflows/createtaxdb.nf +++ b/workflows/createtaxdb.nf @@ -4,7 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' @@ -15,6 +15,19 @@ log.info logo + paramsSummaryLog(workflow) + citation WorkflowCreatetaxdb.initialise(params, log) +// Validate input files parameters (from Sarek) +def checkPathParamList = [ + params.prot2taxid, + params.nuc2taxid, + params.nodesdmp, + params.namesdmp, +] + +for (param in checkPathParamList) if (param) file(param, checkIfExists: true) + +// Validate parameter combinations +if ( params.build_diamond && [!params.prot2taxid, !params.nodesdmp, !params.namesdmp,].any() ) { error('[nf-core/createtaxdb] Supplied --build_diamond, but missing at least one of: --prot2taxid, --nodesdmp, or --namesdmp (all are mandatory for DIAMOND)') } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -32,11 +45,6 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// -include { INPUT_CHECK } from '../subworkflows/local/input_check' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -46,10 +54,13 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' + include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { CAT_CAT as CAT_CAT_AA } from '../modules/nf-core/cat/cat/main' +include { KAIJU_MKFMI } from '../modules/nf-core/kaiju/mkfmi/main' +include { DIAMOND_MAKEDB } from '../modules/nf-core/diamond/makedb/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -64,23 +75,47 @@ workflow CREATETAXDB { ch_versions = Channel.empty() // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // INPUT: Read in samplesheet, validate and stage input files // - INPUT_CHECK ( - file(params.input) - ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema + ch_input = Channel.fromSamplesheet("input") + + // Prepare input for single file inputs modules + + // TODO: Need to have a modification step to get header correct to actually run with kaiju... + // TEST first! + // docs: https://github.com/bioinformatics-centre/kaiju#custom-database + // docs: https://github.com/nf-core/test-datasets/tree/taxprofiler#kaiju + // idea: try just appending `_` to end of each sequence header using a local sed module... it might be sufficient + if ( [params.build_kaiju, params.build_diamond].any() ) { + + // Pull just AA sequences + ch_refs_for_singleref = ch_input + .map{meta, fasta_dna, fasta_aa -> [[id: params.dbname], fasta_aa]} + .filter{meta, fasta_aa -> + fasta_aa + } + .groupTuple() + + // Place in single file + ch_singleref_for_aa = CAT_CAT_AA ( ch_refs_for_singleref ) + ch_versions = ch_versions.mix(CAT_CAT_AA.out.versions.first()) + } // - // MODULE: Run FastQC + // MODULE: Run KAIJU/MKFMI // - FASTQC ( - INPUT_CHECK.out.reads - ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + if ( params.build_kaiju ) { + KAIJU_MKFMI ( CAT_CAT_AA.out.file_out ) + ch_versions = ch_versions.mix(KAIJU_MKFMI.out.versions.first()) + } + + // TODO + // - nf-test + if ( params.build_diamond ) { + DIAMOND_MAKEDB ( CAT_CAT_AA.out.file_out, params.prot2taxid, params.nodesdmp, params.namesdmp ) + ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions.first()) + } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -99,7 +134,6 @@ workflow CREATETAXDB { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) MULTIQC ( ch_multiqc_files.collect(),