Merge branch 'dev' into nf-core-template-merge-3.0.0

nf-core · Oct 8, 2024 · 9db7722 · 9db7722
2 parents c2908ae + 4b056b2
commit 9db7722
Show file tree

Hide file tree

Showing 126 changed files with 7,315 additions and 1,104 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,36 +1,65 @@
-name: nf-core CI
 # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
+name: nf-core CI
 on:
   push:
     branches:
-      - dev
+      - "dev"
   pull_request:
+    branches:
+      - "dev"
+      - "master"
   release:
     types: [published]
   workflow_dispatch:
 
 env:
   NXF_ANSI_LOG: false
+  NFTEST_VER: "0.7.3"
 
 concurrency:
-  group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 jobs:
+  define_nxf_versions:
+    name: Choose nextflow versions to test against depending on target branch
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.nxf_versions.outputs.matrix }}
+    steps:
+      - id: nxf_versions
+        run: |
+          if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.base_ref }}" == "dev" && "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then
+            echo matrix='["latest-everything"]' | tee -a $GITHUB_OUTPUT
+          else
+            echo matrix='["latest-everything", "23.10.0"]' | tee -a $GITHUB_OUTPUT
+          fi
+
   test:
-    name: Run pipeline with test data
-    # Only run on push if this is the nf-core dev branch (merged PRs)
-    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/createtaxdb') }}"
+    name: nf-test
+    needs: define_nxf_versions
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
-        NXF_VER:
-          - "24.04.2"
-          - "latest-everything"
+        NXF_VER: ${{ fromJson(needs.define_nxf_versions.outputs.matrix) }}
+        tags:
+          - "test"
+        profile:
+          - "docker"
+
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
 
+      - name: Check out test data
+        uses: actions/checkout@v3
+        with:
+          repository: nf-core/test-datasets
+          ref: createtaxdb
+          path: test-datasets/
+          fetch-depth: 1
+
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v2
         with:
@@ -39,21 +68,23 @@ jobs:
       - name: Disk space cleanup
         uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
-      - name: Run pipeline with test data (docker)
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
-        # Remember that you can parallelise this by using strategy.matrix
+      - name: Install nf-test
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          wget -qO- https://code.askimed.com/install/nf-test | bash -s $NFTEST_VER
+          sudo mv nf-test /usr/local/bin/
 
-      - name: Run pipeline with test data (singularity)
-        # TODO nf-core: You can customise CI pipeline run tests as required
+      - name: Run nf-test
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results
-        if: "${{ github.base_ref == 'master' }}"
+          nf-test test --tag ${{ matrix.tags }} --profile ${{ matrix.tags }},${{ matrix.profile }} --junitxml=test.xml
 
-      - name: Run pipeline with test data (conda)
-        # TODO nf-core: You can customise CI pipeline run tests as required
+      - name: Output log on failure
+        if: failure()
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,conda --outdir ./results
-        if: "${{ github.base_ref == 'master' }}"
+          sudo apt install bat > /dev/null
+          batcat --decorations=always --color=always ${{ github.workspace }}/.nf-test/tests/*/output/pipeline_info/software_versions.yml
+
+      - name: Publish Test Report
+        uses: mikepenz/action-junit-report@v3
+        if: always() # always run even if the previous step fails
+        with:
+          report_paths: "*.xml"
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ results/
 testing/
 testing*
 *.pyc
+.nf-test*
+test.xml
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -13,6 +13,8 @@ template:
   name: createtaxdb
   org: nf-core
   outdir: .
-  skip_features: null
+  skip_features:
+    - fastqc
+    - igenomes
   version: 1.0dev
 update: null
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 Initial release of nf-core/createtaxdb, created with the [nf-core](https://nf-co.re/) template.
 
+Adds database building support for:
+
+- DIAMOND (added by @jfy133)
+- Kaiju (added by @jfy133)
+- MALT (added by @jfy133)
+
 ### `Added`
 
 ### `Fixed`

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -39,3 +39,31 @@
 - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
 
   > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
+
+- [Bracken](https://doi.org/10.7717/peerj-cs.104)
+
+  > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. https://doi.org/10.7717/peerj-cs.104
+
+- [Centrifuge](https://doi.org/10.1101/gr.210641.116)
+
+  > Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. https://doi.org/10.1101/gr.210641.116
+
+- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
+
+  > Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. https://doi.org/10.1038/nmeth.3176
+
+- [Kaiju](https://doi.org/10.1038/ncomms11257)
+
+> Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. https://doi.org/10.1038/ncomms11257
+
+- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
+
+  > Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome Biology, 20(1), 257. https://doi.org/10.1186/s13059-019-1891-0
+
+- [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0)
+
+  > Breitwieser, F. P., Baker, D. N., & Salzberg, S. L. (2018). KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology, 19(1), 198. https://doi.org/10.1186/s13059-018-1568-0
+
+- [MALT](https://doi.org/10.1038/s41559-017-0446-6)
+
+  > Vågene, Å. J., Herbig, A., Campana, M. G., Robles García, N. M., Warinner, C., Sabin, S., Spyrou, M. A., Andrades Valtueña, A., Huson, D., Tuross, N., Bos, K. I., & Krause, J. (2018). Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature Ecology & Evolution, 2(3), 520–528. https://doi.org/10.1038/s41559-017-0446-6
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <h1>
   <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="docs/images/nf-core-createtaxdb_logo_dark.png">
-    <img alt="nf-core/createtaxdb" src="docs/images/nf-core-createtaxdb_logo_light.png">
+    <source media="(prefers-color-scheme: dark)" srcset="docs/images/nf-core-createtaxdb_logo_dark_tax.png">
+    <img alt="nf-core/createtaxdb" src="docs/images/nf-core-createtaxdb_logo_light_tax.png">
   </picture>
 </h1>
 
@@ -19,7 +19,7 @@
 
 ## Introduction
 
-**nf-core/createtaxdb** is a bioinformatics pipeline that ...
+**nf-core/createtaxdb** is a bioinformatics pipeline that constructs custom metagenomic classifier databases from the same input reference genome set for multiple classifiers and profilers in a highly automated and parallelised manner.
 
 <!-- TODO nf-core:
    Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
@@ -29,10 +29,16 @@
 
 <!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
      workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
-<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
 
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+1. Prepares input FASTA files for building
+2. Builds databases for:
+   - [Bracken](https://doi.org/10.7717/peerj-cs.104)
+   - [Centrifuge](https://doi.org/10.1101/gr.210641.116)
+   - [DIAMOND](https://doi.org/10.1038/nmeth.3176)
+   - [Kaiju](https://doi.org/10.1038/ncomms11257)
+   - [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
+   - [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0)
+   - [MALT](https://doi.org/10.1038/s41559-017-0446-6)
 
 ## Usage
 
@@ -79,12 +85,14 @@ For more details about the output files and reports, please refer to the
 
 ## Credits
 
-nf-core/createtaxdb was originally written by James A. Fellows Yates and the nf-core community.
+nf-core/createtaxdb was originally written by James A. Fellows Yates, Joon Klaps, Alexander Ramos Díaz and the nf-core community.
 
 We thank the following people for their extensive assistance in the development of this pipeline:
 
 <!-- TODO nf-core: If applicable, make list of people who have also contributed -->
 
+- Zandra Fagernäs for logo design
+
 ## Contributions and Support
 
 If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -13,3 +13,7 @@ report_section_order:
 export_plots: true
 
 disable_version_detection: true
+
+custom_logo: "nf-core-createtaxdb_logo_light_tax.svg"
+custom_logo_url: https://nf-co.re/createtaxdb
+custom_logo_title: "nf-core/createtaxdb"
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -7,27 +7,62 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample": {
+            "id": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces",
+                "unique": true,
+                "errorMessage": "Sequence reference name must be provided and cannot contain spaces",
                 "meta": ["id"]
             },
-            "fastq_1": {
-                "type": "string",
-                "format": "file-path",
+            "taxid": {
+                "type": "integer",
+                "unique": true,
+                "errorMessage": "Please provide a valid taxonomic ID in integer format",
+                "meta": ["taxid"]
+            },
+            "fasta_dna": {
+                "anyOf": [
+                    {
+                        "type": "string",
+                        "format": "file-path",
+                        "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$"
+                    },
+                    {
+                        "type": "string",
+                        "maxLength": 0
+                    }
+                ],
+                "unique": true,
+                "errorMessage": "FASTA file for nucleotide sequence cannot contain spaces and must have a valid FASTA extension (fasta, fna, fa, fas, faa), optionally gzipped",
                 "exists": true,
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "format": "file-path"
             },
-            "fastq_2": {
-                "type": "string",
-                "format": "file-path",
+            "fasta_aa": {
+                "anyOf": [
+                    {
+                        "type": "string",
+                        "format": "file-path",
+                        "pattern": "^\\S+\\.(fasta|fas|fa|faa)(\\.gz)?$"
+                    },
+                    {
+                        "type": "string",
+                        "maxLength": 0
+                    }
+                ],
+                "unique": true,
+                "errorMessage": "FASTA file for amino acid reference sequence cannot contain spaces and must have a valid FASTA extension (fasta, fna, fa, fas, faa), optionally gzipped",
                 "exists": true,
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "format": "file-path"
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["id", "taxid"],
+        "anyOf": [
+            {
+                "required": ["fasta_dna"]
+            },
+            {
+                "required": ["fasta_aa"]
+            }
+        ]
     }
 }
diff --git a/assets/test.csv b/assets/test.csv
@@ -0,0 +1,3 @@
+id,taxid,fasta_dna,fasta_aa
+Severe_acute_respiratory_syndrome_coronavirus_2,2697049,https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/fasta/sarscov2.fasta,https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/fasta/sarscov2.faa
+Haemophilus_influenzae,727,https://raw.githubusercontent.com/nf-core/test-datasets/createtaxdb/data/fasta/haemophilus_influenzae.fna.gz,
diff --git a/conf/base.config b/conf/base.config
@@ -59,4 +59,9 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+
+    withName:'KAIJU_MKFMI'{
+        memory = { check_max( 24.GB * task.attempt, 'memory'  ) }
+
+    }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -18,16 +18,36 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
-    withName: FASTQC {
-        ext.args = '--quiet'
-    }
-    withName: 'MULTIQC' {
-        ext.args   = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' }
+    withName: MULTIQC {
+        ext.args   = { params.multiqc_title ? "--title \"${params.multiqc_title}\"" : '' }
         publishDir = [
             path: { "${params.outdir}/multiqc" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
 
+    withName: CAT_CAT_DNA {
+        ext.prefix = { "${meta.id}.fna" }
+        publishDir = [
+            path: { "${params.outdir}/cat" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: params.save_concatenated_fastas
+        ]
+    }
+
+    withName: CAT_CAT_AA {
+        ext.prefix = { "${meta.id}.faa" }
+        publishDir = [
+            path: { "${params.outdir}/cat" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: params.save_concatenated_fastas
+        ]
+    }
+
+    withName: MALT_BUILD {
+        ext.args = { "--sequenceType ${params.malt_sequencetype}" }
+    }
 }