nf-core · jfy133 · Jan 5, 2024 · Dec 5, 2023 · Dec 5, 2023 · Dec 14, 2023
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -39,3 +39,11 @@
 - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
 
   > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
+
+- [DIAMOND](https://doi.org/10.1038/nmeth.3176)
+
+  > Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. https://doi.org/10.1038/nmeth.3176
+
+- [Kaiju](https://doi.org/10.1038/ncomms11257)
+
+> Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. https://doi.org/10.1038/ncomms11257
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -7,30 +7,60 @@
     "items": {
         "type": "object",
         "properties": {
-            "sample": {
+            "id": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
+                "unique": true,
+                "errorMessage": "Sequence reference name must be provided and cannot contain spaces",
+                "meta": ["id"],
+                "anyOf": [
+                    {
+                        "dependentRequired": ["fasta_dna"]
+                    },
+                    {
+                        "dependentRequired": ["fasta_aa"]
+                    }
+                ]
             },
-            "fastq_1": {
-                "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+            "taxid": {
+                "type": "integer",
+                "unique": true,
+                "errorMessage": "Please provide a valid taxonomic ID in integer format",
+                "meta": ["taxid"]
             },
-            "fastq_2": {
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
+            "fasta_dna": {
                 "anyOf": [
                     {
                         "type": "string",
-                        "pattern": "^\\S+\\.f(ast)?q\\.gz$"
+                        "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$"
                     },
                     {
                         "type": "string",
                         "maxLength": 0
                     }
-                ]
+                ],
+                "unique": true,
+                "errorMessage": "FASTA file for nucleotide sequence cannot contain spaces and must have a valid FASTA extension (fasta, fna, fa, fas, faa), optionally gzipped",
+                "exists": true,
+                "format": "file-path"
+            },
+            "fasta_aa": {
+                "anyOf": [
+                    {
+                        "type": "string",
+                        "pattern": "^\\S+\\.(fasta|fas|fa|faa)(\\.gz)?$"
+                    },
+                    {
+                        "type": "string",
+                        "maxLength": 0
+                    }
+                ],
+                "unique": true,
+                "errorMessage": "FASTA file for amino acid reference sequence cannot contain spaces and must have a valid FASTA extension (fasta, fna, fa, fas, faa), optionally gzipped",
+                "exists": true,
+                "format": "file-path"
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["id", "taxid"]
     }
 }
diff --git a/assets/test.csv b/assets/test.csv
@@ -0,0 +1,3 @@
+id,taxid,fasta_dna,fasta_aa
+Severe_acute_respiratory_syndrome_coronavirus_2,2697049,/home/james/Downloads/createtaxdb/sarscov2.fasta,/home/james/Downloads/createtaxdb/sarscov2.faa
+Haemophilus_influenzae,727,/home/james/Downloads/createtaxdb/haemophilus_infuenzae.fna.gz,
diff --git a/conf/modules.config b/conf/modules.config
@@ -18,18 +18,6 @@ process {
         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
     ]
 
-    withName: SAMPLESHEET_CHECK {
-        publishDir = [
-            path: { "${params.outdir}/pipeline_info" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
-    withName: FASTQC {
-        ext.args = '--quiet'
-    }
-
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/docs/output.md b/docs/output.md
@@ -12,32 +12,36 @@ The directories listed below will be created in the results directory after the
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
-- [FastQC](#fastqc) - Raw read QC
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
-### FastQC
+### Kaiju
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `fastqc/`
-  - `*_fastqc.html`: FastQC report containing quality metrics.
-  - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+- `diamond/`
+  - `<database>.dmnd`: DIAMOND dmnd database file
 
 </details>
 
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
+[DIAMOND](https://github.com/bbuchfink/diamond) is a accelerated BLAST compatible local sequence aligner particularly used for protein alignment.
 
-![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
+The `dmnd` file can be given to one of the DIAMOND alignment commands with `diamond blast<x/p> -d <your_database>.dmnd` etc.
 
-![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
+### Kaiju
 
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
+<details markdown="1">
+<summary>Output files</summary>
+
+- `kaiju/`
+  - `<database_name>.fmi`: Kaiju FMI index file
+
+</details>
+
+[Kaiju](https://bioinformatics-centre.github.io/kaiju/) is a fast and sensitive taxonomic classification for metagenomics utilising nucletoide to protein translations.
 
-:::note
-The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
-:::
+The `fmi` file can be given to Kaiju itself with `kaiju -f <your_database>.fmi` etc.
 
 ### MultiQC
 

diff --git a/modules.json b/modules.json
@@ -5,16 +5,36 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "cat/cat": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "diamond/makedb": {
+                        "branch": "master",
+                        "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c",
+                        "installed_by": ["modules"]
+                    },
                     "fastqc": {
                         "branch": "master",
                         "git_sha": "bd8092b67b5103bdd52e300f75889442275c3117",
                         "installed_by": ["modules"]
                     },
+                    "kaiju/mkfmi": {
+                        "branch": "master",
+                        "git_sha": "7365564c402cbd01e9407810730efd10039997a3",
+                        "installed_by": ["modules"]
+                    },
+                    "malt/build": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",

diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml