diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d6e526b..289877e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,11 +15,10 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/CFIA-NCFAD/n - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py` + - [ ] If you've added a new tool - ensure that you've added the version info to a `versions.yml` and added it to the `ch_versions` channel in the workflow you added the tool to. - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/CFIA-NCFAD/nf-flu/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on [the CFIA-NCFAD/nf-test-datasets repo](https://github.com/CFIA-NCFAD/nf-test-datasets/pull/new) -- [ ] Make sure your code lints (`nf-core lint .`). -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test_{illumina,nanopore},docker`). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml new file mode 100644 index 0000000..0431835 --- /dev/null +++ b/.github/workflows/branch.yml @@ -0,0 +1,44 @@ +name: nf-core branch protection +# This workflow is triggered on PRs to master branch on the repository +# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` +on: + pull_request_target: + branches: [master] + +jobs: + test: + runs-on: ubuntu-latest + steps: + # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches + - name: Check PRs + if: github.repository == 'CFIA-NCFAD' + run: | + { [[ ${{github.event.pull_request.head.repo.full_name }} == CFIA-NCFAD ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + + # If the above check failed, post a comment on the PR explaining the failure + # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## This PR is against the `master` branch :x: + + * Do not close this PR + * Click _Edit_ and change the `base` to `dev` + * This CI test will remain failed until you push a new commit + + --- + + Hi @${{ github.event.pull_request.user.login }}, + + It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. + The `master` branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. + + You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. + Note that even after this, the test will continue to show as failing until you push a new commit. + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4ad97c..10d33ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,14 +3,18 @@ name: CI on: push: branches: - - master - dev pull_request: - branches: - - '*' release: types: [published] +env: + NXF_ANSI_LOG: false + +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true + jobs: test_illumina: name: Run Illumina test diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml deleted file mode 100644 index 0471add..0000000 --- a/.github/workflows/linting_comment.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: nf-core linting comment -# This workflow is triggered after the linting action is complete -# It posts an automated comment to the PR, even if the PR is coming from a fork - -on: - workflow_run: - workflows: ["nf-core linting"] - -jobs: - test: - runs-on: ubuntu-latest - steps: - - name: Download lint results - uses: dawidd6/action-download-artifact@v2 - with: - workflow: linting.yml - - - name: Get PR number - id: pr_number - run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" - - - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@v2 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - number: ${{ steps.pr_number.outputs.pr_number }} - path: linting-logs/lint_results.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 41c82e9..f2b39b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.2.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.2.0)] - 2023-06-22 + +### Added + +* Influenza B virus support (#14) +* Polars for faster parsing of BLAST results (#14) + +### Fixes + +* Irregular Illumina paired-end FASTQ files not producing IRMA assemblies (#20) + +### Updates + +* Updated README.md to include references and citations + ## [[3.1.6](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.1.6)] - 2023-05-31 This is a patch release for a minor change to use Biocontainers Docker and Singularity images for Clair3 to avoid hitting limits on pulls from Docker Hub and since Biocontainers images are half the size of [hkubal/clair3](https://hub.docker.com/r/hkubal/clair3/) images. diff --git a/README.md b/README.md index ad23ccf..aee1421 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# CFIA-NCFAD/nf-flu - Influenza A Virus Genome Assembly Nextflow Workflow +# CFIA-NCFAD/nf-flu - Influenza A and B Virus Genome Assembly Nextflow Workflow + +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7011213.svg)](https://doi.org/10.5281/zenodo.7011213) [![CI](https://github.com/CFIA-NCFAD/nf-flu/actions/workflows/ci.yml/badge.svg)](https://github.com/CFIA-NCFAD/nf-flu/actions/workflows/ci.yml) @@ -9,11 +11,10 @@ ## Introduction -**nf-flu** is a bioinformatics analysis pipeline for assembly and H/N subtyping of Influenza A virus. The pipeline supports both Illumina and Nanopore Platform. -Since Influenza is a special virus with multiple gene segments (8 segments) and there might be a reference or multiple we would want to align against, the pipeline will automatically pull top match references for each segment. -To achieve this task, the pipeline downloads Influenza database from NCBI and user could provide their own reference database. The pipline performs read mapping against each reference segment, variant calling and genome assembly. - -The pipeline is implemented in [Nextflow][] +**nf-flu** is a [Nextflow][] bioinformatics analysis pipeline for assembly and H/N subtyping of Influenza A and B viruses from Illumina or Nanopore sequencing data. +Since Influenza has a segmented genome consisting of 8 gene segments, the pipeline will automatically select the top matching reference sequence from NCBI for each gene segment based on [IRMA][] assembly and nucleotide [BLAST][] against all Influenza sequences from NCBI. +Users can also provide their own reference sequences to include in the top reference sequence selection process. +After reference sequence selection, the pipeline performs read mapping to each reference sequence, variant calling and depth-masked consensus sequence generation. ## Pipeline summary @@ -21,7 +22,7 @@ The pipeline is implemented in [Nextflow][] 2. Merge reads of re-sequenced samples ([`cat`](http://www.linfo.org/cat.html)) (if needed) 3. Assembly of Influenza gene segments with [IRMA][] using the built-in FLU module 4. Nucleotide [BLAST][] search against [NCBI Influenza DB][] -5. Automatically pull top match references for segments +5. Automatically select top match references for segments 6. H/N subtype prediction and Excel XLSX report generation based on BLAST results 7. Perform Variant calling and genome assembly for all segments. @@ -73,11 +74,99 @@ The nf-flu pipeline comes with: * [Usage](docs/usage.md) and * [Output](docs/output.md) documentation. -## Resources +## Resources and References + +### [BcfTools][] and [Samtools][] + +```text +Danecek, P., Bonfield, J.K., Liddle, J., Marshall, J., Ohan, V., Pollard, M.O., Whitwham, A., Keane, T., McCarthy, S.A., Davies, R.M., Li, H., 2021. Twelve years of SAMtools and BCFtools. Gigascience 10, giab008. https://doi.org/10.1093/gigascience/giab008 +``` + +### [BLAST][] Basic Local Alignment Search Tool + +```text +Altschul, S.F., Gish, W., Miller, W., Myers, E.W., Lipman, D.J., 1990. Basic local alignment search tool. J. Mol. Biol. 215, 403–410. https://doi.org/10.1016/S0022-2836(05)80360-2 +``` + +```text +Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., Madden, T.L., 2009. BLAST+: architecture and applications. BMC Bioinformatics 10, 421. https://doi.org/10.1186/1471-2105-10-421 +``` + +### [Clair3][] + +```text +Zheng, Z., Li, S., Su, J., Leung, A.W.-S., Lam, T.-W., Luo, R., 2022. Symphonizing pileup and full-alignment for deep learning-based long-read variant calling. Nat Comput Sci 2, 797–803. https://doi.org/10.1038/s43588-022-00387-x +``` + +### [IRMA][] Iterative Refinement Meta-Assembler + +```text +Shepard, S.S., Meno, S., Bahl, J., Wilson, M.M., Barnes, J., Neuhaus, E., 2016. Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler. BMC Genomics 17, 708. https://doi.org/10.1186/s12864-016-3030-6 +``` + +### [Medaka][] + +[Medaka][] is deprecated in favour of [Clair3][] for variant calling of Nanopore data. + +### [Minimap2][] + +[Minimap2][] is used for rapid and accurate read alignment to reference sequences. + +```text +Li, H., 2018. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics 34, 3094–3100. https://doi.org/10.1093/bioinformatics/bty191 +``` + +### [Mosdepth][] + +[Mosdepth][] is used for rapid sequencing coverage calculation and summary statistics. + +```text +Pedersen, B.S., Quinlan, A.R., 2017. Mosdepth: quick coverage calculation for genomes and exomes. Bioinformatics 34, 867–868. https://doi.org/10.1093/bioinformatics/btx699 +``` + +### [MultiQC][] + +[MultiQC][] is used for generation of a single report for multiple tools. + +```text +Ewels, P., Magnusson, M., Lundin, S., Käller, M., 2016. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32, 3047–3048. https://doi.org/10.1093/bioinformatics/btw354 +``` + +### [NCBI Influenza Virus Resource][] + +**nf-flu** relies on publicly available Influenza sequence data from NCBI available at the [NCBI Influenza Virus Resource][], which is downloaded from the [FTP site](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/). + +NCBI Influenza Virus Resource: + +```text +Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Zaslavsky, L., Tatusova, T., Ostell, J., Lipman, D., 2008. The influenza virus resource at the National Center for Biotechnology Information. J Virol 82, 596–601. https://doi.org/10.1128/JVI.02005-07 +``` + +NCBI Influenza Virus Sequence Annotation Tool: + +```text +Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Tatusova, T., 2007. FLAN: a web server for influenza virus genome annotation. Nucleic Acids Res 35, W280-284. https://doi.org/10.1093/nar/gkm354 +``` + +### [Nextflow][] + +**nf-flu** is implemented in [Nextflow][]. + +```text +Tommaso, P.D., Chatzou, M., Floden, E.W., Barja, P.P., Palumbo, E., Notredame, C., 2017. Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316–319. https://doi.org/10.1038/nbt.3820 +``` + +### [nf-core][] + +[nf-core][] is a great resource for building robust and reproducible bioinformatics pipelines. + +```text +Ewels, P.A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M.U., Di Tommaso, P., Nahnsen, S., 2020. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol 38, 276–278. https://doi.org/10.1038/s41587-020-0439-x +``` + +### [seqtk][] -* [NCBI Influenza FTP site](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/) -* [IRMA][] Iterative Refinement Meta-Assembler - * [IRMA Publication](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6) +[seqtk][] is used for rapid manipulation of FASTA/Q files. Available from GitHub at [lh3/seqtk](https://github.com/lh3/seqtk) ## Credits @@ -94,3 +183,13 @@ The nf-flu pipeline was originally developed by [Peter Kruczkiewicz](https://git [Nextflow]: https://www.nextflow.io/ [Docker]: https://www.docker.com/ [Singularity]: https://www.sylabs.io/guides/3.0/user-guide/quick_start.html#quick-installation-steps +[NCBI Influenza Virus Resource]: https://www.ncbi.nlm.nih.gov/genomes/FLU/Database/nph-select.cgi?go=database +[BcfTools]: https://samtools.github.io/bcftools/ +[Samtools]: https://www.htslib.org/ +[nf-core]: https://nf-co.re/ +[Minimap2]: https://github.com/lh3/minimap2/ +[Clair3]: https://github.com/HKU-BAL/Clair3 +[Medaka]: https://github.com/nanoporetech/medaka +[Mosdepth]: https://github.com/brentp/mosdepth +[seqtk]: https://github.com/lh3/seqtk +[MultiQC]: https://multiqc.info/ diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index f09a1c3..90881b7 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -12,17 +12,19 @@ import re import logging from collections import defaultdict -from multiprocessing import Pool +import multiprocessing import click import pandas as pd import numpy as np -from rich.console import Console +import polars as pl from rich.logging import RichHandler LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s [in %(filename)s:%(lineno)d]" logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) +pl.enable_string_cache(True) + influenza_segment = { 1: "1_PB2", 2: "2_PB1", @@ -36,21 +38,21 @@ # Column names/types/final report names blast_cols = [ - ("qaccver", "category"), + ("qaccver", str), ("saccver", str), ("pident", float), - ("length", "uint16"), - ("mismatch", "uint16"), - ("gapopen", "uint16"), - ("qstart", "uint16"), - ("qend", "uint16"), - ("sstart", "uint16"), - ("send", "uint16"), - ("evalue", np.float16), - ("bitscore", np.float16), - ("qlen", "uint16"), - ("slen", "uint16"), - ("qcovs", np.float16), + ("length", pl.UInt16), + ("mismatch", pl.UInt16), + ("gapopen", pl.UInt16), + ("qstart", pl.UInt16), + ("qend", pl.UInt16), + ("sstart", pl.UInt16), + ("send", pl.UInt16), + ("evalue", pl.Float32), + ("bitscore", pl.Float32), + ("qlen", pl.UInt16), + ("slen", pl.UInt16), + ("qcovs", pl.Float32), ("stitle", str), ] @@ -181,85 +183,83 @@ def parse_blast_result( blast_result: str, - df_metadata: pd.DataFrame, + df_metadata: pl.DataFrame, regex_subtype_pattern: str, + get_top_ref: bool, top: int = 3, pident_threshold: float = 0.85, min_aln_length: int = 50, ) -> Optional[ - Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame], Dict] + Tuple[pl.DataFrame, Optional[pl.DataFrame], Optional[pl.DataFrame], Dict] ]: logging.info(f"Parsing BLAST results from {blast_result}") - df = pd.read_csv( + df_filtered = pl.scan_csv( blast_result, - sep="\t", - names=[name for name, coltype in blast_cols], - dtype={name: coltype for name, coltype in blast_cols}, - ) - if df.empty: - logging.error(f"No BLASTN results in {blast_result}!") - return - # Assuming that all BLAST result entries have the same sample name so - # extracting the sample name from the first entry qaccver field - sample_name: str = re.sub(r"^(.+)_\d$", r"\1", df["qaccver"][0]) - logging.info(f"Parsed {df.shape[0]} BLAST results from {blast_result}") - logging.info( - f"{sample_name} | n={df.shape[0]} | Filtering for hits above {pident_threshold}% identity." - ) - df_filtered = df[ - (df["pident"] >= (pident_threshold * 100)) & (df["length"] >= min_aln_length) - ] + has_header=False, + separator="\t", + new_columns=[name for name, coltype in blast_cols], + dtypes={name: coltype for name, coltype in blast_cols}, + ).filter((pl.col("pident") >= (pident_threshold * 100)) & + (pl.col("length") >= min_aln_length)).collect(streaming=True) + + sample_name: str = re.sub(r"^(.+)_\d$", r"\1", df_filtered["qaccver"][0]) logging.info( f"{sample_name} | n={df_filtered.shape[0]} | Filtered for hits above {pident_threshold}% identity." + f"and Min Alignment length > {min_aln_length}" ) - # Sequences header has been corrected by GUNZIP Process - # zcat $archive | sed -E 's/^>gi\\|[0-9]+\\|gb\\|(\\w+)\\|(.*)/>\\1 \\2/' > influenza.fna - df_filtered["accession"] = df_filtered.saccver.str.strip() - df_filtered["sample"] = sample_name - df_filtered["sample"] = pd.Categorical(df_filtered["sample"]) - df_filtered["sample_segment"] = df_filtered.qaccver.str.extract(r".+_(\d)$").astype( - "category" - ) - df_filtered["sample_segment"] = pd.Categorical(df_filtered["sample_segment"]) - df_filtered["subtype_from_match_title"] = ( - df_filtered["stitle"].str.extract(regex_subtype_pattern).astype("category") - ) + df_filtered = df_filtered.with_columns([ + pl.col('saccver').str.strip().alias("accession"), + pl.lit(sample_name, dtype=pl.Categorical).alias("sample"), + pl.col('qaccver').str.extract(r".+_(\d)$").cast(pl.Categorical).alias("sample_segment"), + pl.col("stitle").str.extract(regex_subtype_pattern).alias("subtype_from_match_title").cast(pl.Categorical) + ]) logging.info( f"{sample_name} | Merging NCBI Influenza DB genome metadata with BLAST results on accession." ) - df_merge = pd.merge(df_filtered, df_metadata, on="accession", how="left") + df_merge = df_filtered.join(df_metadata, on="accession", how="left") del df_filtered del df_metadata - df_merge["subtype"] = df_merge["subtype"].combine_first(df_merge["subtype_from_match_title"]) - df_merge = df_merge.sort_values( - by=["sample_segment", "bitscore"], ascending=[True, False] - ).set_index("sample_segment") - subtype_results_summary = {} - H_results = None - N_results = None - if "4" in df_merge.index: - H_results = find_h_or_n_type(df_merge, "4") - subtype_results_summary.update(H_results) - if "6" in df_merge.index: - N_results = find_h_or_n_type(df_merge, "6") - subtype_results_summary.update(N_results) - - subtype_results_summary["sample"] = sample_name - subtype_results_summary["subtype"] = get_subtype_value(H_results, N_results) + df_merge = df_merge.with_columns( + pl.when(pl.col("subtype").is_null()) + .then(pl.col("subtype_from_match_title")) + .otherwise(pl.col("subtype")) + .alias("subtype") + ) + df_merge = df_merge.sort( + by=["sample_segment", "bitscore"], descending=[False, True] + ) + + segments = df_merge["sample_segment"].unique().sort() dfs = [] - segments = df_merge.index.unique() for seg in segments: - dfs.append(df_merge.loc[seg, :].head(top).reset_index()) - df_top_seg_matches = pd.concat(dfs) - cols = pd.Series([x for x, _ in blast_results_report_columns]) - cols = cols[cols.isin(df_top_seg_matches.columns)] - df_top_seg_matches = df_top_seg_matches[cols] + dfs.append(df_merge.filter(pl.col("sample_segment") == seg).head(top)) + df_top_seg_matches = pl.concat(dfs, how="vertical") + cols = pl.Series([x for x, _ in blast_results_report_columns]) + df_top_seg_matches = df_top_seg_matches.select(pl.col(cols)) + subtype_results_summary = {"sample": sample_name} + if not get_top_ref: + is_iav = True + if df_top_seg_matches.select(pl.col("subtype").is_null().all())[0, 0]: + is_iav = False + H_results = None + N_results = None + if "4" in segments: + H_results = find_h_or_n_type(df_merge, "4", is_iav) + subtype_results_summary.update(H_results) + if "6" in segments: + N_results = find_h_or_n_type(df_merge, "6", is_iav) + subtype_results_summary.update(N_results) + subtype_results_summary["subtype"] = get_subtype_value(H_results, N_results, is_iav) + return df_top_seg_matches, subtype_results_summary -def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict]) -> str: +def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict], is_iav: bool) -> str: subtype = "" + if not is_iav: + subtype = "N/A" + return subtype if H_results is None and N_results is None: subtype = "-" elif H_results is not None and N_results is None: @@ -282,41 +282,50 @@ def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict]) -> s return subtype -def find_h_or_n_type(df_merge, seg): +def find_h_or_n_type(df_merge, seg, is_iav): assert seg in [ "4", "6", ], "Can only determine H or N type from segments 4 or 6, respectively!" type_name = "H_type" if seg == "4" else "N_type" h_or_n = type_name[0] - df_segment = df_merge.loc[seg, :] - type_counts = df_segment.subtype.value_counts() - df_type_counts = type_counts.index.str.extract(h_or_n + r"(\d+)", flags=re.IGNORECASE) - df_type_counts.columns = [type_name] - df_type_counts["count"] = type_counts.values - df_type_counts["subtype"] = type_counts.index - df_type_counts = df_type_counts[~pd.isnull(df_type_counts[type_name])] - logging.debug(f"{df_type_counts}") - type_to_count = defaultdict(int) - for _, x in df_type_counts.iterrows(): - if pd.isna(x[type_name]): - continue - type_to_count[x[type_name]] += x["count"] - type_to_count = [(h, c) for h, c in type_to_count.items()] - type_to_count.sort(key=lambda x: x[1], reverse=True) - top_type, top_type_count = type_to_count[0] - total_count = type_counts.sum() - logging.info( - f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})" - ) - type_mask = df_segment.subtype.str.match( - r".*" + h_or_n + top_type + r".*", na=False - ) - type_mask[pd.isnull(type_mask)] = False - df_seg_top_type = df_segment[type_mask] - top_result: pd.Series = [r for _, r in df_seg_top_type.head(1).iterrows()][0] + reg_h_or_n_type = "" + if h_or_n == "H": + reg_h_or_n_type = "[Hh]" + else: + reg_h_or_n_type = "[Nn]" + df_segment = df_merge.filter(pl.col("sample_segment") == seg) + if is_iav: + type_counts = df_segment["subtype"].value_counts(sort=True) + type_counts = type_counts.filter(~pl.col("subtype").is_null()) + df_type_counts = type_counts.with_columns(pl.lit(type_counts["subtype"].str.extract(reg_h_or_n_type + r"(\d+)"). + alias(type_name))) + df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null()) + logging.debug(f"{df_type_counts}") + type_to_count = defaultdict(int) + for x in df_type_counts.iter_rows(named=True): + type_to_count[x[type_name]] += x["counts"] + type_to_count = [(h, c) for h, c in type_to_count.items()] + type_to_count.sort(key=lambda x: x[1], reverse=True) + top_type, top_type_count = type_to_count[0] + total_count = type_counts["counts"].sum() + logging.info( + f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})" + ) + df_segment = df_segment.with_columns( + pl.lit(df_segment["subtype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*") + .fill_null(False) + .alias("type_mask"))) + df_seg_top_type = df_segment.filter(pl.col("type_mask") == True).drop("type_mask") + top_result: pl.Series = [r for r in df_seg_top_type.head(1).iter_rows(named=True)][0] + else: + top_type = "N/A" + top_type_count = "N/A" + total_count = "N/A" + top_result: pl.Series = [r for r in df_segment.head(1).iter_rows(named=True)][0] + results_summary = { - f"{h_or_n}_type": top_type, + f"{h_or_n}_type": top_type if is_iav else "N/A", f"{h_or_n}_sample_segment_length": top_result["qlen"], f"{h_or_n}_top_pident": top_result["pident"], f"{h_or_n}_top_mismatch": top_result["mismatch"], @@ -331,7 +340,7 @@ def find_h_or_n_type(df_merge, seg): f"{h_or_n}_virus_name": top_result["virus_name"], f"{h_or_n}_NCBI_Influenza_DB_subtype_matches": top_type_count, f"{h_or_n}_NCBI_Influenza_DB_total_matches": total_count, - f"{h_or_n}_NCBI_Influenza_DB_proportion_matches": top_type_count / total_count, + f"{h_or_n}_NCBI_Influenza_DB_proportion_matches": top_type_count / total_count if is_iav else "N/A", } logging.info(f"Seg {seg} results: {results_summary}") return results_summary @@ -352,7 +361,6 @@ def find_h_or_n_type(df_merge, seg): def report(flu_metadata, blast_results, excel_report, top, pident_threshold, min_aln_length, threads, get_top_ref, sample_name): from rich.traceback import install - install(show_locals=True, width=120, word_wrap=True) logging.basicConfig( format="%(message)s", @@ -364,77 +372,62 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold, logging.info(f'Parsing Influenza metadata file "{flu_metadata}"') md_cols = [ ("accession", str), - ("host", "category"), - ("segment", "category"), - ("subtype", "str"), - ("country", "category"), - ("date", "category"), - ("seq_length", "uint16"), - ("virus_name", "category"), - ("age", "category"), - ("gender", "category"), - ("group_id", "category"), + ("host", pl.Categorical), + ("segment", pl.Categorical), + ("subtype", str), + ("country", pl.Categorical), + ("date", pl.Categorical), + ("seq_length", pl.UInt16), + ("virus_name", pl.Categorical), + ("age", pl.Categorical), + ("gender", pl.Categorical), + ("group_id", pl.Categorical), ] - df_md = pd.read_csv( + df_md = pl.read_csv( flu_metadata, - sep="\t", - names=[name for name, _ in md_cols], - dtype={name: t for name, t in md_cols}, + has_header=False, + separator="\t", + new_columns=[name for name, _ in md_cols], + dtypes={name: t for name, t in md_cols}, ) - unique_subtypes = df_md.subtype.unique() - unique_subtypes = unique_subtypes[~pd.isna(unique_subtypes)] + + unique_subtypes = df_md.select("subtype").unique() + unique_subtypes = unique_subtypes.filter(~pl.col("subtype").is_null()) logging.info( - f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {unique_subtypes.size} unique subtypes. " + f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {len(unique_subtypes)} unique subtypes. " ) - regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes)) + r")\)" - if threads > 1: - pool = Pool(processes=threads) - logging.info( - f"Initialized multiprocessing pool with {threads} processes. Submitting async parsing jobs." + regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["subtype"])) + r")\)" + results = [ + parse_blast_result(blast_result, df_md, regex_subtype_pattern, get_top_ref, top=top, + pident_threshold=pident_threshold, + min_aln_length=min_aln_length) for blast_result in blast_results] + + if not get_top_ref: + dfs_blast = [] + all_subtype_results = {} + for parsed_result in results: + if parsed_result is None: + continue + df_blast, subtype_results_summary = parsed_result + if df_blast is not None: + dfs_blast.append(df_blast.to_pandas()) + sample = subtype_results_summary["sample"] + all_subtype_results[sample] = subtype_results_summary + df_all_blast = pd.concat(dfs_blast).rename( + columns={k: v for k, v in blast_results_report_columns} ) - async_objects = [ - pool.apply_async( - parse_blast_result, - (blast_result, df_md, regex_subtype_pattern), - dict(top=top, pident_threshold=pident_threshold), - ) - for blast_result in blast_results - ] - logging.info(f"Getting async results...") - results = [x.get() for x in async_objects] - logging.info( - f'Got {len(results)} async parsing results. Merging into report "{excel_report}".' + df_subtype_results = pd.DataFrame(all_subtype_results).transpose() + cols = pd.Series(subtype_results_summary_columns) + cols = cols[cols.isin(df_subtype_results.columns)] + df_subtype_predictions = df_subtype_results[cols].rename( + columns=subtype_results_summary_final_names ) - else: - results = [ - parse_blast_result(blast_result, df_md, regex_subtype_pattern, top=top, pident_threshold=pident_threshold, - min_aln_length=min_aln_length) for blast_result in blast_results] - dfs_blast = [] - all_subtype_results = {} - for parsed_result in results: - if parsed_result is None: - continue - df_blast, subtype_results_summary = parsed_result - if df_blast is not None: - dfs_blast.append(df_blast) - sample = subtype_results_summary["sample"] - all_subtype_results[sample] = subtype_results_summary - df_subtype_results = pd.DataFrame(all_subtype_results).transpose() - df_all_blast = pd.concat(dfs_blast).rename( - columns={k: v for k, v in blast_results_report_columns} - ) - cols = pd.Series(subtype_results_summary_columns) - cols = cols[cols.isin(df_subtype_results.columns)] - df_subtype_predictions = df_subtype_results[cols].rename( - columns=subtype_results_summary_final_names - ) - cols = pd.Series(columns_H_summary_results) - cols = cols[cols.isin(df_subtype_results.columns)] - df_H = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names) - cols = pd.Series(columns_N_summary_results) - cols = cols[cols.isin(df_subtype_results.columns)] - df_N = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names) - if not get_top_ref: + cols = pd.Series(columns_H_summary_results) + cols = cols[cols.isin(df_subtype_results.columns)] + df_H = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names) + cols = pd.Series(columns_N_summary_results) + cols = cols[cols.isin(df_subtype_results.columns)] + df_N = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names) # Add segment name for more informative df_all_blast["Sample Genome Segment Number"] = df_all_blast["Sample Genome Segment Number"]. \ apply(lambda x: influenza_segment[int(x)]) @@ -448,16 +441,23 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold, output_dest=excel_report, ) else: - df_ref_id = df_all_blast[ - ['Sample', 'Sample Genome Segment Number', 'Reference NCBI Accession', 'BLASTN Bitscore', - 'Reference Sequence ID']] - df_ref_id = df_ref_id.reset_index(drop=True) - df_ref_id.loc[df_ref_id['Reference NCBI Accession'].isna(), 'Reference NCBI Accession'] = df_ref_id[ - 'Reference Sequence ID'] - df_ref_id['Reference NCBI Accession'] = df_ref_id['Reference NCBI Accession'].str.strip() - df_ref_id['Sample Genome Segment Number'] = df_ref_id['Sample Genome Segment Number']. \ - apply(lambda x: influenza_segment[int(x)]) - df_ref_id.to_csv(sample_name + ".topsegments.csv", header=True, index=False) + df_blast, subtype_results_summary = results[0] + df_blast = df_blast.rename( + mapping={k: v for k, v in blast_results_report_columns} + ) + df_ref_id = df_blast.select(pl.col(['Sample', 'Sample Genome Segment Number', + 'Reference NCBI Accession', 'BLASTN Bitscore', 'Reference Sequence ID'])) + df_ref_id = df_ref_id.with_columns( + pl.when(pl.col("Reference NCBI Accession").is_null()) + .then(pl.col("Reference Sequence ID")) + .otherwise(pl.col("Reference NCBI Accession")) + .str.strip() + .alias('Reference NCBI Accession') + ) + df_ref_id = df_ref_id.with_columns( + pl.col("Sample Genome Segment Number").apply(lambda x: influenza_segment[int(x)]) + .alias("Sample Genome Segment Number")) + df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True) def get_col_widths(df, index=False): diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config index 104c248..d4f35d7 100644 --- a/conf/modules_illumina.config +++ b/conf/modules_illumina.config @@ -52,7 +52,7 @@ process { ] ] } - withName: 'CAT_FASTQ' { + withName: 'CAT_ILLUMINA_FASTQ' { publishDir = [ [ path: { "${params.outdir}/fastq"}, diff --git a/main.nf b/main.nf index 6cc7874..b7eb14b 100755 --- a/main.nf +++ b/main.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl = 2 def json_schema = "$projectDir/nextflow_schema.json" if (params.help){ - def command = "nextflow run CFIA-NCFAD/nf-flu --input --platfrom illumina samplesheet.csv -profile singularity/docker/conda" + def command = "nextflow run CFIA-NCFAD/nf-flu --input samplesheet.csv --platfrom samplesheet.csv -profile " log.info NfcoreSchema.params_help(workflow, params, json_schema, command) exit 0 } diff --git a/modules.json b/modules.json index 4f98ec1..1e336bf 100644 --- a/modules.json +++ b/modules.json @@ -3,12 +3,6 @@ "homePage": "https://github.com/CFIA-NCFAD/nf-flu", "repos": { "nf-core/modules": { - "blast/blastn": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" - }, - "cat/fastq": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" - }, "custom/dumpsoftwareversions": { "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" }, diff --git a/modules/local/blast_makeblastdb.nf b/modules/local/blast_makeblastdb.nf index d12ca9d..3fc0884 100644 --- a/modules/local/blast_makeblastdb.nf +++ b/modules/local/blast_makeblastdb.nf @@ -3,11 +3,11 @@ process BLAST_MAKEBLASTDB { tag "$fasta" label 'process_low' - conda (params.enable_conda ? 'bioconda::blast=2.12.0' : null) + conda (params.enable_conda ? 'bioconda::blast=2.14.0' : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container 'https://depot.galaxyproject.org/singularity/blast:2.12.0--hf3cf87c_4' + container 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1' } else { - container 'quay.io/biocontainers/blast:2.12.0--hf3cf87c_4' + container 'quay.io/biocontainers/blast:2.14.0--h7d5a4b4_1' } input: diff --git a/modules/local/blastn.nf b/modules/local/blastn.nf new file mode 100644 index 0000000..33090e7 --- /dev/null +++ b/modules/local/blastn.nf @@ -0,0 +1,39 @@ +process BLAST_BLASTN { + tag "$meta.id" + label 'process_high' + + conda (params.enable_conda ? 'bioconda::blast=2.14.0' : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1' + } else { + container 'quay.io/biocontainers/blast:2.14.0--h7d5a4b4_1' + } + + input: + tuple val(meta), path(fasta) + path db + + output: + tuple val(meta), path('*.blastn.txt'), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + DB=`find -L ./ -name "*.ndb" | sed 's/.ndb//'` + blastn \\ + -num_threads $task.cpus \\ + -db \$DB \\ + -query $fasta \\ + $args \\ + -out ${prefix}.blastn.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/cat_illumina_fastq.nf b/modules/local/cat_illumina_fastq.nf new file mode 100644 index 0000000..fe62e55 --- /dev/null +++ b/modules/local/cat_illumina_fastq.nf @@ -0,0 +1,102 @@ +// modified nf-core/modules CAT_FASTQ +// for paired end reads append 1:N:0:. or 2:N:0:. forward and reverse reads +// for compatability with IRMA assembly +process CAT_ILLUMINA_FASTQ { + tag "$meta.id" + label 'process_single' + + conda (params.enable_conda ? "conda-forge::perl" : null) + // use BLAST container here since it has Perl and is required by other + // processes in the pipeline + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1' + } else { + container 'quay.io/biocontainers/blast:2.14.0--h7d5a4b4_1' + } + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + def fqList = [] + def fqgzList = [] + readList.each { + if (it ==~ /^.*\.(fastq|fq)$/) { + fqList << it + } else if (it ==~ /^.*\.(fastq|fq)\.gz$/) { + fqgzList << it + } + } + if (meta.single_end) { + if (fqList.size >= 1 || fqgzList.size >= 1) { + """ + touch ${prefix}.merged.fastq.gz + if [[ ${fqList.size} > 0 ]]; then + cat ${readList.join(' ')} | gzip -ck >> ${prefix}.merged.fastq.gz + fi + if [[ ${fqgzList.size} > 0 ]]; then + cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read1gz = [] + def read2 = [] + def read2gz = [] + fqList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + fqgzList.eachWithIndex{ v, ix -> ( ix & 1 ? read2gz : read1gz ) << v } + """ + # append 1:N:0:. or 2:N:0:. to forward and reverse reads if "[12]:N:.*" + # not present in the FASTQ header for compatability with IRMA assembly + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + if [[ ${read1.size} > 0 ]]; then + cat ${read1.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_1.merged.fastq.gz + fi + if [[ ${read1gz.size} > 0 ]]; then + zcat ${read1gz.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_1.merged.fastq.gz + fi + if [[ ${read2.size} > 0 ]]; then + cat ${read2.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_2.merged.fastq.gz + fi + if [[ ${read2gz.size} > 0 ]]; then + zcat ${read2gz.join(' ')} \\ + | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){ chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\ + | gzip -ck \\ + >> ${prefix}_2.merged.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } +} diff --git a/modules/local/pull_top_ref_id.nf b/modules/local/pull_top_ref_id.nf index 4409acf..6e9f0d7 100644 --- a/modules/local/pull_top_ref_id.nf +++ b/modules/local/pull_top_ref_id.nf @@ -2,11 +2,11 @@ process PULL_TOP_REF_ID { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? 'conda-forge::python=3.9 conda-forge::biopython=1.78 conda-forge::openpyxl=3.0.7 conda-forge::pandas=1.2.4 conda-forge::rich=10.2.2 conda-forge::typer=0.3.2 conda-forge::xlsxwriter=1.4.3' : null) + conda (params.enable_conda ? 'conda-forge::python=3.10 conda-forge::biopython=1.80 conda-forge::openpyxl=3.1.0 conda-forge::pandas=1.5.3 conda-forge::rich=12.6.0 conda-forge::typer=0.7.0 conda-forge::xlsxwriter=3.0.8 conda-forge::polars=0.17.9 conda-forge::pyarrow=11.0.0' : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container 'https://depot.galaxyproject.org/singularity/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0' + container 'https://depot.galaxyproject.org/singularity/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0' } else { - container 'quay.io/biocontainers/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0' + container 'quay.io/biocontainers/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0' } input: diff --git a/modules/local/subtyping_report.nf b/modules/local/subtyping_report.nf index b6f3c8c..95280af 100644 --- a/modules/local/subtyping_report.nf +++ b/modules/local/subtyping_report.nf @@ -21,11 +21,11 @@ process SUBTYPING_REPORT { "2 GB" } } - conda (params.enable_conda ? 'conda-forge::python=3.9 conda-forge::biopython=1.78 conda-forge::openpyxl=3.0.7 conda-forge::pandas=1.2.4 conda-forge::rich=10.2.2 conda-forge::typer=0.3.2 conda-forge::xlsxwriter=1.4.3' : null) + conda (params.enable_conda ? 'conda-forge::python=3.10 conda-forge::biopython=1.80 conda-forge::openpyxl=3.1.0 conda-forge::pandas=1.5.3 conda-forge::rich=12.6.0 conda-forge::typer=0.7.0 conda-forge::xlsxwriter=3.0.8 conda-forge::polars=0.17.9 conda-forge::pyarrow=11.0.0' : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container 'https://depot.galaxyproject.org/singularity/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0' + container 'https://depot.galaxyproject.org/singularity/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0' } else { - container 'quay.io/biocontainers/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0' + container 'quay.io/biocontainers/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0' } input: diff --git a/modules/nf-core/modules/blast/blastn/functions.nf b/modules/nf-core/modules/blast/blastn/functions.nf deleted file mode 100644 index da9da09..0000000 --- a/modules/nf-core/modules/blast/blastn/functions.nf +++ /dev/null @@ -1,68 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} diff --git a/modules/nf-core/modules/blast/blastn/main.nf b/modules/nf-core/modules/blast/blastn/main.nf deleted file mode 100644 index b85f6c8..0000000 --- a/modules/nf-core/modules/blast/blastn/main.nf +++ /dev/null @@ -1,37 +0,0 @@ -process BLAST_BLASTN { - tag "$meta.id" - label 'process_medium' - - conda (params.enable_conda ? 'bioconda::blast=2.12.0' : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/blast:2.12.0--pl5262h3289130_0' : - 'quay.io/biocontainers/blast:2.12.0--pl5262h3289130_0' }" - - input: - tuple val(meta), path(fasta) - path db - - output: - tuple val(meta), path('*.blastn.txt'), emit: txt - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - DB=`find -L ./ -name "*.ndb" | sed 's/.ndb//'` - blastn \\ - -num_threads $task.cpus \\ - -db \$DB \\ - -query $fasta \\ - $args \\ - -out ${prefix}.blastn.txt - cat <<-END_VERSIONS > versions.yml - "${task.process}": - blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/blast/blastn/meta.yml b/modules/nf-core/modules/blast/blastn/meta.yml deleted file mode 100644 index 2742278..0000000 --- a/modules/nf-core/modules/blast/blastn/meta.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: blast_blastn -description: Queries a BLAST DNA database -keywords: - - fasta - - blast - - blastn - - DNA sequence -tools: - - blast: - description: | - BLAST finds regions of similarity between biological sequences. - homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi - documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs - doi: 10.1016/S0022-2836(05)80360-2 - licence: ["US-Government-Work"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: Input fasta file containing queries sequences - pattern: "*.{fa,fasta}" - - db: - type: directory - description: Directory containing blast database - pattern: "*" -output: - - txt: - type: file - description: File containing blastn hits - pattern: "*.{blastn.txt}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@joseespinosa" - - "@drpatelh" diff --git a/modules/nf-core/modules/cat/fastq/functions.nf b/modules/nf-core/modules/cat/fastq/functions.nf deleted file mode 100644 index da9da09..0000000 --- a/modules/nf-core/modules/cat/fastq/functions.nf +++ /dev/null @@ -1,68 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf deleted file mode 100644 index 4ff5f34..0000000 --- a/modules/nf-core/modules/cat/fastq/main.nf +++ /dev/null @@ -1,47 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process CAT_FASTQ { - tag "$meta.id" - label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'merged_fastq', meta:meta, publish_by_meta:['id']) } - - conda (params.enable_conda ? "conda-forge::sed=4.7" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" - } else { - container "biocontainers/biocontainers:v1.2.0_cv1" - } - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.merged.fastq.gz"), emit: reads - - script: - def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" - def readList = reads.collect{ it.toString() } - if (meta.single_end) { - if (readList.size > 1) { - """ - cat ${readList.sort().join(' ')} > ${prefix}.merged.fastq.gz - """ - } - } else { - if (readList.size > 2) { - def read1 = [] - def read2 = [] - readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } - """ - cat ${read1.sort().join(' ')} > ${prefix}_1.merged.fastq.gz - cat ${read2.sort().join(' ')} > ${prefix}_2.merged.fastq.gz - """ - } - } -} \ No newline at end of file diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/modules/cat/fastq/meta.yml deleted file mode 100644 index c836598..0000000 --- a/modules/nf-core/modules/cat/fastq/meta.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: cat_fastq -description: Concatenates fastq files -keywords: - - fastq - - concatenate -tools: - - cat: - description: | - The cat utility reads files sequentially, writing them to the standard output. - documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html - licence: ["GPL-3.0-or-later"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: list - description: | - List of input FastQ files to be concatenated. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: Merged fastq file - pattern: "*.{merged.fastq.gz}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@joseespinosa" - - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 5fcc40d..b3bd9bf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -70,29 +70,57 @@ includeConfig 'conf/modules.config' profiles { charliecloud { - charliecloud.enabled = true + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false } conda { - params.enable_conda = true - conda.createTimeout = "120 min" - conda.enabled = true + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + // Increase time available to build Conda environment + conda.createTimeout = "120 min" conda.useMamba = params.use_mamba } + mamba { + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + // Increase time available to build Conda environment + conda.createTimeout = "120 min" + } debug { process.beforeScript = 'echo $HOSTNAME' } docker { - docker.enabled = true - // Avoid this error: - // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. - // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351 - // once this is established and works well, nextflow might implement this behavior as new default. - docker.runOptions = '-u \$(id -u):\$(id -g)' + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } podman { - podman.enabled = true + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false } singularity { singularity.enabled = true singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } slurm { includeConfig 'conf/slurm.config' @@ -125,9 +153,10 @@ manifest { description = 'Influenza A virus genome assembly pipeline' homePage = 'https://github.com/CFIA-NCFAD/nf-flu' author = 'Peter Kruczkiewicz, Hai Nguyen' - version = '3.1.6' + version = '3.2.0' nextflowVersion = '>=21.10' mainScript = 'main.nf' + doi = '10.5281/zenodo.7011213' } // Following function from https://github.com/nf-core/vipr/blob/master/nextflow.config#L88 diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 0d06add..427f554 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -18,8 +18,8 @@ include { CHECK_SAMPLE_SHEET } from '../modules/local/check_sample_sheet' include { SUBTYPING_REPORT } from '../modules/local/subtyping_report' include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc' include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb' -include { BLAST_BLASTN } from '../modules/nf-core/modules/blast/blastn/main' -include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' +include { BLAST_BLASTN } from '../modules/local/blastn' +include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq' //============================================================================= // Workflow Params Setup @@ -63,21 +63,16 @@ workflow ILLUMINA { [ meta, reads ] } .groupTuple(by: [0]) \ - .branch { meta, reads -> - single: reads.size() == 1 - return [ meta, reads.flatten() ] - multiple: reads.size() > 1 - return [ meta, reads.flatten() ] + .map { meta, reads -> + return [ meta, reads.flatten() ] } .set { ch_input } // Credit to nf-core/viralrecon. Source: https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/workflows/illumina.nf#L221 // Concatenate FastQ files from same sample if required - CAT_FASTQ(ch_input.multiple) - .mix(ch_input.single) - .set { ch_cat_reads } + CAT_ILLUMINA_FASTQ(ch_input) - IRMA(ch_cat_reads, irma_module) + IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module) BLAST_BLASTN(IRMA.out.consensus, BLAST_MAKEBLASTDB.out.db) diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf index 911d911..0d13d9a 100644 --- a/workflows/nanopore.nf +++ b/workflows/nanopore.nf @@ -28,9 +28,9 @@ include { CHECK_REF_FASTA } from '../modules // using modified BLAST_MAKEBLASTDB from nf-core/modules to only move/publish BLAST DB files include { BLAST_MAKEBLASTDB as BLAST_MAKEBLASTDB_NCBI } from '../modules/local/blast_makeblastdb' include { BLAST_MAKEBLASTDB as BLAST_MAKEBLASTDB_REFDB } from '../modules/local/blast_makeblastdb' -include { BLAST_BLASTN as BLAST_BLASTN_IRMA } from '../modules/nf-core/modules/blast/blastn/main' -include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS } from '../modules/nf-core/modules/blast/blastn/main' -include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS_REF_DB } from '../modules/nf-core/modules/blast/blastn/main' +include { BLAST_BLASTN as BLAST_BLASTN_IRMA } from '../modules/local/blastn' +include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS } from '../modules/local/blastn' +include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS_REF_DB } from '../modules/local/blastn' include { CUSTOM_DUMPSOFTWAREVERSIONS as SOFTWARE_VERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' include { MULTIQC } from '../modules/local/multiqc'