From 4554989205069d9cafaf0f51656420254a1d67b7 Mon Sep 17 00:00:00 2001 From: Hai Nguyen Date: Thu, 20 Jul 2023 15:17:04 -0500 Subject: [PATCH 1/9] Assign Genotype null for IBV --- bin/parse_influenza_blast_results.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index 18c4b0b..f054c62 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -243,6 +243,12 @@ def parse_blast_result( df_top_seg_matches = pl.concat(dfs, how="vertical") cols = pl.Series([x for x, _ in blast_results_report_columns]) df_top_seg_matches = df_top_seg_matches.select(pl.col(cols)) + df_top_seg_matches = df_top_seg_matches.with_columns( + pl.when(~pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A")) + .then(pl.lit(None)) + .otherwise(pl.col("Genotype")) + .alias("Genotype") + ) subtype_results_summary = {"sample": sample_name} if not get_top_ref: is_iav = not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0] From 2f3eed4e015d21dc07aa4481b8a088e10c7835da Mon Sep 17 00:00:00 2001 From: Hai Nguyen Date: Thu, 20 Jul 2023 15:44:26 -0500 Subject: [PATCH 2/9] Assign Genotype null for IBV (use if condition) --- bin/parse_influenza_blast_results.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index f054c62..8ece163 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -243,15 +243,12 @@ def parse_blast_result( df_top_seg_matches = pl.concat(dfs, how="vertical") cols = pl.Series([x for x, _ in blast_results_report_columns]) df_top_seg_matches = df_top_seg_matches.select(pl.col(cols)) - df_top_seg_matches = df_top_seg_matches.with_columns( - pl.when(~pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A")) - .then(pl.lit(None)) - .otherwise(pl.col("Genotype")) - .alias("Genotype") - ) subtype_results_summary = {"sample": sample_name} if not get_top_ref: - is_iav = not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0] + is_iav = False + if not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0] \ + and df_top_seg_matches.select(pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A").any())[0, 0]: + is_iav = True H_results = None N_results = None if "4" in segments: From 7243e146f30d9dd2d98257a9ef3dd8607530155b Mon Sep 17 00:00:00 2001 From: Hai Nguyen Date: Thu, 20 Jul 2023 15:52:33 -0500 Subject: [PATCH 3/9] Update change log --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54fa4fe..e1fa060 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[3.3.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.1)] - 2023-07-20 + +### Fixes + +Correct returned value for variable `is_iav` in `parse_influenza_blast_result.py` script (Issue [#32](https://github.com/CFIA-NCFAD/nf-flu/issues/32)) + ## [[3.3.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.0)] - 2023-07-11 This release migrates to more recently updated Influenza virus sequences since the last update for the [NCBI Influenza DB FTP data](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/) was in 2020-10-13. By default, all Orthomyxoviridae virus sequences were parsed from the daily updated NCBI Viruses [`AllNucleotide.fa`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/) and [`AllNuclMetadata.csv.gz`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/AllNuclMetadata.csv.gz) and uploaded to [Figshare](https://figshare.com/articles/dataset/2023-06-14_-_NCBI_Viruses_-_Orthomyxoviridae/23608782) as Zstd compressed files. nf-flu no longer uses the [influenza.fna.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz) and [genomeset.dat.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz) files for Influenza sequences and metadata, respectively. From 93a7b6821e2cc4dcf8cc4cc88d99d6f466f3fea9 Mon Sep 17 00:00:00 2001 From: Hai Nguyen Date: Fri, 21 Jul 2023 12:51:22 -0500 Subject: [PATCH 4/9] Use Genus column to check iav or not --- CHANGELOG.md | 4 ++-- bin/parse_influenza_blast_results.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1fa060..ab9d49d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[3.3.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.1)] - 2023-07-20 +## [[3.3.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.1)] - 2023-07-21 ### Fixes -Correct returned value for variable `is_iav` in `parse_influenza_blast_result.py` script (Issue [#32](https://github.com/CFIA-NCFAD/nf-flu/issues/32)) +This patch release fixes an IBV subtype/genotype parsing issue when generating subtyping report using the new metadata format introduced in 3.3.0 ([#32](https://github.com/CFIA-NCFAD/nf-flu/issues/32)). ## [[3.3.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.0)] - 2023-07-11 diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index 8ece163..7351d5d 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -60,6 +60,7 @@ ("sample_segment", "Sample Genome Segment Number"), ("#Accession", "Reference NCBI Accession"), ("Genotype", "Reference Subtype"), + ("Genus", "Genus"), ("pident", "BLASTN Percent Identity"), ("length", "BLASTN Alignment Length"), ("mismatch", "BLASTN Mismatches"), @@ -245,10 +246,14 @@ def parse_blast_result( df_top_seg_matches = df_top_seg_matches.select(pl.col(cols)) subtype_results_summary = {"sample": sample_name} if not get_top_ref: - is_iav = False - if not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0] \ - and df_top_seg_matches.select(pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A").any())[0, 0]: - is_iav = True + df_genotype_genus = df_top_seg_matches.select(pl.col(["Genotype", "Genus"])) + df_genotype_genus = df_genotype_genus.with_columns( + pl.when(pl.col("Genus") == "Alphainfluenzavirus") + .then(pl.col("Genotype")) + .otherwise(pl.lit(None)) #Genotype is null for non-IAV + .alias("Genotype") + ) + is_iav = not df_genotype_genus.select(pl.col("Genotype").is_null().all())[0, 0] H_results = None N_results = None if "4" in segments: From f3772cd26d4ada0961e61e4de50f68ae84344c7a Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Thu, 3 Aug 2023 17:23:10 -0500 Subject: [PATCH 5/9] Fix IAV check based off Genotype and Genus metadata check --- bin/parse_influenza_blast_results.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py index 7351d5d..389e4b1 100755 --- a/bin/parse_influenza_blast_results.py +++ b/bin/parse_influenza_blast_results.py @@ -247,13 +247,17 @@ def parse_blast_result( subtype_results_summary = {"sample": sample_name} if not get_top_ref: df_genotype_genus = df_top_seg_matches.select(pl.col(["Genotype", "Genus"])) + # where the genus is not IAV, set the genotype to "Not IAV" df_genotype_genus = df_genotype_genus.with_columns( pl.when(pl.col("Genus") == "Alphainfluenzavirus") .then(pl.col("Genotype")) - .otherwise(pl.lit(None)) #Genotype is null for non-IAV + .otherwise(pl.lit("Not IAV")) .alias("Genotype") ) - is_iav = not df_genotype_genus.select(pl.col("Genotype").is_null().all())[0, 0] + genotypes = df_genotype_genus["Genotype"] + genotype_counts = genotypes.value_counts(sort=True) + # if the top genotype is "Not IAV", then the sample is not IAV + is_iav = genotype_counts['Genotype'][0] != "Not IAV" H_results = None N_results = None if "4" in segments: @@ -298,8 +302,7 @@ def find_h_or_n_type(df_merge, seg, is_iav): "4", "6", ], "Can only determine H or N type from segments 4 or 6, respectively!" - type_name = "H_type" if seg == "4" else "N_type" - h_or_n = type_name[0] + h_or_n, type_name = ("H", "H_type") if seg == "4" else ("N", "N_type") df_segment = df_merge.filter(pl.col("sample_segment") == seg) if is_iav: type_counts = df_segment["Genotype"].value_counts(sort=True) From 93d1cb3f38f24a83e77c2e2f13ef4e421e85d639 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Tue, 8 Aug 2023 14:07:07 -0500 Subject: [PATCH 6/9] Update ci.yml to include IBV test data for Nanopore --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0745703..8257fd5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,6 +154,9 @@ jobs: mkdir reads echo "Downloading ERR6359501 from EBI ENA" curl -SLk --silent ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR635/001/ERR6359501/ERR6359501.fastq.gz > reads/ERR6359501.fastq.gz + - name: Fetch IBV test seq + run: | + curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/blob/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz - name: Prepare samplesheet.csv run: | echo "Subsample reads from ERR6359501.fastq.gz with seqtk to mock different runs and ways of specifying input" @@ -168,6 +171,7 @@ jobs: echo "ERR6359501-10k,$(realpath reads/ERR6359501-10k.fastq)" | tee -a samplesheet.csv echo "ERR6359501,$(realpath run1)" | tee -a samplesheet.csv echo "ERR6359501,$(realpath run2)" | tee -a samplesheet.csv + echo "SRR24826962,$(realpath reads/SRR24826962.fastq.gz)" | tee -a samplesheet.csv - name: Cache subsampled influenza.fna uses: actions/cache@v3 id: cache-influenza-fna From 31e2079590e37fd93b2d04e6e08ab5024d896250 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Tue, 8 Aug 2023 17:02:48 -0500 Subject: [PATCH 7/9] Update ci.yml --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8257fd5..726df7a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,6 +157,11 @@ jobs: - name: Fetch IBV test seq run: | curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/blob/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz + - name: Check IBV data + run: | + file reads/SRR24826962.fastq.gz + md5sum reads/SRR24826962.fastq.gz + sha256sum reads/SRR24826962.fastq.gz - name: Prepare samplesheet.csv run: | echo "Subsample reads from ERR6359501.fastq.gz with seqtk to mock different runs and ways of specifying input" From f12bc21d3e3979c51be8660ed1fce536ddf850bb Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Tue, 8 Aug 2023 17:04:03 -0500 Subject: [PATCH 8/9] Update ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 726df7a..39db619 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -156,7 +156,7 @@ jobs: curl -SLk --silent ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR635/001/ERR6359501/ERR6359501.fastq.gz > reads/ERR6359501.fastq.gz - name: Fetch IBV test seq run: | - curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/blob/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz + curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/raw/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz - name: Check IBV data run: | file reads/SRR24826962.fastq.gz From e9eb32a1c4f56f0338dbad6e2e544d2d5f45e4b9 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Wed, 9 Aug 2023 09:06:52 -0500 Subject: [PATCH 9/9] Update nextflow.config; bump to v3.3.2 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index a0f26f0..7e3027e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -151,7 +151,7 @@ manifest { description = 'Influenza A virus genome assembly pipeline' homePage = 'https://github.com/CFIA-NCFAD/nf-flu' author = 'Peter Kruczkiewicz, Hai Nguyen' - version = '3.3.1' + version = '3.3.2' nextflowVersion = '!>=22.10.1' mainScript = 'main.nf' doi = '10.5281/zenodo.7011213'