From 4554989205069d9cafaf0f51656420254a1d67b7 Mon Sep 17 00:00:00 2001
From: Hai Nguyen <nhhaidee@gmail.com>
Date: Thu, 20 Jul 2023 15:17:04 -0500
Subject: [PATCH 1/9] Assign Genotype null for IBV

---
 bin/parse_influenza_blast_results.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 18c4b0b..f054c62 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -243,6 +243,12 @@ def parse_blast_result(
     df_top_seg_matches = pl.concat(dfs, how="vertical")
     cols = pl.Series([x for x, _ in blast_results_report_columns])
     df_top_seg_matches = df_top_seg_matches.select(pl.col(cols))
+    df_top_seg_matches = df_top_seg_matches.with_columns(
+        pl.when(~pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A"))
+        .then(pl.lit(None))
+        .otherwise(pl.col("Genotype"))
+        .alias("Genotype")
+    )
     subtype_results_summary = {"sample": sample_name}
     if not get_top_ref:
         is_iav = not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0]

From 2f3eed4e015d21dc07aa4481b8a088e10c7835da Mon Sep 17 00:00:00 2001
From: Hai Nguyen <nhhaidee@gmail.com>
Date: Thu, 20 Jul 2023 15:44:26 -0500
Subject: [PATCH 2/9] Assign Genotype null for IBV (use if condition)

---
 bin/parse_influenza_blast_results.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index f054c62..8ece163 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -243,15 +243,12 @@ def parse_blast_result(
     df_top_seg_matches = pl.concat(dfs, how="vertical")
     cols = pl.Series([x for x, _ in blast_results_report_columns])
     df_top_seg_matches = df_top_seg_matches.select(pl.col(cols))
-    df_top_seg_matches = df_top_seg_matches.with_columns(
-        pl.when(~pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A"))
-        .then(pl.lit(None))
-        .otherwise(pl.col("Genotype"))
-        .alias("Genotype")
-    )
     subtype_results_summary = {"sample": sample_name}
     if not get_top_ref:
-        is_iav = not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0]
+        is_iav = False
+        if not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0] \
+            and df_top_seg_matches.select(pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A").any())[0, 0]:
+            is_iav = True
         H_results = None
         N_results = None
         if "4" in segments:

From 7243e146f30d9dd2d98257a9ef3dd8607530155b Mon Sep 17 00:00:00 2001
From: Hai Nguyen <nhhaidee@gmail.com>
Date: Thu, 20 Jul 2023 15:52:33 -0500
Subject: [PATCH 3/9] Update change log

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54fa4fe..e1fa060 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.3.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.1)] - 2023-07-20
+
+### Fixes
+
+Correct returned value for variable `is_iav` in `parse_influenza_blast_result.py` script (Issue [#32](https://github.com/CFIA-NCFAD/nf-flu/issues/32))
+
 ## [[3.3.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.0)] - 2023-07-11
 
 This release migrates to more recently updated Influenza virus sequences since the last update for the [NCBI Influenza DB FTP data](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/) was in 2020-10-13. By default, all Orthomyxoviridae virus sequences were parsed from the daily updated NCBI Viruses [`AllNucleotide.fa`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/) and [`AllNuclMetadata.csv.gz`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/AllNuclMetadata.csv.gz) and uploaded to [Figshare](https://figshare.com/articles/dataset/2023-06-14_-_NCBI_Viruses_-_Orthomyxoviridae/23608782) as Zstd compressed files. nf-flu no longer uses the [influenza.fna.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz) and [genomeset.dat.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz) files for Influenza sequences and metadata, respectively.

From 93a7b6821e2cc4dcf8cc4cc88d99d6f466f3fea9 Mon Sep 17 00:00:00 2001
From: Hai Nguyen <nhhaidee@gmail.com>
Date: Fri, 21 Jul 2023 12:51:22 -0500
Subject: [PATCH 4/9] Use Genus column to check iav or not

---
 CHANGELOG.md                         |  4 ++--
 bin/parse_influenza_blast_results.py | 13 +++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1fa060..ab9d49d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,11 +3,11 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[3.3.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.1)] - 2023-07-20
+## [[3.3.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.1)] - 2023-07-21
 
 ### Fixes
 
-Correct returned value for variable `is_iav` in `parse_influenza_blast_result.py` script (Issue [#32](https://github.com/CFIA-NCFAD/nf-flu/issues/32))
+This patch release fixes an IBV subtype/genotype parsing issue when generating subtyping report using the new metadata format introduced in 3.3.0 ([#32](https://github.com/CFIA-NCFAD/nf-flu/issues/32)).
 
 ## [[3.3.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.0)] - 2023-07-11
 
diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 8ece163..7351d5d 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -60,6 +60,7 @@
     ("sample_segment", "Sample Genome Segment Number"),
     ("#Accession", "Reference NCBI Accession"),
     ("Genotype", "Reference Subtype"),
+    ("Genus", "Genus"),
     ("pident", "BLASTN Percent Identity"),
     ("length", "BLASTN Alignment Length"),
     ("mismatch", "BLASTN Mismatches"),
@@ -245,10 +246,14 @@ def parse_blast_result(
     df_top_seg_matches = df_top_seg_matches.select(pl.col(cols))
     subtype_results_summary = {"sample": sample_name}
     if not get_top_ref:
-        is_iav = False
-        if not df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0] \
-            and df_top_seg_matches.select(pl.col("GenBank_Title").str.contains(r"^Influenza.[^BCD]*A").any())[0, 0]:
-            is_iav = True
+        df_genotype_genus = df_top_seg_matches.select(pl.col(["Genotype", "Genus"]))
+        df_genotype_genus = df_genotype_genus.with_columns(
+            pl.when(pl.col("Genus") == "Alphainfluenzavirus")
+            .then(pl.col("Genotype"))
+            .otherwise(pl.lit(None)) #Genotype is null for non-IAV
+            .alias("Genotype")
+        )
+        is_iav = not df_genotype_genus.select(pl.col("Genotype").is_null().all())[0, 0]
         H_results = None
         N_results = None
         if "4" in segments:

From f3772cd26d4ada0961e61e4de50f68ae84344c7a Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Thu, 3 Aug 2023 17:23:10 -0500
Subject: [PATCH 5/9] Fix IAV check based off Genotype and Genus metadata check

---
 bin/parse_influenza_blast_results.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 7351d5d..389e4b1 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -247,13 +247,17 @@ def parse_blast_result(
     subtype_results_summary = {"sample": sample_name}
     if not get_top_ref:
         df_genotype_genus = df_top_seg_matches.select(pl.col(["Genotype", "Genus"]))
+        # where the genus is not IAV, set the genotype to "Not IAV"
         df_genotype_genus = df_genotype_genus.with_columns(
             pl.when(pl.col("Genus") == "Alphainfluenzavirus")
             .then(pl.col("Genotype"))
-            .otherwise(pl.lit(None)) #Genotype is null for non-IAV
+            .otherwise(pl.lit("Not IAV"))
             .alias("Genotype")
         )
-        is_iav = not df_genotype_genus.select(pl.col("Genotype").is_null().all())[0, 0]
+        genotypes = df_genotype_genus["Genotype"]
+        genotype_counts = genotypes.value_counts(sort=True)
+        # if the top genotype is "Not IAV", then the sample is not IAV
+        is_iav = genotype_counts['Genotype'][0] != "Not IAV"
         H_results = None
         N_results = None
         if "4" in segments:
@@ -298,8 +302,7 @@ def find_h_or_n_type(df_merge, seg, is_iav):
         "4",
         "6",
     ], "Can only determine H or N type from segments 4 or 6, respectively!"
-    type_name = "H_type" if seg == "4" else "N_type"
-    h_or_n = type_name[0]
+    h_or_n, type_name = ("H", "H_type") if seg == "4" else ("N", "N_type")
     df_segment = df_merge.filter(pl.col("sample_segment") == seg)
     if is_iav:
         type_counts = df_segment["Genotype"].value_counts(sort=True)

From 93d1cb3f38f24a83e77c2e2f13ef4e421e85d639 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Tue, 8 Aug 2023 14:07:07 -0500
Subject: [PATCH 6/9] Update ci.yml to include IBV test data for Nanopore

---
 .github/workflows/ci.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0745703..8257fd5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -154,6 +154,9 @@ jobs:
           mkdir reads
           echo "Downloading ERR6359501 from EBI ENA"
           curl -SLk --silent ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR635/001/ERR6359501/ERR6359501.fastq.gz > reads/ERR6359501.fastq.gz
+      - name: Fetch IBV test seq
+        run: |
+          curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/blob/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz
       - name: Prepare samplesheet.csv
         run: |
           echo "Subsample reads from ERR6359501.fastq.gz with seqtk to mock different runs and ways of specifying input"
@@ -168,6 +171,7 @@ jobs:
           echo "ERR6359501-10k,$(realpath reads/ERR6359501-10k.fastq)" | tee -a samplesheet.csv
           echo "ERR6359501,$(realpath run1)" | tee -a samplesheet.csv
           echo "ERR6359501,$(realpath run2)" | tee -a samplesheet.csv
+          echo "SRR24826962,$(realpath reads/SRR24826962.fastq.gz)" | tee -a samplesheet.csv
       - name: Cache subsampled influenza.fna
         uses: actions/cache@v3
         id: cache-influenza-fna

From 31e2079590e37fd93b2d04e6e08ab5024d896250 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Tue, 8 Aug 2023 17:02:48 -0500
Subject: [PATCH 7/9] Update ci.yml

---
 .github/workflows/ci.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8257fd5..726df7a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -157,6 +157,11 @@ jobs:
       - name: Fetch IBV test seq
         run: |
           curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/blob/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz
+      - name: Check IBV data
+        run: |
+          file reads/SRR24826962.fastq.gz
+          md5sum reads/SRR24826962.fastq.gz
+          sha256sum reads/SRR24826962.fastq.gz
       - name: Prepare samplesheet.csv
         run: |
           echo "Subsample reads from ERR6359501.fastq.gz with seqtk to mock different runs and ways of specifying input"

From f12bc21d3e3979c51be8660ed1fce536ddf850bb Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Tue, 8 Aug 2023 17:04:03 -0500
Subject: [PATCH 8/9] Update ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 726df7a..39db619 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -156,7 +156,7 @@ jobs:
           curl -SLk --silent ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR635/001/ERR6359501/ERR6359501.fastq.gz > reads/ERR6359501.fastq.gz
       - name: Fetch IBV test seq
         run: |
-          curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/blob/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz
+          curl -SLk --silent https://github.com/CFIA-NCFAD/nf-test-datasets/raw/nf-flu/nanopore/fastq/SRR24826962.sampled.fastq.gz > reads/SRR24826962.fastq.gz
       - name: Check IBV data
         run: |
           file reads/SRR24826962.fastq.gz

From e9eb32a1c4f56f0338dbad6e2e544d2d5f45e4b9 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Wed, 9 Aug 2023 09:06:52 -0500
Subject: [PATCH 9/9] Update nextflow.config; bump to v3.3.2

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index a0f26f0..7e3027e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -151,7 +151,7 @@ manifest {
   description     = 'Influenza A virus genome assembly pipeline'
   homePage        = 'https://github.com/CFIA-NCFAD/nf-flu'
   author          = 'Peter Kruczkiewicz, Hai Nguyen'
-  version         = '3.3.1'
+  version         = '3.3.2'
   nextflowVersion = '!>=22.10.1'
   mainScript      = 'main.nf'
   doi             = '10.5281/zenodo.7011213'