From 7a1b84bf0a476490518de2eaa8736b8d81fa3256 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 23 Jun 2023 07:37:13 -0500
Subject: [PATCH 1/3] Update README.md

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index aee1421..473efca 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 # CFIA-NCFAD/nf-flu - Influenza A and B Virus Genome Assembly Nextflow Workflow
 
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7011213.svg)](https://doi.org/10.5281/zenodo.7011213)
-
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8068086.svg)](https://doi.org/10.5281/zenodo.8068086)
 [![CI](https://github.com/CFIA-NCFAD/nf-flu/actions/workflows/ci.yml/badge.svg)](https://github.com/CFIA-NCFAD/nf-flu/actions/workflows/ci.yml)
 
 [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)

From efb0a23216619c3010c93a4cbe24b77da7a4d6dc Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 16:30:37 -0500
Subject: [PATCH 2/3] Fix #27 by catching NoDataError when trying to parse
 empty BLAST results files in parse_influenza_blast_results.py

---
 CHANGELOG.md                         |  6 ++
 bin/parse_influenza_blast_results.py | 91 ++++++++++++++--------------
 nextflow.config                      |  2 +-
 3 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2b39b5..d8833b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.2.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.2.1)] - 2023-07-07
+
+### Fixes
+
+* Empty BLAST results file parsing `NoDataError` (#27) (Thanks @MatFish for reporting this issue!)
+
 ## [[3.2.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.2.0)] - 2023-06-22
 
 ### Added
diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 90881b7..5608325 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -189,19 +189,27 @@ def parse_blast_result(
         top: int = 3,
         pident_threshold: float = 0.85,
         min_aln_length: int = 50,
-) -> Optional[
-    Tuple[pl.DataFrame, Optional[pl.DataFrame], Optional[pl.DataFrame], Dict]
-]:
+) -> Optional[Tuple[pl.DataFrame, Dict]]:
     logging.info(f"Parsing BLAST results from {blast_result}")
 
-    df_filtered = pl.scan_csv(
-        blast_result,
-        has_header=False,
-        separator="\t",
-        new_columns=[name for name, coltype in blast_cols],
-        dtypes={name: coltype for name, coltype in blast_cols},
-    ).filter((pl.col("pident") >= (pident_threshold * 100)) &
-             (pl.col("length") >= min_aln_length)).collect(streaming=True)
+    try:
+        df_filtered = (
+            pl.scan_csv(
+                blast_result,
+                has_header=False,
+                separator="\t",
+                new_columns=[name for name, coltype in blast_cols],
+                dtypes=dict(blast_cols),
+            )
+            .filter(
+                (pl.col("pident") >= (pident_threshold * 100))
+                & (pl.col("length") >= min_aln_length)
+            )
+            .collect(streaming=True)
+        )
+    except pl.exceptions.NoDataError:
+        logging.warning(f"No BLAST results found in {blast_result}")
+        return None
 
     sample_name: str = re.sub(r"^(.+)_\d$", r"\1", df_filtered["qaccver"][0])
     logging.info(
@@ -231,22 +239,21 @@ def parse_blast_result(
     )
 
     segments = df_merge["sample_segment"].unique().sort()
-    dfs = []
-    for seg in segments:
-        dfs.append(df_merge.filter(pl.col("sample_segment") == seg).head(top))
+    dfs = [
+        df_merge.filter(pl.col("sample_segment") == seg).head(top)
+        for seg in segments
+    ]
     df_top_seg_matches = pl.concat(dfs, how="vertical")
     cols = pl.Series([x for x, _ in blast_results_report_columns])
     df_top_seg_matches = df_top_seg_matches.select(pl.col(cols))
     subtype_results_summary = {"sample": sample_name}
     if not get_top_ref:
-        is_iav = True
-        if df_top_seg_matches.select(pl.col("subtype").is_null().all())[0, 0]:
-            is_iav = False
+        is_iav = not df_top_seg_matches.select(pl.col("subtype").is_null().all())[0, 0]
         H_results = None
         N_results = None
         if "4" in segments:
             H_results = find_h_or_n_type(df_merge, "4", is_iav)
-            subtype_results_summary.update(H_results)
+            subtype_results_summary |= H_results
         if "6" in segments:
             N_results = find_h_or_n_type(df_merge, "6", is_iav)
             subtype_results_summary.update(N_results)
@@ -258,27 +265,26 @@ def parse_blast_result(
 def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict], is_iav: bool) -> str:
     subtype = ""
     if not is_iav:
-        subtype = "N/A"
-        return subtype
+        return "N/A"
     if H_results is None and N_results is None:
         subtype = "-"
     elif H_results is not None and N_results is None:
         H: str = H_results.get("H_type", "")
         subtype = f"H{H}" if H != "" else "-"
-    elif H_results is None and N_results is not None:
+    elif H_results is None:
         N: str = N_results.get("N_type", "")
         subtype = f"N{N}" if N != "" else "-"
     else:
         H: str = H_results.get("H_type", "")
         N: str = N_results.get("N_type", "")
-        if H == "" and N == "":
-            subtype = "-"
-        else:
+        if H or N:
             if H != "":
                 H = f"H{H}"
             if N != "":
                 N = f"N{N}"
             subtype = f"{H}{N}"
+        else:
+            subtype = "-"
     return subtype
 
 
@@ -289,15 +295,11 @@ def find_h_or_n_type(df_merge, seg, is_iav):
     ], "Can only determine H or N type from segments 4 or 6, respectively!"
     type_name = "H_type" if seg == "4" else "N_type"
     h_or_n = type_name[0]
-    reg_h_or_n_type = ""
-    if h_or_n == "H":
-        reg_h_or_n_type = "[Hh]"
-    else:
-        reg_h_or_n_type = "[Nn]"
     df_segment = df_merge.filter(pl.col("sample_segment") == seg)
     if is_iav:
         type_counts = df_segment["subtype"].value_counts(sort=True)
         type_counts = type_counts.filter(~pl.col("subtype").is_null())
+        reg_h_or_n_type = "[Hh]" if h_or_n == "H" else "[Nn]"
         df_type_counts = type_counts.with_columns(pl.lit(type_counts["subtype"].str.extract(reg_h_or_n_type + r"(\d+)").
                                                          alias(type_name)))
         df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null())
@@ -305,7 +307,7 @@ def find_h_or_n_type(df_merge, seg, is_iav):
         type_to_count = defaultdict(int)
         for x in df_type_counts.iter_rows(named=True):
             type_to_count[x[type_name]] += x["counts"]
-        type_to_count = [(h, c) for h, c in type_to_count.items()]
+        type_to_count = list(type_to_count.items())
         type_to_count.sort(key=lambda x: x[1], reverse=True)
         top_type, top_type_count = type_to_count[0]
         total_count = type_counts["counts"].sum()
@@ -313,16 +315,20 @@ def find_h_or_n_type(df_merge, seg, is_iav):
             f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})"
         )
         df_segment = df_segment.with_columns(
-            pl.lit(df_segment["subtype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*")
-                   .fill_null(False)
-                   .alias("type_mask")))
+            pl.lit(
+                df_segment["subtype"]
+                .str.contains(f".*{reg_h_or_n_type}" + top_type + r".*")
+                .fill_null(False)
+                .alias("type_mask")
+            )
+        )
         df_seg_top_type = df_segment.filter(pl.col("type_mask") == True).drop("type_mask")
-        top_result: pl.Series = [r for r in df_seg_top_type.head(1).iter_rows(named=True)][0]
+        top_result: pl.Series = list(df_seg_top_type.head(1).iter_rows(named=True))[0]
     else:
         top_type = "N/A"
         top_type_count = "N/A"
         total_count = "N/A"
-        top_result: pl.Series = [r for r in df_segment.head(1).iter_rows(named=True)][0]
+        top_result: pl.Series = list(df_segment.head(1).iter_rows(named=True))[0]
 
     results_summary = {
         f"{h_or_n}_type": top_type if is_iav else "N/A",
@@ -388,7 +394,7 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
         has_header=False,
         separator="\t",
         new_columns=[name for name, _ in md_cols],
-        dtypes={name: t for name, t in md_cols},
+        dtypes=dict(md_cols),
     )
 
     unique_subtypes = df_md.select("subtype").unique()
@@ -414,7 +420,7 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
             sample = subtype_results_summary["sample"]
             all_subtype_results[sample] = subtype_results_summary
         df_all_blast = pd.concat(dfs_blast).rename(
-            columns={k: v for k, v in blast_results_report_columns}
+            columns=dict(blast_results_report_columns)
         )
         df_subtype_results = pd.DataFrame(all_subtype_results).transpose()
         cols = pd.Series(subtype_results_summary_columns)
@@ -442,9 +448,7 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
         )
     else:
         df_blast, subtype_results_summary = results[0]
-        df_blast = df_blast.rename(
-            mapping={k: v for k, v in blast_results_report_columns}
-        )
+        df_blast = df_blast.rename(mapping=dict(blast_results_report_columns))
         df_ref_id = df_blast.select(pl.col(['Sample', 'Sample Genome Segment Number',
                                             'Reference NCBI Accession', 'BLASTN Bitscore', 'Reference Sequence ID']))
         df_ref_id = df_ref_id.with_columns(
@@ -463,10 +467,9 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
 def get_col_widths(df, index=False):
     """Calculate column widths based on column headers and contents"""
     if index:
-        idx_max = max(
+        yield max(
             [len(str(s)) for s in df.index.values] + [len(str(df.index.name))]
         )
-        yield idx_max
     for c in df.columns:
         # get max length of column contents and length of column header
         yield np.max([df[c].astype(str).str.len().max() + 1, len(c) + 1])
@@ -493,7 +496,7 @@ def write_excel(
             # fixed max number of characters in sheet name due to compatibility
             if sheet_name_index:
                 max_chars = 28
-                fixed_sheetname = "{}_{}".format(idx, fixed_sheetname[:max_chars])
+                fixed_sheetname = f"{idx}_{fixed_sheetname[:max_chars]}"
             else:
                 max_chars = 31
                 fixed_sheetname = fixed_sheetname[:max_chars]
@@ -506,7 +509,7 @@ def write_excel(
                     len(fixed_sheetname),
                 )
 
-            logging.info('Writing table to Excel sheet "{}"'.format(fixed_sheetname))
+            logging.info(f'Writing table to Excel sheet "{fixed_sheetname}"')
             df.to_excel(
                 writer, sheet_name=fixed_sheetname, index=False, freeze_panes=(1, 1)
             )
diff --git a/nextflow.config b/nextflow.config
index b3bd9bf..7ff7d09 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -153,7 +153,7 @@ manifest {
   description     = 'Influenza A virus genome assembly pipeline'
   homePage        = 'https://github.com/CFIA-NCFAD/nf-flu'
   author          = 'Peter Kruczkiewicz, Hai Nguyen'
-  version         = '3.2.0'
+  version         = '3.2.1'
   nextflowVersion = '>=21.10'
   mainScript      = 'main.nf'
   doi             = '10.5281/zenodo.7011213'

From d223e3b19131e72f0768b5e439f429469827b3c8 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Mon, 10 Jul 2023 11:22:24 -0500
Subject: [PATCH 3/3] bin/parse_influenza_blast_results.py: clean up imports,
 reindent

---
 bin/parse_influenza_blast_results.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 5608325..c4cc6a5 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -8,15 +8,14 @@
 Segment 6 - neuraminidase (NA) gene
 """
 
-from typing import Dict, List, Optional, Tuple
-import re
 import logging
+import re
 from collections import defaultdict
-import multiprocessing
+from typing import Dict, List, Optional, Tuple
 
 import click
-import pandas as pd
 import numpy as np
+import pandas as pd
 import polars as pl
 from rich.logging import RichHandler
 
@@ -230,9 +229,9 @@ def parse_blast_result(
     del df_metadata
     df_merge = df_merge.with_columns(
         pl.when(pl.col("subtype").is_null())
-            .then(pl.col("subtype_from_match_title"))
-            .otherwise(pl.col("subtype"))
-            .alias("subtype")
+        .then(pl.col("subtype_from_match_title"))
+        .otherwise(pl.col("subtype"))
+        .alias("subtype")
     )
     df_merge = df_merge.sort(
         by=["sample_segment", "bitscore"], descending=[False, True]
@@ -453,14 +452,14 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
                                             'Reference NCBI Accession', 'BLASTN Bitscore', 'Reference Sequence ID']))
         df_ref_id = df_ref_id.with_columns(
             pl.when(pl.col("Reference NCBI Accession").is_null())
-                .then(pl.col("Reference Sequence ID"))
-                .otherwise(pl.col("Reference NCBI Accession"))
-                .str.strip()
-                .alias('Reference NCBI Accession')
+            .then(pl.col("Reference Sequence ID"))
+            .otherwise(pl.col("Reference NCBI Accession"))
+            .str.strip()
+            .alias('Reference NCBI Accession')
         )
         df_ref_id = df_ref_id.with_columns(
             pl.col("Sample Genome Segment Number").apply(lambda x: influenza_segment[int(x)])
-                .alias("Sample Genome Segment Number"))
+            .alias("Sample Genome Segment Number"))
         df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)