From fbf11860550c33628e4f69c4accaa02517c2fa25 Mon Sep 17 00:00:00 2001
From: Hai Nguyen <nhhaidee@gmail.com>
Date: Fri, 28 Apr 2023 11:25:03 -0500
Subject: [PATCH 1/7] Fixed potential bug: str.match handles lower case of
 subtype

---
 bin/parse_influenza_blast_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index f09a1c3..eea868b 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -310,7 +310,7 @@ def find_h_or_n_type(df_merge, seg):
         f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})"
     )
     type_mask = df_segment.subtype.str.match(
-        r".*" + h_or_n + top_type + r".*", na=False
+        r".*" + h_or_n + top_type + r".*", na=False, flags=re.IGNORECASE
     )
     type_mask[pd.isnull(type_mask)] = False
     df_seg_top_type = df_segment[type_mask]

From 9d9529e3887c6bf8d6841e69083040cb9ba5ba03 Mon Sep 17 00:00:00 2001
From: Hai Nguyen <nhhaidee@gmail.com>
Date: Tue, 2 May 2023 11:23:14 -0500
Subject: [PATCH 2/7] Enhancements: add IBV support

---
 bin/parse_influenza_blast_results.py | 350 +++++++++++++--------------
 modules/local/pull_top_ref_id.nf     |   6 +-
 modules/local/subtyping_report.nf    |   6 +-
 3 files changed, 181 insertions(+), 181 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index eea868b..90881b7 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -12,17 +12,19 @@
 import re
 import logging
 from collections import defaultdict
-from multiprocessing import Pool
+import multiprocessing
 
 import click
 import pandas as pd
 import numpy as np
-from rich.console import Console
+import polars as pl
 from rich.logging import RichHandler
 
 LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s [in %(filename)s:%(lineno)d]"
 logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
 
+pl.enable_string_cache(True)
+
 influenza_segment = {
     1: "1_PB2",
     2: "2_PB1",
@@ -36,21 +38,21 @@
 
 # Column names/types/final report names
 blast_cols = [
-    ("qaccver", "category"),
+    ("qaccver", str),
     ("saccver", str),
     ("pident", float),
-    ("length", "uint16"),
-    ("mismatch", "uint16"),
-    ("gapopen", "uint16"),
-    ("qstart", "uint16"),
-    ("qend", "uint16"),
-    ("sstart", "uint16"),
-    ("send", "uint16"),
-    ("evalue", np.float16),
-    ("bitscore", np.float16),
-    ("qlen", "uint16"),
-    ("slen", "uint16"),
-    ("qcovs", np.float16),
+    ("length", pl.UInt16),
+    ("mismatch", pl.UInt16),
+    ("gapopen", pl.UInt16),
+    ("qstart", pl.UInt16),
+    ("qend", pl.UInt16),
+    ("sstart", pl.UInt16),
+    ("send", pl.UInt16),
+    ("evalue", pl.Float32),
+    ("bitscore", pl.Float32),
+    ("qlen", pl.UInt16),
+    ("slen", pl.UInt16),
+    ("qcovs", pl.Float32),
     ("stitle", str),
 ]
 
@@ -181,85 +183,83 @@
 
 def parse_blast_result(
         blast_result: str,
-        df_metadata: pd.DataFrame,
+        df_metadata: pl.DataFrame,
         regex_subtype_pattern: str,
+        get_top_ref: bool,
         top: int = 3,
         pident_threshold: float = 0.85,
         min_aln_length: int = 50,
 ) -> Optional[
-    Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame], Dict]
+    Tuple[pl.DataFrame, Optional[pl.DataFrame], Optional[pl.DataFrame], Dict]
 ]:
     logging.info(f"Parsing BLAST results from {blast_result}")
 
-    df = pd.read_csv(
+    df_filtered = pl.scan_csv(
         blast_result,
-        sep="\t",
-        names=[name for name, coltype in blast_cols],
-        dtype={name: coltype for name, coltype in blast_cols},
-    )
-    if df.empty:
-        logging.error(f"No BLASTN results in {blast_result}!")
-        return
-    # Assuming that all BLAST result entries have the same sample name so
-    # extracting the sample name from the first entry qaccver field
-    sample_name: str = re.sub(r"^(.+)_\d$", r"\1", df["qaccver"][0])
-    logging.info(f"Parsed {df.shape[0]} BLAST results from {blast_result}")
-    logging.info(
-        f"{sample_name} | n={df.shape[0]} | Filtering for hits above {pident_threshold}% identity."
-    )
-    df_filtered = df[
-        (df["pident"] >= (pident_threshold * 100)) & (df["length"] >= min_aln_length)
-        ]
+        has_header=False,
+        separator="\t",
+        new_columns=[name for name, coltype in blast_cols],
+        dtypes={name: coltype for name, coltype in blast_cols},
+    ).filter((pl.col("pident") >= (pident_threshold * 100)) &
+             (pl.col("length") >= min_aln_length)).collect(streaming=True)
+
+    sample_name: str = re.sub(r"^(.+)_\d$", r"\1", df_filtered["qaccver"][0])
     logging.info(
         f"{sample_name} | n={df_filtered.shape[0]} | Filtered for hits above {pident_threshold}% identity."
+        f"and Min Alignment length > {min_aln_length}"
     )
-    # Sequences header has been corrected by GUNZIP Process
-    # zcat $archive | sed -E 's/^>gi\\|[0-9]+\\|gb\\|(\\w+)\\|(.*)/>\\1 \\2/' > influenza.fna
-    df_filtered["accession"] = df_filtered.saccver.str.strip()
-    df_filtered["sample"] = sample_name
-    df_filtered["sample"] = pd.Categorical(df_filtered["sample"])
-    df_filtered["sample_segment"] = df_filtered.qaccver.str.extract(r".+_(\d)$").astype(
-        "category"
-    )
-    df_filtered["sample_segment"] = pd.Categorical(df_filtered["sample_segment"])
-    df_filtered["subtype_from_match_title"] = (
-        df_filtered["stitle"].str.extract(regex_subtype_pattern).astype("category")
-    )
+    df_filtered = df_filtered.with_columns([
+        pl.col('saccver').str.strip().alias("accession"),
+        pl.lit(sample_name, dtype=pl.Categorical).alias("sample"),
+        pl.col('qaccver').str.extract(r".+_(\d)$").cast(pl.Categorical).alias("sample_segment"),
+        pl.col("stitle").str.extract(regex_subtype_pattern).alias("subtype_from_match_title").cast(pl.Categorical)
+    ])
     logging.info(
         f"{sample_name} | Merging NCBI Influenza DB genome metadata with BLAST results on accession."
     )
-    df_merge = pd.merge(df_filtered, df_metadata, on="accession", how="left")
+    df_merge = df_filtered.join(df_metadata, on="accession", how="left")
     del df_filtered
     del df_metadata
-    df_merge["subtype"] = df_merge["subtype"].combine_first(df_merge["subtype_from_match_title"])
-    df_merge = df_merge.sort_values(
-        by=["sample_segment", "bitscore"], ascending=[True, False]
-    ).set_index("sample_segment")
-    subtype_results_summary = {}
-    H_results = None
-    N_results = None
-    if "4" in df_merge.index:
-        H_results = find_h_or_n_type(df_merge, "4")
-        subtype_results_summary.update(H_results)
-    if "6" in df_merge.index:
-        N_results = find_h_or_n_type(df_merge, "6")
-        subtype_results_summary.update(N_results)
-
-    subtype_results_summary["sample"] = sample_name
-    subtype_results_summary["subtype"] = get_subtype_value(H_results, N_results)
+    df_merge = df_merge.with_columns(
+        pl.when(pl.col("subtype").is_null())
+            .then(pl.col("subtype_from_match_title"))
+            .otherwise(pl.col("subtype"))
+            .alias("subtype")
+    )
+    df_merge = df_merge.sort(
+        by=["sample_segment", "bitscore"], descending=[False, True]
+    )
+
+    segments = df_merge["sample_segment"].unique().sort()
     dfs = []
-    segments = df_merge.index.unique()
     for seg in segments:
-        dfs.append(df_merge.loc[seg, :].head(top).reset_index())
-    df_top_seg_matches = pd.concat(dfs)
-    cols = pd.Series([x for x, _ in blast_results_report_columns])
-    cols = cols[cols.isin(df_top_seg_matches.columns)]
-    df_top_seg_matches = df_top_seg_matches[cols]
+        dfs.append(df_merge.filter(pl.col("sample_segment") == seg).head(top))
+    df_top_seg_matches = pl.concat(dfs, how="vertical")
+    cols = pl.Series([x for x, _ in blast_results_report_columns])
+    df_top_seg_matches = df_top_seg_matches.select(pl.col(cols))
+    subtype_results_summary = {"sample": sample_name}
+    if not get_top_ref:
+        is_iav = True
+        if df_top_seg_matches.select(pl.col("subtype").is_null().all())[0, 0]:
+            is_iav = False
+        H_results = None
+        N_results = None
+        if "4" in segments:
+            H_results = find_h_or_n_type(df_merge, "4", is_iav)
+            subtype_results_summary.update(H_results)
+        if "6" in segments:
+            N_results = find_h_or_n_type(df_merge, "6", is_iav)
+            subtype_results_summary.update(N_results)
+        subtype_results_summary["subtype"] = get_subtype_value(H_results, N_results, is_iav)
+
     return df_top_seg_matches, subtype_results_summary
 
 
-def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict]) -> str:
+def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict], is_iav: bool) -> str:
     subtype = ""
+    if not is_iav:
+        subtype = "N/A"
+        return subtype
     if H_results is None and N_results is None:
         subtype = "-"
     elif H_results is not None and N_results is None:
@@ -282,41 +282,50 @@ def get_subtype_value(H_results: Optional[Dict], N_results: Optional[Dict]) -> s
     return subtype
 
 
-def find_h_or_n_type(df_merge, seg):
+def find_h_or_n_type(df_merge, seg, is_iav):
     assert seg in [
         "4",
         "6",
     ], "Can only determine H or N type from segments 4 or 6, respectively!"
     type_name = "H_type" if seg == "4" else "N_type"
     h_or_n = type_name[0]
-    df_segment = df_merge.loc[seg, :]
-    type_counts = df_segment.subtype.value_counts()
-    df_type_counts = type_counts.index.str.extract(h_or_n + r"(\d+)", flags=re.IGNORECASE)
-    df_type_counts.columns = [type_name]
-    df_type_counts["count"] = type_counts.values
-    df_type_counts["subtype"] = type_counts.index
-    df_type_counts = df_type_counts[~pd.isnull(df_type_counts[type_name])]
-    logging.debug(f"{df_type_counts}")
-    type_to_count = defaultdict(int)
-    for _, x in df_type_counts.iterrows():
-        if pd.isna(x[type_name]):
-            continue
-        type_to_count[x[type_name]] += x["count"]
-    type_to_count = [(h, c) for h, c in type_to_count.items()]
-    type_to_count.sort(key=lambda x: x[1], reverse=True)
-    top_type, top_type_count = type_to_count[0]
-    total_count = type_counts.sum()
-    logging.info(
-        f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})"
-    )
-    type_mask = df_segment.subtype.str.match(
-        r".*" + h_or_n + top_type + r".*", na=False, flags=re.IGNORECASE
-    )
-    type_mask[pd.isnull(type_mask)] = False
-    df_seg_top_type = df_segment[type_mask]
-    top_result: pd.Series = [r for _, r in df_seg_top_type.head(1).iterrows()][0]
+    reg_h_or_n_type = ""
+    if h_or_n == "H":
+        reg_h_or_n_type = "[Hh]"
+    else:
+        reg_h_or_n_type = "[Nn]"
+    df_segment = df_merge.filter(pl.col("sample_segment") == seg)
+    if is_iav:
+        type_counts = df_segment["subtype"].value_counts(sort=True)
+        type_counts = type_counts.filter(~pl.col("subtype").is_null())
+        df_type_counts = type_counts.with_columns(pl.lit(type_counts["subtype"].str.extract(reg_h_or_n_type + r"(\d+)").
+                                                         alias(type_name)))
+        df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null())
+        logging.debug(f"{df_type_counts}")
+        type_to_count = defaultdict(int)
+        for x in df_type_counts.iter_rows(named=True):
+            type_to_count[x[type_name]] += x["counts"]
+        type_to_count = [(h, c) for h, c in type_to_count.items()]
+        type_to_count.sort(key=lambda x: x[1], reverse=True)
+        top_type, top_type_count = type_to_count[0]
+        total_count = type_counts["counts"].sum()
+        logging.info(
+            f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})"
+        )
+        df_segment = df_segment.with_columns(
+            pl.lit(df_segment["subtype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*")
+                   .fill_null(False)
+                   .alias("type_mask")))
+        df_seg_top_type = df_segment.filter(pl.col("type_mask") == True).drop("type_mask")
+        top_result: pl.Series = [r for r in df_seg_top_type.head(1).iter_rows(named=True)][0]
+    else:
+        top_type = "N/A"
+        top_type_count = "N/A"
+        total_count = "N/A"
+        top_result: pl.Series = [r for r in df_segment.head(1).iter_rows(named=True)][0]
+
     results_summary = {
-        f"{h_or_n}_type": top_type,
+        f"{h_or_n}_type": top_type if is_iav else "N/A",
         f"{h_or_n}_sample_segment_length": top_result["qlen"],
         f"{h_or_n}_top_pident": top_result["pident"],
         f"{h_or_n}_top_mismatch": top_result["mismatch"],
@@ -331,7 +340,7 @@ def find_h_or_n_type(df_merge, seg):
         f"{h_or_n}_virus_name": top_result["virus_name"],
         f"{h_or_n}_NCBI_Influenza_DB_subtype_matches": top_type_count,
         f"{h_or_n}_NCBI_Influenza_DB_total_matches": total_count,
-        f"{h_or_n}_NCBI_Influenza_DB_proportion_matches": top_type_count / total_count,
+        f"{h_or_n}_NCBI_Influenza_DB_proportion_matches": top_type_count / total_count if is_iav else "N/A",
     }
     logging.info(f"Seg {seg} results: {results_summary}")
     return results_summary
@@ -352,7 +361,6 @@ def find_h_or_n_type(df_merge, seg):
 def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
            min_aln_length, threads, get_top_ref, sample_name):
     from rich.traceback import install
-
     install(show_locals=True, width=120, word_wrap=True)
     logging.basicConfig(
         format="%(message)s",
@@ -364,77 +372,62 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
     logging.info(f'Parsing Influenza metadata file "{flu_metadata}"')
     md_cols = [
         ("accession", str),
-        ("host", "category"),
-        ("segment", "category"),
-        ("subtype", "str"),
-        ("country", "category"),
-        ("date", "category"),
-        ("seq_length", "uint16"),
-        ("virus_name", "category"),
-        ("age", "category"),
-        ("gender", "category"),
-        ("group_id", "category"),
+        ("host", pl.Categorical),
+        ("segment", pl.Categorical),
+        ("subtype", str),
+        ("country", pl.Categorical),
+        ("date", pl.Categorical),
+        ("seq_length", pl.UInt16),
+        ("virus_name", pl.Categorical),
+        ("age", pl.Categorical),
+        ("gender", pl.Categorical),
+        ("group_id", pl.Categorical),
     ]
-    df_md = pd.read_csv(
+    df_md = pl.read_csv(
         flu_metadata,
-        sep="\t",
-        names=[name for name, _ in md_cols],
-        dtype={name: t for name, t in md_cols},
+        has_header=False,
+        separator="\t",
+        new_columns=[name for name, _ in md_cols],
+        dtypes={name: t for name, t in md_cols},
     )
-    unique_subtypes = df_md.subtype.unique()
-    unique_subtypes = unique_subtypes[~pd.isna(unique_subtypes)]
+
+    unique_subtypes = df_md.select("subtype").unique()
+    unique_subtypes = unique_subtypes.filter(~pl.col("subtype").is_null())
     logging.info(
-        f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {unique_subtypes.size} unique subtypes. "
+        f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {len(unique_subtypes)} unique subtypes. "
     )
-    regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes)) + r")\)"
-    if threads > 1:
-        pool = Pool(processes=threads)
-        logging.info(
-            f"Initialized multiprocessing pool with {threads} processes. Submitting async parsing jobs."
+    regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["subtype"])) + r")\)"
+    results = [
+        parse_blast_result(blast_result, df_md, regex_subtype_pattern, get_top_ref, top=top,
+                           pident_threshold=pident_threshold,
+                           min_aln_length=min_aln_length) for blast_result in blast_results]
+
+    if not get_top_ref:
+        dfs_blast = []
+        all_subtype_results = {}
+        for parsed_result in results:
+            if parsed_result is None:
+                continue
+            df_blast, subtype_results_summary = parsed_result
+            if df_blast is not None:
+                dfs_blast.append(df_blast.to_pandas())
+            sample = subtype_results_summary["sample"]
+            all_subtype_results[sample] = subtype_results_summary
+        df_all_blast = pd.concat(dfs_blast).rename(
+            columns={k: v for k, v in blast_results_report_columns}
         )
-        async_objects = [
-            pool.apply_async(
-                parse_blast_result,
-                (blast_result, df_md, regex_subtype_pattern),
-                dict(top=top, pident_threshold=pident_threshold),
-            )
-            for blast_result in blast_results
-        ]
-        logging.info(f"Getting async results...")
-        results = [x.get() for x in async_objects]
-        logging.info(
-            f'Got {len(results)} async parsing results. Merging into report "{excel_report}".'
+        df_subtype_results = pd.DataFrame(all_subtype_results).transpose()
+        cols = pd.Series(subtype_results_summary_columns)
+        cols = cols[cols.isin(df_subtype_results.columns)]
+        df_subtype_predictions = df_subtype_results[cols].rename(
+            columns=subtype_results_summary_final_names
         )
-    else:
-        results = [
-            parse_blast_result(blast_result, df_md, regex_subtype_pattern, top=top, pident_threshold=pident_threshold,
-                               min_aln_length=min_aln_length) for blast_result in blast_results]
-    dfs_blast = []
-    all_subtype_results = {}
-    for parsed_result in results:
-        if parsed_result is None:
-            continue
-        df_blast, subtype_results_summary = parsed_result
-        if df_blast is not None:
-            dfs_blast.append(df_blast)
-        sample = subtype_results_summary["sample"]
-        all_subtype_results[sample] = subtype_results_summary
-    df_subtype_results = pd.DataFrame(all_subtype_results).transpose()
-    df_all_blast = pd.concat(dfs_blast).rename(
-        columns={k: v for k, v in blast_results_report_columns}
-    )
-    cols = pd.Series(subtype_results_summary_columns)
-    cols = cols[cols.isin(df_subtype_results.columns)]
-    df_subtype_predictions = df_subtype_results[cols].rename(
-        columns=subtype_results_summary_final_names
-    )
-    cols = pd.Series(columns_H_summary_results)
-    cols = cols[cols.isin(df_subtype_results.columns)]
-    df_H = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names)
-    cols = pd.Series(columns_N_summary_results)
-    cols = cols[cols.isin(df_subtype_results.columns)]
-    df_N = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names)
-    if not get_top_ref:
+        cols = pd.Series(columns_H_summary_results)
+        cols = cols[cols.isin(df_subtype_results.columns)]
+        df_H = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names)
+        cols = pd.Series(columns_N_summary_results)
+        cols = cols[cols.isin(df_subtype_results.columns)]
+        df_N = df_subtype_results[cols].rename(columns=subtype_results_summary_final_names)
         # Add segment name for more informative
         df_all_blast["Sample Genome Segment Number"] = df_all_blast["Sample Genome Segment Number"]. \
             apply(lambda x: influenza_segment[int(x)])
@@ -448,16 +441,23 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
             output_dest=excel_report,
         )
     else:
-        df_ref_id = df_all_blast[
-            ['Sample', 'Sample Genome Segment Number', 'Reference NCBI Accession', 'BLASTN Bitscore',
-             'Reference Sequence ID']]
-        df_ref_id = df_ref_id.reset_index(drop=True)
-        df_ref_id.loc[df_ref_id['Reference NCBI Accession'].isna(), 'Reference NCBI Accession'] = df_ref_id[
-            'Reference Sequence ID']
-        df_ref_id['Reference NCBI Accession'] = df_ref_id['Reference NCBI Accession'].str.strip()
-        df_ref_id['Sample Genome Segment Number'] = df_ref_id['Sample Genome Segment Number']. \
-            apply(lambda x: influenza_segment[int(x)])
-        df_ref_id.to_csv(sample_name + ".topsegments.csv", header=True, index=False)
+        df_blast, subtype_results_summary = results[0]
+        df_blast = df_blast.rename(
+            mapping={k: v for k, v in blast_results_report_columns}
+        )
+        df_ref_id = df_blast.select(pl.col(['Sample', 'Sample Genome Segment Number',
+                                            'Reference NCBI Accession', 'BLASTN Bitscore', 'Reference Sequence ID']))
+        df_ref_id = df_ref_id.with_columns(
+            pl.when(pl.col("Reference NCBI Accession").is_null())
+                .then(pl.col("Reference Sequence ID"))
+                .otherwise(pl.col("Reference NCBI Accession"))
+                .str.strip()
+                .alias('Reference NCBI Accession')
+        )
+        df_ref_id = df_ref_id.with_columns(
+            pl.col("Sample Genome Segment Number").apply(lambda x: influenza_segment[int(x)])
+                .alias("Sample Genome Segment Number"))
+        df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)
 
 
 def get_col_widths(df, index=False):
diff --git a/modules/local/pull_top_ref_id.nf b/modules/local/pull_top_ref_id.nf
index 4409acf..6e9f0d7 100644
--- a/modules/local/pull_top_ref_id.nf
+++ b/modules/local/pull_top_ref_id.nf
@@ -2,11 +2,11 @@ process PULL_TOP_REF_ID {
   tag "$meta.id"
   label 'process_medium'
 
-  conda (params.enable_conda ? 'conda-forge::python=3.9 conda-forge::biopython=1.78 conda-forge::openpyxl=3.0.7 conda-forge::pandas=1.2.4 conda-forge::rich=10.2.2 conda-forge::typer=0.3.2 conda-forge::xlsxwriter=1.4.3' : null)
+  conda (params.enable_conda ? 'conda-forge::python=3.10 conda-forge::biopython=1.80 conda-forge::openpyxl=3.1.0 conda-forge::pandas=1.5.3 conda-forge::rich=12.6.0 conda-forge::typer=0.7.0 conda-forge::xlsxwriter=3.0.8 conda-forge::polars=0.17.9 conda-forge::pyarrow=11.0.0' : null)
   if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-    container 'https://depot.galaxyproject.org/singularity/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0'
+    container 'https://depot.galaxyproject.org/singularity/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0'
   } else {
-    container 'quay.io/biocontainers/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0'
+    container 'quay.io/biocontainers/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0'
   }
 
   input:
diff --git a/modules/local/subtyping_report.nf b/modules/local/subtyping_report.nf
index b6f3c8c..95280af 100644
--- a/modules/local/subtyping_report.nf
+++ b/modules/local/subtyping_report.nf
@@ -21,11 +21,11 @@ process SUBTYPING_REPORT {
       "2 GB"
     }
   }
-  conda (params.enable_conda ? 'conda-forge::python=3.9 conda-forge::biopython=1.78 conda-forge::openpyxl=3.0.7 conda-forge::pandas=1.2.4 conda-forge::rich=10.2.2 conda-forge::typer=0.3.2 conda-forge::xlsxwriter=1.4.3' : null)
+  conda (params.enable_conda ? 'conda-forge::python=3.10 conda-forge::biopython=1.80 conda-forge::openpyxl=3.1.0 conda-forge::pandas=1.5.3 conda-forge::rich=12.6.0 conda-forge::typer=0.7.0 conda-forge::xlsxwriter=3.0.8 conda-forge::polars=0.17.9 conda-forge::pyarrow=11.0.0' : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container 'https://depot.galaxyproject.org/singularity/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0'
+        container 'https://depot.galaxyproject.org/singularity/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0'
     } else {
-        container 'quay.io/biocontainers/mulled-v2-693e24f156d01a5f55647120be99929b01b30949:609c862c3470382215fc1b2d9d8a4e9637b2e25f-0'
+        container 'quay.io/biocontainers/mulled-v2-cfa20dfeb068db79c8620a11753add64c23d013a:019cd79f70be602ca625a1a0a4eabab462611a3a-0'
     }
 
   input:

From 9bcafb617ba1590ebbf17c95bf3d9f5804005c38 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Wed, 21 Jun 2023 09:24:14 -0500
Subject: [PATCH 3/7] Update docs, nextflow.config and add branch protection
 GHA

---
 .github/PULL_REQUEST_TEMPLATE.md      |   5 +-
 .github/workflows/branch.yml          |  44 ++++++++++
 .github/workflows/ci.yml              |  10 ++-
 .github/workflows/linting_comment.yml |  27 ------
 README.md                             | 122 +++++++++++++++++++++++---
 main.nf                               |   2 +-
 nextflow.config                       |  51 ++++++++---
 7 files changed, 205 insertions(+), 56 deletions(-)
 create mode 100644 .github/workflows/branch.yml
 delete mode 100644 .github/workflows/linting_comment.yml

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index d6e526b..289877e 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -15,11 +15,10 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/CFIA-NCFAD/n
 
 - [ ] This comment contains a description of changes (with reason).
 - [ ] If you've fixed a bug or added code that should be tested, add tests!
-  - [ ] If you've added a new tool - add to the software_versions process and a regex to `scrape_software_versions.py`
+  - [ ] If you've added a new tool - ensure that you've added the version info to a `versions.yml` and added it to the `ch_versions` channel in the workflow you added the tool to.
   - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/CFIA-NCFAD/nf-flu/tree/master/.github/CONTRIBUTING.md)
   - [ ] If necessary, also make a PR on [the CFIA-NCFAD/nf-test-datasets repo](https://github.com/CFIA-NCFAD/nf-test-datasets/pull/new)
-- [ ] Make sure your code lints (`nf-core lint .`).
-- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`).
+- [ ] Ensure the test suite passes (`nextflow run . -profile test_{illumina,nanopore},docker`).
 - [ ] Usage Documentation in `docs/usage.md` is updated.
 - [ ] Output Documentation in `docs/output.md` is updated.
 - [ ] `CHANGELOG.md` is updated.
diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml
new file mode 100644
index 0000000..0431835
--- /dev/null
+++ b/.github/workflows/branch.yml
@@ -0,0 +1,44 @@
+name: nf-core branch protection
+# This workflow is triggered on PRs to master branch on the repository
+# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
+on:
+  pull_request_target:
+    branches: [master]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
+      - name: Check PRs
+        if: github.repository == 'CFIA-NCFAD'
+        run: |
+          { [[ ${{github.event.pull_request.head.repo.full_name }} == CFIA-NCFAD ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]]
+
+      # If the above check failed, post a comment on the PR explaining the failure
+      # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets
+      - name: Post PR comment
+        if: failure()
+        uses: mshick/add-pr-comment@v1
+        with:
+          message: |
+            ## This PR is against the `master` branch :x:
+
+            * Do not close this PR
+            * Click _Edit_ and change the `base` to `dev`
+            * This CI test will remain failed until you push a new commit
+
+            ---
+
+            Hi @${{ github.event.pull_request.user.login }},
+
+            It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch.
+            The `master` branch on nf-core repositories should always contain code from the latest release.
+            Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch.
+
+            You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page.
+            Note that even after this, the test will continue to show as failing until you push a new commit.
+
+            Thanks again for your contribution!
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          allow-repeats: false
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b4ad97c..10d33ae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,14 +3,18 @@ name: CI
 on:
   push:
     branches:
-      - master
       - dev
   pull_request:
-    branches:
-      - '*'
   release:
     types: [published]
 
+env:
+  NXF_ANSI_LOG: false
+
+concurrency:
+  group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
+  cancel-in-progress: true
+
 jobs:
   test_illumina:
     name: Run Illumina test
diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
deleted file mode 100644
index 0471add..0000000
--- a/.github/workflows/linting_comment.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: nf-core linting comment
-# This workflow is triggered after the linting action is complete
-# It posts an automated comment to the PR, even if the PR is coming from a fork
-
-on:
-  workflow_run:
-    workflows: ["nf-core linting"]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Download lint results
-        uses: dawidd6/action-download-artifact@v2
-        with:
-          workflow: linting.yml
-
-      - name: Get PR number
-        id: pr_number
-        run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)"
-
-      - name: Post PR comment
-        uses: marocchino/sticky-pull-request-comment@v2
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          number: ${{ steps.pr_number.outputs.pr_number }}
-          path: linting-logs/lint_results.md
diff --git a/README.md b/README.md
index ad23ccf..13c2187 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-# CFIA-NCFAD/nf-flu - Influenza A Virus Genome Assembly Nextflow Workflow
+# CFIA-NCFAD/nf-flu - Influenza A and B Virus Genome Assembly Nextflow Workflow
+
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7011213.svg)](https://doi.org/10.5281/zenodo.7011213)
 
 [![CI](https://github.com/CFIA-NCFAD/nf-flu/actions/workflows/ci.yml/badge.svg)](https://github.com/CFIA-NCFAD/nf-flu/actions/workflows/ci.yml)
 
@@ -9,11 +11,10 @@
 
 ## Introduction
 
-**nf-flu** is a bioinformatics analysis pipeline for assembly and H/N subtyping of Influenza A virus. The pipeline supports both Illumina and Nanopore Platform.
-Since Influenza is a special virus with multiple gene segments (8 segments) and there might be a reference or multiple we would want to align against, the pipeline will automatically pull top match references for each segment.
-To achieve this task, the pipeline downloads Influenza database from NCBI and user could provide their own reference database. The pipline performs read mapping against each reference segment, variant calling and genome assembly.
-
-The pipeline is implemented in [Nextflow][]
+**nf-flu** is a [Nextflow][] bioinformatics analysis pipeline for assembly and H/N subtyping of Influenza A and B viruses from Illumina or Nanopore sequencing data.
+Since Influenza has a segmented genome consisting of 8 gene segments, the pipeline will automatically select the top matching reference sequence from NCBI for each gene segment based on [IRMA][] assembly and nucleotide [BLAST][] against all Influenza sequences from NCBI.
+Users can also provide their own reference sequences to include in the top reference sequence selection process.
+After reference sequence selection, the pipeline performs read mapping to each reference sequence, variant calling and depth-masked consensus sequence generation.
 
 ## Pipeline summary
 
@@ -21,7 +22,7 @@ The pipeline is implemented in [Nextflow][]
 2. Merge reads of re-sequenced samples ([`cat`](http://www.linfo.org/cat.html)) (if needed)
 3. Assembly of Influenza gene segments with [IRMA][] using the built-in FLU module
 4. Nucleotide [BLAST][] search against [NCBI Influenza DB][]
-5. Automatically pull top match references for segments
+5. Automatically select top match references for segments
 6. H/N subtype prediction and Excel XLSX report generation based on BLAST results
 7. Perform Variant calling and genome assembly for all segments.
 
@@ -73,11 +74,100 @@ The nf-flu pipeline comes with:
 * [Usage](docs/usage.md) and
 * [Output](docs/output.md) documentation.
 
-## Resources
+## Resources and References
+
+### [BcfTools][] and [Samtools][]
+
+```
+Danecek, P., Bonfield, J.K., Liddle, J., Marshall, J., Ohan, V., Pollard, M.O., Whitwham, A., Keane, T., McCarthy, S.A., Davies, R.M., Li, H., 2021. Twelve years of SAMtools and BCFtools. Gigascience 10, giab008. https://doi.org/10.1093/gigascience/giab008
+```
+
+### [BLAST][] Basic Local Alignment Search Tool
+
+```
+Altschul, S.F., Gish, W., Miller, W., Myers, E.W., Lipman, D.J., 1990. Basic local alignment search tool. J. Mol. Biol. 215, 403–410. https://doi.org/10.1016/S0022-2836(05)80360-2
+```
+
+```
+Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., Madden, T.L., 2009. BLAST+: architecture and applications. BMC Bioinformatics 10, 421. https://doi.org/10.1186/1471-2105-10-421
+```
+
+### [Clair3][]
+
+```
+Zheng, Z., Li, S., Su, J., Leung, A.W.-S., Lam, T.-W., Luo, R., 2022. Symphonizing pileup and full-alignment for deep learning-based long-read variant calling. Nat Comput Sci 2, 797–803. https://doi.org/10.1038/s43588-022-00387-x
+```
+
+### [IRMA][] Iterative Refinement Meta-Assembler
+
+```
+Shepard, S.S., Meno, S., Bahl, J., Wilson, M.M., Barnes, J., Neuhaus, E., 2016. Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler. BMC Genomics 17, 708. https://doi.org/10.1186/s12864-016-3030-6
+```
+
+### [Medaka][]
+
+[Medaka][] is deprecated in favour of [Clair3][] for variant calling of Nanopore data.
+
+### [Minimap2][]
+
+[Minimap2][] is used for rapid and accurate read alignment to reference sequences.
+
+```
+Li, H., 2018. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics 34, 3094–3100. https://doi.org/10.1093/bioinformatics/bty191
+```
+
+### [Mosdepth][]
+
+[Mosdepth][] is used for rapid sequencing coverage calculation and summary statistics.
+
+```
+Pedersen, B.S., Quinlan, A.R., 2017. Mosdepth: quick coverage calculation for genomes and exomes. Bioinformatics 34, 867–868. https://doi.org/10.1093/bioinformatics/btx699
+```
+
+### [MultiQC][]
+
+[MultiQC][] is used for generation of a single report for multiple tools.
+
+```
+Ewels, P., Magnusson, M., Lundin, S., Käller, M., 2016. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32, 3047–3048. https://doi.org/10.1093/bioinformatics/btw354
+```
+
+### [NCBI Influenza Virus Resource][]
+
+**nf-flu** relies on publicly available Influenza sequence data from NCBI available at the [NCBI Influenza Virus Resource][], which is downloaded from the [FTP site](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/).
+
+NCBI Influenza Virus Resource:
+
+```
+Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Zaslavsky, L., Tatusova, T., Ostell, J., Lipman, D., 2008. The influenza virus resource at the National Center for Biotechnology Information. J Virol 82, 596–601. https://doi.org/10.1128/JVI.02005-07
+```
+
+NCBI Influenza Virus Sequence Annotation Tool:
+
+```
+Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Tatusova, T., 2007. FLAN: a web server for influenza virus genome annotation. Nucleic Acids Res 35, W280-284. https://doi.org/10.1093/nar/gkm354
+```
+
+### [Nextflow][]
+
+**nf-flu** is implemented in [Nextflow][].
+
+```
+Tommaso, P.D., Chatzou, M., Floden, E.W., Barja, P.P., Palumbo, E., Notredame, C., 2017. Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316–319. https://doi.org/10.1038/nbt.3820
+```
+
+### [nf-core][]
+
+[nf-core][] is a great resource for building robust and reproducible bioinformatics pipelines.
+
+```
+Ewels, P.A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M.U., Di Tommaso, P., Nahnsen, S., 2020. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol 38, 276–278. https://doi.org/10.1038/s41587-020-0439-x
+```
+
+### [seqtk][]
+
+[seqtk][] is used for rapid manipulation of FASTA/Q files. Available from GitHub at [lh3/seqtk](https://github.com/lh3/seqtk)
 
-* [NCBI Influenza FTP site](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/)
-* [IRMA][] Iterative Refinement Meta-Assembler
-  * [IRMA Publication](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6)
 
 ## Credits
 
@@ -94,3 +184,13 @@ The nf-flu pipeline was originally developed by [Peter Kruczkiewicz](https://git
 [Nextflow]: https://www.nextflow.io/
 [Docker]: https://www.docker.com/
 [Singularity]: https://www.sylabs.io/guides/3.0/user-guide/quick_start.html#quick-installation-steps
+[NCBI Influenza Virus Resource]: https://www.ncbi.nlm.nih.gov/genomes/FLU/Database/nph-select.cgi?go=database
+[BcfTools]: https://samtools.github.io/bcftools/
+[Samtools]: https://www.htslib.org/
+[nf-core]: https://nf-co.re/
+[Minimap2]: https://github.com/lh3/minimap2/
+[Clair3]: https://github.com/HKU-BAL/Clair3
+[Medaka]: https://github.com/nanoporetech/medaka
+[Mosdepth]: https://github.com/brentp/mosdepth
+[seqtk]: https://github.com/lh3/seqtk
+[MultiQC]: https://multiqc.info/
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 6cc7874..b7eb14b 100755
--- a/main.nf
+++ b/main.nf
@@ -5,7 +5,7 @@ nextflow.enable.dsl = 2
 def json_schema = "$projectDir/nextflow_schema.json"
 
 if (params.help){
-  def command = "nextflow run CFIA-NCFAD/nf-flu --input --platfrom illumina samplesheet.csv -profile singularity/docker/conda"
+  def command = "nextflow run CFIA-NCFAD/nf-flu --input samplesheet.csv --platfrom <illumina/nanopore> samplesheet.csv -profile <singularity/docker/conda>"
   log.info NfcoreSchema.params_help(workflow, params, json_schema, command)
   exit 0
 }
diff --git a/nextflow.config b/nextflow.config
index 5fcc40d..e203ff0 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -70,29 +70,57 @@ includeConfig 'conf/modules.config'
 
 profiles {
   charliecloud {
-    charliecloud.enabled = true
+    charliecloud.enabled   = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
   }
   conda {
-    params.enable_conda = true
-    conda.createTimeout = "120 min"
-    conda.enabled = true
+    conda.enabled          = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+    // Increase time available to build Conda environment
+    conda.createTimeout    = "120 min"
     conda.useMamba = params.use_mamba
   }
+  mamba {
+    conda.enabled          = true
+    conda.useMamba         = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+    // Increase time available to build Conda environment
+    conda.createTimeout    = "120 min"
+  }
   debug { process.beforeScript = 'echo $HOSTNAME' }
   docker {
-    docker.enabled = true
-    // Avoid this error:
-    //   WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap.
-    // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351
-    // once this is established and works well, nextflow might implement this behavior as new default.
-    docker.runOptions = '-u \$(id -u):\$(id -g)'
+    docker.enabled         = true
+    docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
   }
   podman {
-    podman.enabled = true
+    podman.enabled         = true
+    docker.enabled         = false
+    singularity.enabled    = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
   }
   singularity {
     singularity.enabled = true
     singularity.autoMounts = true
+    docker.enabled         = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
   }
   slurm {
     includeConfig 'conf/slurm.config'
@@ -128,6 +156,7 @@ manifest {
   version         = '3.1.6'
   nextflowVersion = '>=21.10'
   mainScript      = 'main.nf'
+  doi             = '10.5281/zenodo.7011213'
 }
 
 // Following function from https://github.com/nf-core/vipr/blob/master/nextflow.config#L88

From c40d161262a0f92b8d7ea6d157b9e10bf5ecc472 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Wed, 21 Jun 2023 11:31:04 -0500
Subject: [PATCH 4/7] fix md lint

---
 README.md | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 13c2187..aee1421 100644
--- a/README.md
+++ b/README.md
@@ -78,29 +78,29 @@ The nf-flu pipeline comes with:
 
 ### [BcfTools][] and [Samtools][]
 
-```
+```text
 Danecek, P., Bonfield, J.K., Liddle, J., Marshall, J., Ohan, V., Pollard, M.O., Whitwham, A., Keane, T., McCarthy, S.A., Davies, R.M., Li, H., 2021. Twelve years of SAMtools and BCFtools. Gigascience 10, giab008. https://doi.org/10.1093/gigascience/giab008
 ```
 
 ### [BLAST][] Basic Local Alignment Search Tool
 
-```
+```text
 Altschul, S.F., Gish, W., Miller, W., Myers, E.W., Lipman, D.J., 1990. Basic local alignment search tool. J. Mol. Biol. 215, 403–410. https://doi.org/10.1016/S0022-2836(05)80360-2
 ```
 
-```
+```text
 Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., Madden, T.L., 2009. BLAST+: architecture and applications. BMC Bioinformatics 10, 421. https://doi.org/10.1186/1471-2105-10-421
 ```
 
 ### [Clair3][]
 
-```
+```text
 Zheng, Z., Li, S., Su, J., Leung, A.W.-S., Lam, T.-W., Luo, R., 2022. Symphonizing pileup and full-alignment for deep learning-based long-read variant calling. Nat Comput Sci 2, 797–803. https://doi.org/10.1038/s43588-022-00387-x
 ```
 
 ### [IRMA][] Iterative Refinement Meta-Assembler
 
-```
+```text
 Shepard, S.S., Meno, S., Bahl, J., Wilson, M.M., Barnes, J., Neuhaus, E., 2016. Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler. BMC Genomics 17, 708. https://doi.org/10.1186/s12864-016-3030-6
 ```
 
@@ -112,7 +112,7 @@ Shepard, S.S., Meno, S., Bahl, J., Wilson, M.M., Barnes, J., Neuhaus, E., 2016.
 
 [Minimap2][] is used for rapid and accurate read alignment to reference sequences.
 
-```
+```text
 Li, H., 2018. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics 34, 3094–3100. https://doi.org/10.1093/bioinformatics/bty191
 ```
 
@@ -120,7 +120,7 @@ Li, H., 2018. Minimap2: pairwise alignment for nucleotide sequences. Bioinformat
 
 [Mosdepth][] is used for rapid sequencing coverage calculation and summary statistics.
 
-```
+```text
 Pedersen, B.S., Quinlan, A.R., 2017. Mosdepth: quick coverage calculation for genomes and exomes. Bioinformatics 34, 867–868. https://doi.org/10.1093/bioinformatics/btx699
 ```
 
@@ -128,7 +128,7 @@ Pedersen, B.S., Quinlan, A.R., 2017. Mosdepth: quick coverage calculation for ge
 
 [MultiQC][] is used for generation of a single report for multiple tools.
 
-```
+```text
 Ewels, P., Magnusson, M., Lundin, S., Käller, M., 2016. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics 32, 3047–3048. https://doi.org/10.1093/bioinformatics/btw354
 ```
 
@@ -138,13 +138,13 @@ Ewels, P., Magnusson, M., Lundin, S., Käller, M., 2016. MultiQC: summarize anal
 
 NCBI Influenza Virus Resource:
 
-```
+```text
 Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Zaslavsky, L., Tatusova, T., Ostell, J., Lipman, D., 2008. The influenza virus resource at the National Center for Biotechnology Information. J Virol 82, 596–601. https://doi.org/10.1128/JVI.02005-07
 ```
 
 NCBI Influenza Virus Sequence Annotation Tool:
 
-```
+```text
 Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Tatusova, T., 2007. FLAN: a web server for influenza virus genome annotation. Nucleic Acids Res 35, W280-284. https://doi.org/10.1093/nar/gkm354
 ```
 
@@ -152,7 +152,7 @@ Bao, Y., Bolotov, P., Dernovoy, D., Kiryutin, B., Tatusova, T., 2007. FLAN: a we
 
 **nf-flu** is implemented in [Nextflow][].
 
-```
+```text
 Tommaso, P.D., Chatzou, M., Floden, E.W., Barja, P.P., Palumbo, E., Notredame, C., 2017. Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316–319. https://doi.org/10.1038/nbt.3820
 ```
 
@@ -160,7 +160,7 @@ Tommaso, P.D., Chatzou, M., Floden, E.W., Barja, P.P., Palumbo, E., Notredame, C
 
 [nf-core][] is a great resource for building robust and reproducible bioinformatics pipelines.
 
-```
+```text
 Ewels, P.A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M.U., Di Tommaso, P., Nahnsen, S., 2020. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol 38, 276–278. https://doi.org/10.1038/s41587-020-0439-x
 ```
 
@@ -168,7 +168,6 @@ Ewels, P.A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garc
 
 [seqtk][] is used for rapid manipulation of FASTA/Q files. Available from GitHub at [lh3/seqtk](https://github.com/lh3/seqtk)
 
-
 ## Credits
 
 The nf-flu pipeline was originally developed by [Peter Kruczkiewicz](https://github.com/peterk87) from [CFIA-NCFAD](https://github.com/CFIA-NCFAD), [Hai Nguyen](https://github.com/nhhaidee) extended the piepline for Nanopore data analysis.
@@ -193,4 +192,4 @@ The nf-flu pipeline was originally developed by [Peter Kruczkiewicz](https://git
 [Medaka]: https://github.com/nanoporetech/medaka
 [Mosdepth]: https://github.com/brentp/mosdepth
 [seqtk]: https://github.com/lh3/seqtk
-[MultiQC]: https://multiqc.info/
\ No newline at end of file
+[MultiQC]: https://multiqc.info/

From 0d06252ea072aab2bcc5e600c725bff45fd3d093 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Wed, 21 Jun 2023 11:48:05 -0500
Subject: [PATCH 5/7] Bump to 3.2.0

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index e203ff0..b3bd9bf 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -153,7 +153,7 @@ manifest {
   description     = 'Influenza A virus genome assembly pipeline'
   homePage        = 'https://github.com/CFIA-NCFAD/nf-flu'
   author          = 'Peter Kruczkiewicz, Hai Nguyen'
-  version         = '3.1.6'
+  version         = '3.2.0'
   nextflowVersion = '>=21.10'
   mainScript      = 'main.nf'
   doi             = '10.5281/zenodo.7011213'

From 50c6f6dacfe22338353084921f3487f20bfb31ce Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Wed, 21 Jun 2023 16:58:28 -0500
Subject: [PATCH 6/7] Fixed #20; irregular Illumina paired-end reads should be
 more compatible with IRMA * "{1,2}:N:..." now added to paired-end reads if
 missing when concatenating reads * removed nf-core/modules for CAT_FASTQ and
 BLASTN and replaced with local modules * added better handling of mixed Gzip
 compressed and uncompressed Illumina FASTQs in the same analysis

---
 conf/modules_illumina.config                  |   2 +-
 modules.json                                  |   6 --
 modules/local/blast_makeblastdb.nf            |   6 +-
 modules/local/blastn.nf                       |  39 +++++++
 modules/local/cat_illumina_fastq.nf           | 102 ++++++++++++++++++
 .../nf-core/modules/blast/blastn/functions.nf |  68 ------------
 modules/nf-core/modules/blast/blastn/main.nf  |  37 -------
 modules/nf-core/modules/blast/blastn/meta.yml |  41 -------
 .../nf-core/modules/cat/fastq/functions.nf    |  68 ------------
 modules/nf-core/modules/cat/fastq/main.nf     |  47 --------
 modules/nf-core/modules/cat/fastq/meta.yml    |  39 -------
 workflows/illumina.nf                         |  17 ++-
 workflows/nanopore.nf                         |   6 +-
 13 files changed, 154 insertions(+), 324 deletions(-)
 create mode 100644 modules/local/blastn.nf
 create mode 100644 modules/local/cat_illumina_fastq.nf
 delete mode 100644 modules/nf-core/modules/blast/blastn/functions.nf
 delete mode 100644 modules/nf-core/modules/blast/blastn/main.nf
 delete mode 100644 modules/nf-core/modules/blast/blastn/meta.yml
 delete mode 100644 modules/nf-core/modules/cat/fastq/functions.nf
 delete mode 100644 modules/nf-core/modules/cat/fastq/main.nf
 delete mode 100644 modules/nf-core/modules/cat/fastq/meta.yml

diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
index 104c248..d4f35d7 100644
--- a/conf/modules_illumina.config
+++ b/conf/modules_illumina.config
@@ -52,7 +52,7 @@ process {
             ]
         ]
     }
-    withName: 'CAT_FASTQ' {
+    withName: 'CAT_ILLUMINA_FASTQ' {
         publishDir = [
             [
                 path: { "${params.outdir}/fastq"},
diff --git a/modules.json b/modules.json
index 4f98ec1..1e336bf 100644
--- a/modules.json
+++ b/modules.json
@@ -3,12 +3,6 @@
     "homePage": "https://github.com/CFIA-NCFAD/nf-flu",
     "repos": {
         "nf-core/modules": {
-            "blast/blastn": {
-                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
-            },
-            "cat/fastq": {
-                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
-            },
             "custom/dumpsoftwareversions": {
                 "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
             },
diff --git a/modules/local/blast_makeblastdb.nf b/modules/local/blast_makeblastdb.nf
index d12ca9d..3fc0884 100644
--- a/modules/local/blast_makeblastdb.nf
+++ b/modules/local/blast_makeblastdb.nf
@@ -3,11 +3,11 @@ process BLAST_MAKEBLASTDB {
   tag "$fasta"
   label 'process_low'
 
-  conda (params.enable_conda ? 'bioconda::blast=2.12.0' : null)
+  conda (params.enable_conda ? 'bioconda::blast=2.14.0' : null)
   if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-    container 'https://depot.galaxyproject.org/singularity/blast:2.12.0--hf3cf87c_4'
+    container 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1'
   } else {
-    container 'quay.io/biocontainers/blast:2.12.0--hf3cf87c_4'
+    container 'quay.io/biocontainers/blast:2.14.0--h7d5a4b4_1'
   }
 
   input:
diff --git a/modules/local/blastn.nf b/modules/local/blastn.nf
new file mode 100644
index 0000000..33090e7
--- /dev/null
+++ b/modules/local/blastn.nf
@@ -0,0 +1,39 @@
+process BLAST_BLASTN {
+  tag "$meta.id"
+  label 'process_high'
+
+  conda (params.enable_conda ? 'bioconda::blast=2.14.0' : null)
+  if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+    container 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1'
+  } else {
+    container 'quay.io/biocontainers/blast:2.14.0--h7d5a4b4_1'
+  }
+
+  input:
+  tuple val(meta), path(fasta)
+  path  db
+
+  output:
+  tuple val(meta), path('*.blastn.txt'), emit: txt
+  path "versions.yml"                  , emit: versions
+
+  when:
+  task.ext.when == null || task.ext.when
+
+  script:
+  def args = task.ext.args ?: ''
+  def prefix = task.ext.prefix ?: "${meta.id}"
+  """
+  DB=`find -L ./ -name "*.ndb" | sed 's/.ndb//'`
+  blastn \\
+      -num_threads $task.cpus \\
+      -db \$DB \\
+      -query $fasta \\
+      $args \\
+      -out ${prefix}.blastn.txt
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+      blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')
+  END_VERSIONS
+  """
+}
diff --git a/modules/local/cat_illumina_fastq.nf b/modules/local/cat_illumina_fastq.nf
new file mode 100644
index 0000000..fe62e55
--- /dev/null
+++ b/modules/local/cat_illumina_fastq.nf
@@ -0,0 +1,102 @@
+// modified nf-core/modules CAT_FASTQ
+// for paired end reads append 1:N:0:. or 2:N:0:. forward and reverse reads 
+// for compatability with IRMA assembly
+process CAT_ILLUMINA_FASTQ {
+  tag "$meta.id"
+  label 'process_single'
+
+  conda (params.enable_conda ? "conda-forge::perl" : null)
+  // use BLAST container here since it has Perl and is required by other
+  // processes in the pipeline
+  if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+    container 'https://depot.galaxyproject.org/singularity/blast:2.14.0--h7d5a4b4_1'
+  } else {
+    container 'quay.io/biocontainers/blast:2.14.0--h7d5a4b4_1'
+  }
+
+  input:
+  tuple val(meta), path(reads, stageAs: "input*/*")
+
+  output:
+  tuple val(meta), path("*.merged.fastq.gz"), emit: reads
+  path "versions.yml"                       , emit: versions
+
+  when:
+  task.ext.when == null || task.ext.when
+
+  script:
+  def args = task.ext.args ?: ''
+  def prefix = task.ext.prefix ?: "${meta.id}"
+  def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()]
+  def fqList = [] 
+  def fqgzList = []
+  readList.each { 
+    if (it ==~ /^.*\.(fastq|fq)$/) {
+      fqList << it
+    } else if (it ==~ /^.*\.(fastq|fq)\.gz$/) {
+      fqgzList << it
+    }
+  }
+  if (meta.single_end) {
+    if (fqList.size >= 1 || fqgzList.size >= 1) {
+      """
+      touch ${prefix}.merged.fastq.gz
+      if [[ ${fqList.size} > 0 ]]; then
+        cat ${readList.join(' ')} | gzip -ck >> ${prefix}.merged.fastq.gz
+      fi
+      if [[ ${fqgzList.size} > 0 ]]; then
+        cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz
+      fi
+
+      cat <<-END_VERSIONS > versions.yml
+      "${task.process}":
+        cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
+      END_VERSIONS
+      """
+    }
+  } else {
+    if (readList.size >= 2) {
+      def read1 = []
+      def read1gz = []
+      def read2 = []
+      def read2gz = []
+      fqList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v }
+      fqgzList.eachWithIndex{ v, ix -> ( ix & 1 ? read2gz : read1gz ) << v }
+      """
+      # append 1:N:0:. or 2:N:0:. to forward and reverse reads if "[12]:N:.*"
+      # not present in the FASTQ header for compatability with IRMA assembly
+      touch ${prefix}_1.merged.fastq.gz
+      touch ${prefix}_2.merged.fastq.gz
+      if [[ ${read1.size} > 0 ]]; then
+        cat ${read1.join(' ')} \\
+        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\
+        | gzip -ck \\
+        >> ${prefix}_1.merged.fastq.gz
+      fi
+      if [[ ${read1gz.size} > 0 ]]; then
+        zcat ${read1gz.join(' ')} \\
+        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\
+        | gzip -ck \\
+        >> ${prefix}_1.merged.fastq.gz
+      fi
+      if [[ ${read2.size} > 0 ]]; then
+        cat ${read2.join(' ')} \\
+        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\
+        | gzip -ck \\
+        >> ${prefix}_2.merged.fastq.gz
+      fi
+      if [[ ${read2gz.size} > 0 ]]; then
+        zcat ${read2gz.join(' ')} \\
+        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\
+        | gzip -ck \\
+        >> ${prefix}_2.merged.fastq.gz
+      fi
+
+      cat <<-END_VERSIONS > versions.yml
+      "${task.process}":
+        cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
+      END_VERSIONS
+      """
+    }
+  }
+}
diff --git a/modules/nf-core/modules/blast/blastn/functions.nf b/modules/nf-core/modules/blast/blastn/functions.nf
deleted file mode 100644
index da9da09..0000000
--- a/modules/nf-core/modules/blast/blastn/functions.nf
+++ /dev/null
@@ -1,68 +0,0 @@
-//
-//  Utility functions used in nf-core DSL2 module files
-//
-
-//
-// Extract name of software tool from process name using $task.process
-//
-def getSoftwareName(task_process) {
-    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
-}
-
-//
-// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
-//
-def initOptions(Map args) {
-    def Map options = [:]
-    options.args            = args.args ?: ''
-    options.args2           = args.args2 ?: ''
-    options.args3           = args.args3 ?: ''
-    options.publish_by_meta = args.publish_by_meta ?: []
-    options.publish_dir     = args.publish_dir ?: ''
-    options.publish_files   = args.publish_files
-    options.suffix          = args.suffix ?: ''
-    return options
-}
-
-//
-// Tidy up and join elements of a list to return a path string
-//
-def getPathFromList(path_list) {
-    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
-    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
-    return paths.join('/')
-}
-
-//
-// Function to save/publish module results
-//
-def saveFiles(Map args) {
-    if (!args.filename.endsWith('.version.txt')) {
-        def ioptions  = initOptions(args.options)
-        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
-        if (ioptions.publish_by_meta) {
-            def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
-            for (key in key_list) {
-                if (args.meta && key instanceof String) {
-                    def path = key
-                    if (args.meta.containsKey(key)) {
-                        path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
-                    }
-                    path = path instanceof String ? path : ''
-                    path_list.add(path)
-                }
-            }
-        }
-        if (ioptions.publish_files instanceof Map) {
-            for (ext in ioptions.publish_files) {
-                if (args.filename.endsWith(ext.key)) {
-                    def ext_list = path_list.collect()
-                    ext_list.add(ext.value)
-                    return "${getPathFromList(ext_list)}/$args.filename"
-                }
-            }
-        } else if (ioptions.publish_files == null) {
-            return "${getPathFromList(path_list)}/$args.filename"
-        }
-    }
-}
diff --git a/modules/nf-core/modules/blast/blastn/main.nf b/modules/nf-core/modules/blast/blastn/main.nf
deleted file mode 100644
index b85f6c8..0000000
--- a/modules/nf-core/modules/blast/blastn/main.nf
+++ /dev/null
@@ -1,37 +0,0 @@
-process BLAST_BLASTN {
-    tag "$meta.id"
-    label 'process_medium'
-
-    conda (params.enable_conda ? 'bioconda::blast=2.12.0' : null)
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/blast:2.12.0--pl5262h3289130_0' :
-        'quay.io/biocontainers/blast:2.12.0--pl5262h3289130_0' }"
-
-    input:
-    tuple val(meta), path(fasta)
-    path  db
-
-    output:
-    tuple val(meta), path('*.blastn.txt'), emit: txt
-    path "versions.yml"                  , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    DB=`find -L ./ -name "*.ndb" | sed 's/.ndb//'`
-    blastn \\
-        -num_threads $task.cpus \\
-        -db \$DB \\
-        -query $fasta \\
-        $args \\
-        -out ${prefix}.blastn.txt
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')
-    END_VERSIONS
-    """
-}
diff --git a/modules/nf-core/modules/blast/blastn/meta.yml b/modules/nf-core/modules/blast/blastn/meta.yml
deleted file mode 100644
index 2742278..0000000
--- a/modules/nf-core/modules/blast/blastn/meta.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: blast_blastn
-description: Queries a BLAST DNA database
-keywords:
-  - fasta
-  - blast
-  - blastn
-  - DNA sequence
-tools:
-  - blast:
-      description: |
-        BLAST finds regions of similarity between biological sequences.
-      homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi
-      documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs
-      doi: 10.1016/S0022-2836(05)80360-2
-      licence: ["US-Government-Work"]
-input:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - fasta:
-      type: file
-      description: Input fasta file containing queries sequences
-      pattern: "*.{fa,fasta}"
-  - db:
-      type: directory
-      description: Directory containing blast database
-      pattern: "*"
-output:
-  - txt:
-      type: file
-      description: File containing blastn hits
-      pattern: "*.{blastn.txt}"
-  - versions:
-      type: file
-      description: File containing software versions
-      pattern: "versions.yml"
-authors:
-  - "@joseespinosa"
-  - "@drpatelh"
diff --git a/modules/nf-core/modules/cat/fastq/functions.nf b/modules/nf-core/modules/cat/fastq/functions.nf
deleted file mode 100644
index da9da09..0000000
--- a/modules/nf-core/modules/cat/fastq/functions.nf
+++ /dev/null
@@ -1,68 +0,0 @@
-//
-//  Utility functions used in nf-core DSL2 module files
-//
-
-//
-// Extract name of software tool from process name using $task.process
-//
-def getSoftwareName(task_process) {
-    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
-}
-
-//
-// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
-//
-def initOptions(Map args) {
-    def Map options = [:]
-    options.args            = args.args ?: ''
-    options.args2           = args.args2 ?: ''
-    options.args3           = args.args3 ?: ''
-    options.publish_by_meta = args.publish_by_meta ?: []
-    options.publish_dir     = args.publish_dir ?: ''
-    options.publish_files   = args.publish_files
-    options.suffix          = args.suffix ?: ''
-    return options
-}
-
-//
-// Tidy up and join elements of a list to return a path string
-//
-def getPathFromList(path_list) {
-    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
-    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
-    return paths.join('/')
-}
-
-//
-// Function to save/publish module results
-//
-def saveFiles(Map args) {
-    if (!args.filename.endsWith('.version.txt')) {
-        def ioptions  = initOptions(args.options)
-        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
-        if (ioptions.publish_by_meta) {
-            def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
-            for (key in key_list) {
-                if (args.meta && key instanceof String) {
-                    def path = key
-                    if (args.meta.containsKey(key)) {
-                        path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
-                    }
-                    path = path instanceof String ? path : ''
-                    path_list.add(path)
-                }
-            }
-        }
-        if (ioptions.publish_files instanceof Map) {
-            for (ext in ioptions.publish_files) {
-                if (args.filename.endsWith(ext.key)) {
-                    def ext_list = path_list.collect()
-                    ext_list.add(ext.value)
-                    return "${getPathFromList(ext_list)}/$args.filename"
-                }
-            }
-        } else if (ioptions.publish_files == null) {
-            return "${getPathFromList(path_list)}/$args.filename"
-        }
-    }
-}
diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/modules/cat/fastq/main.nf
deleted file mode 100644
index 4ff5f34..0000000
--- a/modules/nf-core/modules/cat/fastq/main.nf
+++ /dev/null
@@ -1,47 +0,0 @@
-// Import generic module functions
-include { initOptions; saveFiles } from './functions'
-
-params.options = [:]
-options        = initOptions(params.options)
-
-process CAT_FASTQ {
-    tag "$meta.id"
-    label 'process_low'
-    publishDir "${params.outdir}",
-        mode: params.publish_dir_mode,
-        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'merged_fastq', meta:meta, publish_by_meta:['id']) }
-
-    conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
-    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img"
-    } else {
-        container "biocontainers/biocontainers:v1.2.0_cv1"
-    }
-
-    input:
-    tuple val(meta), path(reads)
-
-    output:
-    tuple val(meta), path("*.merged.fastq.gz"), emit: reads
-
-    script:
-    def prefix   = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
-    def readList = reads.collect{ it.toString() }
-    if (meta.single_end) {
-        if (readList.size > 1) {
-            """
-            cat ${readList.sort().join(' ')} > ${prefix}.merged.fastq.gz
-            """
-        }
-    } else {
-        if (readList.size > 2) {
-            def read1 = []
-            def read2 = []
-            readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v }
-            """
-            cat ${read1.sort().join(' ')} > ${prefix}_1.merged.fastq.gz
-            cat ${read2.sort().join(' ')} > ${prefix}_2.merged.fastq.gz
-            """
-        }
-    }
-}
\ No newline at end of file
diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/modules/cat/fastq/meta.yml
deleted file mode 100644
index c836598..0000000
--- a/modules/nf-core/modules/cat/fastq/meta.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: cat_fastq
-description: Concatenates fastq files
-keywords:
-  - fastq
-  - concatenate
-tools:
-  - cat:
-      description: |
-        The cat utility reads files sequentially, writing them to the standard output.
-      documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html
-      licence: ["GPL-3.0-or-later"]
-input:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - reads:
-      type: list
-      description: |
-        List of input FastQ files to be concatenated.
-output:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - reads:
-      type: file
-      description: Merged fastq file
-      pattern: "*.{merged.fastq.gz}"
-  - versions:
-      type: file
-      description: File containing software versions
-      pattern: "versions.yml"
-
-authors:
-  - "@joseespinosa"
-  - "@drpatelh"
diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 0d06add..427f554 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -18,8 +18,8 @@ include { CHECK_SAMPLE_SHEET } from '../modules/local/check_sample_sheet'
 include { SUBTYPING_REPORT } from '../modules/local/subtyping_report'
 include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc'
 include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb'
-include { BLAST_BLASTN } from '../modules/nf-core/modules/blast/blastn/main'
-include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main'
+include { BLAST_BLASTN } from '../modules/local/blastn'
+include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq'
 
 //=============================================================================
 // Workflow Params Setup
@@ -63,21 +63,16 @@ workflow ILLUMINA {
       [ meta, reads ]
     } 
     .groupTuple(by: [0]) \
-    .branch { meta, reads ->
-      single: reads.size() == 1
-        return [ meta, reads.flatten() ]
-      multiple: reads.size() > 1
-        return [ meta, reads.flatten() ]
+    .map { meta, reads ->
+      return [ meta, reads.flatten() ]
     }
     .set { ch_input }
 
   // Credit to nf-core/viralrecon. Source: https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/workflows/illumina.nf#L221
   // Concatenate FastQ files from same sample if required
-  CAT_FASTQ(ch_input.multiple)
-    .mix(ch_input.single)
-    .set { ch_cat_reads }
+  CAT_ILLUMINA_FASTQ(ch_input)
 
-  IRMA(ch_cat_reads, irma_module)
+  IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module)
 
   BLAST_BLASTN(IRMA.out.consensus, BLAST_MAKEBLASTDB.out.db)
 
diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf
index 911d911..0d13d9a 100644
--- a/workflows/nanopore.nf
+++ b/workflows/nanopore.nf
@@ -28,9 +28,9 @@ include { CHECK_REF_FASTA                                     } from '../modules
 // using modified BLAST_MAKEBLASTDB from nf-core/modules to only move/publish BLAST DB files
 include { BLAST_MAKEBLASTDB as BLAST_MAKEBLASTDB_NCBI         } from '../modules/local/blast_makeblastdb'
 include { BLAST_MAKEBLASTDB as BLAST_MAKEBLASTDB_REFDB        } from '../modules/local/blast_makeblastdb'
-include { BLAST_BLASTN as BLAST_BLASTN_IRMA                   } from '../modules/nf-core/modules/blast/blastn/main'
-include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS              } from '../modules/nf-core/modules/blast/blastn/main'
-include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS_REF_DB       } from '../modules/nf-core/modules/blast/blastn/main'
+include { BLAST_BLASTN as BLAST_BLASTN_IRMA                   } from '../modules/local/blastn'
+include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS              } from '../modules/local/blastn'
+include { BLAST_BLASTN as BLAST_BLASTN_CONSENSUS_REF_DB       } from '../modules/local/blastn'
 include { CUSTOM_DUMPSOFTWAREVERSIONS  as SOFTWARE_VERSIONS   } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
 include { MULTIQC                                             } from '../modules/local/multiqc'
 

From a07441c74d03afbc6e50c153cf50326d425416bc Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Wed, 21 Jun 2023 17:23:33 -0500
Subject: [PATCH 7/7] Added CHANGELOG entry for 3.2.0

---
 CHANGELOG.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 41c82e9..f2b39b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,21 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.2.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.2.0)] - 2023-06-22
+
+### Added
+
+* Influenza B virus support (#14)
+* Polars for faster parsing of BLAST results (#14)
+
+### Fixes
+
+* Irregular Illumina paired-end FASTQ files not producing IRMA assemblies (#20)
+
+### Updates
+
+* Updated README.md to include references and citations
+
 ## [[3.1.6](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.1.6)] - 2023-05-31
 
 This is a patch release for a minor change to use Biocontainers Docker and Singularity images for Clair3 to avoid hitting limits on pulls from Docker Hub and since Biocontainers images are half the size of [hkubal/clair3](https://hub.docker.com/r/hkubal/clair3/) images.