From dfab8f55bdb4495d8473f85eb197c85af85cec13 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 12:31:21 -0500
Subject: [PATCH 01/11] Update Influenza ref seqs DB to use all
 Orthomyxoviridae viruses from NCBI FTP site

---
 .github/workflows/ci.yml             |  46 ++-
 bin/parse_influenza_blast_results.py | 103 +++---
 conf/base.config                     |   5 +
 conf/modules_illumina.config         | 120 ++++---
 conf/modules_nanopore.config         | 500 ++++++++++++++-------------
 modules/local/misc.nf                |  34 --
 modules/local/zstd_decompress.nf     |  30 ++
 nextflow.config                      |   4 +-
 workflows/illumina.nf                |  21 +-
 workflows/nanopore.nf                |  16 +-
 10 files changed, 473 insertions(+), 406 deletions(-)
 create mode 100644 modules/local/zstd_decompress.nf

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 10d33ae..b06d54d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,6 +10,10 @@ on:
 
 env:
   NXF_ANSI_LOG: false
+  # URLs to Influenza ref data should be updated in step with nextflow.config
+  # default ncbi_influenza_fasta and ncbi_influenza_metadata params
+  FASTA_ZST_URL: https://api.figshare.com/v2/file/download/41415330
+  CSV_ZST_URL: https://api.figshare.com/v2/file/download/41415333
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
@@ -58,23 +62,32 @@ jobs:
           make -j2
           make install
           which seqtk
-      - name: Cache subsampled influenza.fna.gz
+      - name: Cache subsampled influenza.fna
         uses: actions/cache@v3
         id: cache-influenza-fna
         with:
-          path: influenza-10k.fna.gz
+          path: influenza-10k.fna.zst
           key: influenza-fna
       - name: Subsample NCBI influenza.fna
         if: steps.cache-influenza-fna.outputs.cache-hit != 'true'
         run: |
-          curl --silent -SLk https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz > influenza.fna.gz
-          echo "Subsample 10k seqs from influenza.fna.gz with seqtk"
-          seqtk sample -s 789 influenza.fna.gz 10000 | gzip -ck > influenza-10k.fna.gz
+          curl --silent -SLk ${FASTA_ZST_URL} | zstdcat | seqtk sample -s 789 - 10000 | zstd -ck > influenza-10k.fna.zst
+      - name: Cache influenza.csv
+        uses: actions/cache@v3
+        id: cache-influenza-csv
+        with:
+          path: influenza.csv.zst
+          key: influenza-csv
+      - name: Download influenza.csv
+        if: steps.cache-influenza-csv.outputs.cache-hit != 'true'
+        run: |
+          curl --silent -SLk ${CSV_ZST_URL} > influenza.csv.zst
       - name: Run pipeline with test data
         run: |
           nextflow run ${GITHUB_WORKSPACE} \
             -profile test_illumina,docker \
-            --ncbi_influenza_fasta influenza-10k.fna.gz
+            --ncbi_influenza_fasta influenza-10k.fna.zst \
+            --ncbi_influenza_metadata influenza.csv.zst
       - name: Upload Artifact
         if: success()
         uses: actions/upload-artifact@v1.0.0
@@ -155,25 +168,34 @@ jobs:
           echo "ERR6359501-10k,$(realpath reads/ERR6359501-10k.fastq)" | tee -a samplesheet.csv
           echo "ERR6359501,$(realpath run1)" | tee -a samplesheet.csv
           echo "ERR6359501,$(realpath run2)" | tee -a samplesheet.csv
-      - name: Cache subsampled influenza.fna.gz
+      - name: Cache subsampled influenza.fna
         uses: actions/cache@v3
         id: cache-influenza-fna
         with:
-          path: influenza-10k.fna.gz
+          path: influenza-10k.fna.zst
           key: influenza-fna
       - name: Subsample NCBI influenza.fna
         if: steps.cache-influenza-fna.outputs.cache-hit != 'true'
         run: |
-          curl --silent -SLk https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz > influenza.fna.gz
-          echo "Subsample 10k seqs from influenza.fna.gz with seqtk"
-          seqtk sample -s 789 influenza.fna.gz 10000 | gzip -ck > influenza-10k.fna.gz
+          curl --silent -SLk ${FASTA_ZST_URL} | zstdcat | seqtk sample -s 789 - 10000 | zstd -ck > influenza-10k.fna.zst
+      - name: Cache influenza.csv
+        uses: actions/cache@v3
+        id: cache-influenza-csv
+        with:
+          path: influenza.csv.zst
+          key: influenza-csv
+      - name: Download influenza.csv
+        if: steps.cache-influenza-csv.outputs.cache-hit != 'true'
+        run: |
+          curl --silent -SLk ${CSV_ZST_URL} > influenza.csv.zst
       - name: Run pipeline with test data
         run: |
           nextflow run ${GITHUB_WORKSPACE} \
             -profile test_nanopore,docker \
             --platform nanopore \
             --input samplesheet.csv \
-            --ncbi_influenza_fasta influenza-10k.fna.gz
+            --ncbi_influenza_fasta influenza-10k.fna.zst \
+            --ncbi_influenza_metadata influenza.csv.zst
       - name: Upload pipeline_info/
         if: success()
         uses: actions/upload-artifact@v1.0.0
diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 90881b7..c2bdc6c 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -59,8 +59,8 @@
 blast_results_report_columns = [
     ("sample", "Sample"),
     ("sample_segment", "Sample Genome Segment Number"),
-    ("accession", "Reference NCBI Accession"),
-    ("subtype", "Reference Subtype"),
+    ("#Accession", "Reference NCBI Accession"),
+    ("Genotype", "Reference Subtype"),
     ("pident", "BLASTN Percent Identity"),
     ("length", "BLASTN Alignment Length"),
     ("mismatch", "BLASTN Mismatches"),
@@ -75,19 +75,17 @@
     ("slen", "Reference Sequence Length"),
     ("qcovs", "Sample Sequence Coverage of Reference Sequence"),
     ("stitle", "Reference Sequence ID"),
-    ("segment", "Reference Genome Segment Number"),
-    ("virus_name", "Reference Virus Name"),
-    ("host", "Reference Host"),
-    ("country", "Reference Country"),
-    ("date", "Reference Collection Date"),
-    ("age", "Reference Patient Age"),
-    ("gender", "Reference Patient Gender"),
-    ("group_id", "Reference Group ID"),
+    ("Segment", "Reference Genome Segment Number"),
+    ("GenBank_Title", "Reference Virus Name"),
+    ("Host", "Reference Host"),
+    ("Geo_Location", "Reference Geo Location"),
+    ("Collection_Date", "Reference Collection Date"),
+    ("Release_Date", "Reference Release Date"),
 ]
 
 subtype_results_summary_columns = [
     "sample",
-    "subtype",
+    "Genotype",
     "H_top_accession",
     "H_type",
     "H_virus_name",
@@ -100,7 +98,7 @@
 
 columns_H_summary_results = [
     "sample",
-    "subtype",
+    "Genotype",
     "H_top_accession",
     "H_NCBI_Influenza_DB_proportion_matches",
     "H_NCBI_Influenza_DB_subtype_matches",
@@ -121,7 +119,7 @@
 
 columns_N_summary_results = [
     "sample",
-    "subtype",
+    "Genotype",
     "N_top_accession",
     "N_NCBI_Influenza_DB_proportion_matches",
     "N_NCBI_Influenza_DB_subtype_matches",
@@ -142,7 +140,7 @@
 
 subtype_results_summary_final_names = {
     "sample": "Sample",
-    "subtype": "Subtype Prediction",
+    "Genotype": "Subtype Prediction",
     "N_type": "N: type prediction",
     "N_top_accession": "N: top match accession",
     "N_virus_name": "N: top match virus name",
@@ -209,7 +207,7 @@ def parse_blast_result(
         f"and Min Alignment length > {min_aln_length}"
     )
     df_filtered = df_filtered.with_columns([
-        pl.col('saccver').str.strip().alias("accession"),
+        pl.col('saccver').str.strip().alias("#Accession"),
         pl.lit(sample_name, dtype=pl.Categorical).alias("sample"),
         pl.col('qaccver').str.extract(r".+_(\d)$").cast(pl.Categorical).alias("sample_segment"),
         pl.col("stitle").str.extract(regex_subtype_pattern).alias("subtype_from_match_title").cast(pl.Categorical)
@@ -217,14 +215,14 @@ def parse_blast_result(
     logging.info(
         f"{sample_name} | Merging NCBI Influenza DB genome metadata with BLAST results on accession."
     )
-    df_merge = df_filtered.join(df_metadata, on="accession", how="left")
+    df_merge = df_filtered.join(df_metadata, on="#Accession", how="left")
     del df_filtered
     del df_metadata
     df_merge = df_merge.with_columns(
-        pl.when(pl.col("subtype").is_null())
+        pl.when(pl.col("Genotype").is_null())
             .then(pl.col("subtype_from_match_title"))
-            .otherwise(pl.col("subtype"))
-            .alias("subtype")
+            .otherwise(pl.col("Genotype"))
+            .alias("Genotype")
     )
     df_merge = df_merge.sort(
         by=["sample_segment", "bitscore"], descending=[False, True]
@@ -240,7 +238,7 @@ def parse_blast_result(
     subtype_results_summary = {"sample": sample_name}
     if not get_top_ref:
         is_iav = True
-        if df_top_seg_matches.select(pl.col("subtype").is_null().all())[0, 0]:
+        if df_top_seg_matches.select(pl.col("Genotype").is_null().all())[0, 0]:
             is_iav = False
         H_results = None
         N_results = None
@@ -250,7 +248,7 @@ def parse_blast_result(
         if "6" in segments:
             N_results = find_h_or_n_type(df_merge, "6", is_iav)
             subtype_results_summary.update(N_results)
-        subtype_results_summary["subtype"] = get_subtype_value(H_results, N_results, is_iav)
+        subtype_results_summary["Genotype"] = get_subtype_value(H_results, N_results, is_iav)
 
     return df_top_seg_matches, subtype_results_summary
 
@@ -296,9 +294,9 @@ def find_h_or_n_type(df_merge, seg, is_iav):
         reg_h_or_n_type = "[Nn]"
     df_segment = df_merge.filter(pl.col("sample_segment") == seg)
     if is_iav:
-        type_counts = df_segment["subtype"].value_counts(sort=True)
-        type_counts = type_counts.filter(~pl.col("subtype").is_null())
-        df_type_counts = type_counts.with_columns(pl.lit(type_counts["subtype"].str.extract(reg_h_or_n_type + r"(\d+)").
+        type_counts = df_segment["Genotype"].value_counts(sort=True)
+        type_counts = type_counts.filter(~pl.col("Genotype").is_null())
+        df_type_counts = type_counts.with_columns(pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").
                                                          alias(type_name)))
         df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null())
         logging.debug(f"{df_type_counts}")
@@ -313,7 +311,7 @@ def find_h_or_n_type(df_merge, seg, is_iav):
             f"{h_or_n}{top_type} n={top_type_count}/{total_count} ({top_type_count / total_count:.1%})"
         )
         df_segment = df_segment.with_columns(
-            pl.lit(df_segment["subtype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*")
+            pl.lit(df_segment["Genotype"].str.contains(r".*" + reg_h_or_n_type + top_type + r".*")
                    .fill_null(False)
                    .alias("type_mask")))
         df_seg_top_type = df_segment.filter(pl.col("type_mask") == True).drop("type_mask")
@@ -332,12 +330,12 @@ def find_h_or_n_type(df_merge, seg, is_iav):
         f"{h_or_n}_top_gaps": top_result["gapopen"],
         f"{h_or_n}_top_bitscore": top_result["bitscore"],
         f"{h_or_n}_top_align_length": top_result["length"],
-        f"{h_or_n}_top_accession": top_result["accession"],
-        f"{h_or_n}_top_host": top_result["host"],
-        f"{h_or_n}_top_country": top_result["country"],
-        f"{h_or_n}_top_date": top_result["date"],
+        f"{h_or_n}_top_accession": top_result["#Accession"],
+        f"{h_or_n}_top_host": top_result["Host"],
+        f"{h_or_n}_top_country": top_result["Geo_Location"],
+        f"{h_or_n}_top_date": top_result["Collection_Date"],
         f"{h_or_n}_top_seq_length": top_result["slen"],
-        f"{h_or_n}_virus_name": top_result["virus_name"],
+        f"{h_or_n}_virus_name": top_result["GenBank_Title"],
         f"{h_or_n}_NCBI_Influenza_DB_subtype_matches": top_type_count,
         f"{h_or_n}_NCBI_Influenza_DB_total_matches": total_count,
         f"{h_or_n}_NCBI_Influenza_DB_proportion_matches": top_type_count / total_count if is_iav else "N/A",
@@ -370,33 +368,33 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
     )
 
     logging.info(f'Parsing Influenza metadata file "{flu_metadata}"')
+
     md_cols = [
-        ("accession", str),
-        ("host", pl.Categorical),
-        ("segment", pl.Categorical),
-        ("subtype", str),
-        ("country", pl.Categorical),
-        ("date", pl.Categorical),
-        ("seq_length", pl.UInt16),
-        ("virus_name", pl.Categorical),
-        ("age", pl.Categorical),
-        ("gender", pl.Categorical),
-        ("group_id", pl.Categorical),
+        ("#Accession", str),
+        ("Release_Date", pl.Categorical),
+        ("Genus", pl.Categorical),
+        ("Length", pl.UInt16),
+        ("Genotype", pl.Categorical),
+        ("Segment", pl.Categorical),
+        ("Publications", str),
+        ("Geo_Location", pl.Categorical),
+        ("Host", pl.Categorical),
+        ("Isolation_Source", pl.Categorical),
+        ("Collection_Date", pl.Categorical),
+        ("GenBank_Title", str),
     ]
     df_md = pl.read_csv(
         flu_metadata,
-        has_header=False,
-        separator="\t",
-        new_columns=[name for name, _ in md_cols],
+        has_header=True,
         dtypes={name: t for name, t in md_cols},
     )
 
-    unique_subtypes = df_md.select("subtype").unique()
-    unique_subtypes = unique_subtypes.filter(~pl.col("subtype").is_null())
+    unique_subtypes = df_md.select("Genotype").unique()
+    unique_subtypes = unique_subtypes.filter(~pl.col("Genotype").is_null())
     logging.info(
         f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {len(unique_subtypes)} unique subtypes. "
     )
-    regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["subtype"])) + r")\)"
+    regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["Genotype"])) + r")\)"
     results = [
         parse_blast_result(blast_result, df_md, regex_subtype_pattern, get_top_ref, top=top,
                            pident_threshold=pident_threshold,
@@ -445,8 +443,15 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
         df_blast = df_blast.rename(
             mapping={k: v for k, v in blast_results_report_columns}
         )
-        df_ref_id = df_blast.select(pl.col(['Sample', 'Sample Genome Segment Number',
-                                            'Reference NCBI Accession', 'BLASTN Bitscore', 'Reference Sequence ID']))
+        df_ref_id = df_blast.select(
+            pl.col([
+                'Sample',
+                'Sample Genome Segment Number',
+                'Reference NCBI Accession',
+                'BLASTN Bitscore',
+                'Reference Sequence ID'
+            ])
+        )
         df_ref_id = df_ref_id.with_columns(
             pl.when(pl.col("Reference NCBI Accession").is_null())
                 .then(pl.col("Reference Sequence ID"))
diff --git a/conf/base.config b/conf/base.config
index 8e75895..e7e0f06 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -16,6 +16,11 @@ process {
   maxErrors = '-1'
 
   // Groupable resource requirements for processes
+  withLabel:process_single {
+    cpus = 1
+    memory = { check_max( 100.MB * task.attempt, 'memory' ) }
+    time = { check_max( 1.h * task.attempt, 'time' ) }
+  }
   withLabel:process_low {
     cpus = { check_max( 2 * task.attempt, 'cpus' ) }
     memory = { check_max( 4.GB * task.attempt, 'memory' ) }
diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
index d4f35d7..d6c7c65 100644
--- a/conf/modules_illumina.config
+++ b/conf/modules_illumina.config
@@ -1,63 +1,69 @@
-
+// Illumina subworkflow process configuration
 process {
-    withName: 'IRMA' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/irma"},
-                mode: params.publish_dir_mode
-            ],
-            [
-                path: { "${params.outdir}/consensus/irma/" },
-                pattern: "*.consensus.fasta",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'IRMA' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/irma"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ],
+      [
+        path: { "${params.outdir}/consensus/irma/" },
+        pattern: "*.consensus.fasta",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLAST_MAKEBLASTDB' {
-        ext.args = '-dbtype nucl'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLAST_MAKEBLASTDB' {
+    ext.args = '-dbtype nucl'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast/db"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLAST_BLASTN' {
-        ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLAST_BLASTN' {
+    ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
+  withName: 'SUBTYPING_REPORT' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'SUBTYPING_REPORT' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'ZSTD_DECOMPRESS_.*' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/ncbi-influenza-db"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'GUNZIP_NCBI_FLU_FASTA' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/flu_fasta"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
-    withName: 'CAT_ILLUMINA_FASTQ' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/fastq"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
-}
\ No newline at end of file
+  withName: 'CAT_ILLUMINA_FASTQ' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/fastq"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
+}
diff --git a/conf/modules_nanopore.config b/conf/modules_nanopore.config
index 4ae1c19..df0b951 100644
--- a/conf/modules_nanopore.config
+++ b/conf/modules_nanopore.config
@@ -1,266 +1,284 @@
-
+// Nanopore subworkflow process configuration
 process {
-    withName: 'IRMA' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/irma"},
-                mode: params.publish_dir_mode
-            ],
-            [
-                path: { "${params.outdir}/consensus/irma/" },
-                pattern: "*.irma.consensus.fasta",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'IRMA' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/irma"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ],
+      [
+        path: { "${params.outdir}/consensus/irma/" },
+        pattern: "*.irma.consensus.fasta",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
+
+  withName: 'BLAST_MAKEBLASTDB_NCBI' {
+    ext.args  = '-dbtype nucl'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast/db/ncbi" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLAST_MAKEBLASTDB_NCBI' {
-        ext.args  = '-dbtype nucl'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast/db/ncbi" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
-    withName: 'BLAST_MAKEBLASTDB_REFDB' {
-        ext.args  = '-dbtype nucl'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast/db/ref_db" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLAST_MAKEBLASTDB_REFDB' {
+    ext.args  = '-dbtype nucl'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast/db/ref_db" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLAST_BLASTN_IRMA' {
-        ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast/blastn/irma" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLAST_BLASTN_IRMA' {
+    ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast/blastn/irma" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLAST_BLASTN_CONSENSUS' {
-        ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast/blastn/consensus" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLAST_BLASTN_CONSENSUS' {
+    ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast/blastn/consensus" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLAST_BLASTN_CONSENSUS_REF_DB' {
-        ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
-        publishDir = [
-            [
-                path: { "${params.outdir}/blast/blastn/against_ref_db" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLAST_BLASTN_CONSENSUS_REF_DB' {
+    ext.args = '-outfmt "6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen qcovs stitle" -num_alignments 1000000 -evalue 1e-6'
+    publishDir = [
+      [
+        path: { "${params.outdir}/blast/blastn/against_ref_db" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BCF_CONSENSUS' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/consensus/bcftools/${sample}" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BCF_CONSENSUS' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/consensus/bcftools/${sample}" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BCFTOOLS_STATS' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/variants/${sample}" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BCFTOOLS_STATS' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/variants/${sample}" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'CAT_CONSENSUS' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/consensus/bcftools/"},
-                pattern: "*.consensus.fasta",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'CAT_CONSENSUS' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/consensus/bcftools/"},
+        pattern: "*.consensus.fasta",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'COVERAGE_PLOT' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/coverage_plots/${sample}" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'COVERAGE_PLOT' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/coverage_plots/${sample}" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'MEDAKA' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/variants/${sample}" },
-                pattern: "*.{vcf,log}",
-                mode: params.publish_dir_mode
-            ],
-            [
-                path: { "${params.outdir}/variants/${sample}/medaka"},
-                mode: params.publish_dir_mode,
-                enable: true
-            ]
-        ]
-    }
+  withName: 'MEDAKA' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/variants/${sample}" },
+        pattern: "*.{vcf,log}",
+        mode: params.publish_dir_mode
+      ],
+      [
+        path: { "${params.outdir}/variants/${sample}/medaka"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode,
+        enable: true
+      ]
+    ]
+  }
 
-    withName: 'CLAIR3' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/variants/${sample}"},
-                pattern: "*.{vcf.gz,log}",
-                mode: params.publish_dir_mode
-            ],
-            [
-                path: { "${params.outdir}/variants/${sample}/clair3"},
-                mode: params.publish_dir_mode,
-                enable: true
-            ]
-        ]
-    }
+  withName: 'CLAIR3' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/variants/${sample}"},
+        pattern: "*.{vcf.gz,log}",
+        mode: params.publish_dir_mode
+      ],
+      [
+        path: { "${params.outdir}/variants/${sample}/clair3"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode,
+        enable: true
+      ]
+    ]
+  }
 
-    withName: 'MINIMAP2' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/mapping/${sample}"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'MINIMAP2' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/mapping/${sample}"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'MOSDEPTH_GENOME' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/mosdepth/${sample}"},
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'MOSDEPTH_GENOME' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/mosdepth/${sample}"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'PULL_TOP_REF_ID' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/reference_sequences/${meta.id}"},
-                pattern: "*.csv",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'PULL_TOP_REF_ID' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/reference_sequences/${meta.id}"},
+        pattern: "*.csv",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'CHECK_REF_FASTA' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/reference_sequences/"},
-                pattern: "*.fasta",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'CHECK_REF_FASTA' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/reference_sequences/"},
+        pattern: "*.fasta",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'SEQTK_SEQ' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/reference_sequences/${sample}"},
-                pattern: "*.fasta",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'SEQTK_SEQ' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/reference_sequences/${sample}"},
+        pattern: "*.fasta",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'SUBTYPING_REPORT_BCF_CONSENSUS' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/"},
-                pattern: "*.{xlsx,log}",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'SUBTYPING_REPORT_BCF_CONSENSUS' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/"},
+        pattern: "*.{xlsx,log}",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'BLASTN_REPORT' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/mismatch_report"},
-                pattern: "*.{xlsx}",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'BLASTN_REPORT' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/mismatch_report"},
+        pattern: "*.{xlsx}",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'SUBTYPING_REPORT_IRMA_CONSENSUS' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/irma"},
-                pattern: "*.{xlsx,log}",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'SUBTYPING_REPORT_IRMA_CONSENSUS' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/irma"},
+        pattern: "*.{xlsx,log}",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'VCF_FILTER_FRAMESHIFT' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/variants/${sample}" },
-                pattern: "*.vcf",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'VCF_FILTER_FRAMESHIFT' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/variants/${sample}" },
+        pattern: "*.vcf",
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'GUNZIP_NCBI_FLU_FASTA' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/flu_fasta" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'ZSTD_DECOMPRESS_.*' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/ncbi-influenza-db"},
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'READ_COUNT_FAIL_TSV' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/read_count" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'READ_COUNT_FAIL_TSV' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/read_count" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'READ_COUNT_PASS_TSV' {
-        publishDir = [
-            [
-                path: { "${params.outdir}/read_count" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'READ_COUNT_PASS_TSV' {
+    publishDir = [
+      [
+        path: { "${params.outdir}/read_count" },
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        mode: params.publish_dir_mode
+      ]
+    ]
+  }
 
-    withName: 'SOFTWARE_VERSIONS' {
-         publishDir = [
-            [
-                path: { "${params.outdir}/pipeline_info" },
-                pattern: "software_versions.yml",
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
+  withName: 'SOFTWARE_VERSIONS' {
+   publishDir = [
+     [
+       path: { "${params.outdir}/pipeline_info" },
+       pattern: "software_versions.yml",
+       mode: params.publish_dir_mode
+     ]
+   ]
+ }
 
-    withName: 'MULTIQC' {
-         publishDir = [
-            [
-                path: { "${params.outdir}/MultiQC" },
-                mode: params.publish_dir_mode
-            ]
-        ]
-    }
-}
\ No newline at end of file
+ withName: 'MULTIQC' {
+   publishDir = [
+     [
+       path: { "${params.outdir}/MultiQC" },
+       saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+       mode: params.publish_dir_mode
+     ]
+   ]
+ }
+}
diff --git a/modules/local/misc.nf b/modules/local/misc.nf
index 61c3a68..16a1090 100644
--- a/modules/local/misc.nf
+++ b/modules/local/misc.nf
@@ -58,40 +58,6 @@ process CAT_DB {
     """
 }
 
-process GUNZIP_NCBI_FLU_FASTA {
-    tag "$archive"
-    label 'process_low'
-
-    conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
-    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img"
-    } else {
-        container "biocontainers/biocontainers:v1.2.0_cv1"
-    }
-
-    input:
-    path archive
-
-    output:
-    path "*.fna", emit: fna
-    path "versions.yml" , emit: versions
-
-    script:
-    def software = getSoftwareName(task.process)
-    // replace FASTA headers
-    // >gi|{gi}|gb|{accession}|{description}
-    // with
-    // >{accession} {description}
-    // for easier parsing and processing
-    """
-    zcat $archive | sed -E 's/^>gi\\|[0-9]+\\|gb\\|(\\w+)\\|(.*)/>\\1 \\2/' > influenza.fna
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        zcat: \$(echo \$(zcat --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//')
-    END_VERSIONS
-    """
-}
-
 process CAT_CONSENSUS {
   tag "$sample"
   conda (params.enable_conda ? 'bioconda::shiptv=0.4.0' : null)
diff --git a/modules/local/zstd_decompress.nf b/modules/local/zstd_decompress.nf
new file mode 100644
index 0000000..aecbe87
--- /dev/null
+++ b/modules/local/zstd_decompress.nf
@@ -0,0 +1,30 @@
+process ZSTD_DECOMPRESS {
+
+  conda 'conda-forge::zstd=1.5.2'
+  // TODO: using clair3 container here for zstd and since it might be used if running the Nanopore workflow, but should move to multi-package-container with just zstd and maybe curl to combine data fetch functionality
+  if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+    container 'https://depot.galaxyproject.org/singularity/clair3:1.0.3--py39h8492097_0'
+  } else {
+    container 'quay.io/biocontainers/clair3:1.0.3--py39h8492097_0'
+  }
+
+  input:
+  path(zstd_file, stageAs: "input*/*")
+  val(filename)
+
+  output:
+  path(decompressed_file), emit: file
+  path('versions.yml'), emit: versions
+
+  script:
+  def basename = file(zstd_file).getBaseName()
+  decompressed_file = filename ? "${basename}-${filename}" : basename
+  """
+  zstdcat $zstd_file > $decompressed_file
+
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+      zstd: \$(echo \$(zstd --version 2>&1) | sed 's/^.* v//; s/,.*//')
+  END_VERSIONS
+  """
+}
diff --git a/nextflow.config b/nextflow.config
index b3bd9bf..fccf72f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -23,8 +23,8 @@ params {
   min_aln_length                    = 700
   max_top_blastn                    = 3
   // reference data
-  ncbi_influenza_fasta              = 'https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz'
-  ncbi_influenza_metadata           = 'https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz'
+  ncbi_influenza_fasta              = 'https://api.figshare.com/v2/file/download/41415330'
+  ncbi_influenza_metadata           = 'https://api.figshare.com/v2/file/download/41415333'
   // Slurm scheduler options
   slurm_queue                       = ''
   slurm_queue_size                  = 100
diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 427f554..34f51f9 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -20,6 +20,8 @@ include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc'
 include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb'
 include { BLAST_BLASTN } from '../modules/local/blastn'
 include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq'
+include { FETCH_INFLUENZA_REF_DB } from '../modules/local/fetch_influenza_ref_db'
+include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress'
 
 //=============================================================================
 // Workflow Params Setup
@@ -35,9 +37,14 @@ if (params.irma_module) {
 //=============================================================================
 
 workflow ILLUMINA {
-
-  GUNZIP_NCBI_FLU_FASTA(ch_influenza_db_fasta)
-  BLAST_MAKEBLASTDB(GUNZIP_NCBI_FLU_FASTA.out.fna)
+  ch_versions = Channel.empty()
+  // Decompress reference data
+  ZSTD_DECOMPRESS_FASTA(ch_influenza_db_fasta, "influenza.fasta")
+  ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_FASTA.out.versions)
+  ZSTD_DECOMPRESS_CSV(ch_influenza_metadata, "influenza.csv")
+  ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_CSV.out.versions)
+  BLAST_MAKEBLASTDB(ZSTD_DECOMPRESS_FASTA.out.file)
+  ch_versions = ch_versions.mix(BLAST_MAKEBLASTDB.out.versions)
 
   CHECK_SAMPLE_SHEET(Channel.fromPath( params.input, checkIfExists: true))
     .splitCsv(header: ['sample', 'fastq1', 'fastq2', 'single_end'], sep: ',', skip: 1)
@@ -71,11 +78,17 @@ workflow ILLUMINA {
   // Credit to nf-core/viralrecon. Source: https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/workflows/illumina.nf#L221
   // Concatenate FastQ files from same sample if required
   CAT_ILLUMINA_FASTQ(ch_input)
+  ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions)
 
   IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module)
+  ch_versions = ch_versions.mix(IRMA.out.versions)
 
   BLAST_BLASTN(IRMA.out.consensus, BLAST_MAKEBLASTDB.out.db)
+  ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions)
 
   ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] })
-  SUBTYPING_REPORT(ch_influenza_metadata, ch_blast)
+  SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out, ch_blast)
+  ch_versions = ch_versions.mix(SUBTYPING_REPORT.out.versions)
+
+  SOFTWARE_VERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml'))
 }
diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf
index 0d13d9a..2dd21bd 100644
--- a/workflows/nanopore.nf
+++ b/workflows/nanopore.nf
@@ -19,7 +19,7 @@ include { BCF_CONSENSUS; BCFTOOLS_STATS                       } from '../modules
 include { CLAIR3                                              } from '../modules/local/clair3'
 include { MOSDEPTH_GENOME                                     } from '../modules/local/mosdepth'
 include { CAT_NANOPORE_FASTQ                                  } from '../modules/local/misc'
-include { GUNZIP_NCBI_FLU_FASTA                               } from '../modules/local/misc'
+include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress'
 include { CAT_DB                                              } from '../modules/local/misc'
 include { CAT_CONSENSUS                                       } from '../modules/local/misc'
 include { SEQTK_SEQ                                           } from '../modules/local/seqtk_seq'
@@ -131,10 +131,12 @@ workflow NANOPORE {
     .map { sample, fqgz, fq, count -> [ [id: sample], fqgz, fq ] }
     .set { ch_reads }
 
-  GUNZIP_NCBI_FLU_FASTA(ch_influenza_db_fasta)
-  ch_versions = ch_versions.mix(GUNZIP_NCBI_FLU_FASTA.out.versions)
+  ZSTD_DECOMPRESS_FASTA(ch_influenza_db_fasta, "influenza.fasta")
+  ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_FASTA.out.versions)
+  ZSTD_DECOMPRESS_CSV(ch_influenza_metadata, "influenza.csv")
+  ch_versions = ch_versions.mix(ZSTD_DECOMPRESS_CSV.out.versions)
 
-  ch_input_ref_db = GUNZIP_NCBI_FLU_FASTA.out.fna
+  ch_input_ref_db = ZSTD_DECOMPRESS_FASTA.out.file
 
   if (params.ref_db){
     ch_ref_fasta = file(params.ref_db, type: 'file')
@@ -159,11 +161,11 @@ workflow NANOPORE {
   //Generate suptype prediction report
   if (!params.skip_irma_subtyping_report){
     ch_blast_irma = BLAST_BLASTN_IRMA.out.txt.collect({ it[1] })
-    SUBTYPING_REPORT_IRMA_CONSENSUS(ch_influenza_metadata, ch_blast_irma)
+    SUBTYPING_REPORT_IRMA_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blast_irma)
   }
 
   // Prepare top ncbi accession id for each segment of each sample sample (id which has top bitscore)
-  PULL_TOP_REF_ID(BLAST_BLASTN_IRMA.out.txt, ch_influenza_metadata)
+  PULL_TOP_REF_ID(BLAST_BLASTN_IRMA.out.txt, ZSTD_DECOMPRESS_CSV.out.file)
   ch_versions = ch_versions.mix(PULL_TOP_REF_ID.out.versions)
 
   PULL_TOP_REF_ID.out.accession_id
@@ -242,7 +244,7 @@ workflow NANOPORE {
   ch_versions = ch_versions.mix(BLAST_BLASTN_CONSENSUS.out.versions)
 
   ch_blastn_consensus = BLAST_BLASTN_CONSENSUS.out.txt.collect({ it[1] })
-  SUBTYPING_REPORT_BCF_CONSENSUS(ch_influenza_metadata, ch_blastn_consensus)
+  SUBTYPING_REPORT_BCF_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blastn_consensus)
   ch_versions = ch_versions.mix(SUBTYPING_REPORT_BCF_CONSENSUS.out.versions)
 
   if (params.ref_db){

From d36b980c57d29c5bd64fc8be6b59da82f06b06e6 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 13:51:17 -0500
Subject: [PATCH 02/11] Remove/replace references to GUNZIP_NCBI_FLU_FASTA

---
 workflows/illumina.nf | 1 -
 workflows/nanopore.nf | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 34f51f9..663bf4d 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -16,7 +16,6 @@ ch_influenza_metadata = file(params.ncbi_influenza_metadata)
 include { IRMA } from '../modules/local/irma'
 include { CHECK_SAMPLE_SHEET } from '../modules/local/check_sample_sheet'
 include { SUBTYPING_REPORT } from '../modules/local/subtyping_report'
-include { GUNZIP_NCBI_FLU_FASTA } from '../modules/local/misc'
 include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb'
 include { BLAST_BLASTN } from '../modules/local/blastn'
 include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq'
diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf
index 2dd21bd..be3f1e9 100644
--- a/workflows/nanopore.nf
+++ b/workflows/nanopore.nf
@@ -142,7 +142,7 @@ workflow NANOPORE {
     ch_ref_fasta = file(params.ref_db, type: 'file')
     CHECK_REF_FASTA(ch_ref_fasta)
     ch_versions = ch_versions.mix(CHECK_REF_FASTA.out.versions)
-    CAT_DB(GUNZIP_NCBI_FLU_FASTA.out.fna, CHECK_REF_FASTA.out.fasta)
+    CAT_DB(ZSTD_DECOMPRESS_FASTA.out.file, CHECK_REF_FASTA.out.fasta)
     ch_input_ref_db = CAT_DB.out.fasta
   }
 

From 7371191598cf93db5704ebdbac0bf6440a3f56ee Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 13:53:19 -0500
Subject: [PATCH 03/11] remove missing proc from illumina.nf

---
 workflows/illumina.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 663bf4d..2ccc9e5 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -19,7 +19,6 @@ include { SUBTYPING_REPORT } from '../modules/local/subtyping_report'
 include { BLAST_MAKEBLASTDB } from '../modules/local/blast_makeblastdb'
 include { BLAST_BLASTN } from '../modules/local/blastn'
 include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq'
-include { FETCH_INFLUENZA_REF_DB } from '../modules/local/fetch_influenza_ref_db'
 include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress'
 
 //=============================================================================

From 00f040916e726b96a6773f1d02cfd6d38f25b849 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 13:59:59 -0500
Subject: [PATCH 04/11] fix illumina.nf

---
 workflows/illumina.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 2ccc9e5..e821739 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -85,7 +85,7 @@ workflow ILLUMINA {
   ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions)
 
   ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] })
-  SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out, ch_blast)
+  SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out.file, ch_blast)
   ch_versions = ch_versions.mix(SUBTYPING_REPORT.out.versions)
 
   SOFTWARE_VERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml'))

From e38c9cadeb891b6e1518c29497d09cdc9539d72d Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 14:09:46 -0500
Subject: [PATCH 05/11] fix illumina.nf

---
 workflows/illumina.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index e821739..564f33c 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -21,6 +21,8 @@ include { BLAST_BLASTN } from '../modules/local/blastn'
 include { CAT_ILLUMINA_FASTQ } from '../modules/local/cat_illumina_fastq'
 include { ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_FASTA; ZSTD_DECOMPRESS as ZSTD_DECOMPRESS_CSV } from '../modules/local/zstd_decompress'
 
+include { CUSTOM_DUMPSOFTWAREVERSIONS  as SOFTWARE_VERSIONS   } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main'
+
 //=============================================================================
 // Workflow Params Setup
 //=============================================================================

From 0d9db7383edc06b76020f35760906e172b171892 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 7 Jul 2023 15:32:45 -0500
Subject: [PATCH 06/11] fix parse_influenza_blast_results.py issue with
 Categorical type and CAT_ILLUMINA_FASTQ versions.yml wonkiness

---
 bin/parse_influenza_blast_results.py |   2 +-
 conf/modules_illumina.config         |  10 +++
 modules/local/cat_illumina_fastq.nf  | 105 ++++++++++++++-------------
 workflows/illumina.nf                |   6 +-
 4 files changed, 70 insertions(+), 53 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index c2bdc6c..b8c03d0 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -374,7 +374,7 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
         ("Release_Date", pl.Categorical),
         ("Genus", pl.Categorical),
         ("Length", pl.UInt16),
-        ("Genotype", pl.Categorical),
+        ("Genotype", str),
         ("Segment", pl.Categorical),
         ("Publications", str),
         ("Geo_Location", pl.Categorical),
diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
index d6c7c65..e6560a5 100644
--- a/conf/modules_illumina.config
+++ b/conf/modules_illumina.config
@@ -66,4 +66,14 @@ process {
       ]
     ]
   }
+
+  withName: 'SOFTWARE_VERSIONS' {
+   publishDir = [
+     [
+       path: { "${params.outdir}/pipeline_info" },
+       pattern: "software_versions.yml",
+       mode: params.publish_dir_mode
+     ]
+   ]
+ }
 }
diff --git a/modules/local/cat_illumina_fastq.nf b/modules/local/cat_illumina_fastq.nf
index fe62e55..0afe2bd 100644
--- a/modules/local/cat_illumina_fastq.nf
+++ b/modules/local/cat_illumina_fastq.nf
@@ -39,20 +39,21 @@ process CAT_ILLUMINA_FASTQ {
   }
   if (meta.single_end) {
     if (fqList.size >= 1 || fqgzList.size >= 1) {
-      """
-      touch ${prefix}.merged.fastq.gz
-      if [[ ${fqList.size} > 0 ]]; then
-        cat ${readList.join(' ')} | gzip -ck >> ${prefix}.merged.fastq.gz
-      fi
-      if [[ ${fqgzList.size} > 0 ]]; then
-        cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz
-      fi
+  """
+  touch ${prefix}.merged.fastq.gz
+  if [[ ${fqList.size} > 0 ]]; then
+    cat ${readList.join(' ')} | gzip -ck >> ${prefix}.merged.fastq.gz
+  fi
+  if [[ ${fqgzList.size} > 0 ]]; then
+    cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz
+  fi
 
-      cat <<-END_VERSIONS > versions.yml
-      "${task.process}":
-        cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
-      END_VERSIONS
-      """
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+    cat: \$(echo \$(cat --help 2>&1) | sed 's/ (.*//')
+    gzip: \$(echo \$(gzip --help 2>&1) | sed 's/ (.*//')
+  END_VERSIONS
+  """
     }
   } else {
     if (readList.size >= 2) {
@@ -60,43 +61,49 @@ process CAT_ILLUMINA_FASTQ {
       def read1gz = []
       def read2 = []
       def read2gz = []
-      fqList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v }
-      fqgzList.eachWithIndex{ v, ix -> ( ix & 1 ? read2gz : read1gz ) << v }
-      """
-      # append 1:N:0:. or 2:N:0:. to forward and reverse reads if "[12]:N:.*"
-      # not present in the FASTQ header for compatability with IRMA assembly
-      touch ${prefix}_1.merged.fastq.gz
-      touch ${prefix}_2.merged.fastq.gz
-      if [[ ${read1.size} > 0 ]]; then
-        cat ${read1.join(' ')} \\
-        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\
-        | gzip -ck \\
-        >> ${prefix}_1.merged.fastq.gz
-      fi
-      if [[ ${read1gz.size} > 0 ]]; then
-        zcat ${read1gz.join(' ')} \\
-        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\
-        | gzip -ck \\
-        >> ${prefix}_1.merged.fastq.gz
-      fi
-      if [[ ${read2.size} > 0 ]]; then
-        cat ${read2.join(' ')} \\
-        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\
-        | gzip -ck \\
-        >> ${prefix}_2.merged.fastq.gz
-      fi
-      if [[ ${read2gz.size} > 0 ]]; then
-        zcat ${read2gz.join(' ')} \\
-        | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\
-        | gzip -ck \\
-        >> ${prefix}_2.merged.fastq.gz
-      fi
+      fqList.eachWithIndex { v, ix -> ( ix & 1 ? read2 : read1 ) << v }
+      fqgzList.eachWithIndex { v, ix -> ( ix & 1 ? read2gz : read1gz ) << v }
+      // append 1:N:0:. or 2:N:0:. to forward and reverse reads if "[12]:N:.*"
+      // not present in the FASTQ header for compatability with IRMA assembly
+"""
+touch ${prefix}_1.merged.fastq.gz
 
-      cat <<-END_VERSIONS > versions.yml
-      "${task.process}":
-        cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//')
-      END_VERSIONS
-      """
+touch ${prefix}_2.merged.fastq.gz
+
+if [[ ${read1.size} > 0 ]]; then
+  cat ${read1.join(' ')} \\
+  | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\
+  | gzip -ck \\
+  >> ${prefix}_1.merged.fastq.gz
+fi
+
+if [[ ${read1gz.size} > 0 ]]; then
+  zcat ${read1gz.join(' ')} \\
+  | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 1:N:0:.\n"; } else { print "\$_"; }' \\
+  | gzip -ck \\
+  >> ${prefix}_1.merged.fastq.gz
+fi
+
+if [[ ${read2.size} > 0 ]]; then
+  cat ${read2.join(' ')} \\
+  | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\
+  | gzip -ck \\
+  >> ${prefix}_2.merged.fastq.gz
+fi
+
+if [[ ${read2gz.size} > 0 ]]; then
+  zcat ${read2gz.join(' ')} \\
+  | perl -ne 'if (\$_ =~ /^@.*/ && !(\$_ =~ /^@.* [12]:N:.*/)){  chomp \$_; print "\$_ 2:N:0:.\n"; } else { print "\$_"; }' \\
+  | gzip -ck \\
+  >> ${prefix}_2.merged.fastq.gz
+fi
+
+cat <<-END_VERSIONS > versions.yml
+"${task.process}":
+  cat: \$(echo \$(cat --help 2>&1) | sed 's/ (.*//')
+  gzip: \$(echo \$(gzip --help 2>&1) | sed 's/ (.*//')
+END_VERSIONS
+"""
     }
   }
 }
diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 564f33c..342e6a5 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -78,13 +78,13 @@ workflow ILLUMINA {
   // Credit to nf-core/viralrecon. Source: https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/workflows/illumina.nf#L221
   // Concatenate FastQ files from same sample if required
   CAT_ILLUMINA_FASTQ(ch_input)
-  ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions)
+  ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions.first().ifEmpty(null))
 
   IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module)
-  ch_versions = ch_versions.mix(IRMA.out.versions)
+  ch_versions = ch_versions.mix(IRMA.out.versions.first().ifEmpty(null))
 
   BLAST_BLASTN(IRMA.out.consensus, BLAST_MAKEBLASTDB.out.db)
-  ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions)
+  ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first().ifEmpty(null))
 
   ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] })
   SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out.file, ch_blast)

From 78c5c4b21c5a90bf8e7e2266c612158651577a13 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Mon, 10 Jul 2023 15:18:31 -0500
Subject: [PATCH 07/11] fix merge conflict typo

---
 bin/parse_influenza_blast_results.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index bb4c184..584c35a 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -389,7 +389,6 @@ def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
     df_md = pl.read_csv(
         flu_metadata,
         has_header=True,
-        has_header=False,
         dtypes=dict(md_cols),
     )
 

From f7acabac11bb597c8db70796ba29ead17f483778 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Mon, 10 Jul 2023 16:05:50 -0500
Subject: [PATCH 08/11] parse_influenza_blast_results.py: fix missing var,
 remove unused threads cli opt

---
 bin/parse_influenza_blast_results.py | 14 +++++++++++---
 modules/local/subtyping_report.nf    | 11 ++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index 584c35a..dc06e30 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -296,6 +296,7 @@ def find_h_or_n_type(df_merge, seg, is_iav):
     if is_iav:
         type_counts = df_segment["Genotype"].value_counts(sort=True)
         type_counts = type_counts.filter(~pl.col("Genotype").is_null())
+        reg_h_or_n_type = "[Hh]" if h_or_n == "H" else "[Nn]"
         df_type_counts = type_counts.with_columns(pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").alias(type_name)))
         df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null())
         logging.debug(f"{df_type_counts}")
@@ -355,12 +356,19 @@ def find_h_or_n_type(df_merge, seg, is_iav):
     "--pident-threshold", default=0.85, help="BLAST percent identity threshold"
 )
 @click.option('--min-aln-length', default=50, help="Min BLAST alignment length threshold")
-@click.option("--threads", default=4, help="Number of BLAST result parsing threads.")
 @click.option("--get-top-ref", default=False, help="Get top ref accession id from ncbi database.")
 @click.option("--sample-name", default="", help="Sample Name.")
 @click.argument("blast_results", nargs=-1)
-def report(flu_metadata, blast_results, excel_report, top, pident_threshold,
-           min_aln_length, threads, get_top_ref, sample_name):
+def report(
+        flu_metadata,
+        blast_results,
+        excel_report,
+        top,
+        pident_threshold,
+        min_aln_length,
+        get_top_ref,
+        sample_name
+):
     from rich.traceback import install
     install(show_locals=True, width=120, word_wrap=True)
     logging.basicConfig(
diff --git a/modules/local/subtyping_report.nf b/modules/local/subtyping_report.nf
index 95280af..1c3c625 100644
--- a/modules/local/subtyping_report.nf
+++ b/modules/local/subtyping_report.nf
@@ -1,9 +1,3 @@
-// Import generic module functions
-include { initOptions; saveFiles; getSoftwareName } from './functions'
-
-params.options = [:]
-options        = initOptions(params.options)
-
 process SUBTYPING_REPORT {
   memory { 
     // Dynamically determine how much memory is required for this task based on 
@@ -33,17 +27,16 @@ process SUBTYPING_REPORT {
   path(blastn_results)
 
   output:
-  path('iav-subtyping-report.xlsx'), emit: report
+  path('nf-flu-subtyping-report.xlsx'), emit: report
   path('parse_influenza_blast_results.log'), emit: log
   path "versions.yml", emit: versions
 
   script:
   """
   parse_influenza_blast_results.py \\
-   --threads ${task.cpus} \\
    --flu-metadata $genomeset \\
    --top ${params.max_top_blastn} \\
-   --excel-report iav-subtyping-report.xlsx \\
+   --excel-report nf-flu-subtyping-report.xlsx \\
    --pident-threshold $params.pident_threshold \\
    $blastn_results
   ln -s .command.log parse_influenza_blast_results.log

From dedcfb5a72a15d285800437d879ac9dcf33c685c Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Tue, 11 Jul 2023 11:09:07 -0500
Subject: [PATCH 09/11] fix pull_top_ref_id.nf

---
 bin/parse_influenza_blast_results.py | 84 +++++++++++++++++-----------
 modules/local/pull_top_ref_id.nf     |  3 +-
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
index dc06e30..18c4b0b 100755
--- a/bin/parse_influenza_blast_results.py
+++ b/bin/parse_influenza_blast_results.py
@@ -11,13 +11,13 @@
 import logging
 import re
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
 
 import click
 import numpy as np
 import pandas as pd
 import polars as pl
 from rich.logging import RichHandler
+from typing import Dict, List, Optional, Tuple
 
 LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s [in %(filename)s:%(lineno)d]"
 logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
@@ -297,7 +297,8 @@ def find_h_or_n_type(df_merge, seg, is_iav):
         type_counts = df_segment["Genotype"].value_counts(sort=True)
         type_counts = type_counts.filter(~pl.col("Genotype").is_null())
         reg_h_or_n_type = "[Hh]" if h_or_n == "H" else "[Nn]"
-        df_type_counts = type_counts.with_columns(pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").alias(type_name)))
+        df_type_counts = type_counts.with_columns(
+            pl.lit(type_counts["Genotype"].str.extract(reg_h_or_n_type + r"(\d+)").alias(type_name)))
         df_type_counts = df_type_counts.filter(~pl.col(type_name).is_null())
         logging.debug(f"{df_type_counts}")
         type_to_count = defaultdict(int)
@@ -369,47 +370,31 @@ def report(
         get_top_ref,
         sample_name
 ):
-    from rich.traceback import install
-    install(show_locals=True, width=120, word_wrap=True)
-    logging.basicConfig(
-        format="%(message)s",
-        datefmt="[%Y-%m-%d %X]",
-        level=logging.DEBUG,
-        handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)],
-    )
+    init_logging()
 
     logging.info(f'Parsing Influenza metadata file "{flu_metadata}"')
 
-    md_cols = [
-        ("#Accession", str),
-        ("Release_Date", pl.Categorical),
-        ("Genus", pl.Categorical),
-        ("Length", pl.UInt16),
-        ("Genotype", str),
-        ("Segment", pl.Categorical),
-        ("Publications", str),
-        ("Geo_Location", pl.Categorical),
-        ("Host", pl.Categorical),
-        ("Isolation_Source", pl.Categorical),
-        ("Collection_Date", pl.Categorical),
-        ("GenBank_Title", str),
-    ]
-    df_md = pl.read_csv(
-        flu_metadata,
-        has_header=True,
-        dtypes=dict(md_cols),
-    )
+    df_md = read_refseq_metadata(flu_metadata)
 
     unique_subtypes = df_md.select("Genotype").unique()
     unique_subtypes = unique_subtypes.filter(~pl.col("Genotype").is_null())
     logging.info(
-        f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. There are {len(unique_subtypes)} unique subtypes. "
+        f"Parsed Influenza metadata file into DataFrame with n={df_md.shape[0]} rows and n={df_md.shape[1]} columns. "
+        f"There are {len(unique_subtypes)} unique subtypes."
     )
     regex_subtype_pattern = r"\((H\d+N\d+|" + "|".join(list(unique_subtypes["Genotype"])) + r")\)"
     results = [
-        parse_blast_result(blast_result, df_md, regex_subtype_pattern, get_top_ref, top=top,
-                           pident_threshold=pident_threshold,
-                           min_aln_length=min_aln_length) for blast_result in blast_results]
+        parse_blast_result(
+            blast_result,
+            df_md,
+            regex_subtype_pattern,
+            get_top_ref,
+            top=top,
+            pident_threshold=pident_threshold,
+            min_aln_length=min_aln_length
+        )
+        for blast_result in blast_results
+    ]
 
     if not get_top_ref:
         dfs_blast = []
@@ -474,6 +459,39 @@ def report(
         df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)
 
 
+def read_refseq_metadata(flu_metadata):
+    md_cols = [
+        ("#Accession", str),
+        ("Release_Date", pl.Categorical),
+        ("Genus", pl.Categorical),
+        ("Length", pl.UInt16),
+        ("Genotype", str),
+        ("Segment", pl.Categorical),
+        ("Publications", str),
+        ("Geo_Location", pl.Categorical),
+        ("Host", pl.Categorical),
+        ("Isolation_Source", pl.Categorical),
+        ("Collection_Date", pl.Categorical),
+        ("GenBank_Title", str),
+    ]
+    return pl.read_csv(
+        flu_metadata,
+        has_header=True,
+        dtypes=dict(md_cols),
+    )
+
+
+def init_logging():
+    from rich.traceback import install
+    install(show_locals=True, width=120, word_wrap=True)
+    logging.basicConfig(
+        format="%(message)s",
+        datefmt="[%Y-%m-%d %X]",
+        level=logging.DEBUG,
+        handlers=[RichHandler(rich_tracebacks=True, tracebacks_show_locals=True)],
+    )
+
+
 def get_col_widths(df, index=False):
     """Calculate column widths based on column headers and contents"""
     if index:
diff --git a/modules/local/pull_top_ref_id.nf b/modules/local/pull_top_ref_id.nf
index 6e9f0d7..59ccf64 100644
--- a/modules/local/pull_top_ref_id.nf
+++ b/modules/local/pull_top_ref_id.nf
@@ -1,6 +1,6 @@
 process PULL_TOP_REF_ID {
   tag "$meta.id"
-  label 'process_medium'
+  label 'process_low'
 
   conda (params.enable_conda ? 'conda-forge::python=3.10 conda-forge::biopython=1.80 conda-forge::openpyxl=3.1.0 conda-forge::pandas=1.5.3 conda-forge::rich=12.6.0 conda-forge::typer=0.7.0 conda-forge::xlsxwriter=3.0.8 conda-forge::polars=0.17.9 conda-forge::pyarrow=11.0.0' : null)
   if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
@@ -20,7 +20,6 @@ process PULL_TOP_REF_ID {
   script:
   """
   parse_influenza_blast_results.py \\
-    --threads ${task.cpus} \\
     --flu-metadata $genomeset \\
     --get-top-ref True \\
     --top 1 \\

From b21b89dc48ebad97dc93541fe0c0ad71ec4ca2d9 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Tue, 11 Jul 2023 12:12:32 -0500
Subject: [PATCH 10/11] Update ci.yml

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b06d54d..4648ac8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -202,12 +202,12 @@ jobs:
         with:
           name: nanopore-test-results-pipline_info-${{ matrix.nxf_ver }}
           path: results/pipeline_info
-      - name: Upload iav-subtyping-report.xlsx
+      - name: Upload nf-flu-subtyping-report.xlsx
         if: success()
         uses: actions/upload-artifact@v1.0.0
         with:
           name: nanopore-test-results-subtyping-report-${{ matrix.nxf_ver }}
-          path: results/iav-subtyping-report.xlsx
+          path: results/nf-flu-subtyping-report.xlsx
       - name: Upload multiqc_report.html
         if: success()
         uses: actions/upload-artifact@v1.0.0

From 2ed8bf572eaabae94836ea0db576afb7f6abf335 Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Tue, 11 Jul 2023 15:18:14 -0500
Subject: [PATCH 11/11] Update docs

---
 CHANGELOG.md         |  8 ++++++++
 README.md            | 13 +++++++------
 docs/output.md       |  2 +-
 docs/usage.md        |  8 ++++----
 nextflow.config      |  2 +-
 nextflow_schema.json |  8 ++++----
 6 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d8833b1..54fa4fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,14 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.3.0](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.3.0)] - 2023-07-11
+
+This release migrates to more recently updated Influenza virus sequences since the last update for the [NCBI Influenza DB FTP data](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/) was in 2020-10-13. By default, all Orthomyxoviridae virus sequences were parsed from the daily updated NCBI Viruses [`AllNucleotide.fa`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/) and [`AllNuclMetadata.csv.gz`](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/AllNuclMetadata.csv.gz) and uploaded to [Figshare](https://figshare.com/articles/dataset/2023-06-14_-_NCBI_Viruses_-_Orthomyxoviridae/23608782) as Zstd compressed files. nf-flu no longer uses the [influenza.fna.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz) and [genomeset.dat.gz](https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz) files for Influenza sequences and metadata, respectively.
+
+### Fixes
+
+* More up-to-date Influenza sequences database used by default (#24)
+
 ## [[3.2.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.2.1)] - 2023-07-07
 
 ### Fixes
diff --git a/README.md b/README.md
index 473efca..9e77960 100644
--- a/README.md
+++ b/README.md
@@ -17,13 +17,14 @@ After reference sequence selection, the pipeline performs read mapping to each r
 
 ## Pipeline summary
 
-1. Download latest [NCBI Influenza DB][] sequences and metadata (or use user-specified files)
-2. Merge reads of re-sequenced samples ([`cat`](http://www.linfo.org/cat.html)) (if needed)
+1. Download latest [NCBI Orthomyxoviridae sequences](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&id=11308&lvl=3&keep=1&srchmode=1&unlock) and metadata (parsed from [NCBI Viruses FTP data](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/)).
+2. Merge reads of re-sequenced samples ([`cat`](http://www.linfo.org/cat.html)) (if needed).
 3. Assembly of Influenza gene segments with [IRMA][] using the built-in FLU module
-4. Nucleotide [BLAST][] search against [NCBI Influenza DB][]
-5. Automatically select top match references for segments
-6. H/N subtype prediction and Excel XLSX report generation based on BLAST results
-7. Perform Variant calling and genome assembly for all segments.
+4. Nucleotide [BLAST][] search against [NCBI Influenza DB][] sequences
+5. H/N subtype prediction and Excel XLSX report generation based on BLAST results.
+6. Automatically select top match reference sequences for segments
+7. Read mapping, variant calling and consensus sequence generation for each segment against top reference sequence based on BLAST results.
+8. MultiQC report generation.
 
 ## Quick Start
 
diff --git a/docs/output.md b/docs/output.md
index d0a904e..7f69321 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -78,7 +78,7 @@ The primary output from [IRMA][] are the consensus sequences for gene segments,
 
 </details>
 
-Nucleotide [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi) (`blastn`) is used to query [IRMA][] assembled gene segment sequences against the [NCBI Influenza DB][] sequences (and optionally, against user-specified sequences (`--ref_db`) to predict the H and N subtype of each sample if possible (i.e. if segments 4 (hemagglutinin) and/or 6 (neuraminidase) were assembled) and to determine the closest matching reference sequence for each segment for reference mapped assembly.
+Nucleotide [BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi) (`blastn`) is used to query [IRMA][] assembled gene segment sequences against [Influenza sequences from NCBI](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/) (and optionally, against user-specified sequences (`--ref_db`) to predict the H and N subtype of each sample if possible (i.e. if segments 4 (hemagglutinin) and/or 6 (neuraminidase) were assembled) and to determine the closest matching reference sequence for each segment for reference mapped assembly.
 
 ### Coverage Plots
 
diff --git a/docs/usage.md b/docs/usage.md
index eef8cee..d59ce7f 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -261,17 +261,17 @@ Maximum of top blastn result reported
 
 - Optional
 - Type: string
-- Default: `https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz`
+- Default: `https://api.figshare.com/v2/file/download/41415330`
 
-Path/URL to NCBI Influenza DB sequences FASTA file.
+Path/URL to Zstandard compressed NCBI Influenza virus sequences FASTA file.
 
 #### `--ncbi_influenza_metadata`
 
 - Optional
 - Type: string
-- Default: `https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz`
+- Default: `https://api.figshare.com/v2/file/download/41415333`
 
-Path/URL to NCBI Influenza DB metadata file.
+Path/URL to Zstandard compressed NCBI Influenza virus sequences metadata CSV file.
 
 ### Generic options
 
diff --git a/nextflow.config b/nextflow.config
index ea61c7f..2f4bd39 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -153,7 +153,7 @@ manifest {
   description     = 'Influenza A virus genome assembly pipeline'
   homePage        = 'https://github.com/CFIA-NCFAD/nf-flu'
   author          = 'Peter Kruczkiewicz, Hai Nguyen'
-  version         = '3.2.1'
+  version         = '3.3.0'
   nextflowVersion = '>=21.10'
   mainScript      = 'main.nf'
   doi             = '10.5281/zenodo.7011213'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 32515d3..58b0e32 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -213,14 +213,14 @@
                 },
                 "ncbi_influenza_fasta": {
                     "type": "string",
-                    "default": "https://ftp.ncbi.nih.gov/genomes/INFLUENZA/influenza.fna.gz",
-                    "description": "Path/URL to NCBI Influenza DB sequences FASTA file.",
+                    "default": "https://api.figshare.com/v2/file/download/41415330",
+                    "description": "Path/URL to Zstandard compressed NCBI Influenza virus sequences FASTA file.",
                     "fa_icon": "fas fa-file-alt"
                 },
                 "ncbi_influenza_metadata": {
                     "type": "string",
-                    "default": "https://ftp.ncbi.nih.gov/genomes/INFLUENZA/genomeset.dat.gz",
-                    "description": "Path/URL to NCBI Influenza DB metadata file.",
+                    "default": "https://api.figshare.com/v2/file/download/41415333",
+                    "description": "Path/URL to Zstandard compressed NCBI Influenza virus sequences metadata CSV file.",
                     "fa_icon": "fas fa-file-csv"
                 }
             },