Merge branch 'CW-2780' into 'dev'

Use CATEGORICAL as type for sample_name in seq_summary Closes CW-2780 See merge request epi2melabs/workflows/wf-artic!151
epi2me-labs · Sep 19, 2023 · 9ff9343 · 9ff9343
2 parents a1130b7 + b6b608c
commit 9ff9343
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [unreleased]
+### Fixed
+- reporting of sequence summaries crashing with `TypeError`  
+
 ## [v0.3.31]
 ### Changed
 - `--update_data` default is now `true` to get latest lineage data from Pangolin and Nextclade

diff --git a/bin/workflow_glue/report.py b/bin/workflow_glue/report.py
@@ -11,10 +11,14 @@
 from bokeh.models import Panel, Range1d, Tabs
 import numpy as np
 import pandas as pd
+from pandas.api import types as pd_types
 import pysam
 
 from .util import get_named_logger, wf_parser  # noqa: ABS101
 
+# Define categorical types
+CATEGORICAL = pd_types.CategoricalDtype(ordered=True)
+
 
 def read_files(summaries, **kwargs):
     """Read a set of files and join to single dataframe."""
@@ -47,7 +51,11 @@ def output_json(df, consensus_fasta, fastcat_stats):
             newdf[x].values.tolist() for x in newdf.columns)))
         all_json[sample] = final
     final_json = {'data': []}
-    seq_summary = pd.read_csv(fastcat_stats, delimiter="\t")
+    seq_summary = pd.read_csv(
+        fastcat_stats,
+        delimiter="\t",
+        dtype={"sample_name": CATEGORICAL}
+        )
     readcounts = seq_summary['sample_name'].value_counts().to_dict()
     # parse the consensus fasta to get extra info required
     with pysam.FastxFile(consensus_fasta) as fh:
@@ -93,7 +101,11 @@ def main(args):
 This section displays basic QC metrics indicating read data quality.
 ''')
     # read length summary
-    seq_summary = pd.read_csv(args.fastcat_stats, delimiter="\t")
+    seq_summary = pd.read_csv(
+        args.fastcat_stats,
+        delimiter="\t",
+        dtype={"sample_name": CATEGORICAL}
+        )
     total_bases = seq_summary['read_length'].sum()
     mean_length = total_bases / len(seq_summary)
     median_length = np.median(seq_summary['read_length'])