sort subtyping report results in same order as input samplesheet.csv

peterk87 · Aug 17, 2023 · c661dfc · c661dfc
1 parent d54128d
commit c661dfc
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 47 deletions.
diff --git a/bin/parse_influenza_blast_results.py b/bin/parse_influenza_blast_results.py
@@ -10,14 +10,16 @@
 
 import logging
 import re
+import sys
 from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
 
 import click
 import numpy as np
 import pandas as pd
 import polars as pl
 from rich.logging import RichHandler
-from typing import Dict, List, Optional, Tuple
 
 LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s [in %(filename)s:%(lineno)d]"
 logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
@@ -375,6 +377,7 @@ def find_h_or_n_type(df_merge, seg, is_iav):
 @click.option('--min-aln-length', default=50, help="Min BLAST alignment length threshold")
 @click.option("--get-top-ref", default=False, help="Get top ref accession id from ncbi database.")
 @click.option("--sample-name", default="", help="Sample Name.")
+@click.option("--samplesheet", default="", help="samplesheet.csv to get order of samples.")
 @click.argument("blast_results", nargs=-1)
 def report(
  flu_metadata,
@@ -384,9 +387,20 @@ def report(
  pident_threshold,
  min_aln_length,
  get_top_ref,
- sample_name
+ sample_name,
+ samplesheet,
 ):
  init_logging()
+ if not blast_results:
+ logging.error("No BLAST results files specified!")
+ sys.exit(1)
+
+ ordered_samples: Optional[List[str]] = None
+ if samplesheet:
+ samplesheet_path = Path(samplesheet)
+ if samplesheet_path.resolve().exists():
+ ordered_samples = pl.read_csv(samplesheet_path)['sample'].to_list()
+ logging.info(f"Using samplesheet to order samples: {ordered_samples}")
 
  logging.info(f'Parsing Influenza metadata file "{flu_metadata}"')
 
@@ -412,72 +426,104 @@ def report(
  for blast_result in blast_results
  ]
 
- if not get_top_ref:
+ if get_top_ref:
+ df_top_seg_matches, subtype_results_summary = results[0]
+ write_top_segment_matches(df_top_seg_matches, subtype_results_summary, sample_name)
+ else:
  dfs_blast = []
  all_subtype_results = {}
  for parsed_result in results:
  if parsed_result is None:
  continue
  df_blast, subtype_results_summary = parsed_result
  if df_blast is not None:
- dfs_blast.append(df_blast.to_pandas())
+ dfs_blast.append(df_blast)
  sample = subtype_results_summary["sample"]
  all_subtype_results[sample] = subtype_results_summary
- df_all_blast = pd.concat(dfs_blast).rename(
- columns=dict(BLAST_RESULTS_REPORT_COLUMNS)
- )
+ df_all_blast = pl.concat(dfs_blast, how='vertical')
  df_subtype_results = pd.DataFrame(all_subtype_results).transpose()
+ ordered_sample_to_idx = {sample: idx for idx, sample in enumerate(ordered_samples)} if ordered_samples else None
+
  cols = pd.Series(SUBTYPE_RESULTS_SUMMARY_COLUMNS)
  cols = cols[cols.isin(df_subtype_results.columns)]
- df_subtype_predictions = df_subtype_results[cols].rename(
- columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES
- )
+ df_subtype_results = df_subtype_results[cols]
+
+ if ordered_samples and ordered_sample_to_idx:
+ df_subtype_results = df_subtype_results.sort_values(
+ 'sample',
+ ascending=True,
+ key=lambda x: x.map(ordered_sample_to_idx)
+ )
+ else:
+ df_subtype_results = df_subtype_results.sort_values('sample', ascending=True)
  cols = pd.Series(H_COLUMNS)
  cols = cols[cols.isin(df_subtype_results.columns)]
- df_H = df_subtype_results[cols].rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
+ df_H = df_subtype_results[cols]
  cols = pd.Series(N_COLUMNS)
  cols = cols[cols.isin(df_subtype_results.columns)]
- df_N = df_subtype_results[cols].rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
+ df_N = df_subtype_results[cols]
+
+ # cast all categorical columns to string so that they can be sorted/ordered in a sensible way
+ df_all_blast = df_all_blast.with_columns(
+ pl.col(pl.Categorical).cast(pl.Utf8),
+ )
+ # apply custom sort order to sample column if samplesheet is provided
+ sample_col = pl.col('sample').map_dict(
+ ordered_sample_to_idx) if ordered_samples and ordered_sample_to_idx else pl.col('sample')
+ # Sort by sample names, segment numbers and bitscore
+ df_all_blast = df_all_blast.sort(
+ sample_col,
+ pl.col('sample_segment'),
+ pl.col('bitscore'),
+ descending=[False, False, True]
+ )
+ df_all_blast_pandas: pd.DataFrame = df_all_blast.to_pandas()
  # Convert segment number to segment name (1 -> "1_PB2")
- df_all_blast["Sample Genome Segment Number"] = df_all_blast["Sample Genome Segment Number"]. \
+ df_all_blast_pandas["sample_segment"] = df_all_blast_pandas["sample_segment"]. \
  apply(lambda x: SEGMENT_NAMES[int(x)])
- # Sort by sample names, segment numbers and bitscore
- df_all_blast = df_all_blast.sort_values(
- ["Sample", "Sample Genome Segment Number", "BLASTN Bitscore"],
- ascending=[True, True, False]
+ # Rename columns to more human-readable names
+ df_all_blast_pandas.rename(
+ columns=dict(BLAST_RESULTS_REPORT_COLUMNS)
+ )
+ df_subtype_predictions = df_subtype_results[cols].rename(
+ columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES
  )
+ df_H = df_H.rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
+ df_N = df_N.rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
+ # Write each dataframe to a separate sheet in the excel report
  write_excel(
  [
  ("Subtype Predictions", df_subtype_predictions),
- ("Top Segment Matches", df_all_blast),
+ ("Top Segment Matches", df_all_blast_pandas),
  ("H Segment Results", df_H),
  ("N Segment Results", df_N),
  ],
  output_dest=excel_report,
  )
- else:
- df_blast, subtype_results_summary = results[0]
- df_blast = df_blast.rename(mapping=dict(BLAST_RESULTS_REPORT_COLUMNS))
- df_ref_id = df_blast.select(
- pl.col([
- 'Sample',
- 'Sample Genome Segment Number',
- 'Reference NCBI Accession',
- 'BLASTN Bitscore',
- 'Reference Sequence ID'
- ])
- )
- df_ref_id = df_ref_id.with_columns(
- pl.when(pl.col("Reference NCBI Accession").is_null())
- .then(pl.col("Reference Sequence ID"))
- .otherwise(pl.col("Reference NCBI Accession"))
- .str.strip()
- .alias('Reference NCBI Accession')
- )
- df_ref_id = df_ref_id.with_columns(
- pl.col("Sample Genome Segment Number").apply(lambda x: SEGMENT_NAMES[int(x)])
- .alias("Sample Genome Segment Number"))
- df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)
+
+
+def write_top_segment_matches(df_top_seg_matches, subtype_results_summary, sample_name):
+ df_blast = df_top_seg_matches.rename(mapping=dict(BLAST_RESULTS_REPORT_COLUMNS))
+ df_ref_id = df_blast.select(
+ pl.col([
+ 'Sample',
+ 'Sample Genome Segment Number',
+ 'Reference NCBI Accession',
+ 'BLASTN Bitscore',
+ 'Reference Sequence ID'
+ ])
+ )
+ df_ref_id = df_ref_id.with_columns(
+ pl.when(pl.col("Reference NCBI Accession").is_null())
+ .then(pl.col("Reference Sequence ID"))
+ .otherwise(pl.col("Reference NCBI Accession"))
+ .str.strip()
+ .alias('Reference NCBI Accession')
+ )
+ df_ref_id = df_ref_id.with_columns(
+ pl.col("Sample Genome Segment Number").apply(lambda x: SEGMENT_NAMES[int(x)])
+ .alias("Sample Genome Segment Number"))
+ df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)
 
 
 def read_refseq_metadata(flu_metadata):

diff --git a/modules/local/misc.nf b/modules/local/misc.nf
@@ -87,6 +87,3 @@ process CAT_CONSENSUS {
  END_VERSIONS
  """
 }
-
-
-
diff --git a/modules/local/subtyping_report.nf b/modules/local/subtyping_report.nf
@@ -25,6 +25,7 @@ process SUBTYPING_REPORT {
  input:
  path(genomeset)
  path(blastn_results)
+ path(samplesheet)
 
  output:
  path('nf-flu-subtyping-report.xlsx'), emit: report
@@ -38,6 +39,7 @@ process SUBTYPING_REPORT {
  --top ${params.max_top_blastn} \\
  --excel-report nf-flu-subtyping-report.xlsx \\
  --pident-threshold $params.pident_threshold \\
+ --samplesheet $samplesheet \\
  $blastn_results
 
  ln -s .command.log parse_influenza_blast_results.log

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
@@ -87,7 +87,11 @@ workflow ILLUMINA {
  ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first().ifEmpty(null))
 
  ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] })
- SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out.file, ch_blast)
+ SUBTYPING_REPORT(
+ ZSTD_DECOMPRESS_CSV.out.file,
+ ch_blast,
+ CHECK_SAMPLE_SHEET.out
+ )
  ch_versions = ch_versions.mix(SUBTYPING_REPORT.out.versions)
 
  SOFTWARE_VERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml'))

diff --git a/workflows/nanopore.nf b/workflows/nanopore.nf
@@ -161,7 +161,11 @@ workflow NANOPORE {
  //Generate suptype prediction report
  if (!params.skip_irma_subtyping_report){
  ch_blast_irma = BLAST_BLASTN_IRMA.out.txt.collect({ it[1] })
- SUBTYPING_REPORT_IRMA_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blast_irma)
+ SUBTYPING_REPORT_IRMA_CONSENSUS(
+ ZSTD_DECOMPRESS_CSV.out.file,
+ ch_blast_irma,
+ CHECK_SAMPLE_SHEET.out
+ )
  }
 
  // Prepare top ncbi accession id for each segment of each sample sample (id which has top bitscore)
@@ -244,7 +248,11 @@ workflow NANOPORE {
  ch_versions = ch_versions.mix(BLAST_BLASTN_CONSENSUS.out.versions)
 
  ch_blastn_consensus = BLAST_BLASTN_CONSENSUS.out.txt.collect({ it[1] })
- SUBTYPING_REPORT_BCF_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blastn_consensus)
+ SUBTYPING_REPORT_BCF_CONSENSUS(
+ ZSTD_DECOMPRESS_CSV.out.file, 
+ ch_blastn_consensus,
+ CHECK_SAMPLE_SHEET.out
+ )
  ch_versions = ch_versions.mix(SUBTYPING_REPORT_BCF_CONSENSUS.out.versions)
 
  if (params.ref_db){