Skip to content

Commit

Permalink
sort subtyping report results in same order as input samplesheet.csv
Browse files Browse the repository at this point in the history
  • Loading branch information
peterk87 committed Aug 17, 2023
1 parent d54128d commit c661dfc
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 47 deletions.
128 changes: 87 additions & 41 deletions bin/parse_influenza_blast_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@

import logging
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import click
import numpy as np
import pandas as pd
import polars as pl
from rich.logging import RichHandler
from typing import Dict, List, Optional, Tuple

LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s [in %(filename)s:%(lineno)d]"
logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
Expand Down Expand Up @@ -375,6 +377,7 @@ def find_h_or_n_type(df_merge, seg, is_iav):
@click.option('--min-aln-length', default=50, help="Min BLAST alignment length threshold")
@click.option("--get-top-ref", default=False, help="Get top ref accession id from ncbi database.")
@click.option("--sample-name", default="", help="Sample Name.")
@click.option("--samplesheet", default="", help="samplesheet.csv to get order of samples.")
@click.argument("blast_results", nargs=-1)
def report(
flu_metadata,
Expand All @@ -384,9 +387,20 @@ def report(
pident_threshold,
min_aln_length,
get_top_ref,
sample_name
sample_name,
samplesheet,
):
init_logging()
if not blast_results:
logging.error("No BLAST results files specified!")
sys.exit(1)

ordered_samples: Optional[List[str]] = None
if samplesheet:
samplesheet_path = Path(samplesheet)
if samplesheet_path.resolve().exists():
ordered_samples = pl.read_csv(samplesheet_path)['sample'].to_list()
logging.info(f"Using samplesheet to order samples: {ordered_samples}")

logging.info(f'Parsing Influenza metadata file "{flu_metadata}"')

Expand All @@ -412,72 +426,104 @@ def report(
for blast_result in blast_results
]

if not get_top_ref:
if get_top_ref:
df_top_seg_matches, subtype_results_summary = results[0]
write_top_segment_matches(df_top_seg_matches, subtype_results_summary, sample_name)
else:
dfs_blast = []
all_subtype_results = {}
for parsed_result in results:
if parsed_result is None:
continue
df_blast, subtype_results_summary = parsed_result
if df_blast is not None:
dfs_blast.append(df_blast.to_pandas())
dfs_blast.append(df_blast)
sample = subtype_results_summary["sample"]
all_subtype_results[sample] = subtype_results_summary
df_all_blast = pd.concat(dfs_blast).rename(
columns=dict(BLAST_RESULTS_REPORT_COLUMNS)
)
df_all_blast = pl.concat(dfs_blast, how='vertical')
df_subtype_results = pd.DataFrame(all_subtype_results).transpose()
ordered_sample_to_idx = {sample: idx for idx, sample in enumerate(ordered_samples)} if ordered_samples else None

cols = pd.Series(SUBTYPE_RESULTS_SUMMARY_COLUMNS)
cols = cols[cols.isin(df_subtype_results.columns)]
df_subtype_predictions = df_subtype_results[cols].rename(
columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES
)
df_subtype_results = df_subtype_results[cols]

if ordered_samples and ordered_sample_to_idx:
df_subtype_results = df_subtype_results.sort_values(
'sample',
ascending=True,
key=lambda x: x.map(ordered_sample_to_idx)
)
else:
df_subtype_results = df_subtype_results.sort_values('sample', ascending=True)
cols = pd.Series(H_COLUMNS)
cols = cols[cols.isin(df_subtype_results.columns)]
df_H = df_subtype_results[cols].rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
df_H = df_subtype_results[cols]
cols = pd.Series(N_COLUMNS)
cols = cols[cols.isin(df_subtype_results.columns)]
df_N = df_subtype_results[cols].rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
df_N = df_subtype_results[cols]

# cast all categorical columns to string so that they can be sorted/ordered in a sensible way
df_all_blast = df_all_blast.with_columns(
pl.col(pl.Categorical).cast(pl.Utf8),
)
# apply custom sort order to sample column if samplesheet is provided
sample_col = pl.col('sample').map_dict(
ordered_sample_to_idx) if ordered_samples and ordered_sample_to_idx else pl.col('sample')
# Sort by sample names, segment numbers and bitscore
df_all_blast = df_all_blast.sort(
sample_col,
pl.col('sample_segment'),
pl.col('bitscore'),
descending=[False, False, True]
)
df_all_blast_pandas: pd.DataFrame = df_all_blast.to_pandas()
# Convert segment number to segment name (1 -> "1_PB2")
df_all_blast["Sample Genome Segment Number"] = df_all_blast["Sample Genome Segment Number"]. \
df_all_blast_pandas["sample_segment"] = df_all_blast_pandas["sample_segment"]. \
apply(lambda x: SEGMENT_NAMES[int(x)])
# Sort by sample names, segment numbers and bitscore
df_all_blast = df_all_blast.sort_values(
["Sample", "Sample Genome Segment Number", "BLASTN Bitscore"],
ascending=[True, True, False]
# Rename columns to more human-readable names
df_all_blast_pandas.rename(
columns=dict(BLAST_RESULTS_REPORT_COLUMNS)
)
df_subtype_predictions = df_subtype_results[cols].rename(
columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES
)
df_H = df_H.rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
df_N = df_N.rename(columns=SUBTYPE_RESULTS_SUMMARY_FINAL_NAMES)
# Write each dataframe to a separate sheet in the excel report
write_excel(
[
("Subtype Predictions", df_subtype_predictions),
("Top Segment Matches", df_all_blast),
("Top Segment Matches", df_all_blast_pandas),
("H Segment Results", df_H),
("N Segment Results", df_N),
],
output_dest=excel_report,
)
else:
df_blast, subtype_results_summary = results[0]
df_blast = df_blast.rename(mapping=dict(BLAST_RESULTS_REPORT_COLUMNS))
df_ref_id = df_blast.select(
pl.col([
'Sample',
'Sample Genome Segment Number',
'Reference NCBI Accession',
'BLASTN Bitscore',
'Reference Sequence ID'
])
)
df_ref_id = df_ref_id.with_columns(
pl.when(pl.col("Reference NCBI Accession").is_null())
.then(pl.col("Reference Sequence ID"))
.otherwise(pl.col("Reference NCBI Accession"))
.str.strip()
.alias('Reference NCBI Accession')
)
df_ref_id = df_ref_id.with_columns(
pl.col("Sample Genome Segment Number").apply(lambda x: SEGMENT_NAMES[int(x)])
.alias("Sample Genome Segment Number"))
df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)


def write_top_segment_matches(df_top_seg_matches, subtype_results_summary, sample_name):
df_blast = df_top_seg_matches.rename(mapping=dict(BLAST_RESULTS_REPORT_COLUMNS))
df_ref_id = df_blast.select(
pl.col([
'Sample',
'Sample Genome Segment Number',
'Reference NCBI Accession',
'BLASTN Bitscore',
'Reference Sequence ID'
])
)
df_ref_id = df_ref_id.with_columns(
pl.when(pl.col("Reference NCBI Accession").is_null())
.then(pl.col("Reference Sequence ID"))
.otherwise(pl.col("Reference NCBI Accession"))
.str.strip()
.alias('Reference NCBI Accession')
)
df_ref_id = df_ref_id.with_columns(
pl.col("Sample Genome Segment Number").apply(lambda x: SEGMENT_NAMES[int(x)])
.alias("Sample Genome Segment Number"))
df_ref_id.write_csv(sample_name + ".topsegments.csv", separator=",", has_header=True)


def read_refseq_metadata(flu_metadata):
Expand Down
3 changes: 0 additions & 3 deletions modules/local/misc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,3 @@ process CAT_CONSENSUS {
END_VERSIONS
"""
}



2 changes: 2 additions & 0 deletions modules/local/subtyping_report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ process SUBTYPING_REPORT {
input:
path(genomeset)
path(blastn_results)
path(samplesheet)

output:
path('nf-flu-subtyping-report.xlsx'), emit: report
Expand All @@ -38,6 +39,7 @@ process SUBTYPING_REPORT {
--top ${params.max_top_blastn} \\
--excel-report nf-flu-subtyping-report.xlsx \\
--pident-threshold $params.pident_threshold \\
--samplesheet $samplesheet \\
$blastn_results
ln -s .command.log parse_influenza_blast_results.log
Expand Down
6 changes: 5 additions & 1 deletion workflows/illumina.nf
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ workflow ILLUMINA {
ch_versions = ch_versions.mix(BLAST_BLASTN.out.versions.first().ifEmpty(null))

ch_blast = BLAST_BLASTN.out.txt.collect({ it[1] })
SUBTYPING_REPORT(ZSTD_DECOMPRESS_CSV.out.file, ch_blast)
SUBTYPING_REPORT(
ZSTD_DECOMPRESS_CSV.out.file,
ch_blast,
CHECK_SAMPLE_SHEET.out
)
ch_versions = ch_versions.mix(SUBTYPING_REPORT.out.versions)

SOFTWARE_VERSIONS(ch_versions.unique().collectFile(name: 'collated_versions.yml'))
Expand Down
12 changes: 10 additions & 2 deletions workflows/nanopore.nf
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,11 @@ workflow NANOPORE {
//Generate suptype prediction report
if (!params.skip_irma_subtyping_report){
ch_blast_irma = BLAST_BLASTN_IRMA.out.txt.collect({ it[1] })
SUBTYPING_REPORT_IRMA_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blast_irma)
SUBTYPING_REPORT_IRMA_CONSENSUS(
ZSTD_DECOMPRESS_CSV.out.file,
ch_blast_irma,
CHECK_SAMPLE_SHEET.out
)
}

// Prepare top ncbi accession id for each segment of each sample sample (id which has top bitscore)
Expand Down Expand Up @@ -244,7 +248,11 @@ workflow NANOPORE {
ch_versions = ch_versions.mix(BLAST_BLASTN_CONSENSUS.out.versions)

ch_blastn_consensus = BLAST_BLASTN_CONSENSUS.out.txt.collect({ it[1] })
SUBTYPING_REPORT_BCF_CONSENSUS(ZSTD_DECOMPRESS_CSV.out.file, ch_blastn_consensus)
SUBTYPING_REPORT_BCF_CONSENSUS(
ZSTD_DECOMPRESS_CSV.out.file,
ch_blastn_consensus,
CHECK_SAMPLE_SHEET.out
)
ch_versions = ch_versions.mix(SUBTYPING_REPORT_BCF_CONSENSUS.out.versions)

if (params.ref_db){
Expand Down

0 comments on commit c661dfc

Please sign in to comment.