Skip to content

Commit

Permalink
Merge branch 'update-template' into 'dev'
Browse files Browse the repository at this point in the history
Template updates [CW-1564][CW-2303][CW-2855]

See merge request epi2melabs/workflows/wf-artic!153
  • Loading branch information
SamStudio8 committed Oct 24, 2023
2 parents 9ff9343 + 997d362 commit 168191d
Show file tree
Hide file tree
Showing 8 changed files with 1,011 additions and 577 deletions.
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ repos:
pass_filenames: false
additional_dependencies:
- epi2melabs
- id: build_models
name: build_models
entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py
language: python
files: 'results_schema.yml'
pass_filenames: false
additional_dependencies:
- datamodel-code-generator
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
hooks:
Expand All @@ -37,4 +45,5 @@ repos:
"--import-order-style=google",
"--statistics",
"--max-line-length=88",
"--extend-exclude=bin/workflow_glue/results_schema.py",
]
58 changes: 58 additions & 0 deletions bin/workflow_glue/check_bam_headers_in_dir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Check (u)BAM files for `@SQ` lines whether they are the same in all headers."""

from pathlib import Path
import sys

import pysam

from .util import get_named_logger, wf_parser # noqa: ABS101


def get_sq_lines(xam_file):
"""Extract the `@SQ` lines from the header of a XAM file."""
return pysam.AlignmentFile(xam_file, check_sq=False).header["SQ"]


def main(args):
"""Run the entry point."""
logger = get_named_logger("checkBamHdr")

if not args.input_path.is_dir():
raise ValueError(f"Input path '{args.input_path}' must be a directory.")

target_files = list(args.input_path.glob("*"))
if not target_files:
raise ValueError(f"No files found in input directory '{args.input_path}'.")
# Loop over target files and check if there are `@SQ` lines in all headers or not.
# Set `is_unaligned` accordingly. If there are mixed headers (either with some files
# containing `@SQ` lines and some not or with different files containing different
# `@SQ` lines), set `mixed_headers` to `True`.
first_sq_lines = None
mixed_headers = False
for xam_file in target_files:
sq_lines = get_sq_lines(xam_file)
if first_sq_lines is None:
# this is the first file
first_sq_lines = sq_lines
else:
# this is a subsequent file; check with the first `@SQ` lines
if sq_lines != first_sq_lines:
mixed_headers = True
break

# we set `is_unaligned` to `True` if there were no mixed headers and the last file
# didn't have `@SQ` lines (as we can then be sure that none of the files did)
is_unaligned = not mixed_headers and not sq_lines
# write `is_unaligned` and `mixed_headers` out so that they can be set as env.
# variables
sys.stdout.write(
f"IS_UNALIGNED={int(is_unaligned)};MIXED_HEADERS={int(mixed_headers)}"
)
logger.info(f"Checked (u)BAM headers in '{args.input_path}'.")


def argparser():
"""Argument parser for entrypoint."""
parser = wf_parser("check_bam_headers")
parser.add_argument("input_path", type=Path, help="Path to target directory")
return parser
108 changes: 44 additions & 64 deletions bin/workflow_glue/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

import json

from aplanat import annot, bars, hist, lines, report
from aplanat import bars, lines, report
from aplanat.components import bcfstats, nextclade
from aplanat.components import simple as scomponents
from aplanat.components.fastcat import read_length_plot, read_quality_plot
from aplanat.util import Colors
from bokeh.layouts import gridplot, layout
from bokeh.models import Panel, Range1d, Tabs
import numpy as np
import pandas as pd
from pandas.api import types as pd_types
import pysam
Expand All @@ -28,7 +28,7 @@ def read_files(summaries, **kwargs):
return pd.concat(dfs)


def output_json(df, consensus_fasta, fastcat_stats):
def output_json(df, consensus_fasta, readcounts):
"""Read depth stats df and create JSON output."""
grouped_by_sample = df.groupby('sample_name')
all_json = {}
Expand All @@ -51,12 +51,6 @@ def output_json(df, consensus_fasta, fastcat_stats):
newdf[x].values.tolist() for x in newdf.columns)))
all_json[sample] = final
final_json = {'data': []}
seq_summary = pd.read_csv(
fastcat_stats,
delimiter="\t",
dtype={"sample_name": CATEGORICAL}
)
readcounts = seq_summary['sample_name'].value_counts().to_dict()
# parse the consensus fasta to get extra info required
with pysam.FastxFile(consensus_fasta) as fh:
for entry in fh:
Expand Down Expand Up @@ -101,72 +95,58 @@ def main(args):
This section displays basic QC metrics indicating read data quality.
''')
# read length summary
seq_summary = pd.read_csv(
args.fastcat_stats,
delimiter="\t",
dtype={"sample_name": CATEGORICAL}
tabs = {}
sample_readcounts = {}
sample_goodreadcounts = {}
for summary_fn in args.fastcat_stats:
df_sample = pd.read_csv(summary_fn, sep="\t")
sample_id = df_sample['sample_name'].iloc[0]
rlp = read_length_plot(
df_sample,
min_len=args.min_len,
max_len=args.max_len,
xlim=(0, 2000),
)
total_bases = seq_summary['read_length'].sum()
mean_length = total_bases / len(seq_summary)
median_length = np.median(seq_summary['read_length'])
datas = [seq_summary['read_length']]
length_hist = hist.histogram(
datas, colors=[Colors.cerulean], binwidth=50,
title="Read length distribution.",
x_axis_label='Read Length / bases',
y_axis_label='Number of reads',
xlim=(0, 2000))
length_hist = annot.marker_vline(
length_hist, args.min_len,
label="Min: {}".format(args.min_len), text_baseline='bottom',
color='grey')
length_hist = annot.marker_vline(
length_hist, args.max_len,
label="Max: {}".format(args.max_len), text_baseline='top')
length_hist = annot.subtitle(
length_hist,
"Mean: {:.0f}. Median: {:.0f}".format(
mean_length, median_length))

datas = [seq_summary['mean_quality']]
mean_q, median_q = np.mean(datas[0]), np.median(datas[0])
q_hist = hist.histogram(
datas, colors=[Colors.cerulean], bins=100,
title="Read quality score",
x_axis_label="Quality score",
y_axis_label="Number of reads",
xlim=(4, 25))
q_hist = annot.subtitle(
q_hist,
"Mean: {:.0f}. Median: {:.0f}".format(
mean_q, median_q))
rqp = read_quality_plot(df_sample)
grid = gridplot(
[rlp, rqp], ncols=2, sizing_mode="stretch_width")
tabs[sample_id] = Panel(child=grid, title=sample_id)
# count total reads for output_json
sample_readcounts[sample_id] = df_sample.shape[0]
# count "good reads" for additional bar plot
good_reads = df_sample.loc[
(df_sample['read_length'] > args.min_len)
& (df_sample['read_length'] < args.max_len)
]
sample_goodreadcounts[sample_id] = good_reads.shape[0]

barcode_keys = sorted(sample_goodreadcounts)

# barcode count plot
good_reads = seq_summary.loc[
(seq_summary['read_length'] > args.min_len)
& (seq_summary['read_length'] < args.max_len)]
barcode_counts = (
pd.DataFrame(good_reads['sample_name'].value_counts())
.sort_index()
.reset_index()
.rename(
columns={'index': 'sample', 'sample_name': 'count'})
)

bc_counts = bars.simple_bar(
barcode_counts['sample'].astype(str), barcode_counts['count'],
colors=[Colors.cerulean]*len(barcode_counts),
barcode_keys,
[sample_goodreadcounts.get(x) for x in barcode_keys],
colors=[Colors.cerulean]*len(sample_goodreadcounts),
title=(
'Number of reads per barcode '
'(filtered by {} < length < {})'.format(
args.min_len, args.max_len)),
plot_width=None
)
bc_counts.xaxis.major_label_orientation = 3.14/2

section = report_doc.add_section()
section.markdown("""
### Sequence summaries""")
section.plot(
layout(
[[length_hist, q_hist], [bc_counts]],
sizing_mode="stretch_width"))
[
Tabs(tabs=[tabs.get(x) for x in barcode_keys]),
[bc_counts],
],
sizing_mode="stretch_width"
)
)

section = report_doc.add_section()
section.markdown("""
Expand Down Expand Up @@ -215,7 +195,7 @@ def main(args):
# depth summary by amplicon pool
df = read_files(
args.depths, sep="\t", converters={'sample_name': str})
epi2me_json = output_json(df, args.consensus_fasta, args.fastcat_stats)
epi2me_json = output_json(df, args.consensus_fasta, sample_readcounts)
json_object = json.dumps(epi2me_json, indent=4, separators=(',', ':'))
json_file = open("artic.json", "a")
json_file.write(json_object)
Expand Down Expand Up @@ -378,7 +358,7 @@ def argparser():
"--depths", nargs='+', required=True,
help="Depth summary files")
parser.add_argument(
"--fastcat_stats", required=True,
"--fastcat_stats", nargs='+', required=True,
help="fastcat summary file")
parser.add_argument(
"--nextclade",
Expand Down
Loading

0 comments on commit 168191d

Please sign in to comment.