Skip to content

Commit

Permalink
Merge branch 'vcfstats' into 'dev'
Browse files Browse the repository at this point in the history
Vcfstats

See merge request epi2melabs/workflow-containers/wf-artic!4
  • Loading branch information
cjw85 committed Mar 9, 2021
2 parents 1a3db0c + 4f4abcf commit 71aadb3
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 51 deletions.
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ argument to `nextflow run`:
```
# run the pipeline with the test data
OUTPUT=my_artic_output
nextflow run epi2me-labs/wf-artif \
nextflow run epi2me-labs/wf-artic \
-w ${OUTPUT}/workspace
-profile standard
--fastq test_data/sars-samples-demultiplexed/
Expand All @@ -130,7 +130,7 @@ in the command above.
### Configuration and tuning

> This section provides some minimal guidance for changing common options, see
> the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for further details.***
> the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for further details.
The default settings for the workflow are described in the configuration file `nextflow.config`
found within the git repository. The default configuration defines an *executor* that will
Expand Down Expand Up @@ -233,3 +233,12 @@ In order to run the workflow with this new image it is required to give
nextflow run epi2me-labs/wf-artic \
--wfversion latest
```


## Useful links

* [medaka](https://www.github.com/nanoporetech/medaka)
* [artic](https://github.com/artic-network/fieldbioinformatics)
* [nextflow](https://www.nextflow.io/)
* [docker](https://www.docker.com/products/docker-desktop)
* [conda](https://docs.conda.io/en/latest/miniconda.html)
68 changes: 37 additions & 31 deletions bin/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@

import argparse
import glob
import numpy as np
import pandas as pd
import os

from bokeh.layouts import gridplot, layout
from bokeh.models import Panel, Tabs
import aplanat
from aplanat import annot, bars, gridplot, hist, lines, points, report

from aplanat.util import Colors
from aplanat.components import bcfstats
from bokeh.layouts import gridplot, layout
from bokeh.models import Panel, Tabs
import numpy as np
import pandas as pd

def read_files(summaries):
dfs = list()
Expand All @@ -24,31 +26,29 @@ def main():
parser.add_argument("output", help="Report output filename")
parser.add_argument("--depths", nargs='+', required=True, help="Depth summary files")
parser.add_argument("--summaries", nargs='+', required=True, help="Sequencing summary files")
parser.add_argument("--bcftools_stats", nargs='+', required=True, help="Outputs from bcftools stats")
parser.add_argument("--min_len", default=300, type=int, help="Minimum read length")
parser.add_argument("--max_len", default=700, type=int, help="Maximum read length")
args = parser.parse_args()

report_doc = report.HTMLReport(
"SARS-CoV-2 ARTIC Sequencing report",
"Results generated through the wf-artic nextflow workflow provided by Oxford Nanopore Technologies")
section = report_doc.add_section()

report_doc.markdown('''
section.markdown('''
### Read Quality control
This section displays basic QC metrics indicating read data quality.
''')

np_blue = '#0084A9'
np_dark_grey = '#455560'
np_light_blue = '#90C6E7'

# read length summary
seq_summary = read_files(args.summaries)
total_bases = seq_summary['sequence_length_template'].sum()
mean_length = total_bases / len(seq_summary)
median_length = np.median(seq_summary['sequence_length_template'])
datas = [seq_summary['sequence_length_template']]
length_hist = hist.histogram(
datas, colors=[np_blue], bins=100,
datas, colors=[Colors.cerulean], bins=100,
title="Read length distribution.",
x_axis_label='Read Length / bases',
y_axis_label='Number of reads',
Expand All @@ -67,7 +67,7 @@ def main():
datas = [seq_summary['mean_qscore_template']]
mean_q, median_q = np.mean(datas[0]), np.median(datas[0])
q_hist = hist.histogram(
datas, colors=[np_blue], bins=100,
datas, colors=[Colors.cerulean], bins=100,
title="Read quality score",
x_axis_label="Quality score",
y_axis_label="Number of reads",
Expand All @@ -90,16 +90,25 @@ def main():
)

bc_counts = bars.simple_bar(
barcode_counts['sample'].astype(str), barcode_counts['count'], colors=[np_blue]*len(barcode_counts),
barcode_counts['sample'].astype(str), barcode_counts['count'], colors=[Colors.cerulean]*len(barcode_counts),
title='Number of reads per barcode (filtered by {} < length < {})'.format(args.min_len, args.max_len),
plot_width=None
)
bc_counts.xaxis.major_label_orientation = 3.14/2
report_doc.plot(
section.plot(
layout(
[[length_hist, q_hist], [bc_counts]],
sizing_mode="stretch_width"))

section = report_doc.add_section()
section.markdown('''
### Genome coverage
Plots below indicate depth of coverage from data used within the Artic analysis
coloured by amplicon pool. For adequate variant calling depth should be at least
30X in any region. Pool-1 reads are shown in light-blue, Pool-2 reads are dark grey
(a similarly for forward and reverse reads respectively).
''')

# depth summary by amplicon pool
df = read_files(args.depths)
plots_pool = list()
Expand All @@ -116,7 +125,7 @@ def main():
ys = [df.loc[(pset == i) & bc]['depth'] for i in (1,2)]

plot = points.points(
xs, ys, colors=[np_light_blue, np_dark_grey],
xs, ys, colors=[Colors.light_cornflower_blue, Colors.feldgrau],
title="{}: {:.0f}X, {:.1f}% > {}X".format(
sample, depth.mean(), depth_thresh, depth_lim),
height=200, width=400,
Expand All @@ -129,7 +138,7 @@ def main():
xs = [data['pos'], data['pos']]
ys = [data['depth_fwd'], data['depth_rev']]
plot = points.points(
xs, ys, colors=[np_light_blue, np_dark_grey],
xs, ys, colors=[Colors.light_cornflower_blue, Colors.feldgrau],
title="{}: {:.0f}X, {:.1f}% > {}X".format(
sample, depth.mean(), depth_thresh, depth_lim),
height=200, width=400,
Expand All @@ -142,28 +151,25 @@ def main():
tab2 = Panel(
child=gridplot(plots_orient, ncols=3), title="By read orientation")
cover_panel = Tabs(tabs=[tab1, tab2])
section.plot(cover_panel)

report_doc.markdown('''
### Genome coverage
Plots below indicate depth of coverage from data used within the Artic analysis
coloured by amplicon pool. For adequate variant calling depth should be at least
30X in any region. Pool-1 reads are shown in light-blue, Pool-2 reads are dark grey
(a similarly for forward and reverse reads respectively).
''')
report_doc.plot(cover_panel)
# canned VCF stats report component
section = report_doc.add_section()
bcfstats.full_report(args.bcftools_stats, report=section)

# nextclade data
with open(args.nextclade, encoding='utf8') as fh:
nc = fh.read()
nextclade = report.NextClade(nc)
report_doc.markdown('''
section = report_doc.add_section()
section.markdown('''
### NextClade analysis
The following view is produced by the [nextclade](https://clades.nextstrain.org/) software.
''')
report_doc.plot(nextclade)
with open(args.nextclade, encoding='utf8') as fh:
nc = fh.read()
nextclade = report.NextClade(nc)
section.plot(nextclade)

# Footer section
report_doc.markdown('''
section = report_doc.add_section()
section.markdown('''
### About
**Oxford Nanopore Technologies products are not intended for use for health assessment
Expand Down
2 changes: 1 addition & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ channels:
- conda-forge
- defaults
dependencies:
- aplanat >=0.2.9
- aplanat >=0.3.1
- nextclade-cli >=0.13.0
- np-artic >=1.3.0=py_2
- pomoxis >=0.3.6
38 changes: 21 additions & 17 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ process preArticQC {
input:
tuple file(directory), val(sample_name)
output:
file "seq_summary.txt"
file "${sample_name}.stats"
"""
seq_summary.py $directory seq_summary.txt --sample_name $sample_name
seq_summary.py $directory ${sample_name}.stats --sample_name $sample_name
"""
}

Expand All @@ -55,15 +55,16 @@ process runArtic {
tuple file(directory), val(sample_name)
file "primer_schemes"
output:
file("${sample_name}.consensus.fasta")
file "${sample_name}.consensus.fasta"
tuple file("${sample_name}.pass.named.vcf.gz"), file("${sample_name}.pass.named.vcf.gz.tbi")
file("${sample_name}.depth.txt")

file "${sample_name}.depth.txt"
file "${sample_name}.pass.named.stats"
"""
run_artic.sh \
$sample_name $directory $params._min_len $params._max_len \
$params.medaka_model $params.full_scheme_name \
$task.cpus
${sample_name} ${directory} ${params._min_len} ${params._max_len} \
${params.medaka_model} ${params.full_scheme_name} \
${task.cpus}
bcftools stats ${sample_name}.pass.named.vcf.gz > ${sample_name}.pass.named.stats
"""
}

Expand All @@ -72,15 +73,17 @@ process report {
label "artic"
cpus 1
input:
file "depths_*.txt"
file "read_summary_*.txt"
file "depth_stats/*"
file "read_stats/*"
file "nextclade.json"
file "vcf_stats/*"
output:
file "summary_report.html"
"""
report.py nextclade.json summary_report.html \
--min_len $params._min_len --max_len $params._max_len \
--depths depths_*.txt --summaries read_summary_*.txt
--depths depth_stats/* --summaries read_stats/* \
--bcftools_stats vcf_stats/*
"""
}

Expand All @@ -104,11 +107,10 @@ process allVariants {
input:
tuple file(vcfs), file(tbis)
output:
file "all_variants.vcf.gz"
file "all_variants.vcf.gz.tbi"
tuple file("all_variants.vcf.gz"), file("all_variants.vcf.gz.tbi")
"""
bcftools merge -o all_variants.vcf.gz -O z *.vcf.gz
bcftools index -t all_variants.vcf.gz
bcftools index -t all_variants.vcf.gz
"""
}

Expand Down Expand Up @@ -166,9 +168,11 @@ workflow pipeline {
clades = nextclade(all_consensus, reference, primers)
// report
html_doc = report(
runArtic.out[2].collect(), read_summaries.collect(), clades.collect())

results = all_consensus.concat(all_variants, html_doc)
runArtic.out[2].collect(),
read_summaries.collect(),
clades.collect(),
runArtic.out[3].collect())
results = all_consensus.concat(all_variants[0].flatten(), html_doc)
emit:
results
}
Expand Down

0 comments on commit 71aadb3

Please sign in to comment.