From 4f4abcf6fe0c9f2f3b30aef325faebc7bdbfa11b Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 9 Mar 2021 20:46:58 +0000 Subject: [PATCH] Vcfstats --- README.md | 13 +++++++-- bin/report.py | 68 ++++++++++++++++++++++++++---------------------- environment.yaml | 2 +- main.nf | 38 +++++++++++++++------------ 4 files changed, 70 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index f9def33..6e65034 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ argument to `nextflow run`: ``` # run the pipeline with the test data OUTPUT=my_artic_output -nextflow run epi2me-labs/wf-artif \ +nextflow run epi2me-labs/wf-artic \ -w ${OUTPUT}/workspace -profile standard --fastq test_data/sars-samples-demultiplexed/ @@ -130,7 +130,7 @@ in the command above. ### Configuration and tuning > This section provides some minimal guidance for changing common options, see -> the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for further details.*** +> the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for further details. The default settings for the workflow are described in the configuration file `nextflow.config` found within the git repository. The default configuration defines an *executor* that will @@ -233,3 +233,12 @@ In order to run the workflow with this new image it is required to give nextflow run epi2me-labs/wf-artic \ --wfversion latest ``` + + +## Useful links + +* [medaka](https://www.github.com/nanoporetech/medaka) +* [artic](https://github.com/artic-network/fieldbioinformatics) +* [nextflow](https://www.nextflow.io/) +* [docker](https://www.docker.com/products/docker-desktop) +* [conda](https://docs.conda.io/en/latest/miniconda.html) diff --git a/bin/report.py b/bin/report.py index fe6bead..0f28775 100755 --- a/bin/report.py +++ b/bin/report.py @@ -2,14 +2,16 @@ import argparse import glob -import numpy as np -import pandas as pd +import os -from bokeh.layouts import gridplot, layout -from bokeh.models import Panel, Tabs import aplanat from aplanat import annot, bars, gridplot, hist, lines, points, report - +from aplanat.util import Colors +from aplanat.components import bcfstats +from bokeh.layouts import gridplot, layout +from bokeh.models import Panel, Tabs +import numpy as np +import pandas as pd def read_files(summaries): dfs = list() @@ -24,6 +26,7 @@ def main(): parser.add_argument("output", help="Report output filename") parser.add_argument("--depths", nargs='+', required=True, help="Depth summary files") parser.add_argument("--summaries", nargs='+', required=True, help="Sequencing summary files") + parser.add_argument("--bcftools_stats", nargs='+', required=True, help="Outputs from bcftools stats") parser.add_argument("--min_len", default=300, type=int, help="Minimum read length") parser.add_argument("--max_len", default=700, type=int, help="Maximum read length") args = parser.parse_args() @@ -31,16 +34,13 @@ def main(): report_doc = report.HTMLReport( "SARS-CoV-2 ARTIC Sequencing report", "Results generated through the wf-artic nextflow workflow provided by Oxford Nanopore Technologies") + section = report_doc.add_section() - report_doc.markdown(''' + section.markdown(''' ### Read Quality control This section displays basic QC metrics indicating read data quality. ''') - np_blue = '#0084A9' - np_dark_grey = '#455560' - np_light_blue = '#90C6E7' - # read length summary seq_summary = read_files(args.summaries) total_bases = seq_summary['sequence_length_template'].sum() @@ -48,7 +48,7 @@ def main(): median_length = np.median(seq_summary['sequence_length_template']) datas = [seq_summary['sequence_length_template']] length_hist = hist.histogram( - datas, colors=[np_blue], bins=100, + datas, colors=[Colors.cerulean], bins=100, title="Read length distribution.", x_axis_label='Read Length / bases', y_axis_label='Number of reads', @@ -67,7 +67,7 @@ def main(): datas = [seq_summary['mean_qscore_template']] mean_q, median_q = np.mean(datas[0]), np.median(datas[0]) q_hist = hist.histogram( - datas, colors=[np_blue], bins=100, + datas, colors=[Colors.cerulean], bins=100, title="Read quality score", x_axis_label="Quality score", y_axis_label="Number of reads", @@ -90,16 +90,25 @@ def main(): ) bc_counts = bars.simple_bar( - barcode_counts['sample'].astype(str), barcode_counts['count'], colors=[np_blue]*len(barcode_counts), + barcode_counts['sample'].astype(str), barcode_counts['count'], colors=[Colors.cerulean]*len(barcode_counts), title='Number of reads per barcode (filtered by {} < length < {})'.format(args.min_len, args.max_len), plot_width=None ) bc_counts.xaxis.major_label_orientation = 3.14/2 - report_doc.plot( + section.plot( layout( [[length_hist, q_hist], [bc_counts]], sizing_mode="stretch_width")) + section = report_doc.add_section() + section.markdown(''' +### Genome coverage +Plots below indicate depth of coverage from data used within the Artic analysis +coloured by amplicon pool. For adequate variant calling depth should be at least +30X in any region. Pool-1 reads are shown in light-blue, Pool-2 reads are dark grey +(a similarly for forward and reverse reads respectively). +''') + # depth summary by amplicon pool df = read_files(args.depths) plots_pool = list() @@ -116,7 +125,7 @@ def main(): ys = [df.loc[(pset == i) & bc]['depth'] for i in (1,2)] plot = points.points( - xs, ys, colors=[np_light_blue, np_dark_grey], + xs, ys, colors=[Colors.light_cornflower_blue, Colors.feldgrau], title="{}: {:.0f}X, {:.1f}% > {}X".format( sample, depth.mean(), depth_thresh, depth_lim), height=200, width=400, @@ -129,7 +138,7 @@ def main(): xs = [data['pos'], data['pos']] ys = [data['depth_fwd'], data['depth_rev']] plot = points.points( - xs, ys, colors=[np_light_blue, np_dark_grey], + xs, ys, colors=[Colors.light_cornflower_blue, Colors.feldgrau], title="{}: {:.0f}X, {:.1f}% > {}X".format( sample, depth.mean(), depth_thresh, depth_lim), height=200, width=400, @@ -142,28 +151,25 @@ def main(): tab2 = Panel( child=gridplot(plots_orient, ncols=3), title="By read orientation") cover_panel = Tabs(tabs=[tab1, tab2]) + section.plot(cover_panel) - report_doc.markdown(''' -### Genome coverage -Plots below indicate depth of coverage from data used within the Artic analysis -coloured by amplicon pool. For adequate variant calling depth should be at least -30X in any region. Pool-1 reads are shown in light-blue, Pool-2 reads are dark grey -(a similarly for forward and reverse reads respectively). -''') - report_doc.plot(cover_panel) + # canned VCF stats report component + section = report_doc.add_section() + bcfstats.full_report(args.bcftools_stats, report=section) - # nextclade data - with open(args.nextclade, encoding='utf8') as fh: - nc = fh.read() - nextclade = report.NextClade(nc) - report_doc.markdown(''' + section = report_doc.add_section() + section.markdown(''' ### NextClade analysis The following view is produced by the [nextclade](https://clades.nextstrain.org/) software. ''') - report_doc.plot(nextclade) + with open(args.nextclade, encoding='utf8') as fh: + nc = fh.read() + nextclade = report.NextClade(nc) + section.plot(nextclade) # Footer section - report_doc.markdown(''' + section = report_doc.add_section() + section.markdown(''' ### About **Oxford Nanopore Technologies products are not intended for use for health assessment diff --git a/environment.yaml b/environment.yaml index ef31a5d..bd71141 100644 --- a/environment.yaml +++ b/environment.yaml @@ -5,7 +5,7 @@ channels: - conda-forge - defaults dependencies: - - aplanat >=0.2.9 + - aplanat >=0.3.1 - nextclade-cli >=0.13.0 - np-artic >=1.3.0=py_2 - pomoxis >=0.3.6 diff --git a/main.nf b/main.nf index dceec5a..3491d68 100644 --- a/main.nf +++ b/main.nf @@ -41,9 +41,9 @@ process preArticQC { input: tuple file(directory), val(sample_name) output: - file "seq_summary.txt" + file "${sample_name}.stats" """ - seq_summary.py $directory seq_summary.txt --sample_name $sample_name + seq_summary.py $directory ${sample_name}.stats --sample_name $sample_name """ } @@ -55,15 +55,16 @@ process runArtic { tuple file(directory), val(sample_name) file "primer_schemes" output: - file("${sample_name}.consensus.fasta") + file "${sample_name}.consensus.fasta" tuple file("${sample_name}.pass.named.vcf.gz"), file("${sample_name}.pass.named.vcf.gz.tbi") - file("${sample_name}.depth.txt") - + file "${sample_name}.depth.txt" + file "${sample_name}.pass.named.stats" """ run_artic.sh \ - $sample_name $directory $params._min_len $params._max_len \ - $params.medaka_model $params.full_scheme_name \ - $task.cpus + ${sample_name} ${directory} ${params._min_len} ${params._max_len} \ + ${params.medaka_model} ${params.full_scheme_name} \ + ${task.cpus} + bcftools stats ${sample_name}.pass.named.vcf.gz > ${sample_name}.pass.named.stats """ } @@ -72,15 +73,17 @@ process report { label "artic" cpus 1 input: - file "depths_*.txt" - file "read_summary_*.txt" + file "depth_stats/*" + file "read_stats/*" file "nextclade.json" + file "vcf_stats/*" output: file "summary_report.html" """ report.py nextclade.json summary_report.html \ --min_len $params._min_len --max_len $params._max_len \ - --depths depths_*.txt --summaries read_summary_*.txt + --depths depth_stats/* --summaries read_stats/* \ + --bcftools_stats vcf_stats/* """ } @@ -104,11 +107,10 @@ process allVariants { input: tuple file(vcfs), file(tbis) output: - file "all_variants.vcf.gz" - file "all_variants.vcf.gz.tbi" + tuple file("all_variants.vcf.gz"), file("all_variants.vcf.gz.tbi") """ bcftools merge -o all_variants.vcf.gz -O z *.vcf.gz - bcftools index -t all_variants.vcf.gz + bcftools index -t all_variants.vcf.gz """ } @@ -166,9 +168,11 @@ workflow pipeline { clades = nextclade(all_consensus, reference, primers) // report html_doc = report( - runArtic.out[2].collect(), read_summaries.collect(), clades.collect()) - - results = all_consensus.concat(all_variants, html_doc) + runArtic.out[2].collect(), + read_summaries.collect(), + clades.collect(), + runArtic.out[3].collect()) + results = all_consensus.concat(all_variants[0].flatten(), html_doc) emit: results }