MinIONQC.R

#!/usr/bin/env Rscript

# MinIONQC version 1.4.1
# Copyright (C) 2017 onwards Robert Lanfear
#
# For license see https://github.com/roblanf/minion_qc/blob/master/LICENSE


# supress warnings
options(warn=-1)

suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(viridis))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(yaml))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(futile.logger))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(optparse))


# option parsing #

parser <- OptionParser()

parser <- add_option(parser, 
                     opt_str = c("-i", "--input"), 
                     type = "character",
                     dest = 'input.file',
                     help="Input file or directory (required). Either a full path to a sequence_summary.txt file, or a full path to a directory containing one or more such files. In the latter case the directory is searched recursively."
                     )

parser <- add_option(parser, 
                     opt_str = c("-o", "--outputdirectory"), 
                     type = "character",
                     dest = 'output.dir',
                     default=NA,
                     help="Output directory (optional, default is the same as the input directory). If a single sequencing_summary.txt file is passed as input, then the output directory will contain just the plots associated with that file. If a directory containing more than one sequencing_summary.txt files is passed as input, then the plots will be put into sub-directories that have the same names as the parent directories of each sequencing_summary.txt file"
                     )

parser <- add_option(parser, 
                     opt_str = c("-q", "--qscore_cutoff"), 
                     type="double", 
                     default=7.0,
                     dest = 'q',
                     help="The cutoff value for the mean Q score of a read (default 7). Used to create separate plots for reads above and below this threshold"
                     )

parser <- add_option(parser, 
                     opt_str = c("-p", "--processors"), 
                     type="integer", 
                     default=1,
                     dest = 'cores',
                     help="Number of processors to use for the anlaysis (default 1). Only helps when you are analysing more than one sequencing_summary.txt file at a time"
                     )

parser <- add_option(parser, 
                     opt_str = c("-s", "--smallfigures"), 
                     type = "logical",
                     default = FALSE,
                     dest = 'smallfig',
                     help="TRUE or FALSE (the default). When true, MinIONQC will output smaller figures, e.g. suitable for publications or presentations. The default is to produce larger figures optimised for display on screen. Some figures just require small text, and cannot be effectively resized."
                     )

parser <- add_option(parser, 
                     opt_str = c("-c", "--combined-only"), 
                     type = "logical",
                     default = FALSE,
                     dest = 'combined_only',
                     help="TRUE or FALSE (the default). When true, MinIONQC will only produce the combined report, it will not produce individual reports for each flowcell."
                     )

parser <- add_option(parser, 
                     opt_str = c("-f", "--format"), 
                     type = "character",
                     default = 'png',
                     dest = 'plot_format',
                     help="A string in quotes, to set the output format of the plots. 'png' (the default) or any of 'pdf', 'ps', 'jpeg', 'tiff', 'bmp' are supported. 'png' is recommended and is thoroughly tested. The 'pdf' option may be useful if you have a system without X11 installed, because PNG files require X11 but PDF files do not. Other options are there for convenience."
					 )

parser <- add_option(parser, 
					opt_str = c("-m", "--muxes"), 
					type = "numeric",
					default = '0',
					dest = 'muxscan',
					help="The value for mux scan used in MinKnow."
)

parser <- add_option(parser, 
					opt_str = c("-a", "--add-stat"), 
					type = "logical",
					default = 'FALSE',
					dest = 'plot_stat',
					help="TRUE or FALSE (the default). When true, MinIONQC will add some basic statistical values on plots."
)

opt = parse_args(parser)

if(exists("test.file") == FALSE){
    test.file = c(1, 2, 3) # dummy variable
}

if (length(opt$input.file)==1) {
    input.file = opt$input.file
} else if (length(test.file)==1) {
    input.file = test.file # specifically for testing the script
    flog.info(paste("Using test file", test.file))
} else {
    stop("Input file parameter must be supplied via -i or --input. See script usage (--help) or readme for help: https://github.com/roblanf/minion_qc")
}

q = opt$q
cores = opt$cores
smallfig = opt$smallfig
combined_only = opt$combined_only
plot_stat = opt$plot_stat
mux_int = opt$muxscan

if (opt$plot_format %in% c('png', 'pdf', 'ps', 'jpeg', 'tiff', 'bmp')){
    plot_format = opt$plot_format
}else {
    flog.error("The plot format passed via the -f option must be one of 'png', 'pdf', 'ps', 'jpeg', 'tiff', or 'bmp'. Please check and try again")
    stop()
}

p1m = 1.0 


# this is how we label the reads at least as good as q
q_title = paste("Q>=", q, sep="")


# build the map for R9.5
p1 = data.frame(channel=33:64, row=rep(1:4, each=8), col=rep(1:8, 4))
p2 = data.frame(channel=481:512, row=rep(5:8, each=8), col=rep(1:8, 4))
p3 = data.frame(channel=417:448, row=rep(9:12, each=8), col=rep(1:8, 4))
p4 = data.frame(channel=353:384, row=rep(13:16, each=8), col=rep(1:8, 4))
p5 = data.frame(channel=289:320, row=rep(17:20, each=8), col=rep(1:8, 4))
p6 = data.frame(channel=225:256, row=rep(21:24, each=8), col=rep(1:8, 4))
p7 = data.frame(channel=161:192, row=rep(25:28, each=8), col=rep(1:8, 4))
p8 = data.frame(channel=97:128, row=rep(29:32, each=8), col=rep(1:8, 4))

q1 = data.frame(channel=1:32, row=rep(1:4, each=8), col=rep(16:9, 4))
q2 = data.frame(channel=449:480, row=rep(5:8, each=8), col=rep(16:9, 4))
q3 = data.frame(channel=385:416, row=rep(9:12, each=8), col=rep(16:9, 4))
q4 = data.frame(channel=321:352, row=rep(13:16, each=8), col=rep(16:9, 4))
q5 = data.frame(channel=257:288, row=rep(17:20, each=8), col=rep(16:9, 4))
q6 = data.frame(channel=193:224, row=rep(21:24, each=8), col=rep(16:9, 4))
q7 = data.frame(channel=129:160, row=rep(25:28, each=8), col=rep(16:9, 4))
q8 = data.frame(channel=65:96, row=rep(29:32, each=8), col=rep(16:9, 4))

map = rbind(p1, p2, p3, p4, p5, p6, p7, p8, q1, q2, q3, q4, q5, q6, q7, q8)

add_cols <- function(d, min.q){
    # take a sequencing sumamry file (d), and a minimum Q value you are interested in (min.q)
    # return the same data frame with the following columns added 
        # cumulative.bases and cumulative.bases.time
        # hour of run
        # reads.per.hour
    
    d = subset(d, mean_qscore_template >= min.q)
    
    if(nrow(d)==0){ 
        flog.error(paste("There are no reads with a mean Q score higher than your cutoff of ", min.q, ". Please choose a lower cutoff and try again.", sep = ""))
        quit()
    }
    
    if(max(d$channel)<=512){
        d = merge(d, map, by="channel")
    }else{
        # thanks to Matt Loose. Code adapted from: https://github.com/mattloose/flowcellvis/blob/master/flowcellgif.py
        block = floor((d$channel-1)/250)
        remainder = (d$channel-1)%%250
        d$row = floor(remainder/10) + 1 # +1 because R is not zero indexed
        d$col = remainder%%10 + block*10 + 1 # +1 because R is not zero indexed 
    }
    
    d = d[with(d, order(as.numeric(start_time))), ] # sort by start time
    d$cumulative.bases.time = cumsum(as.numeric(d$sequence_length_template))

    d = d[with(d, order(-sequence_length_template)), ] # sort by read length
    d$cumulative.bases = cumsum(as.numeric(d$sequence_length_template))
    d$hour = d$start_time %/% 3600
    
    # add the reads generated for each hour
    reads.per.hour = as.data.frame(table(d$hour))
    names(reads.per.hour) = c("hour", "reads_per_hour")
    reads.per.hour$hour = as.numeric(as.character(reads.per.hour$hour))
    d = merge(d, reads.per.hour, by = c("hour"))    
    return(d)
}


load_summary <- function(filepath, min.q){
    # load a sequencing summary and add some info
    # min.q is a vector of length 2 defining 2 levels of min.q to have
    # by default the lowest value is -Inf, i.e. includes all reads. The 
    # other value in min.q is set by the user at the command line
    
    suppressWarnings({
        d = read_tsv(filepath, col_types = cols_only(channel = 'i', 
                                                num_events_template = 'i', 
                                                sequence_length_template = 'i', 
                                                mean_qscore_template = 'n',
                                                sequence_length_2d = 'i',
                                                mean_qscore_2d = 'n',
                                                start_time = 'n',
                                                calibration_strand_genome_template = 'c'))
    })
    
    problem_rows = problems(d)$row
    
    if(length(problem_rows>0)){

        flog.warn("There were problems in parsing your input file with the following rows: ")
        flog.warn(problem_rows)
        flog.warn("These rows will be ignored for further analysis, please check your input file")
        d = d[-c(problem_rows),]
        
    }
    
    if(max(d$channel)<=512){
        flog.info("MinION flowcell detected")
    }else{
        flog.info("PromethION flowcell detected")
    }
    
    # remove the control sequence from directRNA runs
    if("calibration_strand_genome_template" %in% names(d)){
        d = subset(d, calibration_strand_genome_template != "YHR174W")
    }

    if("sequence_length_2d" %in% names(d)){
        # it's a 1D2 or 2D run
        d$sequence_length_template = as.numeric(as.character(d$sequence_length_2d))
        d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_2d))
        d$num_events_template = NA
        d$start_time = as.numeric(as.character(d$start_time))
        
    }else{
        d$sequence_length_template = as.numeric(as.character(d$sequence_length_template))
        d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_template))
        d$num_events_template = as.numeric(as.character(d$num_events_template))
        d$start_time = as.numeric(as.character(d$start_time))
    }
        
    d$events_per_base = d$num_events_template/d$sequence_length_template

    flowcell = basename(dirname(filepath))
    
    # add columns for all the reads
    d1 = add_cols(d, min.q[1])
    d1$Q_cutoff = "All reads"
    
    # add columns for just the reads that pass the user Q threshold
    d2 = add_cols(d, min.q[2])
    d2$Q_cutoff = q_title
    
    # bind those two together into one data frame
    d = as.data.frame(rbindlist(list(d1, d2)))

    # name the flowcell (useful for analyses with >1 flowcell)
    d$flowcell = flowcell
    
    # make sure this is a factor
    d$Q_cutoff = as.factor(d$Q_cutoff)
    
    keep = c("hour", "start_time", "channel", "sequence_length_template", "mean_qscore_template", "row", "col", "cumulative.bases", "cumulative.bases.time", "reads_per_hour", "Q_cutoff", "flowcell", "events_per_base")
    dk = d[, which(names(d) %in% keep)]
        
    return(dk)
}

reads.gt <- function(d, len){
    # return the number of reads in data frame d
    # that are at least as long as length len
    return(length(which(d$sequence_length_template>=len)))
}

bases.gt <- function(d, len){
    # return the number of bases contained in reads from 
    # data frame d
    # that are at least as long as length len
    reads = subset(d, sequence_length_template >= len)
    return(sum(as.numeric(reads$sequence_length_template)))
}

log10_minor_break = function (...){
    # function to add minor breaks to a log10 graph
    # hat-tip: https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot
    function(x) {
        minx         = floor(min(log10(x), na.rm=T))-1;
        maxx         = ceiling(max(log10(x), na.rm=T))+1;
        n_major      = maxx-minx+1;
        major_breaks = seq(minx, maxx, by=1)
        minor_breaks = 
            rep(log10(seq(1, 9, by=1)), times = n_major)+
            rep(major_breaks, each = 9)
        return(10^(minor_breaks))
    }
}


log10_major_break = function (...){
    # function to add major breaks to a log10 graph
    # hat-tip: https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot
    function(x) {
        minx         = floor(min(log10(x), na.rm=T))-1;
        maxx         = ceiling(max(log10(x), na.rm=T))+1;
        n_major      = maxx-minx+1;
        major_breaks = seq(minx, maxx, by=1)
        return(10^(major_breaks))
    }
}


binSearch <- function(min, max, df, t = 100000) {
    # binary search algorithm, thanks to https://stackoverflow.com/questions/46292438/optimising-a-calculation-on-every-cumulative-subset-of-a-vector-in-r/46303384#46303384
    # the aim is to return the number of reads in a dataset (df)
    # that comprise the largest subset of reads with an N50 of t
    # we use this to calculte the number of 'ultra long' reads
    # which are defined as those with N50 > 100KB
    mid = floor(mean(c(min, max)))
    if (mid == min) {
        if (df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[min]/2))] < t) {
            return(min - 1)
        } else {
            return(max - 1)
        }
    }
    
    n = df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[mid]/2))]
    if (n >= t) {
        return(binSearch(mid, max, df))
    } else {
        return(binSearch(min, mid, df))
    }
}


summary.stats <- function(d, Q_cutoff="All reads"){
    # Calculate summary stats for a single value of min.q
    
    rows = which(as.character(d$Q_cutoff)==Q_cutoff)
    d = d[rows,]
    d = d[with(d, order(-sequence_length_template)), ] # sort by read length, just in case
    
    total.bases = sum(as.numeric(d$sequence_length_template))
    total.reads = nrow(d)
    N50.length = d$sequence_length_template[min(which(d$cumulative.bases > (total.bases/2)))]
    mean.length = round(mean(as.numeric(d$sequence_length_template)), digits = 1)
    median.length = round(median(as.numeric(d$sequence_length_template)), digits = 1)
    max.length = max(as.numeric(d$sequence_length_template))
    mean.q = round(mean(d$mean_qscore_template), digits = 1)
    median.q = round(median(d$mean_qscore_template), digits = 1)
    
    #calculate ultra-long reads and bases (max amount of data with N50>100KB)
    ultra.reads = binSearch(1, nrow(d), d, t = 100000)    
    if(ultra.reads>=1){
        ultra.gigabases = sum(as.numeric(d$sequence_length_template[1:ultra.reads]))/1000000000
    }else{
        ultra.gigabases = 0
    }
        
    reads = list(
                reads.gt(d, 10000), 
                reads.gt(d, 20000), 
                reads.gt(d, 50000),
                reads.gt(d, 100000),
                reads.gt(d, 200000),
                reads.gt(d, 500000),
                reads.gt(d, 1000000),
                ultra.reads)
    names(reads) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong")

    bases = list(
                bases.gt(d, 10000)/1000000000, 
                bases.gt(d, 20000)/1000000000, 
                bases.gt(d, 50000)/1000000000,
                bases.gt(d, 100000)/1000000000,
                bases.gt(d, 200000)/1000000000,
                bases.gt(d, 500000)/1000000000,
                bases.gt(d, 1000000)/1000000000,
                ultra.gigabases)
    names(bases) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong")
    
    return(list('total.gigabases' = total.bases/1000000000,
                'total.reads' = total.reads,
                'N50.length' = N50.length, 
                'mean.length' = mean.length, 
                'median.length' = median.length,
                'max.length' = max.length,
                'mean.q' = mean.q,
                'median.q' = median.q,
                'reads' = reads,
                'gigabases' = bases
                ))
}

channel.summary <- function(d){
    # calculate summaries of what happened in each of the channels 
    # of a flowcell
    
    a = ddply(d, .(channel), 
              summarize, 
              total.bases = sum(sequence_length_template), 
              total.reads = sum(which(sequence_length_template>=0)), 
              mean.read.length = mean(sequence_length_template), 
              median.read.length = median(sequence_length_template),
              row = mean(row),
              col = mean(col))
    b = melt(a, id.vars = c("channel", "row", "col"))
    return(b)    
}


single.flowcell <- function(input.file, output.dir, q=7, base.dir = NA){
    # wrapper function to analyse data from a single flowcell
    # input.file is a sequencing_summary.txt file from a 1D run
    # output.dir is the output directory into which to write results
    # q is the cutoff used for Q values, set by the user
    # base.dir is the base directory if and only if the user supplied a base directory
    # we use base.dir to name flowcells in a sensible way
    flog.info(paste("Loading input file:", input.file))
    d = load_summary(input.file, min.q=c(-Inf, q))

    flowcell = unique(d$flowcell)

    
    # output goes with the sequencing summary file unless otherwise specified
    if(is.na(opt$output.dir)){ 
        output.dir = file.path(dirname(input.file))
    } else {
        # the user supplied an output dir
        output.dir = file.path(opt$output.dir, flowcell)
    }
    
    flog.info(paste(sep = "", flowcell, ": creating output directory:", output.dir))
    dir.create(output.dir, recursive = TRUE)
    out.txt = file.path(output.dir, "summary.yaml")
    
    flog.info(paste(sep = "", flowcell, ": summarising input file for flowcell"))
    all.reads.summary = summary.stats(d, Q_cutoff = "All reads")
    q10.reads.summary = summary.stats(d, Q_cutoff = q_title)
    
    summary = list("input file" = input.file,
                   "All reads" = all.reads.summary,
                   cutoff = q10.reads.summary,
                   "notes" = 'ultralong reads refers to the largest set of reads with N50>100KB')
    
    names(summary)[3] = q_title
    
    write(as.yaml(summary), out.txt)
    
    if (mux_int == 0) {
    	mux_int = max(d$hour)
    }
    muxes = seq(from = 0, to = max(d$hour), by = mux_int)

    # set up variable sizes
    if(smallfig == TRUE){ p1m = 0.5 }else{ p1m = 1.0 }
    if(smallfig == TRUE){ p2m = 0.6 }else{ p2m = 1.0 }
	
	
    # make plots
    flog.info(paste(sep = "", flowcell, ": plotting length histogram"))
	p1 = ggplot(d, aes(x = sequence_length_template, fill = Q_cutoff)) +
			geom_histogram(bins = 300) +
			scale_x_log10(minor_breaks=log10_minor_break(), breaks = log10_major_break()) +
			facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") +
			theme(text = element_text(size = 15)) +
			xlab("Read length (bases)") +
			ylab("Number of reads") +
			guides(fill=FALSE) + scale_fill_viridis(discrete = TRUE, begin = 0.25, end = 0.75)
	
	# Add N50 and mean length on p1
	if (plot_stat) {	
		# Extract values to display
		annotation_df = data.frame(
				Q_cutoff = c("All reads", q_title),
				n50 = c(all.reads.summary$N50.length, q10.reads.summary$N50.length),
				mean = c(all.reads.summary$mean.length, q10.reads.summary$mean.length),
				n50_math = c(
						if(all.reads.summary$N50.length > all.reads.summary$mean.length){"+"} else {"-"},
						if(all.reads.summary$N50.length > q10.reads.summary$mean.length){"+"} else {"-"}
				),
				mean_math = c(
						if(all.reads.summary$N50.length > all.reads.summary$mean.length){"-"} else {"+"},
						if(all.reads.summary$N50.length > q10.reads.summary$mean.length){"-"} else {"+"}
				)
		)
		
		# Set up labels and their location on p1
		annotation_text_df = data.frame(
				Q_cutoff = c("All reads", q_title),
				n50_label = c(paste("N50", annotation_df[1,2], sep="\n"), paste("N50", annotation_df[2,2], sep="\n")),
				n50_x = c(eval(parse(text=paste(annotation_df[1,"n50"], annotation_df[1,"n50_math"] , annotation_df[1,"n50"], "*", 0.4))), eval(parse(text=paste(annotation_df[2,"n50"],annotation_df[2, "n50_math"] ,annotation_df[2,"n50"], "*", 0.4)))),
				mean_label = c(paste("Mean", annotation_df[1,3], sep="\n"), paste("Mean", annotation_df[2,3], sep="\n")),
				mean_x =  c(eval(parse(text=paste(annotation_df[1,"mean"], annotation_df[1, "mean_math"] , annotation_df[1,"mean"], "*", 0.4))), eval(parse(text=paste(annotation_df[2,"mean"],annotation_df[2, "mean_math"] ,annotation_df[2,"mean"], "*", 0.4))))
		)
		
		# Add lines on p1
		ylim=max(ggplot_build(p1)$data[[1]]$count)
		p1 = p1 +
			geom_vline(data=annotation_df, aes(xintercept = n50), color="black") +
			geom_vline(data=annotation_df, aes(xintercept = mean), color="black") +
			geom_text(data = annotation_text_df, mapping = aes(x=n50_x, y=ylim*0.85, label = n50_label)) +
			geom_text(data = annotation_text_df, mapping = aes(x=mean_x, y=ylim*0.85, label = mean_label))
	}
	
    suppressMessages(ggsave(filename = file.path(output.dir, paste("length_histogram.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p1)) #

    flog.info(paste(sep = "", flowcell, ": plotting mean Q score histogram"))
    p2 = ggplot(d, aes(x = mean_qscore_template, fill = Q_cutoff)) + 
        geom_histogram(bins = 300) + 
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + 
        theme(text = element_text(size = 15)) +
        xlab("Mean Q score of read") +
        ylab("Number of reads") +
        guides(fill=FALSE) + scale_fill_viridis(discrete = TRUE, begin = 0.25, end = 0.75)

	# Add mean Qscore on p2
	if (plot_stat) {
		# Extract values to display
		meanQ_df = data.frame(
				Q_cutoff = c("All reads", q_title),
				mean = c(all.reads.summary$mean.q, q10.reads.summary$mean.q),
				mean_label_x = c(all.reads.summary$mean.q*0.95, q10.reads.summary$mean.q*0.95),
				mean_label = c(paste("Mean\n", all.reads.summary$mean.q), paste("Mean\n", q10.reads.summary$mean.q))
		)
		
		# Add line on p2
		ylim_meanQ=max(ggplot_build(p2)$data[[1]]$count)
		p2 = p2 +
			geom_vline(data=meanQ_df, aes(xintercept = mean), color="black") +
			geom_text(data = meanQ_df, mapping = aes(x=mean_label_x, y=ylim_meanQ*0.95, label = mean_label))
	}
	
    suppressMessages(ggsave(filename = file.path(output.dir, paste("q_histogram.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p2)) #
    
    if(max(d$channel)<=512){
        # only do this for minion, not promethion
	    flog.info(paste(sep = "", flowcell, ": plotting flowcell overview"))
        p3 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x=start_time/3600, y=sequence_length_template, colour = mean_qscore_template)) + 
            geom_point(size=1.5, alpha=0.35) + 
            scale_colour_viridis() + 
            labs(colour='Q')  + 
            scale_y_log10() + 
            facet_grid(row~col) +
            theme(panel.spacing = unit(0.5, "lines")) +
            xlab("Hours into run") +
            ylab("Read length") +
            theme(text = element_text(size = 40), axis.text.x = element_text(size=12), axis.text.y = element_text(size=12), legend.text=element_text(size=18), legend.title=element_text(size=24))
    
            suppressMessages(ggsave(filename = file.path(output.dir,  paste("flowcell_overview.", plot_format, sep="")), device=plot_format, width = 2000/75, height = 1920/75, plot = p3))
    }

    flog.info(paste(sep = "", flowcell, ": plotting flowcell yield over time"))
    p5 = ggplot(d, aes(x=start_time/3600, y=cumulative.bases.time/1000000000, colour = Q_cutoff)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        geom_line(size = 1) + 
        xlab("Hours into run") +
        ylab("Total yield in gigabases") +
        scale_colour_viridis(discrete = TRUE, begin = 0.25, end = 0.75, guide = guide_legend(title = "Reads")) +
        theme(text = element_text(size = 15))
    suppressMessages(ggsave(filename = file.path(output.dir, paste("yield_over_time.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*480/75, plot = p5)) #


    flog.info(paste(sep = "", flowcell, ": plotting flowcell yield by read length"))
    p6 = ggplot(d, aes(x=sequence_length_template, y=cumulative.bases/1000000000, colour = Q_cutoff)) + 
        geom_line(size = 1) + 
        xlab("Minimum read length (bases)") +
        ylab("Total yield in gigabases") +
        scale_colour_viridis(discrete = TRUE, begin = 0.25, end = 0.75, guide = guide_legend(title = "Reads")) +
        theme(text = element_text(size = 15))
    xmax = max(d$sequence_length_template[which(d$cumulative.bases > 0.01 * max(d$cumulative.bases))])
    p6 = p6 + scale_x_continuous(limits = c(0, xmax))
    suppressMessages(ggsave(filename = file.path(output.dir, paste("yield_by_length.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*480/75, plot = p6)) #
    
    flog.info(paste(sep = "", flowcell, ": plotting sequence length over time"))
    p7 = ggplot(d, aes(x=start_time/3600, y=sequence_length_template, colour = Q_cutoff, group = Q_cutoff)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        theme(text = element_text(size = 15)) +
        geom_smooth() + 
        xlab("Hours into run") + 
        ylab("Mean read length (bases)") + 
        scale_colour_viridis(discrete = TRUE, begin = 0.25, end = 0.75, guide = guide_legend(title = "Reads")) +
        ylim(0, NA)
    suppressMessages(ggsave(filename = file.path(output.dir, paste("length_by_hour.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*480/75, plot = p7)) #
    
    flog.info(paste(sep = "", flowcell, ": plotting Q score over time"))
    p8 = ggplot(d, aes(x=start_time/3600, y=mean_qscore_template, colour = Q_cutoff, group = Q_cutoff)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        theme(text = element_text(size = 15)) +
        geom_smooth() + 
        xlab("Hours into run") + 
        ylab("Mean Q score") + 
        scale_colour_viridis(discrete = TRUE, begin = 0.25, end = 0.75, guide = guide_legend(title = "Reads")) +
        ylim(0, NA)
    suppressMessages(ggsave(filename = file.path(output.dir, paste("q_by_hour.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*480/75, plot = p8)) #
    
    flog.info(paste(sep = "", flowcell, ": plotting reads per hour"))
    f = d[c("hour", "reads_per_hour", "Q_cutoff")]
    f = f[!duplicated(f),]
    g = subset(f, Q_cutoff=="All reads")
    h = subset(f, Q_cutoff==q_title)
    max = max(f$hour)
    # all of this is just to fill in hours with no reads recorded
    all = 0:max
    add.g = all[which(all %in% g$hour == FALSE)]
    if(length(add.g)>0){
        add.g = data.frame(hour = add.g, reads_per_hour = 0, Q_cutoff = "All reads")
        g = rbind(g, add.g)
    }
    add.h = all[which(all %in% h$hour == FALSE)]
    if(length(add.h)>0){
        add.h = data.frame(hour = add.h, reads_per_hour = 0, Q_cutoff = q_title)
        h = rbind(h, add.h)
    }
    i = rbind(g, h)
    i$Q_cutoff = as.character(i$Q_cutoff)
    i$Q_cutoff[which(i$Q_cutoff==q_title)] = paste("Q>=", q, sep="")
    p9 = ggplot(i, aes(x=hour, y=reads_per_hour, colour = Q_cutoff, group = Q_cutoff)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        theme(text = element_text(size = 15)) +
        geom_point() +
        geom_line() +
        xlab("Hours into run") + 
        ylab("Number of reads per hour") + 
        ylim(0, NA) + 
        scale_colour_viridis(discrete = TRUE, begin = 0.25, end = 0.75, guide = guide_legend(title = "Reads"))
    suppressMessages(ggsave(filename = file.path(output.dir, paste("reads_per_hour.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*480/75, plot = p9)) #
    
    if(max(d$channel)<=512){
        # minion
        flog.info(paste(sep = "", flowcell, ": plotting read length vs. q score scatterplot"))
        p10 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = sequence_length_template, y = mean_qscore_template, colour = events_per_base)) + 
            geom_point(alpha=0.05, size = 0.4) + 
            scale_x_log10(minor_breaks=log10_minor_break(), breaks = log10_major_break()) + 
            labs(colour='Events per base\n(log scale)\n')  + 
            theme(text = element_text(size = 15)) +
            xlab("Read length (bases)") +
            ylab("Mean Q score of read")
    }else{
        # promethion
        p10 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = sequence_length_template, y = mean_qscore_template, colour = events_per_base)) + 
            geom_bin2d() + 
            scale_x_log10(minor_breaks=log10_minor_break(), breaks = log10_major_break()) + 
            theme(text = element_text(size = 15)) +
            scale_fill_viridis() +
            xlab("Read length (bases)") +
            ylab("Mean Q score of read")
    }
    
    
    if(max(d$events_per_base, na.rm=T)>0){
        # a catch for 1D2 runs which don't have events per base
        p10 = p10 + scale_colour_viridis(trans = "log", labels = scientific, option = 'inferno') 
    }
    # we keep it a bit wider, because the legend takes up a fair bit of the plot space
    suppressMessages(ggsave(filename = file.path(output.dir, paste("length_vs_q.", plot_format, sep="")), device=plot_format, width = p2m*960/75, height = p1m*960/75, plot = p10)) #
    
    flog.info(paste(sep = "", flowcell, ": plotting flowcell channels summary histograms"))
    c = channel.summary(subset(d, Q_cutoff=="All reads"))
    c10 = channel.summary(subset(d, Q_cutoff==q_title))
    c$Q_cutoff = "All reads"
    c10$Q_cutoff = q_title
    cc = rbind(c, c10)
    cc$variable = as.character(cc$variable)
    cc$variable[which(cc$variable=="total.bases")] = "Number of bases per channel"
    cc$variable[which(cc$variable=="total.reads")] = "Number of reads per channel"
    cc$variable[which(cc$variable=="mean.read.length")] = "Mean read length per channel"
    cc$variable[which(cc$variable=="median.read.length")] = "Median read length per channel"
    p11 = ggplot(cc, aes(x = value, fill = Q_cutoff)) + geom_histogram(bins = 30) + 
        facet_grid(Q_cutoff~variable, scales = "free_x") + 
        theme(text = element_text(size = 15), axis.text.x = element_text(angle = 60, hjust = 1)) +
        guides(fill=FALSE) +
        scale_fill_viridis(discrete = TRUE, begin = 0.25, end = 0.75) +
        guides(fill=FALSE)
    suppressMessages(ggsave(filename = file.path(output.dir, paste("channel_summary.", plot_format, sep="")), device=plot_format, width = 960/75, height = 480/75, plot = p11)) 
    

    flog.info(paste(sep = "", flowcell, ": plotting physical overview of output per channel"))
    if(max(d$channel)<=512){
        # minion
        cols = 2
    }else{
        # promethion
        cols = 1
    }


    p12 = ggplot(subset(cc, variable == "Number of bases per channel"), aes(x = as.numeric(col), y = as.numeric(row))) + 
        geom_tile(aes(fill = value/1000000000), colour="white", size=0.25) +
        facet_wrap(~Q_cutoff, ncol = cols) + 
        theme(text = element_text(size = 15), 
                plot.background=element_blank(), 
                panel.border=element_blank(),
                panel.background = element_blank(),
                panel.grid.major = element_blank(), 
                panel.grid.minor = element_blank()) +
        scale_fill_viridis(name = "GB/channel") +
        scale_y_continuous(trans = "reverse", expand=c(0,0)) +
        scale_x_continuous(expand=c(0,0)) +
        coord_fixed() +
        labs(x="channel column",y="channel row")

    if(max(d$channel)<=512){
        # minion
        suppressMessages(ggsave(filename = file.path(output.dir, paste("gb_per_channel_overview.", plot_format, sep="")), device=plot_format, width = 960/150, height = 480/75, plot = p12)) 
    }else{
        # promethion
        suppressMessages(ggsave(filename = file.path(output.dir, paste("gb_per_channel_overview.", plot_format, sep="")), device=plot_format, width = 960/75, height = 480/75, plot = p12))         
    }
    
    return(d)
}

combined.flowcell <- function(d, output.dir, q=8){
    # function to analyse combined data from multiple flowcells
    # useful for getting an overall impression of the combined data

    flog.info("Creating output directory")
    out.txt = file.path(output.dir, "summary.yaml")
    
    # write summaries
    flog.info(paste("Summarising combined data from all flowcells, saving to:", out.txt))

    # tidy up and remove added stuff
    drops = c("cumulative.bases", "hour", "reads.per.hour")
    d = d[ , !(names(d) %in% drops)]
    
    d1 = subset(d, Q_cutoff == "All reads")    
    d1 = d1[with(d1, order(-sequence_length_template)), ] # sort by read length
    d1$cumulative.bases = cumsum(as.numeric(d1$sequence_length_template))
        
    d2 = subset(d, Q_cutoff == q_title)    
    d2 = d2[with(d2, order(-sequence_length_template)), ] # sort by read length
    d2$cumulative.bases = cumsum(as.numeric(d2$sequence_length_template))

    d1$Q_cutoff = as.factor(d1$Q_cutoff)
    d2$Q_cutoff = as.factor(d2$Q_cutoff)
    
    all.reads.summary = summary.stats(d1, Q_cutoff = "All reads")
    q10.reads.summary = summary.stats(d2, Q_cutoff = q_title)
    
    summary = list("input file" = input.file,
                   "All reads" = all.reads.summary,
                   cutoff = q10.reads.summary,
                   "notes" = 'ultralong reads refers to the largest set of reads with N50>100KB')
    
    names(summary)[3] = q_title
    
    write(as.yaml(summary), out.txt)

    d = rbind(d1, d2)
    d$Q_cutoff = as.factor(d$Q_cutoff)
    d1 = 0
    d2 = 0
        
    # set up variable sizes
    if(smallfig == TRUE){ p1m = 0.5 }else{ p1m = 1.0 }

    # make plots
    flog.info("Plotting combined length histogram")
    p1 = ggplot(d, aes(x = sequence_length_template, fill = Q_cutoff)) + 
        geom_histogram(bins = 300) + 
        scale_x_log10(minor_breaks=log10_minor_break(), breaks = log10_major_break()) + 
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + 
        theme(text = element_text(size = 15)) +
        xlab("Read length (bases)") +
        ylab("Number of reads") +
        guides(fill=FALSE) + scale_fill_viridis(discrete = TRUE, begin = 0.25, end = 0.75)
    suppressMessages(ggsave(filename = file.path(output.dir, paste("combined_length_histogram.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p1))
    
    flog.info("Plotting combined mean Q score histogram")
    p2 = ggplot(d, aes(x = mean_qscore_template, fill = Q_cutoff)) + 
        geom_histogram(bins = 300) + 
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + 
        theme(text = element_text(size = 15)) +
        xlab("Mean Q score of read") +
        ylab("Number of reads") +
        guides(fill=FALSE) + scale_fill_viridis(discrete = TRUE, begin = 0.25, end = 0.75)
    suppressMessages(ggsave(filename = file.path(output.dir, paste("combined_q_histogram.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p2))
        
    flog.info("Plotting combined yield by length")
    p4 = ggplot(d, aes(x=sequence_length_template, y=cumulative.bases/1000000000, colour = Q_cutoff)) + 
        geom_line(size = 1) + 
        xlab("Minimum read length (bases)") +
        ylab("Total yield in gigabases") +
        scale_colour_viridis(discrete = TRUE, begin = 0.25, end = 0.75, guide = guide_legend(title = "Reads")) +
        theme(text = element_text(size = 15))
    xmax = max(d$sequence_length_template[which(d$cumulative.bases > 0.01 * max(d$cumulative.bases))])
    p4 = p4 + scale_x_continuous(limits = c(0, xmax))
    
    suppressMessages(ggsave(filename = file.path(output.dir, "combined_yield_by_length.png"), device=plot_format, width = p1m*960/75, height = p1m*480/75, plot = p4))
    
}


multi.flowcell = function(input.file, output.base, q){
    # wrapper function to allow parallelisation of single-flowcell 
    # analyses when >1 flowcell is analysed in one run
    
    d = single.flowcell(input.file, output.dir, q)
    return(d)
    
}


multi.plots = function(dm, output.dir){
    # function to plot data from multiple flowcells,
    # where the data is not combined (as in combined.flowcell() )
    # but instead just uses multiple lines on each plot.
    
    muxes = seq(from = 0, to = max(dm$hour), by = 8)
    
    # set up variable sizes
    if(smallfig == TRUE){ p1m = 0.5 }else{ p1m = 1.0 }

    # make plots
    flog.info("Plotting length distributions")
    p1 = ggplot(dm, aes(x = sequence_length_template)) + 
        geom_line(stat="density", aes(colour = flowcell), size = 1) +
        scale_x_log10(minor_breaks=log10_minor_break(), breaks = log10_major_break()) + 
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + 
        theme(text = element_text(size = 15)) +
        xlab("Read length (bases)") +
        ylab("Density")
    suppressMessages(ggsave(filename = file.path(output.dir, paste("length_distributions.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p1)) #
    
    flog.info("Plotting mean Q score distributions")
    p2 = ggplot(dm, aes(x = mean_qscore_template)) + 
        geom_line(stat="density", aes(colour = flowcell), size = 1) +
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + 
        theme(text = element_text(size = 15)) +
        xlab("Mean Q score of read") +
        ylab("Density")
    suppressMessages(ggsave(filename = file.path(output.dir, paste("q_distributions.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p2)) #


    flog.info("Plotting flowcell yield over time")
    p5 = ggplot(dm, aes(x=start_time/3600, y=cumulative.bases.time/1000000000, colour = flowcell)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        geom_line(size = 1) + 
        xlab("Hours into run") +
        ylab("Total yield in gigabases") +
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") +
        theme(text = element_text(size = 15))
    suppressMessages(ggsave(filename = file.path(output.dir, paste("yield_over_time.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p5)) #

        
    flog.info("Plotting flowcell yield by length")
    p6 = ggplot(dm, aes(x=sequence_length_template, y=cumulative.bases/1000000000, colour = flowcell)) + 
        geom_line(size = 1) + 
        xlab("Minimum read length (bases)") +
        ylab("Total yield in gigabases") +
        theme(text = element_text(size = 15)) + 
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y")
    xmax = max(dm$sequence_length_template[which(dm$cumulative.bases > 0.01 * max(dm$cumulative.bases))])
    p6 = p6 + scale_x_continuous(limits = c(0, xmax))
    suppressMessages(ggsave(filename = file.path(output.dir, paste("yield_by_length.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p6)) #
    

    flog.info("Plotting sequence length over time")
    p7 = ggplot(dm, aes(x=start_time/3600, y=sequence_length_template, colour = flowcell)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        theme(text = element_text(size = 15)) +
        geom_smooth() + 
        xlab("Hours into run") + 
        ylab("Mean read length (bases)") + 
        ylim(0, NA) +
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y")
    suppressMessages(ggsave(filename = file.path(output.dir, paste("length_by_hour.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p7))
    

    flog.info("Plotting Q score over time")
    p8 = ggplot(dm, aes(x=start_time/3600, y=mean_qscore_template, colour = flowcell)) + 
        geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) +
        theme(text = element_text(size = 15)) +
        geom_smooth() + 
        xlab("Hours into run") + 
        ylab("Mean Q score") + 
        facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y")
    suppressMessages(ggsave(filename = file.path(output.dir, paste("q_by_hour.", plot_format, sep="")), device=plot_format, width = p1m*960/75, height = p1m*960/75, plot = p8))    
    
}


# Choose how to act depending on whether we have a single input file or mulitple input files

if(file_test("-f", input.file)==TRUE & length(test.file)>1){
    # if it's an existing file (not a folder) just run one analysis
    d = single.flowcell(input.file, output.dir, q)
}else if(file_test("-d", input.file)==TRUE & length(test.file)>1){
    # it's a directory, recursively analyse all sequencing_summary.txt files
  
    # get a list of all sequencing_summary.txt files, recursively
    summaries = list.files(path = input.file, pattern = "sequencing_summary.txt", recursive = TRUE, full.names = TRUE)

    flog.info("")
    flog.info("**** Analysing the following files ****")
    flog.info(summaries)

    
    # if the user passes a directory with only one sequencing_summary.txt file...
    if(length(summaries) == 1){
        d = single.flowcell(summaries[1], output.dir, q)
        flog.info('**** Analysis complete ****')
    }else{
        # analyse each one and keep the returns in a list
        if(combined_only == FALSE){
            results = mclapply(summaries, multi.flowcell, output.dir, q, mc.cores = cores)
        }else{
            results = mclapply(summaries, load_summary, min.q = c(-Inf, q), mc.cores = cores)
        }
        
        # rbind that list
        flog.info('**** Analysing data from all flowcells combined ****')
        dm = as.data.frame(rbindlist(results))

        # now do the single plot on ALL the output
        if(is.na(opt$output.dir)){ 
            combined.output = file.path(input.file, "combinedQC")
        } else {
            # the user supplied an output dir
            combined.output = file.path(opt$output.dir, "combinedQC")
        }
        
        flog.info(paste("Plots from the combined output will be saved in", combined.output))
        dir.create(combined.output, recursive = TRUE)
        combined.flowcell(dm, combined.output, q)
        multi.plots(dm, combined.output)
        flog.info('**** Analysis complete ****')
        flog.info('If you use MinIONQC in your published work, please cite:')
        flog.info('R Lanfear, M Schalamun, D Kainer, W Wang, B Schwessinger (2018). MinIONQC: fast and simple quality control for MinION sequencing data, Bioinformatics, bty654')
        flog.info('https://doi.org/10.1093/bioinformatics/bty654')
    }
    
}else{
    #WTF
    flog.warn(paste("Couldn't find a sequencing summary file in your input which was: ", 
        input.file, 
        "\nThe input must be either a sequencing_summary.txt file, or a directory containing one or more such files"))
}