diff --git a/DESCRIPTION b/DESCRIPTION index 64d2855..3b63fd2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Maintainer: Stefan Dentro License: GPL-3 Type: Package Title: Battenberg subclonal copy number caller -Version: 2.2.7 +Version: 2.2.8 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"), person("Peter", "Van Loo", role=c("aut")), person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")), @@ -23,7 +23,10 @@ Imports: ggplot2, readr, gtools, - gridExtra + gridExtra, + doParallel, + parallel, + foreach URL: https://github.com/Wedge-Oxford/battenberg LazyLoad: yes License: file LICENSE diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4d5576a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +#FROM r-base +FROM ubuntu:16.04 + +# Add dependencies +RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl git libssl-dev curl + +RUN mkdir /tmp/downloads + +RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archive/1.2.1.tar.gz && \ + mkdir /tmp/downloads/htslib && \ + tar -C /tmp/downloads/htslib --strip-components 1 -zxf tmp.tar.gz && \ + make -C /tmp/downloads/htslib && \ + rm -f /tmp/downloads/tmp.tar.gz + +ENV HTSLIB /tmp/downloads/htslib + +RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/cancerit/alleleCount/archive/v2.1.2.tar.gz && \ + mkdir /tmp/downloads/alleleCount && \ + tar -C /tmp/downloads/alleleCount --strip-components 1 -zxf tmp.tar.gz && \ + cd /tmp/downloads/alleleCount/c && \ + mkdir bin && \ + make && \ + cp /tmp/downloads/alleleCount/c/bin/alleleCounter /usr/local/bin/. && \ + cd /tmp/downloads && \ + rm -rf /tmp/downloads/alleleCount /tmp/downloads/tmp.tar.gz + +RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/impute_v2.3.2_x86_64_static.tgz && \ + mkdir /tmp/downloads/impute2 && \ + tar -C /tmp/downloads/impute2 --strip-components 1 -zxf tmp.tar.gz && \ + cp /tmp/downloads/impute2/impute2 /usr/local/bin && \ + rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz + +RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel","foreach"))' +RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' +RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' + +RUN curl -sSL -o battenberg_wgs.R https://raw.githubusercontent.com/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R +# modify paths to reference files +RUN cat battenberg_wgs.R | \ + sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \ + sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \ + sed 's|G1000PREFIX_AC = \".*|G1000PREFIX_AC = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesloci2012_chr\"|' | \ + sed 's|GCCORRECTPREFIX = \".*|GCCORRECTPREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_gcContent/1000_genomes_GC_corr_chr_\"|' | \ + sed 's|PROBLEMLOCI = \".*|PROBLEMLOCI = \"/opt/battenberg_reference/battenberg_problem_loci/probloci_270415.txt.gz\"|' > /usr/local/bin/battenberg_wgs.R && rm -f battenberg_wgs.R + +#RUN curl -sSL -o battenberg_snp6.R https://raw.githubusercontent.com/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R +#RUN cat battenberg_snp6.R | \ +# sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \ +# sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \ +# sed 's|SNP6_REF_INFO_FILE = \".*|SNP6_REF_INFO_FILE = \"/opt/battenberg_reference/battenberg_snp6/snp6_ref_info_file.txt\"|' > /usr/local/bin/battenberg_snp6.R && rm -f battenberg_wgs.R diff --git a/NAMESPACE b/NAMESPACE index e11517c..d715f18 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(plot.haplotype.data) export(GetChromosomeBAFs) export(GetChromosomeBAFs_SNP6) export(allele_ratio_plot) +export(battenberg) export(calc_psi_t) export(calc_rho_psi_refit) export(callSubclones) @@ -24,10 +25,13 @@ export(getBAFsAndLogRs) export(infer_gender_birdseed) export(make_posthoc_plots) export(parse.imputeinfofile) +export(prepare_snp6) +export(prepare_wgs) export(read_table_generic) export(run.impute) export(runASCAT) export(run_clonal_ASCAT) +export(run_haplotyping) export(segment.baf.phased) export(segment.baf.phased.sv) export(squaresplot) @@ -43,7 +47,12 @@ importFrom(ASCAT,ascat.plotNonRounded) importFrom(ASCAT,ascat.plotSunrise) importFrom(ASCAT,make_segments) importFrom(RColorBrewer,brewer.pal) +importFrom(doParallel,registerDoParallel) +importFrom(foreach,"%dopar%") +importFrom(foreach,foreach) importFrom(gridExtra,arrangeGrob) importFrom(gridExtra,grid.arrange) importFrom(gtools,mixedsort) +importFrom(parallel,makeCluster) +importFrom(parallel,stopCluster) importFrom(readr,read_table) diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R index fd3c32a..0445e5d 100644 --- a/R/Battenberg-package.R +++ b/R/Battenberg-package.R @@ -4,4 +4,7 @@ #' @importFrom gridExtra grid.arrange arrangeGrob #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded #' @importFrom gtools mixedsort +#' @importFrom parallel makeCluster stopCluster +#' @importFrom doParallel registerDoParallel +#' @importFrom foreach foreach %dopar% NULL diff --git a/R/battenberg.R b/R/battenberg.R new file mode 100644 index 0000000..d27d457 --- /dev/null +++ b/R/battenberg.R @@ -0,0 +1,230 @@ + +#' Run the Battenberg pipeline +#' +#' @param tumourname Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix. +#' @param normalname Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix. +#' @param tumour_data_file A BAM or CEL file for the tumour +#' @param normal_data_file A BAM or CEL file for the normal +#' @param imputeinfofile Full path to a Battenberg impute info file with pointers to Impute2 reference data +#' @param g1000prefix Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data +#' @param problemloci Full path to a problem loci file that contains SNP loci that should be filtered out +#' @param gccorrectprefix Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA) +#' @param g1000allelesprefix Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA) +#' @param ismale A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA) +#' @param data_type String that contains either wgs or snp6 depending on the supplied input data (Default: wgs) +#' @param impute_exe Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH) +#' @param allelecounter_exe Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH) +#' @param nthreads The number of concurrent processes to use while running the Battenberg pipeline (Default: 8) +#' @param platform_gamma Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1) +#' @param phasing_gamma Gamma parameter used when correcting phasing mistakes (Default: 1) +#' @param segmentation_gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10) +#' @param segmentation_kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3) +#' @param phasing_kmin Kmin used when correcting for phasing mistakes (Default: 3) +#' @param clonality_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 0) +#' @param ascat_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 1) +#' @param min_ploidy Minimum ploidy to be considered (Default: 1.6) +#' @param max_ploidy Maximum ploidy to be considered (Default: 4.8) +#' @param min_rho Minimum purity to be considered (Default: 0.1) +#' @param min_goodness Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63) +#' @param uninformative_BAF_threshold The threshold beyond which BAF becomes uninformative (Default: 0.51) +#' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10) +#' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20) +#' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35) +#' @param calc_seg_baf_option Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1) +#' @param skip_allele_counting Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE) +#' @param skip_preprocessing Provide TRUE when preprocessing is already complete (Default: FALSE) +#' @param skip_phasing Provide TRUE when phasing is already complete (Default: FALSE) +#' @param snp6_reference_info_file Reference files for the SNP6 pipeline only (Default: NA) +#' @param apt.probeset.genotype.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype) +#' @param apt.probeset.summarize.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize) +#' @param norm.geno.clust.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl) +#' @param birdseed_report_file Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt) +#' @param heterozygousFilter Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none") +#' @author sd11 +#' @export +battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, + gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1, + segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6, + max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, + min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, skip_phasing=F, + snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", + norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") { + + requireNamespace("foreach") + requireNamespace("doParallel") + requireNamespace("parallel") + + if (data_type=="wgs" & is.na(ismale)) { + print("Please provide a boolean denominator whether this sample represents a male donor") + q(save="no", status=1) + } + + if (data_type=="wgs" & is.na(g1000allelesprefix)) { + print("Please provide a path to 1000 Genomes allele reference files") + q(save="no", status=1) + } + + if (data_type=="wgs" & is.na(gccorrectprefix)) { + print("Please provide a path to GC content reference files") + q(save="no", status=1) + } + + if (data_type=="wgs" | data_type=="WGS") { + chrom_names = get.chrom.names(imputeinfofile, ismale) + logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="") + allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="") + } else if (data_type=="snp6" | data_type=="SNP6") { + chrom_names = get.chrom.names(imputeinfofile, TRUE) + logr_file = paste(tumourname, "_mutantLogR.tab", sep="") + allelecounts_file = NULL + } + + if (!skip_preprocessing) { + if (data_type=="wgs" | data_type=="WGS") { + # Setup for parallel computing + clp = parallel::makeCluster(nthreads) + doParallel::registerDoParallel(clp) + + prepare_wgs(chrom_names=chrom_names, + tumourbam=tumour_data_file, + normalbam=normal_data_file, + tumourname=tumourname, + normalname=normalname, + g1000allelesprefix=g1000allelesprefix, + g1000prefix=g1000prefix, + gccorrectprefix=gccorrectprefix, + min_base_qual=min_base_qual, + min_map_qual=min_map_qual, + allelecounter_exe=allelecounter_exe, + min_normal_depth=min_normal_depth, + nthreads=nthreads, + skip_allele_counting=skip_allele_counting) + + # Kill the threads + parallel::stopCluster(clp) + + } else if (data_type=="snp6" | data_type=="SNP6") { + + prepare_snp6(tumour_cel_file=tumour_data_file, + normal_cel_file=normal_data_file, + tumourname=tumourname, + chrom_names=chrom_names, + snp6_reference_info_file=snp6_reference_info_file, + apt.probeset.genotype.exe=apt.probeset.genotype.exe, + apt.probeset.summarize.exe=apt.probeset.summarize.exe, + norm.geno.clust.exe=norm.geno.clust.exe, + birdseed_report_file=birdseed_report_file) + + } else { + print("Unknown data type provided, please provide wgs or snp6") + q(save="no", status=1) + } + } + + if (data_type=="snp6" | data_type=="SNP6") { + # Infer what the gender is - WGS requires it to be specified + gender = infer_gender_birdseed(birdseed_report_file) + ismale = gender == "male" + } + + if (!skip_phasing) { + # Setup for parallel computing + clp = parallel::makeCluster(nthreads) + doParallel::registerDoParallel(clp) + + # Reconstruct haplotypes + # mclapply(1:length(chrom_names), function(chrom) { + foreach::foreach (chrom=1:length(chrom_names)) %dopar% { + print(chrom) + + run_haplotyping(chrom=chrom, + tumourname=tumourname, + normalname=normalname, + ismale=ismale, + imputeinfofile=imputeinfofile, + problemloci=problemloci, + impute_exe=impute_exe, + min_normal_depth=min_normal_depth, + chrom_names=chrom_names, + snp6_reference_info_file=snp6_reference_info_file, + heterozygousFilter=heterozygousFilter) + }#, mc.cores=nthreads) + + # Kill the threads as from here its all single core + parallel::stopCluster(clp) + + # Combine all the BAF output into a single file + combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), + inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", + outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + no.chrs=length(chrom_names)) + } + + # Segment the phased and haplotyped BAF data + segment.baf.phased(samplename=tumourname, + inputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + outputfile=paste(tumourname, ".BAFsegmented.txt", sep=""), + gamma=segmentation_gamma, + phasegamma=phasing_gamma, + kmin=segmentation_kmin, + phasekmin=phasing_kmin, + calc_seg_baf_option=calc_seg_baf_option) + + # Fit a clonal copy number profile + fit.copy.number(samplename=tumourname, + outputfile.prefix=paste(tumourname, "_", sep=""), + inputfile.baf.segmented=paste(tumourname, ".BAFsegmented.txt", sep=""), + inputfile.baf=paste(tumourname,"_mutantBAF.tab", sep=""), + inputfile.logr=logr_file, + dist_choice=clonality_dist_metric, + ascat_dist_choice=ascat_dist_metric, + min.ploidy=min_ploidy, + max.ploidy=max_ploidy, + min.rho=min_rho, + min.goodness=min_goodness, + uninformative_BAF_threshold=uninformative_BAF_threshold, + gamma_param=platform_gamma, + use_preset_rho_psi=F, + preset_rho=NA, + preset_psi=NA, + read_depth=30) + + # Go over all segments, determine which segements are a mixture of two states and fit a second CN state + callSubclones(sample.name=tumourname, + baf.segmented.file=paste(tumourname, ".BAFsegmented.txt", sep=""), + logr.file=logr_file, + rho.psi.file=paste(tumourname, "_rho_and_psi.txt",sep=""), + output.file=paste(tumourname,"_subclones.txt", sep=""), + output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), + output.gw.figures.prefix=paste(tumourname,"_BattenbergProfile", sep=""), + masking_output_file=paste(tumourname, "_segment_masking_details.txt", sep=""), + sv_breakpoints_file="NA", + chr_names=chrom_names, + gamma=platform_gamma, + segmentation.gamma=NA, + siglevel=0.05, + maxdist=0.01, + noperms=1000, + calc_seg_baf_option=calc_seg_baf_option) + + # Make some post-hoc plots + make_posthoc_plots(samplename=tumourname, + logr_file=logr_file, + subclones_file=paste(tumourname, "_subclones.txt", sep=""), + rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), + bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), + logrsegmented_file=paste(tumourname, ".logRsegmented.txt", sep=""), + allelecounts_file=allelecounts_file) + + # Save refit suggestions for a future rerun + cnfit_to_refit_suggestions(samplename=tumourname, + subclones_file=paste(tumourname, "_subclones.txt", sep=""), + rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), + gamma_param=platform_gamma) + + + + + +} + diff --git a/R/clonal_ascat.R b/R/clonal_ascat.R index 7de439b..a7c96a5 100755 --- a/R/clonal_ascat.R +++ b/R/clonal_ascat.R @@ -298,14 +298,11 @@ calc_LogR_Pvalue <-function( LogR, maxdist_LogR, LogR_level ) # kjd 27-2-2014 } -#' Helper function to estimate rho from a given copy number state and it's BAF and LogR +#' Helper function to estimate rho from a given copy number state and it's BAF. The LogR is not used. #' @noRd estimate_rho <-function( LogR_value, BAFreq_value, nA_value, nB_value ) # kjd 10-3-2014 { - temp_value = BAFreq_value * ( 2 - nA_value - nB_value ) - temp_value = nB_value - 1 - temp_value - rho_value = ( ( 2 * BAFreq_value ) - 1 ) / temp_value - + rho_value = (2*BAFreq_value-1)/(2*BAFreq_value-BAFreq_value*(nA_value+nB_value)-1+nA_value) return( rho_value ) } diff --git a/R/fitcopynumber.R b/R/fitcopynumber.R index 2041685..99614d0 100644 --- a/R/fitcopynumber.R +++ b/R/fitcopynumber.R @@ -57,13 +57,13 @@ fit.copy.number = function(samplename, outputfile.prefix, inputfile.baf.segmente raw.BAF.data = raw.BAF.data[!is.na(raw.BAF.data[,3]),] raw.logR.data = raw.logR.data[!is.na(raw.logR.data[,3]),] - # Chromosome names are sometimes 'chr1', etc. - if(length(grep("chr",raw.BAF.data[1,1]))>0){ - raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1]) - } - if(length(grep("chr",raw.logR.data[1,1]))>0){ - raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1]) - } + ## Chromosome names are sometimes 'chr1', etc. + #if(length(grep("chr",raw.BAF.data[1,1]))>0){ + # raw.BAF.data[,1] = gsub("chr","",raw.BAF.data[,1]) + #} + #if(length(grep("chr",raw.logR.data[1,1]))>0){ + # raw.logR.data[,1] = gsub("chr","",raw.logR.data[,1]) + #} BAF.data = NULL logR.data = NULL @@ -222,9 +222,9 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil } # Chromosome names are sometimes 'chr1', etc. - if(length(grep("chr",LogRvals[1,1]))>0){ - LogRvals[,1] = gsub("chr","",LogRvals[,1]) - } + #if(length(grep("chr",LogRvals[1,1]))>0){ +# LogRvals[,1] = gsub("chr","",LogRvals[,1]) + #} ctrans = c(1:length(chr_names)) names(ctrans) = chr_names @@ -267,7 +267,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil ################################################################################################ # Collapse the BAFsegmented into breakpoints to be used in plotting segment_breakpoints = collapse_bafsegmented_to_segments(BAFvals) - if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") { + if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) { svs = read.table(sv_breakpoints_file, header=T, stringsAsFactors=F) } @@ -277,7 +277,7 @@ callSubclones = function(sample.name, baf.segmented.file, logr.file, rho.psi.fil #if no points to plot, skip if (length(pos)==0) { next } - if (!is.null(sv_breakpoints_file) & !sv_breakpoints_file=="NA") { + if (!is.null(sv_breakpoints_file) & !ifelse(is.null(sv_breakpoints_file), TRUE, sv_breakpoints_file=="NA") & !ifelse(is.null(sv_breakpoints_file), TRUE, is.na(sv_breakpoints_file))) { svs_pos = svs[svs$chromosome==chr,]$position / 1000000 } else { svs_pos = NULL @@ -609,7 +609,9 @@ merge_segments = function(subclones, bafsegmented, logR, rho, psi, platform_gamm # Perform t-test on the BAFphased if (sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]])) > 10 & - sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10) { + sum(!is.na(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])) > 10 & + sum(!is.na(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3])) > 10 & + sum(!is.na(logR[logR$Chromosome==subclones$chr[i] & logR$Position>=subclones$startpos[i] & logR$Position<=subclones$endpos[i], 3])) > 10) { baf_significant = t.test(bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i-1] & bafsegmented$Position>=subclones$startpos[i-1] & bafsegmented$Position<=subclones$endpos[i-1]], bafsegmented$BAFphased[bafsegmented$Chromosome==subclones$chr[i] & bafsegmented$Position>=subclones$startpos[i] & bafsegmented$Position<=subclones$endpos[i]])$p.value < 0.05 logr_significant = t.test(logR[logR$Chromosome==subclones$chr[i-1] & logR$Position>=subclones$startpos[i-1] & logR$Position<=subclones$endpos[i-1], 3], diff --git a/R/impute.R b/R/impute.R index 01e542a..29356a4 100644 --- a/R/impute.R +++ b/R/impute.R @@ -112,3 +112,97 @@ combine.impute.output = function(inputfile.prefix, outputfile, is.male, imputein impute.output = concatenateImputeFiles(inputfile.prefix, all.boundaries) write.table(impute.output, file=outputfile, row.names=F, col.names=F, quote=F, sep=" ") } + + +#' Construct haplotypes for a chromosome +#' +#' This function takes preprocessed data and performs haplotype reconstruction. +#' +#' @param chrom The chromosome for which to reconstruct haplotypes +#' @param tumourname Identifier of the tumour, used to match data files on disk +#' @param normalname Identifier of the normal, used to match data files on disk +#' @param ismale Boolean, set to TRUE if the sample is male +#' @param imputeinfofile Full path to the imputeinfo reference file +#' @param problemloci Full path to the problematic loci reference file +#' @param impute_exe Path to the impute executable (can be found if its in $PATH) +#' @param min_normal_depth Minimal depth in the matched normal required for a SNP to be used +#' @param chrom_names A vector containing the names of chromosomes to be included +#' @param snp6_reference_info_file SNP6 only parameter Default: NA +#' @param heterozygousFilter SNP6 only parameter Default: NA +#' @author sd11 +#' @export +run_haplotyping = function(chrom, tumourname, normalname, ismale, imputeinfofile, problemloci, impute_exe, min_normal_depth, chrom_names, + snp6_reference_info_file=NA, heterozygousFilter=NA) { + + if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) { + generate.impute.input.wgs(chrom=chrom, + tumour.allele.counts.file=paste(tumourname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), + normal.allele.counts.file=paste(normalname,"_alleleFrequencies_chr", chrom, ".txt", sep=""), + output.file=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), + imputeinfofile=imputeinfofile, + is.male=ismale, + problemLociFile=problemloci, + useLociFile=NA) + } else { + generate.impute.input.snp6(infile.germlineBAF=paste(tumourname, "_germlineBAF.tab", sep=""), + infile.tumourBAF=paste(tumourname, "_mutantBAF.tab", sep=""), + outFileStart=paste(tumourname, "_impute_input_chr", sep=""), + chrom=chrom, + chr_names=chrom_names, + problemLociFile=problemloci, + snp6_reference_info_file=snp6_reference_info_file, + imputeinfofile=imputeinfofile, + is.male=ismale, + heterozygousFilter=heterozygousFilter) + } + + # Run impute on the files + run.impute(inputfile=paste(tumourname, "_impute_input_chr", chrom, ".txt", sep=""), + outputfile.prefix=paste(tumourname, "_impute_output_chr", chrom, ".txt", sep=""), + is.male=ismale, + imputeinfofile=imputeinfofile, + impute.exe=impute_exe, + region.size=5000000, + chrom=chrom) + + # As impute runs in windows across a chromosome we need to assemble the output + combine.impute.output(inputfile.prefix=paste(tumourname, "_impute_output_chr", chrom, ".txt", sep=""), + outputfile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + is.male=ismale, + imputeinfofile=imputeinfofile, + region.size=5000000, + chrom=chrom) + + # If an allele counts file exists we assume this is a WGS sample and run the corresponding step, otherwise it must be SNP6 + print(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep="")) + print(file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) + if (file.exists(paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""))) { + # WGS - Transform the impute output into haplotyped BAFs + GetChromosomeBAFs(chrom=chrom, + SNP_file=paste(tumourname, "_alleleFrequencies_chr", chrom, ".txt", sep=""), + haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + samplename=tumourname, + outfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + chr_names=chrom_names, + minCounts=min_normal_depth) + } else { + print("SNP6 get BAFs") + # SNP6 - Transform the impute output into haplotyped BAFs + GetChromosomeBAFs_SNP6(chrom=chrom, + alleleFreqFile=paste(tumourname, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), + haplotypeFile=paste(tumourname, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), + samplename=tumourname, + outputfile=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + chr_names=chrom_names) + } + + # Plot what we have until this point + plot.haplotype.data(haplotyped.baf.file=paste(tumourname, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), + imageFileName=paste(tumourname,"_chr",chrom,"_heterozygousData.png",sep=""), + samplename=tumourname, + chrom=chrom, + chr_names=chrom_names) + + # Cleanup temp Impute output + unlink(paste(tumourname, "_impute_output_chr", chrom, "*K.txt*", sep="")) +} diff --git a/R/plotting.R b/R/plotting.R index 699c48b..fb3c947 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -484,7 +484,7 @@ allele_ratio_plot = function(samplename, bafsegmented, logrsegmented, outputfile print("Plotting..") if (platform == "WGS") { - sel = allelecounts[seq(1, nrow(allelecounts), 100),] + sel = seq(1, nrow(allelecounts), 100) } else { sel = rep(T, nrow(allelecounts)) } diff --git a/R/prepare_SNP6.R b/R/prepare_SNP6.R index ce9512b..76e4eae 100644 --- a/R/prepare_SNP6.R +++ b/R/prepare_SNP6.R @@ -421,3 +421,47 @@ infer_gender_birdseed = function(birdseed_report_file) { z = read.table(birdseed_report_file, header=T) return(as.character(z$em.cluster.chrX.het.contrast_gender)) } + + +#' Prepare SNP6 data for haplotype construction +#' +#' This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files +#' and performing GC content correction. +#' +#' @param tumour_cel_file Full path to a CEL file containing the tumour raw data +#' @param normal_cel_file Full path to a CEL file containing the normal raw data +#' @param tumourname Identifier to be used for tumour output files +#' @param chrom_names A vector containing the names of chromosomes to be included +#' @param snp6_reference_info_file Full path to the SNP6 reference info file +#' @param apt.probeset.genotype.exe Full path to the apt.probeset.genotype executable (Default: expected in $PATH) +#' @param apt.probeset.summarize.exe Full path to the apt.probeset.summarize executable (Default: expected in $PATH) +#' @param norm.geno.clust.exe Full path to the norm.geno.clust.exe executable (Default: expected in $PATH) +#' @param birdseed_report_file Name of the birdseed output file. This is a temp output file of one of the internally called functions of which the name cannot be defined. Don't change this parameter. (Default: birdseed.report.txt) +#' @author sd11 +#' @export +prepare_snp6 = function(tumour_cel_file, normal_cel_file, tumourname, chrom_names, + snp6_reference_info_file, apt.probeset.genotype.exe="apt-probeset-genotype", + apt.probeset.summarize.exe="apt-probeset-summarize", norm.geno.clust.exe="normalize_affy_geno_cluster.pl", + birdseed_report_file="birdseed.report.txt") { + + # Extract the LogR and BAF from both tumour and normal cel files. + cel2baf.logr(normal_cel_file=normal_cel_file, + tumour_cel_file=tumour_cel_file, + output_file=paste(tumourname, "_lrr_baf.txt", sep=""), + snp6_reference_info_file=snp6_reference_info_file, + apt.probeset.genotype.exe=apt.probeset.genotype.exe, + apt.probeset.summarize.exe=apt.probeset.summarize.exe, + norm.geno.clust.exe=norm.geno.clust.exe) + + gc.correct(samplename=tumourname, + infile.logr.baf=paste(tumourname, "_lrr_baf.txt", sep=""), + outfile.tumor.LogR=paste(tumourname, "_mutantLogR.tab", sep=""), + outfile.tumor.BAF=paste(tumourname, "_mutantBAF.tab", sep=""), + outfile.normal.LogR=paste(tumourname, "_germlineLogR.tab", sep=""), + outfile.normal.BAF=paste(tumourname, "_germlineBAF.tab", sep=""), + outfile.probeBAF=paste(tumourname, "_probeBAF.txt", sep=""), + snp6_reference_info_file=snp6_reference_info_file, + birdseed_report_file=birdseed_report_file, + chr_names=chrom_names) + +} diff --git a/R/prepare_wgs.R b/R/prepare_wgs.R index 61e56bb..7db9779 100644 --- a/R/prepare_wgs.R +++ b/R/prepare_wgs.R @@ -35,7 +35,7 @@ getAlleleCounts = function(bam.file, output.file, g1000.loci, min.base.qual=20, #' @param g1000file.prefix Prefix to where 1000 Genomes reference files can be found. #' @param minCounts Integer, minimum depth required for a SNP to be included (optional, default=NA). #' @param samplename String, name of the sample (optional, default=sample1). -#' @param seed A seed to be set +#' @param seed A seed to be set for when randomising the alleles. #' @author dw9, sd11 #' @export getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, figuresFile.prefix, BAFnormalFile, BAFmutantFile, logRnormalFile, logRmutantFile, combinedAlleleCountsFile, chr_names, g1000file.prefix, minCounts=NA, samplename="sample1", seed=as.integer(Sys.time())) { @@ -44,11 +44,19 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile input_data = concatenateAlleleCountFiles(tumourAlleleCountsFile.prefix, ".txt", length(chr_names)) normal_input_data = concatenateAlleleCountFiles(normalAlleleCountsFile.prefix, ".txt", length(chr_names)) - allele_data = concatenateG1000SnpFiles(g1000file.prefix, ".txt", length(chr_names)) + allele_data = concatenateG1000SnpFiles(g1000file.prefix, ".txt", length(chr_names), chr_names) - #save(file=paste(samplename, "_BAFLogR_concatenated_input.RData", sep=""), input_data, normal_input_data, allele_data) - #load(paste(samplename, "_BAFLogR_concatenated_input.RData", sep="")) + # Synchronise all the data frames + chrpos_allele = paste(allele_data[,1], "_", allele_data[,2], sep="") + chrpos_normal = paste(normal_input_data[,1], "_", normal_input_data[,2], sep="") + chrpos_tumour = paste(input_data[,1], "_", input_data[,2], sep="") + matched_data = Reduce(intersect, list(chrpos_allele, chrpos_normal, chrpos_tumour)) + + allele_data = allele_data[chrpos_allele %in% matched_data,] + normal_input_data = normal_input_data[chrpos_tumour %in% matched_data,] + input_data = input_data[chrpos_tumour %in% matched_data,] + # Clean up and reduce amount of unneeded data names(input_data)[1] = "CHR" names(normal_input_data)[1] = "CHR" @@ -57,11 +65,11 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile # Obtain depth for both alleles for tumour and normal len = nrow(normal_data) - normCount1 = normal_data[cbind(1:len,allele_data[,2])] - normCount2 = normal_data[cbind(1:len,allele_data[,3])] + normCount1 = normal_data[cbind(1:len,allele_data[,3])] + normCount2 = normal_data[cbind(1:len,allele_data[,4])] totalNormal = normCount1 + normCount2 - mutCount1 = mutant_data[cbind(1:len,allele_data[,2])] - mutCount2 = mutant_data[cbind(1:len,allele_data[,3])] + mutCount1 = mutant_data[cbind(1:len,allele_data[,3])] + mutCount2 = mutant_data[cbind(1:len,allele_data[,4])] totalMutant = mutCount1 + mutCount2 # Clean up a few unused variables to save some memory @@ -76,10 +84,6 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile totalNormal = totalNormal[indices] totalMutant = totalMutant[indices] - # These variables are cleaned - #normal_data = normal_data[indices,] - #mutant_data = mutant_data[indices,] - normCount1 = normCount1[indices] normCount2 = normCount2[indices] mutCount1 = mutCount1[indices] @@ -118,7 +122,6 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile write.table(alleleCounts, file=combinedAlleleCountsFile, row.names=F, quote=F, sep="\t") # Plot the raw data using ASCAT - #ascat.bc = ascat.loadData(logRmutantFile, BAFmutantFile, logRnormalFile, BAFnormalFile) # Manually create an ASCAT object, which saves reading in the above files again SNPpos = germline.BAF[,c("Chromosome", "Position")] ch = list() @@ -136,7 +139,7 @@ getBAFsAndLogRs = function(tumourAlleleCountsFile.prefix, normalAlleleCountsFile Tumor_LogR_segmented=NULL, Tumor_BAF_segmented=NULL, Tumor_counts=NULL, Germline_counts=NULL, SNPpos=tumor.LogR[,1:2], chrs=chr_names, samples=c(samplename), chrom=split_genome(tumor.LogR[,1:2]), ch=ch) - # TODO: On PD7422a this produces different plots than before (see streak on chrom 14) + ASCAT::ascat.plotRawData(ascat.bc) #, parentDir=figuresFile.prefix) } @@ -248,6 +251,7 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co Tumor_LogR = read.table(Tumour_LogR_file, stringsAsFactors=F, header=T) + print("Processing GC content data") GC_data = list() Tumor_LogR_new = list() for (chrindex in 1:length(chrom_names)) { @@ -263,15 +267,15 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co Tumor_LogR = do.call(rbind, Tumor_LogR_new) rm(Tumor_LogR_new) GC_data = do.call(rbind, GC_data) - + flag_nona = is.finite(Tumor_LogR[,3]) corr = cor(GC_data[flag_nona, 3:ncol(GC_data)], Tumor_LogR[flag_nona,3], use="complete.obs") length = nrow(Tumor_LogR) corr = apply(corr, 1, function(x) sum(abs(x*length))/sum(length)) index_1M = c(which(names(corr)=="X1M"), which(names(corr)=="X1Mb")) - maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)])) - maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)])) + maxGCcol_short = which(corr[1:(index_1M-1)]==max(corr[1:(index_1M-1)]))[1] + maxGCcol_long = which(corr[index_1M:length(corr)]==max(corr[index_1M:length(corr)]))[1] maxGCcol_long = maxGCcol_long+(index_1M-1) cat("weighted correlation: ",paste(names(corr),format(corr,digits=2), ";"),"\n") @@ -315,3 +319,71 @@ gc.correct.wgs = function(Tumour_LogR_file, outfile, correlations_outfile, gc_co write.table(corr, file=gsub(".txt", "_afterCorrection.txt", correlations_outfile), sep="\t", quote=F, row.names=F) } } + +#' Prepare WGS data for haplotype construction +#' +#' This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR +#' and performing GC content correction. +#' +#' @param chrom_names A vector containing the names of chromosomes to be included +#' @param tumourbam Full path to the tumour BAM file +#' @param normalbam Full path to the normal BAM file +#' @param tumourname Identifier to be used for tumour output files +#' @param normalname Identifier to be used for normal output files +#' @param g1000allelesprefix Prefix path to the 1000 Genomes alleles reference files +#' @param g1000prefix Prefix path to the 1000 Genomes SNP reference files +#' @param gccorrectprefix Prefix path to GC content reference data +#' @param min_base_qual Minimum base quality required for a read to be counted +#' @param min_map_qual Minimum mapping quality required for a read to be counted +#' @param allelecounter_exe Path to the allele counter executable (can be found in $PATH) +#' @param min_normal_depth Minimum depth required in the normal for a SNP to be included +#' @param nthreads The number of paralel processes to run +#' @param skip_allele_counting Flag, set to TRUE if allele counting is already complete (files are expected in the working directory on disk) +#' @author sd11 +#' @export +prepare_wgs = function(chrom_names, tumourbam, normalbam, tumourname, normalname, g1000allelesprefix, g1000prefix, gccorrectprefix, + min_base_qual, min_map_qual, allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) { + + requireNamespace("foreach") + requireNamespace("doParallel") + requireNamespace("parallel") + + if (!skip_allele_counting) { + # Obtain allele counts for 1000 Genomes locations for both tumour and normal + foreach::foreach(i=1:length(chrom_names)) %dopar% { + getAlleleCounts(bam.file=tumourbam, + output.file=paste(tumourname,"_alleleFrequencies_chr", i, ".txt", sep=""), + g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), + min.base.qual=min_base_qual, + min.map.qual=min_map_qual, + allelecounter.exe=allelecounter_exe) + + getAlleleCounts(bam.file=normalbam, + output.file=paste(normalname,"_alleleFrequencies_chr", i, ".txt", sep=""), + g1000.loci=paste(g1000allelesprefix, i, ".txt", sep=""), + min.base.qual=min_base_qual, + min.map.qual=min_map_qual, + allelecounter.exe=allelecounter_exe) + } + } + + # Obtain BAF and LogR from the raw allele counts + getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(tumourname,"_alleleFrequencies_chr", sep=""), + normalAlleleCountsFile.prefix=paste(normalname,"_alleleFrequencies_chr", sep=""), + figuresFile.prefix=paste(tumourname, "_", sep=''), + BAFnormalFile=paste(tumourname,"_normalBAF.tab", sep=""), + BAFmutantFile=paste(tumourname,"_mutantBAF.tab", sep=""), + logRnormalFile=paste(tumourname,"_normalLogR.tab", sep=""), + logRmutantFile=paste(tumourname,"_mutantLogR.tab", sep=""), + combinedAlleleCountsFile=paste(tumourname,"_alleleCounts.tab", sep=""), + chr_names=chrom_names, + g1000file.prefix=g1000prefix, + minCounts=min_normal_depth, + samplename=tumourname) + # Perform GC correction + gc.correct.wgs(Tumour_LogR_file=paste(tumourname,"_mutantLogR.tab", sep=""), + outfile=paste(tumourname,"_mutantLogR_gcCorrected.tab", sep=""), + correlations_outfile=paste(tumourname, "_GCwindowCorrelations.txt", sep=""), + gc_content_file_prefix=gccorrectprefix, + chrom_names=chrom_names) +} diff --git a/R/util.R b/R/util.R index f0c8c31..be77422 100644 --- a/R/util.R +++ b/R/util.R @@ -88,16 +88,17 @@ concatenateAlleleCountFiles = function(inputStart, inputEnd, no.chrs) { #' Function to concatenate 1000 Genomes SNP reference files #' @noRd -concatenateG1000SnpFiles = function(inputStart, inputEnd, no.chrs) { - infiles = c() +concatenateG1000SnpFiles = function(inputStart, inputEnd, no.chrs, chr_names) { + data = list() for(chrom in 1:no.chrs) { filename = paste(inputStart, chrom, inputEnd, sep="") # Only add files that exist and have data if(file.exists(filename) && file.info(filename)$size>0) { - infiles = c(infiles, filename) + # infiles = c(infiles, filename) + data[[chrom]] = cbind(chromosome=chr_names[chrom], read_table_generic(filename)) } } - return(as.data.frame(do.call(rbind, lapply(infiles, FUN=function(x) { read_table_generic(x) })))) + return(as.data.frame(do.call(rbind, data))) } ######################################################################################## diff --git a/README.md b/README.md index 37c4262..b6c9d70 100644 --- a/README.md +++ b/README.md @@ -10,32 +10,81 @@ Please visit the [cgpBattenberg page](https://github.com/cancerit/cgpBattenberg) The instructions below will install the latest stable Battenberg version. Please take this approach only when you'd like to do something not covered by cgpBattenberg. -### Prerequisites +### Standalone + +#### Prerequisites Installing from Github requires devtools and Battenberg requires readr, RColorBrewer and ASCAT. The pipeline requires doParallel. From the command line run: - > R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));' - - > R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' +``` +R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools", "readr", "doParallel", "ggplot2", "RColorBrewer", "gridExtra", "gtools"));' +R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' +``` -### Installation from Github +#### Installation from Github To install Battenberg, run the following from the command line: - > R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' +``` +R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")' +``` -### Required reference files +#### Required reference files Battenberg requires a number of reference files that should be downloaded. - * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v1.tar.gz + * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_1000genomesloci2012_v3.tar.gz + * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_impute_1000G_v3.tar.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/probloci_270415.txt.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_wgs_gc_correction_1000g_v3.tar.gz * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_exe.tgz (SNP6 only) * ftp://ftp.sanger.ac.uk/pub/teams/113/Battenberg/battenberg_snp6_ref.tgz (SNP6 only) -### Pipeline +#### Pipeline Go into inst/example for example WGS and SNP6 R-only pipelines. - \ No newline at end of file +### Docker - experimental + +Battenberg can be run inside a Docker container. Please follow the instructions below. + +#### Installation + +``` +wget https://raw.githubusercontent.com/Wedge-Oxford/battenberg/dev/Dockerfile +docker build -t battenberg:2.2.8 . +``` + +#### Download reference data + +To do + +#### Run interactively + +These commands run the Battenberg pipeline within a Docker container in interactive mode. This command assumes the data is available locally in `$PWD/data/pcawg/HCC1143_ds` and the reference files have been placed in `$PWD/battenberg_reference` + +``` +docker run -it -v `pwd`/data/pcawg/HCC1143_ds:/mnt/battenberg/ -v `pwd`/battenberg_reference:/opt/battenberg_reference battenberg:2.2.8 +``` + +Within the Docker terminal run the pipeline, in this case on the ICGC PCAWG testing data available [here](https://s3-eu-west-1.amazonaws.com/wtsi-pancancer/testdata/HCC1143_ds.tar). + +``` +R CMD BATCH '--no-restore-data --no-save --args HCC1143 HCC1143_BL /mnt/battenberg/HCC1143_BL.bam /mnt/battenberg/HCC1143.bam FALSE /mnt/battenberg/' /usr/local/bin/battenberg_wgs.R /mnt/battenberg/battenberg.Rout +``` + +### Building a release + +In RStudio: In the Build tab, click Check Package + +Then open the NAMESPACE file and edit: + +``` +S3method(plot,haplotype.data) +``` + +to: + +``` +export(plot.haplotype.data) +``` diff --git a/inst/example/battenberg_snp6.R b/inst/example/battenberg_snp6.R index b2ffc39..59a2fdb 100644 --- a/inst/example/battenberg_snp6.R +++ b/inst/example/battenberg_snp6.R @@ -4,8 +4,10 @@ NORMALCEL = toString(args[2]) TUMOURCEL = toString(args[3]) RUN_DIR = toString(args[4]) +NORMALNAME = NA +SKIP_PREPROCESSING = F +SKIP_PHASING = F library(Battenberg) -library(doParallel) ############################################################################### # 2017-10-03 @@ -14,7 +16,7 @@ library(doParallel) ############################################################################### # Parallelism parameters -NTHREADS = 6 +NTHREADS = 8 # General static IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute/impute_info.txt" @@ -33,6 +35,8 @@ BIRDSEED_REPORT_FILE = "birdseed.report.txt" # No control over the name of this PLATFORM_GAMMA = 0.55 PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 +SEGMENTATIIN_KMIN = 3 +PHASING_KMIN = 1 CLONALITY_DIST_METRIC = 0 ASCAT_DIST_METRIC = 1 MIN_PLOIDY = 1.6 #1.6 @@ -47,156 +51,194 @@ HETEROZYGOUSFILTER = "none" # Change to work directory and load the chromosome information setwd(RUN_DIR) -chrom_names = get.chrom.names(IMPUTEINFOFILE, TRUE) -# Setup for parallel computing -clp = makeCluster(NTHREADS) -registerDoParallel(clp) - -# Extract the LogR and BAF from both tumour and normal cel files. -cel2baf.logr(normal_cel_file=NORMALCEL, - tumour_cel_file=TUMOURCEL, - output_file=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), - snp6_reference_info_file=SNP6_REF_INFO_FILE, - apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, - apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, - norm.geno.clust.exe=NORM_GENO_CLUST_EXE) - -gc.correct(samplename=TUMOURNAME, - infile.logr.baf=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), - outfile.tumor.LogR=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), - outfile.tumor.BAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), - outfile.normal.LogR=paste(TUMOURNAME, "_germlineLogR.tab", sep=""), - outfile.normal.BAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), - outfile.probeBAF=paste(TUMOURNAME, "_probeBAF.txt", sep=""), - snp6_reference_info_file=SNP6_REF_INFO_FILE, +battenberg(tumourname=TUMOURNAME, + normalname=NORMALNAME, + tumour_data_file=TUMOURCEL, + normal_data_file=NORMALCEL, + imputeinfofile=IMPUTEINFOFILE, + g1000prefix=G1000PREFIX, + problemloci=PROBLEMLOCI, + data_type="snp6", + impute_exe=IMPUTE_EXE, + nthreads=NTHREADS, + platform_gamma=PLATFORM_GAMMA, + phasing_gamma=PHASING_GAMMA, + segmentation_gamma=SEGMENTATION_GAMMA, + segmentation_kmin=SEGMENTATIIN_KMIN, + phasing_kmin=PHASING_KMIN, + clonality_dist_metric=CLONALITY_DIST_METRIC, + ascat_dist_metric=ASCAT_DIST_METRIC, + min_ploidy=MIN_PLOIDY, + max_ploidy=MAX_PLOIDY, + min_rho=MIN_RHO, + min_goodness=MIN_GOODNESS_OF_FIT, + uninformative_BAF_threshold=BALANCED_THRESHOLD, + calc_seg_baf_option=CALC_SEG_BAF_OPTION, + skip_preprocessing=SKIP_PREPROCESSING, + skip_phasing=SKIP_PHASING, + snp6_reference_info_file=SNP6_REF_INFO_FILE, + apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, + apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, + norm.geno.clust.exe=NORM_GENO_CLUST_EXE, birdseed_report_file=BIRDSEED_REPORT_FILE, - chr_names=chrom_names) - -# Infer what the gender is -gender = infer_gender_birdseed(BIRDSEED_REPORT_FILE) -is_male = gender == "male" -chrom_names = get.chrom.names(IMPUTEINFOFILE, is_male) - -foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run.impute","combine.impute.output","GetChromosomeBAFs_SNP6","plot.haplotype.data")) %dopar% { - - # Transform input into a format that Impute2 takes - generate.impute.input.snp6(infile.germlineBAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), - infile.tumourBAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), - outFileStart=paste(TUMOURNAME, "_impute_input_chr", sep=""), - chrom=chrom, - chr_names=chrom_names, - problemLociFile=PROBLEMLOCI, - snp6_reference_info_file=SNP6_REF_INFO_FILE, - imputeinfofile=IMPUTEINFOFILE, - is.male=is_male, - heterozygousFilter=HETEROZYGOUSFILTER) - - # Run impute on the files - run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), - outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - is.male=is_male, - imputeinfofile=IMPUTEINFOFILE, - chrom=chrom, - impute.exe=IMPUTE_EXE) - - # As impute runs in windows across a chromosome we need to assemble the output - combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - is.male=is_male, - imputeinfofile=IMPUTEINFOFILE, - region.size=5000000, - chrom=chrom) - - # Transform the impute output into haplotyped BAFs - GetChromosomeBAFs_SNP6(chrom=chrom, - alleleFreqFile=paste(TUMOURNAME, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), - haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - samplename=TUMOURNAME, - outputfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - chr_names=chrom_names) - - # Plot what we have until this point - plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - imageFileName=paste(TUMOURNAME,"_chr",chrom_names[chrom],"_heterozygousData.png",sep=""), - samplename=TUMOURNAME, - chrom=chrom, - chr_names=chrom_names) - - # Cleanup temp Impute output - unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt_*K.txt*", sep="")) -} - -# Kill the threads as from here its all single core -stopCluster(clp) - -# Combine all the BAF output into a single file -combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), - inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", - outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - no.chrs=length(chrom_names)) - -# Segment the phased and haplotyped BAF data -# Call segment.baf.phased.sv when SVs are available - not relevant for SNP6 data -segment.baf.phased(samplename=TUMOURNAME, - inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - gamma=SEGMENTATION_GAMMA, - phasegamma=PHASING_GAMMA, - kmin=3, - phasekmin=1, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) - -# Fit a clonal copy number profile -fit.copy.number(samplename=TUMOURNAME, - outputfile.prefix=paste(TUMOURNAME, "_", sep=""), - inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - dist_choice=CLONALITY_DIST_METRIC, - ascat_dist_choice=ASCAT_DIST_METRIC, - min.ploidy=MIN_PLOIDY, - max.ploidy=MAX_PLOIDY, - min.rho=MIN_RHO, - max.rho=MAX_RHO, - min.goodness=MIN_GOODNESS_OF_FIT, - uninformative_BAF_threshold=BALANCED_THRESHOLD, - gamma_param=PLATFORM_GAMMA, - use_preset_rho_psi=F, - preset_rho=NA, - preset_psi=NA, - read_depth=30) - -# Go over all segments, determine which segements are a mixture of two states and fit a second CN state -callSubclones(sample.name=TUMOURNAME, - baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), - output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), - output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), - output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), - masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""), - chr_names=chrom_names, - gamma=PLATFORM_GAMMA, - segmentation.gamma=NA, - siglevel=0.05, - maxdist=0.01, - noperms=1000, - max_allowed_state=250, - sv_breakpoints_file="NA", - calc_seg_baf_option=CALC_SEG_BAF_OPTION) - -# Make some post-hoc plots -make_posthoc_plots(samplename=TUMOURNAME, - logr_file=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), - allelecounts_file=NULL) - -# Save refit suggestions for a future rerun -cnfit_to_refit_suggestions(samplename=TUMOURNAME, - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - gamma_param=PLATFORM_GAMMA) + heterozygousFilter=HETEROZYGOUSFILTER) + + + + +# # Change to work directory and load the chromosome information +# setwd(RUN_DIR) +# chrom_names = get.chrom.names(IMPUTEINFOFILE, TRUE) +# +# # Setup for parallel computing +# clp = makeCluster(NTHREADS) +# registerDoParallel(clp) +# +# # Extract the LogR and BAF from both tumour and normal cel files. +# cel2baf.logr(normal_cel_file=NORMALCEL, +# tumour_cel_file=TUMOURCEL, +# output_file=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), +# snp6_reference_info_file=SNP6_REF_INFO_FILE, +# apt.probeset.genotype.exe=APT_PROBESET_GENOTYPE_EXE, +# apt.probeset.summarize.exe=APT_PROBESET_SUMMARIZE_EXE, +# norm.geno.clust.exe=NORM_GENO_CLUST_EXE) +# +# gc.correct(samplename=TUMOURNAME, +# infile.logr.baf=paste(TUMOURNAME, "_lrr_baf.txt", sep=""), +# outfile.tumor.LogR=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), +# outfile.tumor.BAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), +# outfile.normal.LogR=paste(TUMOURNAME, "_germlineLogR.tab", sep=""), +# outfile.normal.BAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), +# outfile.probeBAF=paste(TUMOURNAME, "_probeBAF.txt", sep=""), +# snp6_reference_info_file=SNP6_REF_INFO_FILE, +# birdseed_report_file=BIRDSEED_REPORT_FILE, +# chr_names=chrom_names) +# +# # Infer what the gender is +# gender = infer_gender_birdseed(BIRDSEED_REPORT_FILE) +# is_male = gender == "male" +# chrom_names = get.chrom.names(IMPUTEINFOFILE, is_male) +# +# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.snp6","run.impute","combine.impute.output","GetChromosomeBAFs_SNP6","plot.haplotype.data")) %dopar% { +# +# # Transform input into a format that Impute2 takes +# generate.impute.input.snp6(infile.germlineBAF=paste(TUMOURNAME, "_germlineBAF.tab", sep=""), +# infile.tumourBAF=paste(TUMOURNAME, "_mutantBAF.tab", sep=""), +# outFileStart=paste(TUMOURNAME, "_impute_input_chr", sep=""), +# chrom=chrom, +# chr_names=chrom_names, +# problemLociFile=PROBLEMLOCI, +# snp6_reference_info_file=SNP6_REF_INFO_FILE, +# imputeinfofile=IMPUTEINFOFILE, +# is.male=is_male, +# heterozygousFilter=HETEROZYGOUSFILTER) +# +# # Run impute on the files +# run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), +# outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# is.male=is_male, +# imputeinfofile=IMPUTEINFOFILE, +# chrom=chrom, +# impute.exe=IMPUTE_EXE) +# +# # As impute runs in windows across a chromosome we need to assemble the output +# combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# is.male=is_male, +# imputeinfofile=IMPUTEINFOFILE, +# region.size=5000000, +# chrom=chrom) +# +# # Transform the impute output into haplotyped BAFs +# GetChromosomeBAFs_SNP6(chrom=chrom, +# alleleFreqFile=paste(TUMOURNAME, "_impute_input_chr", chrom, "_withAlleleFreq.csv", sep=""), +# haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# samplename=TUMOURNAME, +# outputfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# chr_names=chrom_names) +# +# # Plot what we have until this point +# plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# imageFileName=paste(TUMOURNAME,"_chr",chrom_names[chrom],"_heterozygousData.png",sep=""), +# samplename=TUMOURNAME, +# chrom=chrom, +# chr_names=chrom_names) +# +# # Cleanup temp Impute output +# unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt_*K.txt*", sep="")) +# } +# +# # Kill the threads as from here its all single core +# stopCluster(clp) +# +# # Combine all the BAF output into a single file +# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), +# inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", +# outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# no.chrs=length(chrom_names)) +# +# # Segment the phased and haplotyped BAF data +# # Call segment.baf.phased.sv when SVs are available - not relevant for SNP6 data +# segment.baf.phased(samplename=TUMOURNAME, +# inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# gamma=SEGMENTATION_GAMMA, +# phasegamma=PHASING_GAMMA, +# kmin=3, +# phasekmin=1, +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Fit a clonal copy number profile +# fit.copy.number(samplename=TUMOURNAME, +# outputfile.prefix=paste(TUMOURNAME, "_", sep=""), +# inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), +# inputfile.logr=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# dist_choice=CLONALITY_DIST_METRIC, +# ascat_dist_choice=ASCAT_DIST_METRIC, +# min.ploidy=MIN_PLOIDY, +# max.ploidy=MAX_PLOIDY, +# min.rho=MIN_RHO, +# max.rho=MAX_RHO, +# min.goodness=MIN_GOODNESS_OF_FIT, +# uninformative_BAF_threshold=BALANCED_THRESHOLD, +# gamma_param=PLATFORM_GAMMA, +# use_preset_rho_psi=F, +# preset_rho=NA, +# preset_psi=NA, +# read_depth=30) +# +# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state +# callSubclones(sample.name=TUMOURNAME, +# baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logr.file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), +# output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), +# output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), +# output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), +# masking_output_file=paste(TUMOURNAME,"_segment_masking_details.txt", sep=""), +# chr_names=chrom_names, +# gamma=PLATFORM_GAMMA, +# segmentation.gamma=NA, +# siglevel=0.05, +# maxdist=0.01, +# noperms=1000, +# max_allowed_state=250, +# sv_breakpoints_file="NA", +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Make some post-hoc plots +# make_posthoc_plots(samplename=TUMOURNAME, +# logr_file=paste(TUMOURNAME, "_mutantLogR.tab", sep=""), +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), +# allelecounts_file=NULL) +# +# # Save refit suggestions for a future rerun +# cnfit_to_refit_suggestions(samplename=TUMOURNAME, +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# gamma_param=PLATFORM_GAMMA) diff --git a/inst/example/battenberg_wgs.R b/inst/example/battenberg_wgs.R index 97daeaa..8fc2cdc 100644 --- a/inst/example/battenberg_wgs.R +++ b/inst/example/battenberg_wgs.R @@ -6,17 +6,20 @@ TUMOURBAM = toString(args[4]) IS.MALE = as.logical(args[5]) RUN_DIR = toString(args[6]) +SKIP_ALLELECOUNTING = F +SKIP_PREPROCESSING = F +SKIP_PHASING = F + library(Battenberg) -library(doParallel) ############################################################################### -# 2017-10-03 -# A pure R Battenberg v2.2.7 WGS pipeline implementation. +# 2017-11-05 +# A pure R Battenberg v2.2.8 WGS pipeline implementation. # sd11@sanger.ac.uk ############################################################################### # Parallelism parameters -NTHREADS = 6 +NTHREADS = 8 # General static IMPUTEINFOFILE = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_impute_v3/impute_info.txt" @@ -29,6 +32,8 @@ IMPUTE_EXE = "impute2" PLATFORM_GAMMA = 1 PHASING_GAMMA = 1 SEGMENTATION_GAMMA = 10 +SEGMENTATIIN_KMIN = 3 +PHASING_KMIN = 1 CLONALITY_DIST_METRIC = 0 ASCAT_DIST_METRIC = 1 MIN_PLOIDY = 1.6 @@ -45,168 +50,208 @@ CALC_SEG_BAF_OPTION = 1 ALLELECOUNTER = "alleleCounter" PROBLEMLOCI = "/lustre/scratch116/casm/team113/sd11/reference/GenomeFiles/battenberg_probloci/probloci_270415.txt.gz" -chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE) - -# Setup for parallel computing -clp = makeCluster(NTHREADS) -registerDoParallel(clp) - -# Obtain allele counts for 1000 Genomes locations for both tumour and normal -foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { - getAlleleCounts(bam.file=TUMOURBAM, - output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), - g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), - min.base.qual=MIN_BASE_QUAL, - min.map.qual=MIN_MAP_QUAL, - allelecounter.exe=ALLELECOUNTER) - - getAlleleCounts(bam.file=NORMALBAM, - output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), - g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), - min.base.qual=MIN_BASE_QUAL, - min.map.qual=MIN_MAP_QUAL, - allelecounter.exe=ALLELECOUNTER) - -} -# Obtain BAF and LogR from the raw allele counts -getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), - normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""), - figuresFile.prefix=paste(TUMOURNAME, "_", sep=''), - BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), - BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), - logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""), - chr_names=chrom_names, - g1000file.prefix=G1000PREFIX, - minCounts=MIN_NORMAL_DEPTH, - samplename=TUMOURNAME) -# Perform GC correction -gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), - outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), - correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""), - gc_content_file_prefix=GCCORRECTPREFIX, - chrom_names=chrom_names) - - -# These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster. -foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% { - print(chrom) - # Transform the allele counts into something that impute understands - generate.impute.input.wgs(chrom=chrom, - tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), - normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), - output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), - imputeinfofile=IMPUTEINFOFILE, - is.male=IS.MALE, - problemLociFile=PROBLEMLOCI, - useLociFile=NA) - - # Run impute on the files - run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), - outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - is.male=IS.MALE, - imputeinfofile=IMPUTEINFOFILE, - impute.exe=IMPUTE_EXE, - region.size=5000000, - chrom=chrom) - - # As impute runs in windows across a chromosome we need to assemble the output - combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), - outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - is.male=IS.MALE, - imputeinfofile=IMPUTEINFOFILE, - region.size=5000000, - chrom=chrom) - - # Transform the impute output into haplotyped BAFs - GetChromosomeBAFs(chrom=chrom, - SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), - haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), - samplename=TUMOURNAME, - outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - chr_names=chrom_names, - minCounts=MIN_NORMAL_DEPTH) - - # Plot what we have until this point - plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), - samplename=TUMOURNAME, - chrom=chrom, - chr_names=chrom_names) - - # Cleanup temp Impute output - unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) -} - -# Kill the threads as from here its all single core -stopCluster(clp) - -# Combine all the BAF output into a single file -combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), - inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", - outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - no.chrs=length(chrom_names)) - -# Segment the phased and haplotyped BAF data -segment.baf.phased(samplename=TUMOURNAME, - inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), - outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - gamma=SEGMENTATION_GAMMA, - phasegamma=PHASING_GAMMA, - kmin=3, - phasekmin=1, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# Change to work directory and load the chromosome information +setwd(RUN_DIR) -# Fit a clonal copy number profile -fit.copy.number(samplename=TUMOURNAME, - outputfile.prefix=paste(TUMOURNAME, "_", sep=""), - inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), - inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), - dist_choice=CLONALITY_DIST_METRIC, - ascat_dist_choice=ASCAT_DIST_METRIC, - min.ploidy=MIN_PLOIDY, - max.ploidy=MAX_PLOIDY, - min.rho=MIN_RHO, - min.goodness=MIN_GOODNESS_OF_FIT, - uninformative_BAF_threshold=BALANCED_THRESHOLD, - gamma_param=PLATFORM_GAMMA, - use_preset_rho_psi=F, - preset_rho=NA, - preset_psi=NA, - read_depth=30) +battenberg(tumourname=TUMOURNAME, + normalname=NORMALNAME, + tumour_data_file=TUMOURBAM, + normal_data_file=NORMALBAM, + ismale=IS.MALE, + imputeinfofile=IMPUTEINFOFILE, + g1000prefix=G1000PREFIX, + g1000allelesprefix=G1000PREFIX_AC, + gccorrectprefix=GCCORRECTPREFIX, + problemloci=PROBLEMLOCI, + data_type="wgs", + impute_exe=IMPUTE_EXE, + allelecounter_exe=ALLELECOUNTER, + nthreads=NTHREADS, + platform_gamma=PLATFORM_GAMMA, + phasing_gamma=PHASING_GAMMA, + segmentation_gamma=SEGMENTATION_GAMMA, + segmentation_kmin=SEGMENTATIIN_KMIN, + phasing_kmin=PHASING_KMIN, + clonality_dist_metric=CLONALITY_DIST_METRIC, + ascat_dist_metric=ASCAT_DIST_METRIC, + min_ploidy=MIN_PLOIDY, + max_ploidy=MAX_PLOIDY, + min_rho=MIN_RHO, + min_goodness=MIN_GOODNESS_OF_FIT, + uninformative_BAF_threshold=BALANCED_THRESHOLD, + min_normal_depth=MIN_NORMAL_DEPTH, + min_base_qual=MIN_BASE_QUAL, + min_map_qual=MIN_MAP_QUAL, + calc_seg_baf_option=CALC_SEG_BAF_OPTION, + skip_allele_counting=SKIP_ALLELECOUNTING, + skip_preprocessing=SKIP_PREPROCESSING, + skip_phasing=SKIP_PHASING) -# Go over all segments, determine which segements are a mixture of two states and fit a second CN state -callSubclones(sample.name=TUMOURNAME, - baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), - rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), - output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), - output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), - output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), - masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""), - sv_breakpoints_file="NA", - chr_names=chrom_names, - gamma=PLATFORM_GAMMA, - segmentation.gamma=NA, - siglevel=0.05, - maxdist=0.01, - noperms=1000, - calc_seg_baf_option=CALC_SEG_BAF_OPTION) -# Make some post-hoc plots -make_posthoc_plots(samplename=TUMOURNAME, - logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), - logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), - allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep="")) -# Save refit suggestions for a future rerun -cnfit_to_refit_suggestions(samplename=TUMOURNAME, - subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), - rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), - gamma_param=PLATFORM_GAMMA) +# chrom_names = get.chrom.names(IMPUTEINFOFILE, IS.MALE) +# +# # Setup for parallel computing +# clp = makeCluster(NTHREADS) +# registerDoParallel(clp) +# +# # Obtain allele counts for 1000 Genomes locations for both tumour and normal +# foreach(i=1:length(chrom_names), .export=c("getAlleleCounts")) %dopar% { +# getAlleleCounts(bam.file=TUMOURBAM, +# output.file=paste(TUMOURNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), +# g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), +# min.base.qual=MIN_BASE_QUAL, +# min.map.qual=MIN_MAP_QUAL, +# allelecounter.exe=ALLELECOUNTER) +# +# getAlleleCounts(bam.file=NORMALBAM, +# output.file=paste(NORMALNAME,"_alleleFrequencies_chr", i, ".txt", sep=""), +# g1000.loci=paste(G1000PREFIX_AC, i, ".txt", sep=""), +# min.base.qual=MIN_BASE_QUAL, +# min.map.qual=MIN_MAP_QUAL, +# allelecounter.exe=ALLELECOUNTER) +# +# } +# # Obtain BAF and LogR from the raw allele counts +# getBAFsAndLogRs(tumourAlleleCountsFile.prefix=paste(TUMOURNAME,"_alleleFrequencies_chr", sep=""), +# normalAlleleCountsFile.prefix=paste(NORMALNAME,"_alleleFrequencies_chr", sep=""), +# figuresFile.prefix=paste(TUMOURNAME, "_", sep=''), +# BAFnormalFile=paste(TUMOURNAME,"_normalBAF.tab", sep=""), +# BAFmutantFile=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), +# logRnormalFile=paste(TUMOURNAME,"_normalLogR.tab", sep=""), +# logRmutantFile=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# combinedAlleleCountsFile=paste(TUMOURNAME,"_alleleCounts.tab", sep=""), +# chr_names=chrom_names, +# g1000file.prefix=G1000PREFIX, +# minCounts=MIN_NORMAL_DEPTH, +# samplename=TUMOURNAME) +# # Perform GC correction +# gc.correct.wgs(Tumour_LogR_file=paste(TUMOURNAME,"_mutantLogR.tab", sep=""), +# outfile=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), +# correlations_outfile=paste(TUMOURNAME, "_GCwindowCorrelations.txt", sep=""), +# gc_content_file_prefix=GCCORRECTPREFIX, +# chrom_names=chrom_names) +# +# +# # These steps are independent and can be run in parallel. This could be done through multi-threading or splitting these up into separate jobs on a cluster. +# foreach(chrom=1:length(chrom_names), .export=c("generate.impute.input.wgs","run.impute","combine.impute.output","GetChromosomeBAFs","plot.haplotype.data")) %dopar% { +# print(chrom) +# # Transform the allele counts into something that impute understands +# generate.impute.input.wgs(chrom=chrom, +# tumour.allele.counts.file=paste(TUMOURNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), +# normal.allele.counts.file=paste(NORMALNAME,"_alleleFrequencies_chr", chrom, ".txt", sep=""), +# output.file=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), +# imputeinfofile=IMPUTEINFOFILE, +# is.male=IS.MALE, +# problemLociFile=PROBLEMLOCI, +# useLociFile=NA) +# +# # Run impute on the files +# run.impute(inputfile=paste(TUMOURNAME, "_impute_input_chr", chrom, ".txt", sep=""), +# outputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# is.male=IS.MALE, +# imputeinfofile=IMPUTEINFOFILE, +# impute.exe=IMPUTE_EXE, +# region.size=5000000, +# chrom=chrom) +# +# # As impute runs in windows across a chromosome we need to assemble the output +# combine.impute.output(inputfile.prefix=paste(TUMOURNAME, "_impute_output_chr", chrom, ".txt", sep=""), +# outputfile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# is.male=IS.MALE, +# imputeinfofile=IMPUTEINFOFILE, +# region.size=5000000, +# chrom=chrom) +# +# # Transform the impute output into haplotyped BAFs +# GetChromosomeBAFs(chrom=chrom, +# SNP_file=paste(TUMOURNAME, "_alleleFrequencies_chr", chrom, ".txt", sep=""), +# haplotypeFile=paste(TUMOURNAME, "_impute_output_chr", chrom, "_allHaplotypeInfo.txt", sep=""), +# samplename=TUMOURNAME, +# outfile=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# chr_names=chrom_names, +# minCounts=MIN_NORMAL_DEPTH) +# +# # Plot what we have until this point +# plot.haplotype.data(haplotyped.baf.file=paste(TUMOURNAME, "_chr", chrom, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# imageFileName=paste(TUMOURNAME,"_chr",chrom,"_heterozygousData.png",sep=""), +# samplename=TUMOURNAME, +# chrom=chrom, +# chr_names=chrom_names) +# +# # Cleanup temp Impute output +# unlink(paste(TUMOURNAME, "_impute_output_chr", chrom, "*K.txt*", sep="")) +# } +# +# # Kill the threads as from here its all single core +# stopCluster(clp) +# +# # Combine all the BAF output into a single file +# combine.baf.files(inputfile.prefix=paste(TUMOURNAME, "_chr", sep=""), +# inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", +# outputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# no.chrs=length(chrom_names)) +# +# # Segment the phased and haplotyped BAF data +# segment.baf.phased(samplename=TUMOURNAME, +# inputfile=paste(TUMOURNAME, "_heterozygousMutBAFs_haplotyped.txt", sep=""), +# outputfile=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# gamma=SEGMENTATION_GAMMA, +# phasegamma=PHASING_GAMMA, +# kmin=3, +# phasekmin=1, +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Fit a clonal copy number profile +# fit.copy.number(samplename=TUMOURNAME, +# outputfile.prefix=paste(TUMOURNAME, "_", sep=""), +# inputfile.baf.segmented=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# inputfile.baf=paste(TUMOURNAME,"_mutantBAF.tab", sep=""), +# inputfile.logr=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), +# dist_choice=CLONALITY_DIST_METRIC, +# ascat_dist_choice=ASCAT_DIST_METRIC, +# min.ploidy=MIN_PLOIDY, +# max.ploidy=MAX_PLOIDY, +# min.rho=MIN_RHO, +# min.goodness=MIN_GOODNESS_OF_FIT, +# uninformative_BAF_threshold=BALANCED_THRESHOLD, +# gamma_param=PLATFORM_GAMMA, +# use_preset_rho_psi=F, +# preset_rho=NA, +# preset_psi=NA, +# read_depth=30) +# +# # Go over all segments, determine which segements are a mixture of two states and fit a second CN state +# callSubclones(sample.name=TUMOURNAME, +# baf.segmented.file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logr.file=paste(TUMOURNAME,"_mutantLogR_gcCorrected.tab", sep=""), +# rho.psi.file=paste(TUMOURNAME, "_rho_and_psi.txt",sep=""), +# output.file=paste(TUMOURNAME,"_subclones.txt", sep=""), +# output.figures.prefix=paste(TUMOURNAME,"_subclones_chr", sep=""), +# output.gw.figures.prefix=paste(TUMOURNAME,"_BattenbergProfile", sep=""), +# masking_output_file=paste(TUMOURNAME, "_segment_masking_details.txt", sep=""), +# sv_breakpoints_file="NA", +# chr_names=chrom_names, +# gamma=PLATFORM_GAMMA, +# segmentation.gamma=NA, +# siglevel=0.05, +# maxdist=0.01, +# noperms=1000, +# calc_seg_baf_option=CALC_SEG_BAF_OPTION) +# +# # Make some post-hoc plots +# make_posthoc_plots(samplename=TUMOURNAME, +# logr_file=paste(TUMOURNAME, "_mutantLogR_gcCorrected.tab", sep=""), +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# bafsegmented_file=paste(TUMOURNAME, ".BAFsegmented.txt", sep=""), +# logrsegmented_file=paste(TUMOURNAME, ".logRsegmented.txt", sep=""), +# allelecounts_file=paste(TUMOURNAME, "_alleleCounts.tab", sep="")) +# +# # Save refit suggestions for a future rerun +# cnfit_to_refit_suggestions(samplename=TUMOURNAME, +# subclones_file=paste(TUMOURNAME, "_subclones.txt", sep=""), +# rho_psi_file=paste(TUMOURNAME, "_rho_and_psi.txt", sep=""), +# gamma_param=PLATFORM_GAMMA) +# diff --git a/man/battenberg.Rd b/man/battenberg.Rd new file mode 100644 index 0000000..a8d75d9 --- /dev/null +++ b/man/battenberg.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/battenberg.R +\name{battenberg} +\alias{battenberg} +\title{Run the Battenberg pipeline} +\usage{ +battenberg(tumourname, normalname, tumour_data_file, normal_data_file, + imputeinfofile, g1000prefix, problemloci, gccorrectprefix = NA, + g1000allelesprefix = NA, ismale = NA, data_type = "wgs", + impute_exe = "impute2", allelecounter_exe = "alleleCounter", + nthreads = 8, platform_gamma = 1, phasing_gamma = 1, + segmentation_gamma = 10, segmentation_kmin = 3, phasing_kmin = 1, + clonality_dist_metric = 0, ascat_dist_metric = 1, min_ploidy = 1.6, + max_ploidy = 4.8, min_rho = 0.1, min_goodness = 0.63, + uninformative_BAF_threshold = 0.51, min_normal_depth = 10, + min_base_qual = 20, min_map_qual = 35, calc_seg_baf_option = 1, + skip_allele_counting = F, skip_preprocessing = F, skip_phasing = F, + snp6_reference_info_file = NA, + apt.probeset.genotype.exe = "apt-probeset-genotype", + apt.probeset.summarize.exe = "apt-probeset-summarize", + norm.geno.clust.exe = "normalize_affy_geno_cluster.pl", + birdseed_report_file = "birdseed.report.txt", heterozygousFilter = "none") +} +\arguments{ +\item{tumourname}{Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.} + +\item{normalname}{Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.} + +\item{tumour_data_file}{A BAM or CEL file for the tumour} + +\item{normal_data_file}{A BAM or CEL file for the normal} + +\item{imputeinfofile}{Full path to a Battenberg impute info file with pointers to Impute2 reference data} + +\item{g1000prefix}{Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data} + +\item{problemloci}{Full path to a problem loci file that contains SNP loci that should be filtered out} + +\item{gccorrectprefix}{Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)} + +\item{g1000allelesprefix}{Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)} + +\item{ismale}{A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA)} + +\item{data_type}{String that contains either wgs or snp6 depending on the supplied input data (Default: wgs)} + +\item{impute_exe}{Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH)} + +\item{allelecounter_exe}{Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH)} + +\item{nthreads}{The number of concurrent processes to use while running the Battenberg pipeline (Default: 8)} + +\item{platform_gamma}{Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)} + +\item{phasing_gamma}{Gamma parameter used when correcting phasing mistakes (Default: 1)} + +\item{segmentation_gamma}{The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)} + +\item{segmentation_kmin}{Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)} + +\item{phasing_kmin}{Kmin used when correcting for phasing mistakes (Default: 3)} + +\item{clonality_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 0)} + +\item{ascat_dist_metric}{Distance metric to use when choosing purity/ploidy combinations (Default: 1)} + +\item{min_ploidy}{Minimum ploidy to be considered (Default: 1.6)} + +\item{max_ploidy}{Maximum ploidy to be considered (Default: 4.8)} + +\item{min_rho}{Minimum purity to be considered (Default: 0.1)} + +\item{min_goodness}{Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)} + +\item{uninformative_BAF_threshold}{The threshold beyond which BAF becomes uninformative (Default: 0.51)} + +\item{min_normal_depth}{Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)} + +\item{min_base_qual}{Minimum base quality required for a read to be counted when allele counting (Default: 20)} + +\item{min_map_qual}{Minimum mapping quality required for a read to be counted when allele counting (Default: 35)} + +\item{calc_seg_baf_option}{Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1)} + +\item{skip_allele_counting}{Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE)} + +\item{skip_preprocessing}{Provide TRUE when preprocessing is already complete (Default: FALSE)} + +\item{skip_phasing}{Provide TRUE when phasing is already complete (Default: FALSE)} + +\item{snp6_reference_info_file}{Reference files for the SNP6 pipeline only (Default: NA)} + +\item{apt.probeset.genotype.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype)} + +\item{apt.probeset.summarize.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize)} + +\item{norm.geno.clust.exe}{Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl)} + +\item{birdseed_report_file}{Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt)} + +\item{heterozygousFilter}{Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none")} +} +\description{ +Run the Battenberg pipeline +} +\author{ +sd11 +} diff --git a/man/getBAFsAndLogRs.Rd b/man/getBAFsAndLogRs.Rd index 2ffea31..1e36064 100644 --- a/man/getBAFsAndLogRs.Rd +++ b/man/getBAFsAndLogRs.Rd @@ -34,7 +34,7 @@ getBAFsAndLogRs(tumourAlleleCountsFile.prefix, normalAlleleCountsFile.prefix, \item{samplename}{String, name of the sample (optional, default=sample1).} -\item{seed}{A seed to be set} +\item{seed}{A seed to be set for when randomising the alleles.} } \description{ Obtain BAF and LogR from the allele counts diff --git a/man/prepare_snp6.Rd b/man/prepare_snp6.Rd new file mode 100644 index 0000000..dfa23bc --- /dev/null +++ b/man/prepare_snp6.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prepare_SNP6.R +\name{prepare_snp6} +\alias{prepare_snp6} +\title{Prepare SNP6 data for haplotype construction} +\usage{ +prepare_snp6(tumour_cel_file, normal_cel_file, tumourname, chrom_names, + snp6_reference_info_file, + apt.probeset.genotype.exe = "apt-probeset-genotype", + apt.probeset.summarize.exe = "apt-probeset-summarize", + norm.geno.clust.exe = "normalize_affy_geno_cluster.pl", + birdseed_report_file = "birdseed.report.txt") +} +\arguments{ +\item{tumour_cel_file}{Full path to a CEL file containing the tumour raw data} + +\item{normal_cel_file}{Full path to a CEL file containing the normal raw data} + +\item{tumourname}{Identifier to be used for tumour output files} + +\item{chrom_names}{A vector containing the names of chromosomes to be included} + +\item{snp6_reference_info_file}{Full path to the SNP6 reference info file} + +\item{apt.probeset.genotype.exe}{Full path to the apt.probeset.genotype executable (Default: expected in $PATH)} + +\item{apt.probeset.summarize.exe}{Full path to the apt.probeset.summarize executable (Default: expected in $PATH)} + +\item{norm.geno.clust.exe}{Full path to the norm.geno.clust.exe executable (Default: expected in $PATH)} + +\item{birdseed_report_file}{Name of the birdseed output file. This is a temp output file of one of the internally called functions of which the name cannot be defined. Don't change this parameter. (Default: birdseed.report.txt)} +} +\description{ +This function performs part of the Battenberg SNP6 pipeline: Extract BAF and logR from the CEL files +and performing GC content correction. +} +\author{ +sd11 +} diff --git a/man/prepare_wgs.Rd b/man/prepare_wgs.Rd new file mode 100644 index 0000000..638d030 --- /dev/null +++ b/man/prepare_wgs.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prepare_wgs.R +\name{prepare_wgs} +\alias{prepare_wgs} +\title{Prepare WGS data for haplotype construction} +\usage{ +prepare_wgs(chrom_names, tumourbam, normalbam, tumourname, normalname, + g1000allelesprefix, g1000prefix, gccorrectprefix, min_base_qual, min_map_qual, + allelecounter_exe, min_normal_depth, nthreads, skip_allele_counting) +} +\arguments{ +\item{chrom_names}{A vector containing the names of chromosomes to be included} + +\item{tumourbam}{Full path to the tumour BAM file} + +\item{normalbam}{Full path to the normal BAM file} + +\item{tumourname}{Identifier to be used for tumour output files} + +\item{normalname}{Identifier to be used for normal output files} + +\item{g1000allelesprefix}{Prefix path to the 1000 Genomes alleles reference files} + +\item{g1000prefix}{Prefix path to the 1000 Genomes SNP reference files} + +\item{gccorrectprefix}{Prefix path to GC content reference data} + +\item{min_base_qual}{Minimum base quality required for a read to be counted} + +\item{min_map_qual}{Minimum mapping quality required for a read to be counted} + +\item{allelecounter_exe}{Path to the allele counter executable (can be found in $PATH)} + +\item{min_normal_depth}{Minimum depth required in the normal for a SNP to be included} + +\item{nthreads}{The number of paralel processes to run} + +\item{skip_allele_counting}{Flag, set to TRUE if allele counting is already complete (files are expected in the working directory on disk)} +} +\description{ +This function performs part of the Battenberg WGS pipeline: Counting alleles, constructing BAF and logR +and performing GC content correction. +} +\author{ +sd11 +} diff --git a/man/run_haplotyping.Rd b/man/run_haplotyping.Rd new file mode 100644 index 0000000..3e727eb --- /dev/null +++ b/man/run_haplotyping.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute.R +\name{run_haplotyping} +\alias{run_haplotyping} +\title{Construct haplotypes for a chromosome} +\usage{ +run_haplotyping(chrom, tumourname, normalname, ismale, imputeinfofile, + problemloci, impute_exe, min_normal_depth, chrom_names, + snp6_reference_info_file = NA, heterozygousFilter = NA) +} +\arguments{ +\item{chrom}{The chromosome for which to reconstruct haplotypes} + +\item{tumourname}{Identifier of the tumour, used to match data files on disk} + +\item{normalname}{Identifier of the normal, used to match data files on disk} + +\item{ismale}{Boolean, set to TRUE if the sample is male} + +\item{imputeinfofile}{Full path to the imputeinfo reference file} + +\item{problemloci}{Full path to the problematic loci reference file} + +\item{impute_exe}{Path to the impute executable (can be found if its in $PATH)} + +\item{min_normal_depth}{Minimal depth in the matched normal required for a SNP to be used} + +\item{chrom_names}{A vector containing the names of chromosomes to be included} + +\item{snp6_reference_info_file}{SNP6 only parameter Default: NA} + +\item{heterozygousFilter}{SNP6 only parameter Default: NA} +} +\description{ +This function takes preprocessed data and performs haplotype reconstruction. +} +\author{ +sd11 +}