Merge branch 'dev'

Wedge-lab · Mar 20, 2018 · 89323a2 · 89323a2
2 parents 5528172 + 73cdd9d
commit 89323a2
Show file tree

Hide file tree

Showing 20 changed files with 1,243 additions and 370 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Maintainer: Stefan Dentro <sd11@sanger.ac.uk>
 License: GPL-3
 Type: Package
 Title: Battenberg subclonal copy number caller
-Version: 2.2.7
+Version: 2.2.8
 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"),
     person("Peter", "Van Loo", role=c("aut")),
     person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")),
@@ -23,7 +23,10 @@ Imports:
   ggplot2,
   readr,
   gtools,
-  gridExtra
+  gridExtra,
+  doParallel,
+  parallel,
+  foreach
 URL: https://github.com/Wedge-Oxford/battenberg
 LazyLoad: yes
 License: file LICENSE

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,50 @@
+#FROM r-base
+FROM ubuntu:16.04
+
+# Add dependencies
+RUN apt-get update && apt-get install -y libxml2 libxml2-dev libcurl4-gnutls-dev r-cran-rgl git libssl-dev curl
+
+RUN mkdir /tmp/downloads
+
+RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/samtools/htslib/archive/1.2.1.tar.gz && \
+    mkdir /tmp/downloads/htslib && \
+    tar -C /tmp/downloads/htslib --strip-components 1 -zxf tmp.tar.gz && \
+    make -C /tmp/downloads/htslib && \
+    rm -f /tmp/downloads/tmp.tar.gz
+
+ENV HTSLIB /tmp/downloads/htslib
+
+RUN curl -sSL -o tmp.tar.gz --retry 10 https://github.com/cancerit/alleleCount/archive/v2.1.2.tar.gz && \
+    mkdir /tmp/downloads/alleleCount && \
+    tar -C /tmp/downloads/alleleCount --strip-components 1 -zxf tmp.tar.gz && \
+    cd /tmp/downloads/alleleCount/c && \
+    mkdir bin && \
+    make && \
+    cp /tmp/downloads/alleleCount/c/bin/alleleCounter /usr/local/bin/. && \
+    cd /tmp/downloads && \
+    rm -rf /tmp/downloads/alleleCount /tmp/downloads/tmp.tar.gz
+
+RUN curl -sSL -o tmp.tar.gz --retry 10 https://mathgen.stats.ox.ac.uk/impute/impute_v2.3.2_x86_64_static.tgz && \
+    mkdir /tmp/downloads/impute2 && \
+    tar -C /tmp/downloads/impute2 --strip-components 1 -zxf tmp.tar.gz && \
+    cp /tmp/downloads/impute2/impute2 /usr/local/bin && \
+    rm -rf /tmp/downloads/impute2 /tmp/downloads/tmp.tar.gz
+
+RUN R -q -e 'source("http://bioconductor.org/biocLite.R"); biocLite(c("devtools","RColorBrewer","ggplot2","gridExtra","readr","doParallel","foreach"))'
+RUN R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")'
+RUN R -q -e 'devtools::install_github("Wedge-Oxford/battenberg")'
+
+RUN curl -sSL -o battenberg_wgs.R https://raw.githubusercontent.com/Wedge-Oxford/battenberg/master/inst/example/battenberg_wgs.R
+# modify paths to reference files
+RUN cat battenberg_wgs.R | \
+    sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \
+    sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \
+    sed 's|G1000PREFIX_AC = \".*|G1000PREFIX_AC = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesloci2012_chr\"|' | \
+    sed 's|GCCORRECTPREFIX = \".*|GCCORRECTPREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_gcContent/1000_genomes_GC_corr_chr_\"|' | \
+    sed 's|PROBLEMLOCI = \".*|PROBLEMLOCI = \"/opt/battenberg_reference/battenberg_problem_loci/probloci_270415.txt.gz\"|' > /usr/local/bin/battenberg_wgs.R && rm -f battenberg_wgs.R
+
+#RUN curl -sSL -o battenberg_snp6.R https://raw.githubusercontent.com/Wedge-Oxford/battenberg/master/inst/example/battenberg_snp6.R
+#RUN cat battenberg_snp6.R | \
+#    sed 's|IMPUTEINFOFILE = \".*|IMPUTEINFOFILE = \"/opt/battenberg_reference/1000genomes_2012_v3_impute/impute_info.txt\"|' | \
+#    sed 's|G1000PREFIX = \".*|G1000PREFIX = \"/opt/battenberg_reference/1000genomes_2012_v3_loci/1000genomesAlleles2012_chr\"|' | \
+#    sed 's|SNP6_REF_INFO_FILE = \".*|SNP6_REF_INFO_FILE = \"/opt/battenberg_reference/battenberg_snp6/snp6_ref_info_file.txt\"|' > /usr/local/bin/battenberg_snp6.R && rm -f battenberg_wgs.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,6 +4,7 @@ export(plot.haplotype.data)
 export(GetChromosomeBAFs)
 export(GetChromosomeBAFs_SNP6)
 export(allele_ratio_plot)
+export(battenberg)
 export(calc_psi_t)
 export(calc_rho_psi_refit)
 export(callSubclones)
@@ -24,10 +25,13 @@ export(getBAFsAndLogRs)
 export(infer_gender_birdseed)
 export(make_posthoc_plots)
 export(parse.imputeinfofile)
+export(prepare_snp6)
+export(prepare_wgs)
 export(read_table_generic)
 export(run.impute)
 export(runASCAT)
 export(run_clonal_ASCAT)
+export(run_haplotyping)
 export(segment.baf.phased)
 export(segment.baf.phased.sv)
 export(squaresplot)
@@ -43,7 +47,12 @@ importFrom(ASCAT,ascat.plotNonRounded)
 importFrom(ASCAT,ascat.plotSunrise)
 importFrom(ASCAT,make_segments)
 importFrom(RColorBrewer,brewer.pal)
+importFrom(doParallel,registerDoParallel)
+importFrom(foreach,"%dopar%")
+importFrom(foreach,foreach)
 importFrom(gridExtra,arrangeGrob)
 importFrom(gridExtra,grid.arrange)
 importFrom(gtools,mixedsort)
+importFrom(parallel,makeCluster)
+importFrom(parallel,stopCluster)
 importFrom(readr,read_table)
diff --git a/R/Battenberg-package.R b/R/Battenberg-package.R
@@ -4,4 +4,7 @@
 #' @importFrom gridExtra grid.arrange arrangeGrob
 #' @importFrom ASCAT make_segments ascat.plotSunrise ascat.plotAscatProfile ascat.plotNonRounded
 #' @importFrom gtools mixedsort
+#' @importFrom parallel makeCluster stopCluster
+#' @importFrom doParallel registerDoParallel
+#' @importFrom foreach foreach %dopar%
 NULL
diff --git a/R/battenberg.R b/R/battenberg.R
@@ -0,0 +1,230 @@
+
+#' Run the Battenberg pipeline
+#' 
+#' @param tumourname Tumour identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.
+#' @param normalname Matched normal identifier, this is used as a prefix for the output files. If allele counts are supplied separately, they are expected to have this identifier as prefix.
+#' @param tumour_data_file A BAM or CEL file for the tumour
+#' @param normal_data_file A BAM or CEL file for the normal
+#' @param imputeinfofile Full path to a Battenberg impute info file with pointers to Impute2 reference data
+#' @param g1000prefix Full prefix path to 1000 Genomes SNP loci data, as part of the Battenberg reference data
+#' @param problemloci Full path to a problem loci file that contains SNP loci that should be filtered out
+#' @param gccorrectprefix Full prefix path to GC content files, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)
+#' @param g1000allelesprefix Full prefix path to 1000 Genomes SNP alleles data, as part of the Battenberg reference data, not required for SNP6 data (Default: NA)
+#' @param ismale A boolean set to TRUE if the donor is male, set to FALSE if female, not required for SNP6 data (Default: NA)
+#' @param data_type String that contains either wgs or snp6 depending on the supplied input data (Default: wgs)
+#' @param impute_exe Pointer to the Impute2 executable (Default: impute2, i.e. expected in $PATH)
+#' @param allelecounter_exe Pointer to the alleleCounter executable (Default: alleleCounter, i.e. expected in $PATH)
+#' @param nthreads The number of concurrent processes to use while running the Battenberg pipeline (Default: 8)
+#' @param platform_gamma Platform scaling factor, suggestions are set to 1 for wgs and to 0.55 for snp6 (Default: 1)
+#' @param phasing_gamma Gamma parameter used when correcting phasing mistakes (Default: 1)
+#' @param segmentation_gamma The gamma parameter controls the size of the penalty of starting a new segment during segmentation. It is therefore the key parameter for controlling the number of segments (Default: 10)
+#' @param segmentation_kmin Kmin represents the minimum number of probes/SNPs that a segment should consist of (Default: 3)
+#' @param phasing_kmin Kmin used when correcting for phasing mistakes (Default: 3)
+#' @param clonality_dist_metric  Distance metric to use when choosing purity/ploidy combinations (Default: 0)
+#' @param ascat_dist_metric Distance metric to use when choosing purity/ploidy combinations (Default: 1)
+#' @param min_ploidy Minimum ploidy to be considered (Default: 1.6)
+#' @param max_ploidy Maximum ploidy to be considered (Default: 4.8)
+#' @param min_rho Minimum purity to be considered (Default: 0.1)
+#' @param min_goodness Minimum goodness of fit required for a purity/ploidy combination to be accepted as a solution (Default: 0.63)
+#' @param uninformative_BAF_threshold The threshold beyond which BAF becomes uninformative (Default: 0.51)
+#' @param min_normal_depth Minimum depth required in the matched normal for a SNP to be considered as part of the wgs analysis (Default: 10)
+#' @param min_base_qual Minimum base quality required for a read to be counted when allele counting (Default: 20)
+#' @param min_map_qual Minimum mapping quality required for a read to be counted when allele counting (Default: 35)
+#' @param calc_seg_baf_option Sets way to calculate BAF per segment: 1=mean, 2=median (Default: 1)
+#' @param skip_allele_counting Provide TRUE when allele counting can be skipped (i.e. its already done) (Default: FALSE)
+#' @param skip_preprocessing Provide TRUE when preprocessing is already complete (Default: FALSE)
+#' @param skip_phasing  Provide TRUE when phasing is already complete (Default: FALSE)
+#' @param snp6_reference_info_file Reference files for the SNP6 pipeline only (Default: NA)
+#' @param apt.probeset.genotype.exe Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-genotype)
+#' @param apt.probeset.summarize.exe  Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: apt-probeset-summarize)
+#' @param norm.geno.clust.exe  Helper tool for extracting data from CEL files, SNP6 pipeline only (Default: normalize_affy_geno_cluster.pl)
+#' @param birdseed_report_file Sex inference output file, SNP6 pipeline only (Default: birdseed.report.txt)
+#' @param heterozygousFilter Legacy option to set a heterozygous SNP filter, SNP6 pipeline only (Default: "none")
+#' @author sd11
+#' @export
+battenberg = function(tumourname, normalname, tumour_data_file, normal_data_file, imputeinfofile, g1000prefix, problemloci, 
+                      gccorrectprefix=NA, g1000allelesprefix=NA, ismale=NA, data_type="wgs", impute_exe="impute2", allelecounter_exe="alleleCounter", nthreads=8, platform_gamma=1, phasing_gamma=1,
+                      segmentation_gamma=10, segmentation_kmin=3, phasing_kmin=1, clonality_dist_metric=0, ascat_dist_metric=1, min_ploidy=1.6,
+                      max_ploidy=4.8, min_rho=0.1, min_goodness=0.63, uninformative_BAF_threshold=0.51, min_normal_depth=10, min_base_qual=20, 
+                      min_map_qual=35, calc_seg_baf_option=1, skip_allele_counting=F, skip_preprocessing=F, skip_phasing=F,
+                      snp6_reference_info_file=NA, apt.probeset.genotype.exe="apt-probeset-genotype", apt.probeset.summarize.exe="apt-probeset-summarize", 
+                      norm.geno.clust.exe="normalize_affy_geno_cluster.pl", birdseed_report_file="birdseed.report.txt", heterozygousFilter="none") {
+
+  requireNamespace("foreach")
+  requireNamespace("doParallel")
+  requireNamespace("parallel")
+
+  if (data_type=="wgs" & is.na(ismale)) {
+    print("Please provide a boolean denominator whether this sample represents a male donor")
+    q(save="no", status=1)
+  }
+
+  if (data_type=="wgs" & is.na(g1000allelesprefix)) {
+    print("Please provide a path to 1000 Genomes allele reference files")
+    q(save="no", status=1)
+  }
+
+  if (data_type=="wgs" & is.na(gccorrectprefix)) {
+    print("Please provide a path to GC content reference files")
+    q(save="no", status=1)
+  }
+
+  if (data_type=="wgs" | data_type=="WGS") {
+    chrom_names = get.chrom.names(imputeinfofile, ismale)
+    logr_file = paste(tumourname, "_mutantLogR_gcCorrected.tab", sep="")
+    allelecounts_file = paste(tumourname, "_alleleCounts.tab", sep="")
+  } else if (data_type=="snp6" | data_type=="SNP6") {
+    chrom_names = get.chrom.names(imputeinfofile, TRUE)
+    logr_file = paste(tumourname, "_mutantLogR.tab", sep="")
+    allelecounts_file = NULL
+  }
+
+  if (!skip_preprocessing) {
+    if (data_type=="wgs" | data_type=="WGS") {
+      # Setup for parallel computing
+      clp = parallel::makeCluster(nthreads)
+      doParallel::registerDoParallel(clp)
+
+      prepare_wgs(chrom_names=chrom_names, 
+                  tumourbam=tumour_data_file, 
+                  normalbam=normal_data_file, 
+                  tumourname=tumourname, 
+                  normalname=normalname, 
+                  g1000allelesprefix=g1000allelesprefix, 
+                  g1000prefix=g1000prefix, 
+                  gccorrectprefix=gccorrectprefix, 
+                  min_base_qual=min_base_qual, 
+                  min_map_qual=min_map_qual, 
+                  allelecounter_exe=allelecounter_exe, 
+                  min_normal_depth=min_normal_depth, 
+                  nthreads=nthreads,
+                  skip_allele_counting=skip_allele_counting)
+
+      # Kill the threads
+      parallel::stopCluster(clp)
+
+    } else if (data_type=="snp6" | data_type=="SNP6") {
+
+      prepare_snp6(tumour_cel_file=tumour_data_file, 
+                   normal_cel_file=normal_data_file, 
+                   tumourname=tumourname, 
+                   chrom_names=chrom_names, 
+                   snp6_reference_info_file=snp6_reference_info_file, 
+                   apt.probeset.genotype.exe=apt.probeset.genotype.exe, 
+                   apt.probeset.summarize.exe=apt.probeset.summarize.exe,
+                   norm.geno.clust.exe=norm.geno.clust.exe, 
+                   birdseed_report_file=birdseed_report_file)
+
+    } else {
+      print("Unknown data type provided, please provide wgs or snp6")
+      q(save="no", status=1)
+    }
+  }
+
+  if (data_type=="snp6" | data_type=="SNP6") {
+    # Infer what the gender is - WGS requires it to be specified
+    gender = infer_gender_birdseed(birdseed_report_file)
+    ismale = gender == "male"
+  }
+
+  if (!skip_phasing) {
+    # Setup for parallel computing
+    clp = parallel::makeCluster(nthreads)
+    doParallel::registerDoParallel(clp)
+
+    # Reconstruct haplotypes 
+    # mclapply(1:length(chrom_names), function(chrom) {
+    foreach::foreach (chrom=1:length(chrom_names)) %dopar% {
+      print(chrom)
+
+      run_haplotyping(chrom=chrom, 
+                      tumourname=tumourname, 
+                      normalname=normalname, 
+                      ismale=ismale, 
+                      imputeinfofile=imputeinfofile, 
+                      problemloci=problemloci, 
+                      impute_exe=impute_exe, 
+                      min_normal_depth=min_normal_depth,
+  		                chrom_names=chrom_names,
+  		                snp6_reference_info_file=snp6_reference_info_file,
+  		                heterozygousFilter=heterozygousFilter)
+    }#, mc.cores=nthreads)
+
+    # Kill the threads as from here its all single core
+    parallel::stopCluster(clp)
+
+    # Combine all the BAF output into a single file
+    combine.baf.files(inputfile.prefix=paste(tumourname, "_chr", sep=""), 
+                      inputfile.postfix="_heterozygousMutBAFs_haplotyped.txt", 
+                      outputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""),
+                      no.chrs=length(chrom_names))
+  }
+
+  # Segment the phased and haplotyped BAF data
+  segment.baf.phased(samplename=tumourname,
+                     inputfile=paste(tumourname, "_heterozygousMutBAFs_haplotyped.txt", sep=""), 
+                     outputfile=paste(tumourname, ".BAFsegmented.txt", sep=""),
+                     gamma=segmentation_gamma,
+                     phasegamma=phasing_gamma,
+                     kmin=segmentation_kmin,
+                     phasekmin=phasing_kmin,
+                     calc_seg_baf_option=calc_seg_baf_option)
+
+  # Fit a clonal copy number profile
+  fit.copy.number(samplename=tumourname,
+                  outputfile.prefix=paste(tumourname, "_", sep=""),
+                  inputfile.baf.segmented=paste(tumourname, ".BAFsegmented.txt", sep=""), 
+                  inputfile.baf=paste(tumourname,"_mutantBAF.tab", sep=""), 
+                  inputfile.logr=logr_file, 
+                  dist_choice=clonality_dist_metric, 
+                  ascat_dist_choice=ascat_dist_metric, 
+                  min.ploidy=min_ploidy, 
+                  max.ploidy=max_ploidy, 
+                  min.rho=min_rho, 
+                  min.goodness=min_goodness, 
+                  uninformative_BAF_threshold=uninformative_BAF_threshold, 
+                  gamma_param=platform_gamma, 
+                  use_preset_rho_psi=F, 
+                  preset_rho=NA, 
+                  preset_psi=NA, 
+                  read_depth=30)
+
+  # Go over all segments, determine which segements are a mixture of two states and fit a second CN state
+  callSubclones(sample.name=tumourname, 
+                baf.segmented.file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
+                logr.file=logr_file, 
+                rho.psi.file=paste(tumourname, "_rho_and_psi.txt",sep=""), 
+                output.file=paste(tumourname,"_subclones.txt", sep=""), 
+                output.figures.prefix=paste(tumourname,"_subclones_chr", sep=""), 
+                output.gw.figures.prefix=paste(tumourname,"_BattenbergProfile", sep=""),
+                masking_output_file=paste(tumourname, "_segment_masking_details.txt", sep=""),
+                sv_breakpoints_file="NA",
+                chr_names=chrom_names, 
+                gamma=platform_gamma, 
+                segmentation.gamma=NA, 
+                siglevel=0.05, 
+                maxdist=0.01, 
+                noperms=1000,
+                calc_seg_baf_option=calc_seg_baf_option)
+
+  # Make some post-hoc plots
+  make_posthoc_plots(samplename=tumourname, 
+                     logr_file=logr_file, 
+                     subclones_file=paste(tumourname, "_subclones.txt", sep=""), 
+                     rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""), 
+                     bafsegmented_file=paste(tumourname, ".BAFsegmented.txt", sep=""), 
+                     logrsegmented_file=paste(tumourname, ".logRsegmented.txt", sep=""), 
+                     allelecounts_file=allelecounts_file)
+
+  # Save refit suggestions for a future rerun
+  cnfit_to_refit_suggestions(samplename=tumourname, 
+                             subclones_file=paste(tumourname, "_subclones.txt", sep=""),
+                             rho_psi_file=paste(tumourname, "_rho_and_psi.txt", sep=""),
+                             gamma_param=platform_gamma)
+
+
+
+
+
+}
+
diff --git a/R/clonal_ascat.R b/R/clonal_ascat.R
@@ -298,14 +298,11 @@ calc_LogR_Pvalue <-function( LogR, maxdist_LogR, LogR_level ) # kjd 27-2-2014
 
 }
 
-#' Helper function to estimate rho from a given copy number state and it's BAF and LogR
+#' Helper function to estimate rho from a given copy number state and it's BAF. The LogR is not used.
 #' @noRd
 estimate_rho <-function( LogR_value, BAFreq_value, nA_value, nB_value ) # kjd 10-3-2014
 {
-  temp_value = BAFreq_value * ( 2 - nA_value - nB_value )
-  temp_value = nB_value - 1 - temp_value
-  rho_value =  ( ( 2 * BAFreq_value ) - 1 ) / temp_value
-
+  rho_value = (2*BAFreq_value-1)/(2*BAFreq_value-BAFreq_value*(nA_value+nB_value)-1+nA_value)
   return( rho_value )
 
 }