diff --git a/DNAcopy_1.48.0.tar.gz b/DNAcopy_1.48.0.tar.gz new file mode 100644 index 0000000..240b17f Binary files /dev/null and b/DNAcopy_1.48.0.tar.gz differ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e08bffb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,52 @@ +FROM ubuntu:14.04 + +MAINTAINER Jeltje van Baren, jeltje.van.baren@gmail.com + +# create a working directory and work from there +RUN mkdir /tmp/install +WORKDIR /tmp/install + +RUN apt-get update && apt-get install -y \ + gcc \ + make \ + zlib1g-dev \ + git \ + wget \ + python-numpy \ + default-jre \ + r-base \ + bc + +# DNAcopy version keeps changing so deprecated this: +# R and DNAcopy package (move to R library location) +#RUN apt-get install -y r-base +#RUN wget http://www.bioconductor.org/packages/release/bioc/src/contrib/DNAcopy_1.40.0.tar.gz +# instead: +COPY ./DNAcopy_1.48.0.tar.gz ./ +RUN R CMD INSTALL DNAcopy_1.48.0.tar.gz + +# Samtools 0.1.18 - note: 0.1.19 and 1.1 do NOT work, VarScan copynumber dies on the mpileup +RUN wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.18/samtools-0.1.18.tar.bz2 +RUN tar -xvf samtools-0.1.18.tar.bz2 +# the make command generates a lot of warnings, none of them relevant to the final samtools code, hence 2>/dev/null +RUN (cd samtools-0.1.18/ && make DFLAGS='-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=0' LIBCURSES='' 2>/dev/null && mv samtools /usr/local/bin) + +# get varscan +RUN wget -O /usr/local/bin/VarScan.jar https://github.com/dkoboldt/varscan/releases/download/2.4.2/VarScan.v2.4.2.jar + +# Move wrapper and helper scripts to same location +ADD ./run_varscan /usr/local/bin/ +ADD ./separateArms.py /usr/local/bin/ +ADD ./basicDNAcopy.R /usr/local/bin/ +ADD ./meanLogRatioByChromosome.py /usr/local/bin/ + +# Set WORKDIR to /data -- predefined mount location. +RUN mkdir /data +WORKDIR /data + +# And clean up +RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/install + +ENTRYPOINT ["bash", "/usr/local/bin/run_varscan"] + + diff --git a/Dockstore.cwl b/Dockstore.cwl new file mode 100644 index 0000000..cfc819c --- /dev/null +++ b/Dockstore.cwl @@ -0,0 +1,68 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool + +id: Varscan2 +label: Varscan2 workflow + +baseCommand: ['-s', './'] + +description: | + A Docker container for a Varscan2 workflow. See the [github repo](https://github.com/Jeltje/varscan2) for more information. + +requirements: + - class: DockerRequirement + dockerPull: quay.io/ucsc_cgl/varscan_cnv + +inputs: + genome: + type: File + doc: Genome fasta + format: http://edamontology.org/format_1929 + inputBinding: + prefix: -i + + centromeres: + type: File + doc: Centromere bed file + format: http://edamontology.org/format_3003 + inputBinding: + prefix: -b + + targets: + type: File + doc: Exome Targets bed file + format: http://edamontology.org/format_3003 + inputBinding: + prefix: -w + + control_bam_input: + type: File + doc: The control exome BAM file used as input, it must be sorted. + format: http://edamontology.org/format_2572 + inputBinding: + prefix: -c + + tumor_bam_input: + type: File + doc: The tumor exome BAM file used as input, it must be sorted. + format: http://edamontology.org/format_2572 + inputBinding: + prefix: -t + + sample_id: + type: string? + default: mypatient + doc: sample ID to use in output + inputBinding: + prefix: -q + + +stdout: output.cnv + +outputs: + - id: output + type: stdout + + diff --git a/Dockstore.json b/Dockstore.json new file mode 100644 index 0000000..841106a --- /dev/null +++ b/Dockstore.json @@ -0,0 +1,26 @@ +{ + "output": { + "path": "/tmp/varscan.cnv", + "class": "File" + }, + "genome": { + "path": "http://hgwdev.cse.ucsc.edu/~jeltje/public_data/genome.fa.gz", + "class": "File" + }, + "control_bam_input": { + "path":"https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/data_for_testing/HCC1143_ds/HCC1143_BL.bam", + "class": "File" + }, + "centromeres": { + "path": "http://hgwdev.cse.ucsc.edu/~jeltje/public_data/centromeres.bed", + "class": "File" + }, + "tumor_bam_input": { + "path":"https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/data_for_testing/HCC1143_ds/HCC1143_BL.bam", + "class": "File" + }, + "targets": { + "path": "http://hgwdev.cse.ucsc.edu/~jeltje/public_data/targets.bed", + "class": "File" + } +} diff --git a/README.md b/README.md index 55a1b8c..aa453fa 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,123 @@ -# dockstore_tool_varscan_cnv -Varscan copynumber variation caller workflow +# Varscan2 + +**This repository contains code to create a docker implementation of the Varscan2.4.2 copynumber variation (CNV) caller.** + +Varscan2 was developed by Dan Koboldt (see References below). It can be used to detect copy number variation (CNV) in sample pairs, usually exomes from a tumor and control from one patient. + +The Varscan2 executable (https://github.com/dkoboldt/varscan.git) combines several tools. It is meant to be run in a pipeline, during which different tools are called in sequence. For details on Varscan, see http://dkoboldt.github.io/varscan/ + + +This repository ONLY contains a pipeline for Varscan2 **copynumber variation**. If you want to run other Varscan tools, please use Varscan2 directly. This docker container contains a wrapper script that uses Varscan tools and other programs *with specific parameters*. These may not be the perfect parameters for your particular samples. See below for the full list of pipeline steps. + +**Inputs** to the program are a tumor/control pair of BAM files, a genome fasta file and several [bed format](https://genome.ucsc.edu/FAQ/FAQformat#format1) helper files (see below). The program will sort the bam files by coordinate if it finds this hasn't been done. +**Output** is a file with chromosome segments that are scored for amplification or deletion. + +To get per-gene output, these scores must be mapped to an annotation, for example using [this program] (https://github.com/Jeltje/cnvtogenes) + +## The code + +The Varscan wrapper script runs the following: + +1. samtools flagstat on each bam file +2. samtools mpileup on both bam files +3. Determine unique mapped read ratio +4. Varscan copynumber +5. Remove low coverage regions +6. Varscan copyCaller +7. Calculate median for recentering +8. Varscan copyCaller recenter +9. Separate chromosome arms +10. DNAcopy (CBS) +11. Merge chromosome arms + +The chromosome arms are separated before the Circular Binary Segmentation (CBS) step to avoid making calls across centromeres. + +## Getting the docker container + +The latest Varscan docker image can be downloaded directly from quay.io using +`docker pull quay.io/jeltje/varscan2` + +Alternatively, you can build from the github repo: +``` +git clone https://github.com/jeltje/varscan2.git +cd varscan2 +docker build -t jeltje/varscan2 . +``` + +## Running the docker container + +For details on running docker containers in general, see the excellent tutorial at https://docs.docker.com/ + +To see a usage statement, run + +``` +docker run jeltje/varscan2 -h +``` + +### Example input: + +``` +docker run -v /path/to/input/files:/data jeltje/varscan2 -c normal.bam -t tumor.bam -q sampleid -i genome.fa -b centromeres.bed -w targets.bed -s tmpdir > varscan.cnv + +``` + +where + +`normal.bam` and `tumor.bam` are BAM format files of exome reads aligned to the genome. + +`sampleid` is an identifier for the patient. This will be used in the output. + +`genome.fa` is a fasta file containing the genome that was used to create the BAM files. This file can be gzipped (and must end with `.gz` if so). A samtools indexed `.fai` will be created if it is not present in the same directory as this file (for details see Other Considerations, below). + +`centromeres.bed` is a [bed format file](https://genome.ucsc.edu/FAQ/FAQformat#format1) containing centromere locations. This list is used to remove centromeres from the CBS calls. + +`targets.bed` is a list of exome targets in bed format. This is used as a 'whitelist' of genome regions so that off target alignments will not be used for analysis + +`tmpdir` is a directory for temporary output files. If you set option -d, these files will be kept + +Keep in mind that all these file locations must be with respect to your `/path/to/input/files`. + +Centromeres for hg19 are provided ind the `/data` directory + +> You can find centromere locations for genomes via +> http://genome.ucsc.edu/cgi-bin/hgTables +> Using the following selections: +> - group: Mapping and Sequencing +> - track:gap +> - filter - goes to new page, look for 'type does match' and type centromere, submit +> - output format: bed +> Submit, on the next page just press Get Bed +*SPECIAL NOTE* In newer genome versions there is a separate `centromere` track, but it may show multiple overlapping centromeres for the same chromosome. These must be reduced to a single centromere per chromosome, or the program will not work correctly. + +## Output + +Output is written to `STDOUT` and uses the following format: +``` +sampleID chrom loc.start loc.end num.mark seg.mean + +``` + +To get amplified or deleted segments from this file, a threshold must be applied. This is often set to `0.25/-0.25`, +and with a minimum number of 10 markers per segment. + + +## Other considerations + +Tumor and control really must be from the same patient and processed in the same experiment. Batch effects are strong in exome experiments and using the wrong control renders Varscan output meaningless. + +The genome file will be unzipped and/or indexed using `samtools faidx` if necessary. +To save time and index a genome, run +``` + samtools faidx +``` +This creates an index named genome.fa.fai +The genome and the index must be in the same directory, and the genome file (not the index) is the input to run_varscan + +The whitelist is a bed format file with the exome targets used in the experiment. It ensures that Varscan only uses target regions for its analysis and not any off target read matches. It is important to use the real list of exome targets. For meaningful results do not use a generic list. + + +## References + +Koboldt DC, Zhang Q, Larson DE, Shen D, McLellan MD, Lin L, Miller CA, Mardis ER, Ding L, Wilson RK. +VarScan 2: somatic mutation and copy number alteration discovery in cancer by exome sequencing. +Genome Res. 2012 Mar;22(3):568-76. doi: 10.1101/gr.129684.111. diff --git a/basicDNAcopy.R b/basicDNAcopy.R new file mode 100755 index 0000000..f2de372 --- /dev/null +++ b/basicDNAcopy.R @@ -0,0 +1,24 @@ +library(DNAcopy) + +args <- commandArgs(TRUE) + +# arguments are input and SD + +# Alway use the same random seed for reproducible results +set.seed(0xcafe) # cafe is a hex number + +cn <- read.table(args[1],header=TRUE) +sd <- as.double(args[2]) +CNA.object <-CNA(genomdat = cn$adjusted_log_ratio, chrom = cn$chrom, maploc = cn$chr_stop, + data.type = 'logratio', sampleid = "sample") + +smoothed.CNA.object <- smooth.CNA(CNA.object) + +segment.smoothed.CNA.object <- segment(smoothed.CNA.object, undo.splits="sdundo", undo.SD=sd, verbose=1) +p.segment.smoothed.CNA.object <- segments.p(segment.smoothed.CNA.object) + +outfile <- paste(args[1], "SD", sd, "dnacopy.out", sep=".") + +write.table(p.segment.smoothed.CNA.object[,1:6], file=outfile, quote=F, row.names=F, sep="\t") + +detach(package:DNAcopy) diff --git a/meanLogRatioByChromosome.py b/meanLogRatioByChromosome.py new file mode 100755 index 0000000..e6eecfa --- /dev/null +++ b/meanLogRatioByChromosome.py @@ -0,0 +1,64 @@ +#!/usr/bin/python + +import sys, os, re, argparse +from numpy import * + +parser = argparse.ArgumentParser(description="Calculates log average per chromosome") +parser.add_argument('cpcalled', type=str,help="copyCalled output") + +class Chrom(object): + """Holds fragment scores by chromosomes """ + def __init__(self, chrom, score, endpos): + self.frags = [] + self.chrom = chrom + self.add(chrom, score, endpos) + def add(self, chrom, score, endpos): + if self.chrom == chrom: + self.frags.append(score) + self.lastfrag = endpos + return True + else: + return False + def stats(self): + self.mean = mean(self.frags) + self.std = std(self.frags) + + +if len(sys.argv)==1: + parser.print_help() + sys.exit(1) +args = parser.parse_args() + +# Main +chromTable = [] # holds chrom objects +curChrom = Chrom('empty', 0, 0) +f = open(args.cpcalled,'r') +for line in f: + line = line.strip() + fields = line.split("\t") + chr = fields[0] + if chr == "chrom": + continue + score = float(fields[6]) + endpos = int(fields[2]) + if not (curChrom.add(chr, score, endpos)): + curChrom = Chrom(chr, score, endpos) + chromTable.append(curChrom) +f.close() + +if len(chromTable) < 3: + print >>sys.stderr, "ERROR: Please enter whole genome file" + sys.exit(1) + +means = [] +for chr in chromTable: + if chr.lastfrag < 60000000: # skip MT, GL, Y +# if len(chr.frags) < 3000: # skip MT, GL, Y + continue + chr.stats() + means.append(chr.mean) + +med = len(means)/2 # this rounds, which is perfect +means.sort() +print "%.2f" % mean([means[med-1], means[med], means[med+1]]) + diff --git a/run_varscan b/run_varscan new file mode 100755 index 0000000..59a2329 --- /dev/null +++ b/run_varscan @@ -0,0 +1,337 @@ +#! /bin/bash + +print_usage(){ +>&2 cat < -t -q -i -b -w -s + Wrapper script for Varscan2 + Runs the following steps: + 1. samtools flagstat on each bam file + 2. samtools mpileup on both bam files + 3. determine unique mapped read ratio + 4. VarScan copynumber + 5. Remove low coverage regions + 6. VarScan copyCaller + 7. calculate median for recentering + 8. VarScan copyCaller recenter + 9. Separate chromosome arms + 10. DNAcopy + 11. Merge chromosome arms + +OPTIONS: + -h Show this message + -t tumor bam file + -c control bam file + -q optional sample ID, eg TCGA-001-4-2018 + -i genome that was used to create the bam files + -b centromere locations (bed format) + -w exome whitelist (bed format) + -s directory for temporary files. The script creates a directory varscan.N inside. + -d do not delete temporary output + -n instead of creating a new temporary directory, use this one + +EOF +} + +cBam='False' +tBam='False' +sampleid="sample" +idx='False' +cent='False' +white='False' +scratch= +prevDir= +cleanup=true + +while getopts "ht:c:q:i:b:w:s:n:a:d" OPTION +do + case $OPTION in + h) + print_usage + exit + ;; + t) + tBam=$OPTARG + ;; + c) + cBam=$OPTARG + ;; + q) + sampleid=$OPTARG + ;; + i) + idx=$OPTARG + ;; + b) + cent=$OPTARG + ;; + w) + white=$OPTARG + ;; + s) + scratch=$OPTARG + ;; + d) + cleanup=false + ;; + n) + prevDir=$OPTARG + ;; + ?) + print_usage + exit + ;; + esac +done + +graceful_death() { + >&2 echo "ERROR: Cannot finish $0 because $1"; + exit 1 +} + +# Check if all file arguments have been given and are valid +file_check() { + if [ $1 == 'False' ]; then + print_usage + graceful_death "some input arguments are missing" + fi + if [[ ! -e "$1" ]]; then + print_usage + graceful_death "can't find $1" + fi +} + +for i in $cBam $tBam $idx $cent $white; do + file_check $i +done + +# Sanity check +tmpdir= +if [[ -z "$prevDir" ]] && [[ -z $scratch ]]; then + graceful_death "Please give either the -n OR -s option" +fi + +# select correct temp dir +if [[ -z "$prevDir" ]]; then + if [ ! -d "$scratch" ]; then + graceful_death "cannot find scratch output dir $scratch" + fi + tmpExt=$RANDOM + tmpdir="$scratch/varscan.$tmpExt" + mkdir -p $tmpdir +else + if [ ! -d "$prevDir" ]; then + graceful_death "cannot find previous run directory $prevDir" + fi + tmpdir=$prevDir +fi +>&2 echo "Output files will be stored in $tmpdir" + +# Make sure all inputs are for the same genome +>&2 echo "Checking inputs..." + +# unzip genome, if necessary +genofa="$tmpdir/genome.fa" +if [[ "$idx" == *.gz ]]; then + zcat $idx > $genofa +else + ln -s $idx $genofa +fi +# index genome +>&2 echo "indexing $genofa" +samtools faidx $genofa + + +# Check that the bam files are sorted and index them +issort(){ + didsort=$(samtools view -H $1 | grep -m 1 ^@HD) + if [[ "$didsort" != *SO:coordinate ]]; then + >&2 echo "Sorting $1 to $2..." + samtools sort $1 $2 + else + ln -s $1 $2.bam + fi + >&2 echo "indexing $2.bam..." + samtools index $2.bam +} +issort $tBam "$tmpdir/t" +tBam="$tmpdir/t.bam" +issort $cBam "$tmpdir/c" +cBam="$tmpdir/c.bam" + +contain(){ + if [[ $(comm -23 $1 $2) ]]; then + graceful_death "some or all of the chromosomes in the input $3 cannot be found in the genome fasta; please make sure all your inputs are for the same genome version" + fi +} + +samtools view -H $tBam | grep ^@SQ | cut -f2 | cut -f2 -d':' | sort > $tmpdir/bam.chrs +cut -f1 $cent | sort > $tmpdir/cent.chrs +cut -f1 $white | sort -u > $tmpdir/target.chrs +grep '>' $genofa | sed 's/\s.*//' | sed 's/>//' | sort > $tmpdir/geno.chrs + +# all bam chrs, and all bed chromosomes should be in the genome (but not vice versa) +contain $tmpdir/bam.chrs $tmpdir/geno.chrs $tBam +contain $tmpdir/cent.chrs $tmpdir/geno.chrs $cent +contain $tmpdir/target.chrs $tmpdir/geno.chrs $white + +# checks if a file exists and has more than one line in it +# several programs in this wrapper will output a single line if they fail +exists(){ + if [ -e "$1" ] + then + ct=$(head -n 2 $1 | wc -l | cut -f1 -d' ') + if [ "$ct" -eq "2" ]; then + return 0 + else + return 1 + fi + else + return 1 + fi +} + +# runOrDie gets its variables directly from MAIN +runOrDie(){ + if exists "$outfile" ; then + return 0 # nothing to be done + fi + for file in $infile; do + ext=$(echo $file | sed "s/.*\.//"); + [ "$ext" == "bam" ] && continue # do not check bam files again + if ! exists "$file" && [ -z $DEBUG ]; then + graceful_death "cannot run $cmd: missing or corrupted $infile" + fi + done + >&2 echo $cmd + if [[ -z $DEBUG ]]; then + date >&2 + eval $cmd + if ! exists "$outfile" ; then + graceful_death "failed to find $outfile" + fi + fi +} + + + + +# correct version of samtools? +cmd="samtools 2>&1 | grep Version | cut -f2 -d' '" +sVersion=$(eval $cmd) +if [ $sVersion != "0.1.18" ]; then + graceful_death "wrong samtools version: expected 0.1.18, got $sVersion" +fi + +# find location of run script so we can get the other necessary scripts +DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +DNAcopy=$DIR/basicDNAcopy.R +findDelta=$DIR/meanLogRatioByChromosome.py +separateArms=$DIR/separateArms.py +varScan="nice java -Xmx2048m -jar $DIR/VarScan.jar" + +########## MAIN ################ + +# Samtools flagstat +infile="$cBam" +outfile="$tmpdir/control.flagstat" +cmd="samtools flagstat $infile > $outfile" +runOrDie + +infile=$tBam +outfile="$tmpdir/tumor.flagstat" +cmd="samtools flagstat $infile > $outfile" +runOrDie + +# Samtools mpileup +infile="$genofa $cBam $tBam" +outfile="$tmpdir/mpileup" +cmd="samtools mpileup -q 1 -B -l $white -f $infile > $outfile" +runOrDie + + +ntest=$(head -n 100000 $tmpdir/mpileup | cut -f3 | grep -c N) +if [ "$ntest" -eq "100000" ]; then + graceful_death "it looks like the chromosome names in your bam files don't match the ones in the input genome" +fi + +# Varscan copynumber +# must calculate data ratio from flagstat output +# also must move to output dir to run this because varscan doesn't parse the output name +dratio= +if exists $tmpdir/control.flagstat && exists $tmpdir/tumor.flagstat ; then + cnum=$(grep -m 1 mapped $tmpdir/control.flagstat | cut -f1 -d' ') + tnum=$(grep -m 1 mapped $tmpdir/tumor.flagstat | cut -f1 -d' ') + dratio=$(echo "scale=2;$cnum/$tnum" | bc) +fi +if [[ -z $dratio ]] && [ -z $DEBUG ]; then + graceful_death "could not determine data ratio from $tmpdir/control.flagstat and $tmpdir/tumor.flagstat" +fi + +pushd $tmpdir > /dev/null +vOptions='--min-segment-size 100 --mpileup 1' +dr="--data-ratio $dratio" # .88 works instead of 0.88 +infile="mpileup" +outfile="output.copynumber" +cmd="$varScan copynumber $infile output $vOptions $dr" # output is base name, copynumber gets added as extension +runOrDie +pushd > /dev/null + +# From the output, filter any segments for which the tumor coverage is less than 10 +# and the control coverage is less than 20 +awk -v x=10 '$6 >= x' $tmpdir/output.copynumber | \ +awk -v x=20 '$5 >= x' > $tmpdir/output.copynumber.cov + + +# Varscan copycaller +infile="$tmpdir/output.copynumber.cov" +outfile="$tmpdir/copyCalled" +ccOptions="--output-file $outfile --output-homdel-file $outfile.homdel" +cmd="$varScan copyCaller $infile $ccOptions" +runOrDie + +# Calculate recenter amount +infile="$tmpdir/copyCalled" +delta=$($findDelta $infile) +if [ -z "$delta" ]; then + graceful_death "Could not find chr average, please make sure your bamfiles cover all chromosomes (samtools idxstats file.bam)" +fi + +# Rerun copycaller +infile="$tmpdir/output.copynumber.cov" +outfile="$tmpdir/copyCalled.recenter" +ccOptions="--output-file $outfile --output-homdel-file $outfile.homdel" + +cmp=$(awk -v delta=$delta 'END{if (delta < -0.2) {print "lt"} else {if (delta > 0.2) {print "gt"} else {print "eq"}}}' < /dev/null) +if [[ "$cmp" == "lt" ]]; then + rd=$(echo $delta | sed 's/-//') + cmd="$varScan copyCaller $infile $ccOptions --recenter-down $rd" + runOrDie +elif [[ "$cmp" == "gt" ]]; then + cmd="$varScan copyCaller $infile $ccOptions --recenter-up $delta" + runOrDie +else + ln -s copyCalled $tmpdir/copyCalled.recenter +fi + +# add p and q to chromosome arms +infile="$tmpdir/copyCalled.recenter" +outfile="$tmpdir/copyCalled.recenter.sep" +cmd="$separateArms $infile $cent > $outfile" +runOrDie + +# Circular binary segmentation +infile="$tmpdir/copyCalled.recenter.sep" +outfile="$tmpdir/copyCalled.recenter.sep.SD.2.5.dnacopy.out" +cmd="Rscript $DNAcopy $infile 2.5 >/dev/null" +runOrDie + +# remove the arms and print to stdout +sed 's/\.[pq] / /' $tmpdir/copyCalled.recenter.sep.SD.2.5.dnacopy.out | \ + sed "s/^sample/$sampleid/" + +# clean up +if $cleanup; then + rm $tmpdir/* + rmdir $tmpdir +fi + diff --git a/separateArms.py b/separateArms.py new file mode 100755 index 0000000..1c800b1 --- /dev/null +++ b/separateArms.py @@ -0,0 +1,79 @@ +#!/usr/bin/python + +import sys, os, re, argparse + +parser = argparse.ArgumentParser(description="Divide input copycaller file into p and q segments using an input bed file with centromere coordinates") +parser.add_argument('input', type=str,help="Tab separated copycaller file") +parser.add_argument('centro', type=str,help="Centromere bed file") +# optional argument +#parser.add_argument('-t', dest='targetfile', type=str, default=False, +# help="Input file with target genes and scores") +# optional flag +#parser.add_argument('-d', '--debug', help="Optional debugging output", action='store_true') + +def myfunction(line): + """DESCRIBE ME""" + line = line[:-1] + +class cent(object): + """centromere objects by chromosome""" + def __init__(self, line): + [id, start, end] = line.split("\t") + self.name = id + self.p = int(start) + self.q = int(end) + +def getChrom(id, centList, cur): + if cur and id == cur.name: + return cur + for i in centList: + if id == i.name: + return i + return False + +if len(sys.argv)==1: + parser.print_help() + sys.exit(1) +args = parser.parse_args() + +# Main + +centros=[] +f = open(args.centro,'r') +for line in f: + line = line.strip() + centobj = cent(line) + centros.append(centobj) +f.close + +curchrom = False +f = open(args.input,'r') +for line in f: + line = line.strip() + fields = line.split("\t") + if fields[0] == 'chrom': + print line + continue + id = fields[0] + start = int(fields[1]) + end = int(fields[2]) + curchrom = getChrom(id, centros, curchrom) +# print >>sys.stderr, id, curchrom +# if (not curchrom) or (not curchrom.name == fields[0]): +# curchrom = False +# for c in centros: +# if c.name == id: +# curchrom = c +# print >>sys.stderr, "setting", curchrom +# break + # centromere free chromosomes + if not curchrom: + print line + # print only if segment does not overlap centromere + elif end < curchrom.p: + fields[0] = ('.').join([fields[0], 'p']) + print ("\t").join(fields) + elif start > curchrom.q: + fields[0] = ('.').join([fields[0], 'q']) + print ("\t").join(fields) +f.close diff --git a/src/cbsToBed.py b/src/cbsToBed.py new file mode 100755 index 0000000..4dbd260 --- /dev/null +++ b/src/cbsToBed.py @@ -0,0 +1,72 @@ +#!/usr/bin/python + +import sys, os, re, getopt +usage = sys.argv[0]+""" + +Create bed file based on Varscan CBS output with +segment scores converted to colors (red for amplified, blue for deleted) + +Option: + -n Don't create bed header + -c cutoff for amplification or deletion (default 0.25) + +""" + + +# Main +# read in command line and options +try: + opts, args = getopt.getopt(sys.argv[1:], "fdc:hbn") +except getopt.GetoptError: + + # print help information and exit: + print usage + print "ERROR did not recognize input\n" + sys.exit(2) + +makeBed = True +head = True +val = 0.25 +for o, a in opts: +# if o == "-d": +# doNotDelete = True + if o == "-n": + head = False + if o == "-c": + val = float(a) + if o == "-h": + print usage + sys.exit() + + +if len(args) != 1: + sys.exit(usage) + +# Run program + +f = open(args[0],'r') +counter = 0 + +if makeBed and head: + print 'track name=%s description="%s" itemRgb="On"'% (args[0], args[0]) +for line in f: + line = line.strip() + fields = line.split("\t") + if fields[0] == 'ID': + continue + counter+=1 + id=('.').join(['mrg', str(counter)]) + qualifier = "neutral" + rgb='0,0,0' + if(float(fields[5]) < -val): + rgb='0,0,255' # blue + elif(float(fields[5]) > val): + rgb='255,0,0' # red + else: + rgb='0,0,0' # black + chr = ("").join(["chr", fields[1]]) + outstring = ("\t").join([chr, fields[2], fields[3], id, '0', '.', fields[2], fields[2], rgb ]) + print outstring +f.close() + +