From 40198649e205301292c569bb9ee4b1434b37b921 Mon Sep 17 00:00:00 2001 From: Jeremiah Wala Date: Thu, 18 Jun 2015 12:43:26 -0400 Subject: [PATCH] lots of stuff --- source.dot | 467 ++++++++++++++++++++++++ src/AlignedContig.cpp | 296 +++++++++------ src/BWAWrapper.cpp | 2 +- src/BamWalker.cpp | 119 ++++-- src/BreakPoint.cpp | 104 ++++-- src/DiscordantCluster.cpp | 284 +++++++++++--- src/GenomicRegion.cpp | 14 +- src/GenomicRegionCollection.cpp | 69 +++- src/HTSTools.cpp | 138 ------- src/Makefile | 24 +- src/Makefile.am | 4 +- src/Makefile.in | 20 +- src/MiniRules.cpp | 6 +- src/RealignTool.cpp | 10 +- src/SeqRead.cpp | 1 - src/SnowTools/AlignedContig.h | 36 +- src/SnowTools/BamRead.h | 63 +++- src/SnowTools/BamWalker.h | 53 ++- src/SnowTools/BreakPoint.h | 12 +- src/SnowTools/DiscordantCluster.h | 97 +++-- src/SnowTools/GenomicRegion.h | 8 +- src/SnowTools/GenomicRegionCollection.h | 12 +- src/SnowTools/HTSTools.h | 91 ----- src/SnowTools/MiniRules.h | 2 - src/SnowTools/Read.h | 14 - src/VFilterTool.cpp | 199 ---------- src/VFilterTool.h | 7 - src/VariantBamWalker.cpp | 50 --- src/VariantBamWalker.h | 26 -- src/config.status | 4 +- src/configure | 25 +- src/configure.ac | 15 +- src/source.dot | 62 ++++ 33 files changed, 1435 insertions(+), 899 deletions(-) create mode 100644 source.dot delete mode 100644 src/HTSTools.cpp delete mode 100644 src/SeqRead.cpp delete mode 100644 src/SnowTools/HTSTools.h delete mode 100644 src/SnowTools/Read.h delete mode 100644 src/VFilterTool.cpp delete mode 100644 src/VFilterTool.h delete mode 100644 src/VariantBamWalker.cpp delete mode 100644 src/VariantBamWalker.h create mode 100644 src/source.dot diff --git a/source.dot b/source.dot new file mode 100644 index 000000000..df82f7a3b --- /dev/null +++ b/source.dot @@ -0,0 +1,467 @@ +digraph "source tree" { + overlap=scale; + size="8,10"; + ratio="fill"; + fontsize="16"; + fontname="Helvetica"; + clusterrank="local"; + "cram.h" -> "cram_codecs.h" + "bwtsw2_aux.c" -> "bwt_lite.h" + "bwt_gen.c" -> "QSufSort.h" + "bwtsw2_chain.c" -> "ksort.h" + "pattern.c" -> "reader.h" + "bwt_gen.c" -> "malloc_wrap.h" + "BWTIntervalCache.cpp" -> "BWTIntervalCache.h" + "tabix.c" -> "config.h" + "node.c" -> "node.h" + "DiscordantCluster.cpp" -> "DiscordantCluster.h" + "SAReader.h" -> "Occurrence.h" + "SparseGapArray.h" -> "GapArray.h" + "bwtsw2_main.c" -> "bwtsw2.h" + "Occurrence.cpp" -> "Occurrence.h" + "bwa.c" -> "bntseq.h" + "synced_bcf_reader.h" -> "tbx.h" + "tbx.h" -> "hts.h" + "VFilterTool.cpp" -> "GenomicRegion.h" + "BWTCARopebwt.cpp" -> "BWTCARopebwt.h" + "kstring.c" -> "malloc_wrap.h" + "VFilterTool.cpp" -> "SnowUtils.h" + "vcf.c" -> "bgzf.h" + "kopen.c" -> "malloc_wrap.h" + "sam.c" -> "bgzf.h" + "SampledSuffixArray.cpp" -> "SampledSuffixArray.h" + "bwamem.c" -> "bntseq.h" + "multifast.c" -> "multifast.h" + "bwaseqio.c" -> "kseq.h" + "synced_bcf_reader.h" -> "vcf.h" + "bwtsw2_chain.c" -> "bwtsw2.h" + "BWTTraverse.cpp" -> "BWTTraverse.h" + "BWTCARopebwt.cpp" -> "BWTWriterBinary.h" + "bwamem.c" -> "bwamem.h" + "SuffixArray.cpp" -> "SAWriter.h" + "hts.c" -> "hfile.h" + "SuffixArray.cpp" -> "SAReader.h" + "bgzf.c" -> "hfile.h" + "bwamem.c" -> "ksort.h" + "tabix.c" -> "regidx.h" + "fastmap.c" -> "bntseq.h" + "regidx.c" -> "regidx.h" + "SuffixCompare.h" -> "STCommon.h" + "BWTReader.cpp" -> "BWTReaderBinary.h" + "vcf.c" -> "khash_str2int.h" + "RLBWT.h" -> "FMMarkers.h" + "bwamem_extra.c" -> "bntseq.h" + "bwa.c" -> "bwa.h" + "QuickBWT.h" -> "BWT.h" + "utils.c" -> "kseq.h" + "SBWT.h" -> "STCommon.h" + "cram.h" -> "cram_index.h" + "SuffixArray.cpp" -> "SuffixArray.h" + "BWTAlgorithms.h" -> "BWTIndexSet.h" + "bwase.c" -> "kstring.h" + "multifast.c" -> "walker.h" + "bgzf.c" -> "khash.h" + "kseq.h" -> "malloc_wrap.h" + "bwtsw2_pair.c" -> "ksw.h" + "AlignedContig.cpp" -> "BamWalker.h" + "kvec.h" -> "malloc_wrap.h" + "vcf_sweep.c" -> "vcf_sweep.h" + "bwa.c" -> "ksw.h" + "cram.h" -> "cram_stats.h" + "SAWriter.h" -> "STCommon.h" + "GapArray.h" -> "BWT.h" + "bwase.c" -> "ksw.h" + "Histogram.cpp" -> "Histogram.h" + "BWTReaderAscii.h" -> "Occurrence.h" + "SAReader.h" -> "STCommon.h" + "vcfutils.h" -> "vcf.h" + "cram.h" -> "cram_structs.h" + "bwamem.c" -> "ksw.h" + "SBWT.h" -> "HitData.h" + "vcf_sweep.h" -> "hts.h" + "hfile_net.c" -> "hfile_internal.h" + "pemerge.c" -> "malloc_wrap.h" + "SACAInducedCopying.h" -> "SuffixArray.h" + "example.c" -> "kseq.h" + "bgzf.c" -> "config.h" + "BWTReaderBinary.cpp" -> "SBWT.h" + "bwaseqio.c" -> "bamlite.h" + "bwtsw2_pair.c" -> "malloc_wrap.h" + "hfile_irods.c" -> "config.h" + "bwtsw2_main.c" -> "utils.h" + "bwamem.c" -> "kbtree.h" + "utils.c" -> "utils.h" + "utils.c" -> "ksort.h" + "BWTDiskConstruction.cpp" -> "BWTCABauerCoxRosone.h" + "hts.c" -> "hts_internal.h" + "PopulationIndex.cpp" -> "PopulationIndex.h" + "bwamem_extra.c" -> "bwamem.h" + "string_alloc.c" -> "string_alloc.h" + "bwtsw2_aux.c" -> "bntseq.h" + "BWTWriterBinary.h" -> "STCommon.h" + "vcf.h" -> "hts.h" + "CoverageTool.cpp" -> "CoverageTool.h" + "hts.c" -> "config.h" + "bwtsw2_aux.c" -> "kstring.h" + "bwa.h" -> "bwt.h" + "bntseq.c" -> "malloc_wrap.h" + "bwt.c" -> "bwt.h" + "synced_bcf_reader.c" -> "kseq.h" + "BWTIntervalCache.h" -> "BWTInterval.h" + "back.BreakPoint.cpp" -> "gzstream.h" + "RLBWT.h" -> "STCommon.h" + "BWTCABauerCoxRosone.cpp" -> "BWTWriterBinary.h" + "bwa.c" -> "kseq.h" + "bwamem.h" -> "bwa.h" + "SuffixArray.cpp" -> "SuffixCompare.h" + "bwase.h" -> "bntseq.h" + "example.c" -> "bwamem.h" + "BWTReaderBinary.h" -> "BWTReader.h" + "bwtsw2_aux.c" -> "utils.h" + "InverseSuffixArray.cpp" -> "InverseSuffixArray.h" + "tbx.c" -> "config.h" + "BWTWriterAscii.cpp" -> "SBWT.h" + "RLBWT.cpp" -> "BWTReader.h" + "kbtree.h" -> "malloc_wrap.h" + "faidx.c" -> "config.h" + "hfile_net.c" -> "knetfile.h" + "node.h" -> "actypes.h" + "BWTIndexSet.h" -> "SampledSuffixArray.h" + "BWTReaderAscii.cpp" -> "RLBWT.h" + "bgzf.c" -> "bgzf.h" + "BWT.h" -> "RLBWT.h" + "synced_bcf_reader.c" -> "khash_str2int.h" + "VariantBamWalker.cpp" -> "VariantBamWalker.h" + "BWTWriter.h" -> "STCommon.h" + "bwtgap.h" -> "bwt.h" + "bwase.c" -> "utils.h" + "bwt_lite.c" -> "bwt_lite.h" + "vcfutils.c" -> "config.h" + "bwt_lite.c" -> "malloc_wrap.h" + "BreakPoint.cpp" -> "BreakPoint.h" + "knetfile.c" -> "knetfile.h" + "BWTWriterBinary.h" -> "BWTWriter.h" + "bwtsw2_aux.c" -> "bwa.h" + "bgzip.c" -> "hts.h" + "vcfutils.c" -> "vcfutils.h" + "SAWriter.h" -> "Occurrence.h" + "BWTReaderAscii.h" -> "STCommon.h" + "bwamem.c" -> "kstring.h" + "htsfile.c" -> "hfile.h" + "hts.c" -> "bgzf.h" + "ksw.c" -> "ksw.h" + "bwtaln.c" -> "bwtgap.h" + "htsfile.c" -> "config.h" + "BWTWriterAscii.h" -> "Occurrence.h" + "BWTTraverse.h" -> "BWT.h" + "hts.c" -> "hts.h" + "bwtsw2_core.c" -> "bwt.h" + "bwamem_extra.c" -> "kstring.h" + "regidx.c" -> "hts.h" + "BWTReaderBinary.h" -> "RLBWT.h" + "bwamem.c" -> "malloc_wrap.h" + "sam.c" -> "config.h" + "BWTIntervalCache.cpp" -> "BWTAlgorithms.h" + "ksw.c" -> "malloc_wrap.h" + "bwt.c" -> "malloc_wrap.h" + "GenomicRegion.cpp" -> "gzstream.h" + "reader.c" -> "reader.h" + "BWTWriterBinary.h" -> "BWTReaderBinary.h" + "BWTReader.cpp" -> "BWTReaderAscii.h" + "BWTReaderAscii.cpp" -> "SBWT.h" + "bwa.c" -> "utils.h" + "BWTDiskConstruction.cpp" -> "RankProcess.h" + "bwtaln.c" -> "bwa.h" + "GapArray.cpp" -> "GapArray.h" + "htsfile.c" -> "vcf.h" + "SampledSuffixArray.h" -> "BWT.h" + "bwtsw2_pair.c" -> "utils.h" + "vcf.h" -> "kstring.h" + "BWTWriterBinary.h" -> "Occurrence.h" + "BWTIndexSet.h" -> "PopulationIndex.h" + "InverseSuffixArray.h" -> "SuffixArray.h" + "QuickBWT.cpp" -> "QuickBWT.h" + "hfile_irods.c" -> "hfile_internal.h" + "ahocorasick.c" -> "ahocorasick.h" + "bwtindex.c" -> "utils.h" + "kstring.c" -> "kstring.h" + "bwtsw2_main.c" -> "bwt.h" + "kstring.h" -> "malloc_wrap.h" + "tbx.c" -> "khash.h" + "vcf.c" -> "kseq.h" + "pemerge.c" -> "utils.h" + "bwtsw2_core.c" -> "bwt_lite.h" + "bwamem_pair.c" -> "kstring.h" + "bwtsw2_pair.c" -> "bntseq.h" + "BWTIntervalCache.h" -> "BWT.h" + "BWTDiskConstruction.h" -> "BWT.h" + "bwt.c" -> "kvec.h" + "bwape.c" -> "bwtaln.h" + "gChain.cpp" -> "gChain.h" + "BWTCARopebwt.cpp" -> "BWTWriterAscii.h" + "VariantBamWalker.h" -> "BamStats.h" + "BWTWriter.cpp" -> "BWTWriter.h" + "HitData.h" -> "STCommon.h" + "QSufSort.c" -> "QSufSort.h" + "khash.h" -> "malloc_wrap.h" + "tabix.c" -> "tbx.h" + "BWTIndexSet.h" -> "BWTIntervalCache.h" + "bwtsw2_core.c" -> "malloc_wrap.h" + "VariantBamWalker.h" -> "BamWalker.h" + "ksw.c" -> "kseq.h" + "main.c" -> "utils.h" + "bntseq.c" -> "kseq.h" + "SBWT.cpp" -> "BWTReaderBinary.h" + "BWTDiskConstruction.cpp" -> "SAReader.h" + "RLBWT.cpp" -> "BWTWriter.h" + "bwa.c" -> "kstring.h" + "bgzip.c" -> "config.h" + "BWTWriterAscii.h" -> "SuffixArray.h" + "bwase.h" -> "bwtaln.h" + "BWTWriter.cpp" -> "BWTWriterAscii.h" + "ksort.h" -> "malloc_wrap.h" + "Histogram.h" -> "IntervalTree.h" + "bwa.c" -> "malloc_wrap.h" + "bwtsw2_aux.c" -> "ksort.h" + "sam.c" -> "khash.h" + "bwa.c" -> "kvec.h" + "vcf.c" -> "vcf.h" + "bwtgap.c" -> "bwtgap.h" + "BamWalker.cpp" -> "BamWalker.h" + "STCommon.h" -> "STGlobals.h" + "bwtsw2_aux.c" -> "bwtsw2.h" + "BWTWriter.cpp" -> "BWTWriterBinary.h" + "SuffixArray.h" -> "STCommon.h" + "hts.c" -> "ksort.h" + "RealignTool.cpp" -> "BamWalker.h" + "bwtsw2_aux.c" -> "ksw.h" + "bwtsw2_aux.c" -> "malloc_wrap.h" + "tbx.c" -> "tbx.h" + "bwamem_pair.c" -> "bwamem.h" + "SBWT.cpp" -> "SBWT.h" + "bwt.c" -> "utils.h" + "BWTWriter.h" -> "BWTReader.h" + "SBWT.h" -> "Occurrence.h" + "bamlite.c" -> "bamlite.h" + "bwtaln.c" -> "malloc_wrap.h" + "SampledSuffixArray.h" -> "SuffixArray.h" + "back.BreakPoint.cpp" -> "BreakPoint.h" + "back.AlignedContig.h" -> "BreakPoint.h" + "bgzf.c" -> "hts.h" + "bwamem_pair.c" -> "ksw.h" + "bwape.c" -> "bwase.h" + "BWTReaderBinary.cpp" -> "RLBWT.h" + "BWTWriter.h" -> "SuffixArray.h" + "BWTWriterAscii.cpp" -> "RLBWT.h" + "SampledSuffixArray.cpp" -> "SAWriter.h" + "SBWT.cpp" -> "BWTWriterAscii.h" + "SBWT.cpp" -> "BWTReaderAscii.h" + "bamlite.h" -> "malloc_wrap.h" + "RLBWT.cpp" -> "RLBWT.h" + "bwaseqio.c" -> "malloc_wrap.h" + "hfile_internal.h" -> "hfile.h" + "vcf_sweep.h" -> "vcf.h" + "htsfile.c" -> "hts.h" + "bwa.h" -> "bntseq.h" + "BWTWriterBinary.h" -> "RLBWT.h" + "bwtsw2_main.c" -> "bwa.h" + "BWAWrapper.cpp" -> "BWAWrapper.h" + "hfile.c" -> "hfile.h" + "kfunc.c" -> "kfunc.h" + "SAReader.cpp" -> "SuffixArray.h" + "bwamem.h" -> "bwt.h" + "BWTDiskConstruction.cpp" -> "BWTWriter.h" + "bwase.c" -> "bwase.h" + "tabix.c" -> "vcf.h" + "synced_bcf_reader.c" -> "config.h" + "BWTWriterAscii.h" -> "BWTWriter.h" + "faidx.c" -> "bgzf.h" + "bwtaln.c" -> "utils.h" + "fastmap.c" -> "bwa.h" + "SuffixArray.cpp" -> "SACAInducedCopying.h" + "bwape.c" -> "utils.h" + "VFilterTool.cpp" -> "VariantBamWalker.h" + "bwtindex.c" -> "bwt.h" + "fastmap.c" -> "kseq.h" + "VFilterTool.cpp" -> "GenomicRegionCollection.h" + "BWTWriter.h" -> "Occurrence.h" + "sam.c" -> "hfile.h" + "bwtsw2_core.c" -> "kvec.h" + "bntseq.c" -> "khash.h" + "malloc_wrap.c" -> "malloc_wrap.h" + "bwape.c" -> "bntseq.h" + "SnowTools.cpp" -> "CoverageTool.h" + "ahocorasick.h" -> "actypes.h" + "bwamem_pair.c" -> "utils.h" + "fastmap.c" -> "utils.h" + "bwamem.c" -> "kvec.h" + "RealignTool.cpp" -> "AlignedContig.h" + "vcf.c" -> "khash.h" + "bwamem_pair.c" -> "malloc_wrap.h" + "BWTCABauerCoxRosone.h" -> "SAWriter.h" + "SBWT.h" -> "BWTReader.h" + "vcf.c" -> "hfile.h" + "RLBWT.h" -> "SuffixArray.h" + "HTSTools.cpp" -> "HTSTools.h" + "vcf.c" -> "kstring.h" + "bwape.c" -> "malloc_wrap.h" + "BreakPoint.cpp" -> "gzstream.h" + "bwtsw2.h" -> "bntseq.h" + "MiniRules.cpp" -> "MiniRules.h" + "gzstream.C" -> "gzstream.h" + "BWTTraverse.h" -> "BWTAlgorithms.h" + "SAReader.cpp" -> "SAReader.h" + "vcf.c" -> "tbx.h" + "BWTReader.cpp" -> "BWTReader.h" + "strmm.c" -> "strmm.h" + "sam.h" -> "hts.h" + "BWTDiskConstruction.h" -> "SuffixArray.h" + "bwape.c" -> "kvec.h" + "pemerge.c" -> "bwa.h" + "sam.c" -> "cram.h" + "pattern.c" -> "pattern.h" + "vcf_sweep.c" -> "config.h" + "BWTDiskConstruction.cpp" -> "GapArray.h" + "GenomicRegion.cpp" -> "GenomicRegion.h" + "tabix.c" -> "kseq.h" + "bwtsw2_core.c" -> "bwtsw2.h" + "bwtindex.c" -> "bntseq.h" + "RealignTool.cpp" -> "BWAWrapper.h" + "bntseq.c" -> "utils.h" + "bwtsw2.h" -> "bwt_lite.h" + "bntseq.c" -> "bntseq.h" + "RankProcess.h" -> "GapArray.h" + "SuffixArray.cpp" -> "InverseSuffixArray.h" + "hts_internal.h" -> "hts.h" + "regidx.c" -> "config.h" + "pemerge.c" -> "kseq.h" + "BWTReader.h" -> "Occurrence.h" + "bwape.c" -> "bwa.h" + "SnowUtils.cpp" -> "SnowUtils.h" + "tabix.c" -> "bgzf.h" + "tbx.c" -> "bgzf.h" + "VFilterTool.cpp" -> "VFilterTool.h" + "bwamem.h" -> "bntseq.h" + "ahocorasick.c" -> "node.h" + "BWTWriterBinary.cpp" -> "BWTWriterBinary.h" + "BWTWriterAscii.h" -> "STCommon.h" + "htsfile.c" -> "sam.h" + "synced_bcf_reader.h" -> "hts.h" + "pemerge.c" -> "ksw.h" + "fastmap.c" -> "bwamem.h" + "hts.c" -> "version.h" + "GenomicRegionCollection.cpp" -> "gzstream.h" + "regidx.c" -> "kstring.h" + "md5.c" -> "config.h" + "RankProcess.h" -> "BWT.h" + "RLBWT.h" -> "RLUnit.h" + "bwaseqio.c" -> "bwtaln.h" + "bwtsw2_aux.c" -> "kseq.h" + "RLBWT.h" -> "Occurrence.h" + "hts.c" -> "khash.h" + "SACAInducedCopying.cpp" -> "SuffixCompare.h" + "cram.h" -> "cram_decode.h" + "BWTCABauerCoxRosone.cpp" -> "BWTCABauerCoxRosone.h" + "BWTIndexSet.h" -> "BWT.h" + "sam.c" -> "sam.h" + "SnowTools.cpp" -> "RealignTool.h" + "cram.h" -> "cram_encode.h" + "tabix.c" -> "hts.h" + "hfile.c" -> "config.h" + "bwase.c" -> "bntseq.h" + "Occurrence.h" -> "STCommon.h" + "bwtaln.c" -> "bwtaln.h" + "BWTReaderAscii.h" -> "BWTReader.h" + "sam.c" -> "kseq.h" + "bwtsw2.h" -> "bwt.h" + "RealignTool.cpp" -> "RealignTool.h" + "RankProcess.cpp" -> "RankProcess.h" + "BWT.h" -> "SBWT.h" + "bwtsw2_pair.c" -> "bwtsw2.h" + "bwtsw2_pair.c" -> "bwt.h" + "SAWriter.h" -> "SAReader.h" + "hts.c" -> "kseq.h" + "example3.cpp" -> "AhoCorasickPlus.h" + "BWTWriterBinary.cpp" -> "RLBWT.h" + "regidx.c" -> "kseq.h" + "md5.c" -> "hts.h" + "STCommon.cpp" -> "STCommon.h" + "SBWT.h" -> "SuffixArray.h" + "BamStats.cpp" -> "BamStats.h" + "bwamem_extra.c" -> "bwa.h" + "BWTReader.h" -> "STCommon.h" + "Occurrence.cpp" -> "SBWT.h" + "bamlite.c" -> "malloc_wrap.h" + "InverseSuffixArray.h" -> "STCommon.h" + "SuffixArray.cpp" -> "BWTWriter.h" + "pattern.c" -> "strmm.h" + "bwtgap.h" -> "bwtaln.h" + "BWTCABauerCoxRosone.h" -> "BWTWriterBinary.h" + "bwtsw2_chain.c" -> "malloc_wrap.h" + "SampledSuffixArray.cpp" -> "SAReader.h" + "bwtgap.c" -> "malloc_wrap.h" + "bwtsw2_pair.c" -> "kstring.h" + "fastmap.c" -> "kvec.h" + "walker.c" -> "walker.h" + "bwtsw2_core.c" -> "khash.h" + "bwtsw2_core.c" -> "ksort.h" + "faidx.c" -> "khash.h" + "sam.c" -> "kstring.h" + "SuffixCompare.cpp" -> "SuffixCompare.h" + "vcf_sweep.c" -> "bgzf.h" + "VFilterTool.cpp" -> "gzstream.h" + "multifast.c" -> "pattern.h" + "GapArray.cpp" -> "SparseGapArray.h" + "BWTAlgorithms.h" -> "BWTInterval.h" + "synced_bcf_reader.c" -> "synced_bcf_reader.h" + "BWTReaderBinary.cpp" -> "BWTReaderBinary.h" + "AlignedContig.cpp" -> "AlignedContig.h" + "pattern.c" -> "multifast.h" + "regidx.c" -> "khash_str2int.h" + "bwtaln.h" -> "bwt.h" + "main.c" -> "kstring.h" + "AhoCorasickPlus.cpp" -> "AhoCorasickPlus.h" + "bwape.c" -> "khash.h" + "RLBWT.h" -> "HitData.h" + "BWTAlgorithms.h" -> "STCommon.h" + "hts.c" -> "cram.h" + "bwtindex.c" -> "malloc_wrap.h" + "faidx.c" -> "faidx.h" + "bwtgap.c" -> "bwtaln.h" + "BWTWriterBinary.cpp" -> "SBWT.h" + "hfile.h" -> "hts_defs.h" + "BWTReaderBinary.h" -> "STCommon.h" + "bwase.c" -> "malloc_wrap.h" + "pemerge.c" -> "kstring.h" + "BWTReaderAscii.cpp" -> "BWTReaderAscii.h" + "hfile.c" -> "hfile_internal.h" + "SAWriter.cpp" -> "SAWriter.h" + "bwase.c" -> "bwa.h" + "RLBWT.h" -> "BWTReader.h" + "tabix.c" -> "sam.h" + "cram.h" -> "cram_io.h" + "bwase.h" -> "bwt.h" + "BWTDiskConstruction.cpp" -> "BWTDiskConstruction.h" + "SACAInducedCopying.cpp" -> "SACAInducedCopying.h" + "hfile_net.c" -> "config.h" + "RealignTool.cpp" -> "BamRead.h" + "bwase.c" -> "bwtaln.h" + "faidx.c" -> "hfile.h" + "BWTCARopebwt.cpp" -> "SAWriter.h" + "is.c" -> "malloc_wrap.h" + "bwamem.c" -> "utils.h" + "bgzip.c" -> "bgzf.h" + "BWTReaderBinary.h" -> "Occurrence.h" + "BWTAlgorithms.cpp" -> "BWTAlgorithms.h" + "bwape.c" -> "ksw.h" + "bwashm.c" -> "bwa.h" + "faidx.h" -> "hts_defs.h" + "SAWriter.cpp" -> "SuffixArray.h" + "BWTDiskConstruction.cpp" -> "SAWriter.h" + "bwaseqio.c" -> "utils.h" + "BamRead.cpp" -> "BamRead.h" + "bwamem_pair.c" -> "kvec.h" + "BWTWriterAscii.cpp" -> "BWTWriterAscii.h" + "vcf.c" -> "config.h" + "BWTDiskConstruction.cpp" -> "BWTReader.h" +} diff --git a/src/AlignedContig.cpp b/src/AlignedContig.cpp index 23aa083f3..d5798062c 100644 --- a/src/AlignedContig.cpp +++ b/src/AlignedContig.cpp @@ -6,7 +6,7 @@ namespace SnowTools { - void AlignedContig::addAlignment(const BamRead &align) { + /*void AlignedContig::addAlignment(const BamRead &align) { AlignmentFragment tal(align); m_frag_v.push_back(tal); @@ -14,7 +14,8 @@ namespace SnowTools { if (m_frag_v.size() > 1) sort(m_frag_v.begin(), m_frag_v.end()); -} + }*/ + AlignedContig::AlignedContig(const BamReadVector& bav) { if (!bav.size()) @@ -28,6 +29,7 @@ namespace SnowTools { // make the individual alignments and add for (auto& i : bav) { m_frag_v.push_back(AlignmentFragment(i)); + m_frag_v.back().num_align = bav.size(); } // sort fragments by order on fwd-strand contig @@ -45,8 +47,8 @@ void AlignedContig::printContigFasta(std::ofstream& os) const { void AlignedContig::blacklist(GRC &grv) { - if (m_skip) - return; + //if (m_skip) + // return; // loop through the indel breaks and blacklist for (auto& i : m_frag_v) @@ -57,11 +59,8 @@ void AlignedContig::blacklist(GRC &grv) { void AlignedContig::splitCoverage() { - if (m_skip) - return; - - // require that alignReadsToContig already ran - assert(m_tried_align_reads); + //if (m_skip) + // return; for (auto& i : m_frag_v) { for (auto& j : i.m_indel_breaks) { @@ -86,7 +85,9 @@ std::ostream& operator<<(std::ostream& out, const AlignedContig &ac) { // print the global breakpoint if (!ac.m_global_bp.isEmpty()) - out << "Global BP: " << ac.m_global_bp << std::endl; + out << "Global BP: " << ac.m_global_bp << + " ins_aginst_contig " << ac.insertion_against_contig_read_count << + " del_against_contig " << ac.deletion_against_contig_read_count << std::endl; // print the multi-map breakpoints for (auto& i : ac.m_local_breaks) @@ -97,7 +98,8 @@ std::ostream& operator<<(std::ostream& out, const AlignedContig &ac) { for (auto& i : ac.m_frag_v) for (auto& j : i.getIndelBreaks()) if (!j.isEmpty()) - out << "Indel: " << j << " -- " << ac.getContigName() << std::endl; + out << "Indel: " << j << " -- " << ac.getContigName() << " ins_a_contig " << ac.insertion_against_contig_read_count << + " del_a_contig " << ac.deletion_against_contig_read_count << std::endl; // print the AlignmentFragments alignments for (auto& i : ac.m_frag_v) @@ -111,58 +113,69 @@ std::ostream& operator<<(std::ostream& out, const AlignedContig &ac) { } } - // print the contig base-pairs + // print the contig base-pairs out << ac.getSequence() << " " << ac.getContigName() << std::endl; - - PlottedReadVector plot_vec; - - // print out the individual reads - for (auto& i : ac.m_bamreads) { - - int pos = -1; - int aln = -1; - int dum= 0; - std::string seq = i.QualityTrimmedSequence(4, dum); - std::string sr = i.GetZTag("SR"); - - // reverse complement if need be - int32_t rc = i.GetIntTag("RC"); - if (rc) - SnowTools::rcomplement(seq); - - // get the more complex tags (since there can be multiple annotations per tag) - std::vector posvec = i.GetSmartIntTag("SL"); // start positions ON CONTIG - std::vector alnvec = i.GetSmartIntTag("TS"); // start positions ON READ - std::vector cnvec = i.GetSmartStringTag("CN"); - assert(cnvec.size() == posvec.size()); - size_t kk = 0; - for (; kk < cnvec.size(); kk++) - if (cnvec[kk] == ac.getContigName()) { - pos = posvec[kk]; - aln = alnvec[kk]; - break; - } - - // trim the sequence if it hangs off the end - //if (i.GetZTag("SR") == "t147_D0BK6ACXX111110:6:2101:5352:156091") { - // std::cerr << "i.PositionEnd() " << i.PositionEnd() << " i.Position " << i.Position() << " len " << ac.getSequence().length() << std::endl; - // exit(1); - // } - if (aln > 0) - seq = seq.substr(aln, seq.length() - aln); - - if ( (pos + seq.length() ) > ac.getSequence().length()) - seq = seq.substr(0, ac.getSequence().length() - pos); - + + PlottedReadVector plot_vec; - assert(kk != cnvec.size()); // assure that we found something - pos = abs(pos); - int padlen = ac.getSequence().size() - pos - seq.size() + 5; - padlen = std::max(5, padlen); + // print out the individual reads + for (auto& i : ac.m_bamreads) { + + int pos = -1; + int aln = -1; + int rc = 0; + int dum= 0; + std::string this_cig; + std::string seq = i.QualityTrimmedSequence(4, dum); + std::string sr = i.GetZTag("SR"); + + // reverse complement if need be + //int32_t rc = i.GetIntTag("RC"); + //if (rc/* && false*/) + // SnowTools::rcomplement(seq); + + // get the more complex tags (since there can be multiple annotations per tag) + std::vector posvec = i.GetSmartIntTag("SL"); // start positions ON CONTIG + std::vector alnvec = i.GetSmartIntTag("TS"); // start positions ON READ + std::vector rcvec = i.GetSmartIntTag("RC"); // read reverse complmented relative to contig + std::vector cigvec = i.GetSmartStringTag("SC"); // read against contig CIGAR + std::vector cnvec = i.GetSmartStringTag("CN"); + + assert(cnvec.size() == posvec.size()); + size_t kk = 0; + for (; kk < cnvec.size(); kk++) + if (cnvec[kk] == ac.getContigName()) { + pos = posvec[kk]; + aln = alnvec[kk]; + rc = rcvec[kk]; + this_cig = cigvec[kk]; + break; + } + + // reverse complement if need be + if (rc) + SnowTools::rcomplement(seq); + + // trim the sequence if it hangs off the end + //if (i.GetZTag("SR") == "t147_D0BK6ACXX111110:6:2101:5352:156091") { + // std::cerr << "i.PositionEnd() " << i.PositionEnd() << " i.Position " << i.Position() << " len " << ac.getSequence().length() << std::endl; + // exit(1); + // } + if (aln > 0) + seq = seq.substr(aln, seq.length() - aln); + + if ( (pos + seq.length() ) > ac.getSequence().length()) + seq = seq.substr(0, ac.getSequence().length() - pos); + + + assert(kk != cnvec.size()); // assure that we found something + pos = abs(pos); + int padlen = ac.getSequence().size() - pos - seq.size() + 5; + padlen = std::max(5, padlen); std::stringstream rstream; assert(pos < 1e4 && padlen < 1e4); // bug, need to check - rstream << sr << "--" << (i.ChrID()+1) << ":" << i.Position(); + rstream << sr << "--" << (i.ChrID()+1) << ":" << i.Position() << " r2c CIGAR: " << this_cig; plot_vec.push_back({pos, seq, rstream.str()}); } @@ -198,8 +211,8 @@ std::ostream& operator<<(std::ostream& out, const AlignedContig &ac) { void AlignedContig::setMultiMapBreakPairs() { - if (m_skip) - return; + //if (m_skip) + // return; // if single mapped contig, nothing to do here if (m_frag_v.size() == 1) @@ -208,6 +221,8 @@ void AlignedContig::setMultiMapBreakPairs() { // initialize the breakpoint, fill with basic info BreakPoint bp; bp.seq = getSequence(); + bp.num_align = m_frag_v.size(); + assert(bp.num_align > 0); //bp.num_frag_v = m_frag_v.size(); bp.cname = getContigName(); @@ -371,10 +386,6 @@ void AlignedContig::setMultiMapBreakPairs() { SnowTools::USeqVector v = {{getContigName(), getSequence()}}; bw.constructIndex(v); - // debug - SnowTools::BamWalker walk("/seq/picard_aggregation/G14856/TCGA-05-4432-01A-01D-1931-08/v2/TCGA-05-4432-01A-01D-1931-08.bam"); - walk.setStdout(); - // align the reads for (auto& i : bav) { @@ -389,18 +400,32 @@ void AlignedContig::setMultiMapBreakPairs() { if (brv.size() == 0) continue; - + + bool length_pass = (brv[0].PositionEnd() - brv[0].Position()) >= 40; + bool mapq_pass = brv[0].MapQuality(); + int ins_bases = brv[0].MaxInsertionBases(); + int del_bases = brv[0].MaxDeletionBases(); + // store read2contig alignment info in this read - if ( (brv[0].PositionEnd() - brv[0].Position()) > 80 && brv[0].MapQuality() == 60 && brv[0].MaxInsertionBases() == 0 && brv[0].MaxDeletionBases() == 0) + if ( length_pass && mapq_pass && ins_bases == 0 && del_bases == 0) { if (brv[0].ReverseFlag()) - i.AddIntTag("RC",1); + i.SmartAddTag("RC","1"); + else + i.SmartAddTag("RC","0"); i.SmartAddTag("SL", std::to_string(brv[0].Position())); + i.SmartAddTag("SE", std::to_string(brv[0].PositionEnd())); i.SmartAddTag("TS", std::to_string(brv[0].AlignmentPosition())); + i.SmartAddTag("TE", std::to_string(brv[0].AlignmentEndPosition())); + i.SmartAddTag("SC", brv[0].CigarString()); i.SmartAddTag("CN", getContigName()); m_bamreads.push_back(i); } + else if (ins_bases && mapq_pass && length_pass) + ++insertion_against_contig_read_count; + else if (del_bases && mapq_pass && length_pass) + ++deletion_against_contig_read_count; } @@ -435,7 +460,30 @@ std::string AlignedContig::printDiscordantClusters() const { } */ -AlignmentFragment::AlignmentFragment(const BamRead &talign) { + bool AlignedContig::checkLocal(const GenomicRegion& window) + { + bool has_loc = false; + for (auto& i : m_frag_v) + if (i.checkLocal(window)) + has_loc = true; + return has_loc; + + } + + bool AlignmentFragment::checkLocal(const GenomicRegion& window) + { + // make a regino for this frag + GenomicRegion gfrag(m_align.ChrID(), m_align.Position(), m_align.PositionEnd()); + + if (window.getOverlap(gfrag)) { + local = true; + return true; + } + + return false; + } + + AlignmentFragment::AlignmentFragment(const BamRead &talign) { m_align = talign; @@ -491,8 +539,10 @@ AlignmentFragment::AlignmentFragment(const BamRead &talign) { // parse right away to see if there are indels on this alignment BreakPoint bp; size_t fail_safe_count = 0; - while (parseIndelBreak(bp) && fail_safe_count++ < 100) + while (parseIndelBreak(bp) && fail_safe_count++ < 100) { m_indel_breaks.push_back(bp); + assert(bp.num_align == 1); + } assert(fail_safe_count != 100); @@ -517,10 +567,30 @@ std::ostream& operator<<(std::ostream &out, const AlignmentFragment &c) { else if (j.Type == 'S' || j.Type == 'H') out << std::string(j.Length, '.'); } - + // print contig and genome breaks - out << " C[" << c.break1 << "," << c.break2 << "] G[" << c.gbreak1 << "," << c.gbreak2 << "]"; - + out << "\tC[" << c.break1 << "," << c.break2 << "] G[" << c.gbreak1 << "," << c.gbreak2 << "]"; + + // add local info + out << "\tLocal: " << c.local << "\tAligned to: " << (c.m_align.ChrID()+1) << ":" << c.m_align.Position() << "(" << (c.m_align.ReverseFlag() ? "-" : "+") << ") CIG: " << c.m_align.CigarString() << " MAPQ: " << c.m_align.MapQuality(); + + // print the del if there is one TODO + /* std::string del_string = std::string(c.m_align.Length(), ' '); + int dpos = 0; + bool has_del = false; + for (auto& i : c.m_align.GetCigar()) { + if (i.Type != 'D') + dpos += i.Length; + if (i.Type == 'D' && dpos != 0) { + del_string[dpos-1] = '|'; + del_string[dpos] = '|'; + has_del = true; + } + } + if (has_del) + out << del_string << std::endl; + */ + // print the info /* out << " " << c.m_align.RefID + 1 << ":" << c.align.Position << " MAPQ: " << c.align.MapQuality << " OrientedCigar: " << BamToolsUtils::cigarToString(c.align.CigarData) @@ -535,23 +605,33 @@ std::ostream& operator<<(std::ostream &out, const AlignmentFragment &c) { } + void AlignedContig::checkAgainstCigarMatches(const CigarMap& nmap, const CigarMap& tmap) { + + for (auto& i : m_frag_v) + i.indelCigarMatches(nmap, tmap); + + } + void AlignmentFragment::indelCigarMatches(const CigarMap &nmap, const CigarMap &tmap) { + // loop through the indel breakpoints for (auto& i : m_indel_breaks) { - assert(i.isindel); + assert(i.isindel); // make sure we only call on indels if (i.getSpan() <= 0) { std::cerr << "weird span detected " << i.getSpan(); std::cerr << i << std::endl; std::cerr << *this << std::endl; } - //assert(i.getSpan() > 0); - + + // get the hash string in same formate as cigar map (eg. pos_3D) std::string st = i.getHashString(); + // check if this breakpoint from assembly is in the cigarmap CigarMap::const_iterator ffn = nmap.find(st); CigarMap::const_iterator fft = tmap.find(st); + // if it is, add it if (ffn != nmap.end()) i.ncigar = ffn->second; if (fft != tmap.end()) @@ -589,14 +669,12 @@ bool AlignedContig::isWorse(const AlignedContig &ac) const { } bool AlignmentFragment::parseIndelBreak(BreakPoint &bp) { - - // reject out of hand if not in interval - if (!local) - return false; - - //cout << "parsing indel break" << std::endl; - assert(m_cigar.size()); + // make sure we have a non-zero cigar + if (m_cigar.size() == 0) { + std::cerr << "CIGAR of length 0 on " << *this << std::endl; + return false; + } // reject if it has small matches, could get confused. Fix later for (auto& i : m_cigar) @@ -604,27 +682,11 @@ bool AlignmentFragment::parseIndelBreak(BreakPoint &bp) { return false; // reject if first alignment is I or D - for (auto& i : m_cigar) { - if (i.Type == 'D' || i.Type == 'I') { - std::cerr << "rejcting cigar for starting in I or D" << std::endl; - return false; - } - if (i.Type == 'M') - break; + if (m_cigar[0].Type == 'I' || m_cigar[0].Type == 'D' || m_cigar[m_cigar.size()-1].Type == 'D' || m_cigar[m_cigar.size()-1].Type == 'I') { + std::cerr << "rejcting cigar for starting in I or D" << std::endl; + return false; } - // reject if last alignment is I or D - /* Cigar tmpcig = m_cigar; - //BamToolsUtils::flipCigar(tmpcig); - for (auto& i : tmpcig) { - if (i.Type == 'D' || i.Type == 'I') { - cerr << "rejcting cigar for ending in I or D" << std::endl; - return false; - } - if (i.Type == 'M') - break; - }*/ - // use next available largest D / I size_t loc = 0; // keep track of which cigar field for (auto& i : m_cigar) { @@ -642,21 +704,20 @@ bool AlignmentFragment::parseIndelBreak(BreakPoint &bp) { } } - // we made it to the end, no more indels to report - //if (idx == align.CigarData.size()) - // return false; - // we made it to the end, no more indels to report if (loc == m_cigar.size()) return false; // clear out the old bp just in case - bp.insertion = ""; - bp.homology = ""; - - //bp.seq = m_seq; + bp = BreakPoint(); bp.isindel = true; + // set the number of alignments for this bp + bp.num_align = 1; + + //bp.insertion = ""; + //bp.homology = ""; + int curr = 0; int gcurrlen = -1; @@ -669,8 +730,8 @@ bool AlignmentFragment::parseIndelBreak(BreakPoint &bp) { bp.cpos1 = -1; bp.cpos2 = -1; - //bp.num_frag_v = 1; - //bp.cname = m_name; + + bp.cname = m_align.Qname(); bp.mapq1 = m_align.MapQuality(); bp.mapq2 = m_align.MapQuality(); bp.gr1.strand = '+'; @@ -806,11 +867,22 @@ bool AlignedContig::hasVariant() const { return true; for (auto& i : m_frag_v) - if (i.m_indel_breaks.size()) + if (i.local && i.m_indel_breaks.size()) return true; return false; } + void AlignedContig::addDiscordantCluster(DiscordantClusterMap& dmap) + { + + // loop through the breaks and compare with the map + for (auto& i : m_local_breaks) + i.__combine_with_discordant_cluster(dmap); + m_global_bp.__combine_with_discordant_cluster(dmap); + + } + + } diff --git a/src/BWAWrapper.cpp b/src/BWAWrapper.cpp index 01135c3f1..36ae773d4 100644 --- a/src/BWAWrapper.cpp +++ b/src/BWAWrapper.cpp @@ -81,7 +81,7 @@ namespace SnowTools { mem_alnreg_v ar; ar = mem_align1(memopt, idx->bwt, idx->bns, idx->pac, seq.length(), seq.c_str()); // get all the hits - + #ifdef DEBUG_BWATOOLS std::cout << "num hits: " << ar.n << std::endl; //std::cout << __print_bns() << std::endl; diff --git a/src/BamWalker.cpp b/src/BamWalker.cpp index fc6b22324..5cc67eeae 100644 --- a/src/BamWalker.cpp +++ b/src/BamWalker.cpp @@ -4,6 +4,22 @@ using namespace SnowTools; +struct idx_delete { + void operator()(void* x) { hts_idx_destroy((hts_idx_t*)x); } +}; + +struct hts_itr_delete { + void operator()(void* x) { hts_itr_destroy((hts_itr_t*)x); } +}; + +struct bgzf_delete { + void operator()(void* x) { bgzf_close((BGZF*)x); } +}; + +struct bam_hdr_delete { + void operator()(void* x) { bam_hdr_destroy((bam_hdr_t*)x); } +}; + // set the bam region bool BamWalker::__set_region(const GenomicRegion& gp) { @@ -14,13 +30,10 @@ bool BamWalker::__set_region(const GenomicRegion& gp) { #endif //HTS set region - if (!idx) { - idx = hts_idx_load(m_in.c_str(), HTS_FMT_BAI); -#ifdef DEBUG_WALKER - std::cerr << "loading the index for "<< m_in << " and has value " << idx << std::endl; -#endif - } - hts_itr = sam_itr_queryi(idx, gp.chr, gp.pos1, gp.pos2); + if (!idx) + idx = std::shared_ptr(hts_idx_load(m_in.c_str(), HTS_FMT_BAI), idx_delete()); + hts_itr = std::shared_ptr(sam_itr_queryi(idx.get(), gp.chr, gp.pos1, gp.pos2), hts_itr_delete()); + if (!hts_itr) { std::cerr << "Error: Failed to set region: " << gp << std::endl; return false; @@ -34,21 +47,32 @@ void BamWalker::setBamWalkerRegion(const GenomicRegion& g) m_region.clear(); m_region.push_back(g); m_region_idx = 0; // rewind it - __set_region(g); + __check_regions_blacklist(); // sets m_region + if (m_region.size()) + __set_region(m_region[0]); + else + m_region.push_back(GenomicRegion(-1,-1,-1)); } void BamWalker::setBamWalkerRegions(const GenomicRegionVector& grv) { - assert(grv.size()); + if (grv.size() == 0) + { + std::cerr << "Warning: Trying to set an empty bam region" << std::endl; + return; + } m_region = grv; m_region_idx = 0; // rewind it - //m_region.rewind(); - __set_region(grv[0]); + __check_regions_blacklist(); // sets m_region + if (m_region.size()) + __set_region(m_region[0]); + else + m_region.push_back(GenomicRegion(-1,-1,-1)); } void BamWalker::__close_read_bam() { - if (fp) + /* if (fp) bgzf_close(fp); if (br) bam_hdr_destroy(br); @@ -61,11 +85,12 @@ void BamWalker::__close_read_bam() { br = NULL; hts_itr = NULL; idx = NULL; - + */ } void BamWalker::__close_write_bam() { + if (fop) sam_close(fop); fop = NULL; @@ -101,7 +126,6 @@ BamWalker::BamWalker(const std::string& in) : m_in(in) // open for reading if (!__open_BAM_for_reading()) throw 20; - } bool BamWalker::__open_BAM_for_reading() @@ -110,19 +134,18 @@ bool BamWalker::__open_BAM_for_reading() assert(m_in.length()); // HTS open the reader - const char rflag = 'r'; - fp = bgzf_open(m_in.c_str(), &rflag); + fp = std::shared_ptr(bgzf_open(m_in.c_str(), "r"), bgzf_delete()); if (!fp) { std::cerr << "Error using HTS reader on opening " << m_in << std::endl; exit(EXIT_FAILURE); - } - br = bam_hdr_read(fp); + } + + br = std::shared_ptr(bam_hdr_read(fp.get()), bam_hdr_delete()); if (!br) { std::cerr << "Error using HTS reader on opening " << m_in << std::endl; return false; - //exit(EXIT_FAILURE); } return true; @@ -146,7 +169,7 @@ bool BamWalker::__open_BAM_for_writing() // hts write the header if (m_print_header) - sam_hdr_write(fop, br); + sam_hdr_write(fop, br.get()); return true; @@ -172,7 +195,7 @@ void BamWalker::SetMiniRulesCollection(const std::string& rules) { // construct the minirules - m_mr = MiniRulesCollection(rules, br); + m_mr = MiniRulesCollection(rules, br.get()); // check that it worked if (!m_mr.size()) { @@ -201,7 +224,7 @@ bool BamWalker::GetNextRead(BamRead& r, bool& rule) bam1_t* b = bam_init1(); if (hts_itr == 0) { - if (bam_read1(fp, b) < 0) { + if (bam_read1(fp.get(), b) < 0) { #ifdef DEBUG_WALKER std::cerr << "ended reading on null hts_itr" << std::endl; @@ -213,12 +236,9 @@ bool BamWalker::GetNextRead(BamRead& r, bool& rule) int32_t valid; if (hts_itr == 0) { - //std::cerr << "debug snowtools intialized read FROM WHOLE " << std::endl; - valid = bam_read1(fp, b); + valid = bam_read1(fp.get(), b); } else { - //std::cerr << "debug snowtools intialized read FROM HTS_ITR_NEXT. fp " << fp << " hts_itr " << hts_itr << " b " << b << std::endl; - valid = hts_itr_next(fp, hts_itr, b, dum); - //std::cerr << "debug snowtools DONE intialized read FROM HTS_ITR_NEXT" << std::endl; + valid = hts_itr_next(fp.get(), hts_itr.get(), b, dum); } if (valid <= 0) { // read not found @@ -239,7 +259,7 @@ bool BamWalker::GetNextRead(BamRead& r, bool& rule) } // next region exists, try it __set_region(m_region[m_region_idx]); - valid = hts_itr_next(fp, hts_itr, b, dum); + valid = hts_itr_next(fp.get(), hts_itr.get(), b, dum); } while (valid <= 0); // keep trying regions until works } @@ -248,6 +268,17 @@ bool BamWalker::GetNextRead(BamRead& r, bool& rule) // check if it passed the rules rule = m_mr.isValid(r); + ++m_num_reads_seen; + if (rule) + ++m_num_reads_kept; + + // hit the limit, no more raeding + if (m_limit >= 0 && m_num_reads_seen > m_limit) + return false; + if (m_keep_limit >= 0 && m_num_reads_kept > m_keep_limit) + return false; + + return true; } @@ -260,7 +291,7 @@ void BamWalker::WriteAlignment(BamRead &r) for (auto& i : m_tag_list) r.RemoveTag(i.c_str()); - sam_write1(fop, br, r.raw()); + sam_write1(fop, br.get(), r.raw()); } std::ostream& SnowTools::operator<<(std::ostream& out, const BamWalker& b) @@ -306,3 +337,33 @@ std::string BamWalker::displayMiniRulesCollection() const ss << m_mr; return ss.str(); } + +void BamWalker::__check_regions_blacklist() +{ + + //std::cerr << "BLACKLIST size " << blacklist.size() << std::endl; + + // check if it overlaps with blacklist + if (blacklist.size()) + { + GRC regs; + for (auto& i : m_region) + regs.add(i); + //std::cerr << "regs " << std::endl; + //for (auto& i : regs) + //std::cerr << "REG " << i << std::endl; + GRC out = regs.complement(blacklist); + //for (auto& i : blacklist) + //std::cerr << "BL: " << i << std::endl; + m_region = out.asGenomicRegionVector(); + //for (auto& i : m_region) + //std::cerr << "TRIMMED REGION " << i << std::endl; + } + +} + +void BamWalker::addBlacklist(GRC& bl) +{ + blacklist = bl; +} + diff --git a/src/BreakPoint.cpp b/src/BreakPoint.cpp index 689ee8838..e7e455000 100644 --- a/src/BreakPoint.cpp +++ b/src/BreakPoint.cpp @@ -3,9 +3,10 @@ #include #include "SnowTools/gzstream.h" +//#include "SnowTools/HTSTools.h" //#include "SnowTools/SnowToolsCommon.h" -#define SPLIT_BUFF 12 +#define SPLIT_BUFF 8 namespace opt { @@ -105,7 +106,7 @@ namespace SnowTools { // set the evidence string bool isdisc = (dc.tcount + dc.ncount) != 0; //bool issplit = (tsplit + nsplit) != 0; - bool issplit = dc.contig != ""; + bool issplit = (tsplit + nsplit) != 0;//dc.m_contig != ""; //if (discovar) // evidence = "DSCVR"; if (num_align == 1) @@ -118,6 +119,9 @@ namespace SnowTools { evidence = "ASSMB"; else if (num_align > 2) evidence = "COMPL"; + + if (!evidence.length()) + std::cerr << "num_align " << num_align << " isdisc " << isdisc << " issplit " << issplit << std::endl; assert(evidence.length()); confidence = ""; @@ -270,26 +274,29 @@ namespace SnowTools { } // make a breakpoint from a discordant cluster - BreakPoint::BreakPoint(DiscordantCluster tdc) { + BreakPoint::BreakPoint(const DiscordantCluster& tdc) { + num_align = 0; dc = tdc; - gr1.pos1 = (tdc.reg1.strand) ? tdc.reg1.pos2 : tdc.reg1.pos1; + + gr1.pos1 = (tdc.m_reg1.strand) ? tdc.m_reg1.pos2 : tdc.m_reg1.pos1; gr1.pos2 = gr1.pos1; - gr2.pos1 = (tdc.reg2.strand) ? tdc.reg2.pos2 : tdc.reg2.pos1; + gr2.pos1 = (tdc.m_reg2.strand) ? tdc.m_reg2.pos2 : tdc.m_reg2.pos1; gr2.pos2 = gr2.pos1; - gr1.chr = tdc.reg1.chr; - gr2.chr = tdc.reg2.chr; - cname = tdc.cluster; - gr1.strand = tdc.reg1.strand; - gr2.strand = tdc.reg2.strand; + gr1.chr = tdc.m_reg1.chr; + gr2.chr = tdc.m_reg2.chr; + gr1.strand = tdc.m_reg1.strand; + gr2.strand = tdc.m_reg2.strand; - mapq1 = tdc.getMeanMapq(false); - mapq2 = tdc.getMeanMapq(true); // mate + mapq1 = tdc.getMeanMapq(false); + mapq2 = tdc.getMeanMapq(true); // mate + cname = tdc.toRegionString(); + } bool BreakPoint::hasDiscordant() const { - return !dc.reg1.isEmpty(); + return !dc.m_reg1.isEmpty(); } @@ -309,7 +316,7 @@ namespace SnowTools { bool BreakPoint::operator==(const BreakPoint &bp) const { - return (gr1.chr == bp.gr1.chr && gr1.pos1 == bp.gr1.pos1 && gr2.pos1 == bp.gr2.pos1); + return (gr1 == bp.gr1 && gr2 == bp.gr2); //gr1.chr == bp.gr1.chr && gr1.pos1 == bp.gr1.pos1 && gr2.pos1 == bp.gr2.pos1); } @@ -393,7 +400,17 @@ namespace SnowTools { if (bp.gr1.chr < 24) { GenomicRegion gr = bp.gr1; gr.pad(20); - std::string seqr = getRefSequence(gr, findex); + + // get the reference seqeuence for this piece + int len; + std::string chrstring = GenomicRegion::chrToString(gr.chr); + char * seq = faidx_fetch_seq(findex, const_cast(chrstring.c_str()), gr.pos1-1, gr.pos2-1, &len); + if (!seq) { + std::cerr << "faidx_fetch_seq fail" << std::endl; + } + std::string seqr = std::string(seq); + + // define repeats std::vector repr = {"AAAAAAAA", "TTTTTTTT", "CCCCCCCC", "GGGGGGGG", "TATATATATATATATA", "ATATATATATATATAT", "GCGCGCGCGCGCGCGC", "CGCGCGCGCGCGCGCG", @@ -536,7 +553,6 @@ namespace SnowTools { for (auto& j : bav) { std::string sr = j.GetZTag("SR"); - //r_get_Z_tag(j, "SR", sr); assert(sr.at(0) == 't' || sr.at(0) == 'n'); bool tumor_read = (sr.at(0) == 't'); @@ -545,16 +561,20 @@ namespace SnowTools { //r_get_trimmed_seq(j, seq); assert(seq.length() > 0); - std::string qname = j.GetZTag("CN"); //GetStringTag(j, "CN").back(); - int pos = j.GetSmartIntTag("AL").back(); + std::string qname = j.GetSmartStringTag("CN").back(); //GetStringTag(j, "CN").back(); + int pos = j.GetSmartIntTag("SL").back(); + int te = 0; + try { + te = j.GetSmartIntTag("SE").back(); + } catch (...) { + std::cerr << "error grabbing SE tag for tag " << j.GetZTag("SE") << std::endl; + } if (qname != cname) std::cerr << "qname " << qname << "cname " << cname << std::endl; - assert(qname == cname); - if (qname == cname) { // make sure we're comparing the right alignment - int rightend = pos + seq.length(); + int rightend = te; //seq.length(); int leftend = pos; bool issplit1 = (leftend <= leftbreak1) && (rightend >= rightbreak1); bool issplit2 = (leftend <= leftbreak2) && (rightend >= rightbreak2); @@ -711,7 +731,7 @@ namespace SnowTools { << " T/N split: " << tsplit << "/" << nsplit << " T/N cigar: " << tcigar << "/" << ncigar << " T/N AF " << af_t << "/" << af_n; else - ss << ">>>> Somatic STRUCTURAL VAR at " << gr1 << " to " << gr2 << " contig " << cname + ss << ">>>> Somatic STRUCTURAL VAR at " << gr1.pointString() << " to " << gr2.pointString() << " SPAN " << getSpan() << " contig " << cname << " T/N split: " << tsplit << "/" << nsplit << " T/N discordant: " << dc.tcount << "/" << dc.ncount << " evidence " << evidence; @@ -729,4 +749,44 @@ namespace SnowTools { blacklist = true; } + + void BreakPoint::__combine_with_discordant_cluster(DiscordantClusterMap& dmap) + { + int PAD = 400; + GenomicRegion bp1 = gr1; + GenomicRegion bp2 = gr2; + bp1.pad(PAD); + bp2.pad(PAD); + + + for (auto& d : dmap) + { + bool bp1reg1 = bp1.getOverlap(d.second.m_reg1) > 0; + bool bp2reg2 = bp2.getOverlap(d.second.m_reg2) > 0; + + //debug + bool bp1reg2 = bp1.getOverlap(d.second.m_reg2) != 0; + bool bp2reg1 = bp2.getOverlap(d.second.m_reg1) != 0; + + bool pass = bp1reg1 && bp2reg2; + + //debug + if (cname=="c_1_6524299_6527299_3") + std::cerr << " HERE " << d.first << " " << d.second << " pass " << pass << std::endl; + + if (pass) + { + // check that we haven't already added a cluster to this breakpoint + // if so, chose the one with more tumor support + if (dc.isEmpty()) { + dc = d.second; + } else if (dc.tcount < d.second.tcount) { + dc = d.second; + } + + } + } + + } + } diff --git a/src/DiscordantCluster.cpp b/src/DiscordantCluster.cpp index 3e8b14e39..1007e736e 100644 --- a/src/DiscordantCluster.cpp +++ b/src/DiscordantCluster.cpp @@ -1,6 +1,10 @@ #include "SnowTools/DiscordantCluster.h" #include +#include + +#define DISC_PAD 400 +#define MIN_PER_CLUST 2 namespace SnowTools { @@ -10,8 +14,69 @@ namespace SnowTools { qnames.insert(std::pair(name, true)); return; } + + DiscordantClusterMap DiscordantCluster::clusterReads(const BamReadVector& bav, const GenomicRegion& interval) { + + // remove any reads that are not present twice or have sufficient isize + std::unordered_map tmp_map; + for (auto& i : bav) { + std::string tt = i.Qname(); + if (tmp_map.count(tt) ==0) + tmp_map[tt] = 1; + else + tmp_map[tt]++; + } + + BamReadVector bav_dd; + for (auto& r : bav) { + + // is the read even discordant? + bool disc_r = (abs(r.InsertSize()) >= 600) || (r.MateChrID() != r.ChrID()); + if (tmp_map[r.Qname()] == 2 && disc_r) + bav_dd.push_back(r); + + } + + // sort by position + std::sort(bav_dd.begin(), bav_dd.end(), BamReadSort::ByReadPosition()); + + // clear the tmp map. Now we want to use it to store if we already clustered read + tmp_map.clear(); + + BamReadClusterVector fwd, rev, fwdfwd, revrev, fwdrev, revfwd; + std::pair fwd_info, rev_info; // refid, pos + fwd_info = {-1,-1}; + rev_info = {-1,-1}; + + // make the fwd and reverse READ clusters. dont consider mate yet + __cluster_reads(bav_dd, fwd, rev); + + // within the forward read clusters, cluster mates on fwd and rev + __cluster_mate_reads(fwd, fwdfwd, fwdrev); + + // within the reverse read clusters, cluster mates on fwd and rev + __cluster_mate_reads(rev, revfwd, revrev); + + // we have the reads in their clusters. Just convert to discordant reads clusters + DiscordantClusterMap dd; + __convertToDiscordantCluster(dd, fwdfwd, bav_dd); + __convertToDiscordantCluster(dd, fwdrev, bav_dd); + __convertToDiscordantCluster(dd, revfwd, bav_dd); + __convertToDiscordantCluster(dd, revrev, bav_dd); + + // remove clusters that dont overlap with the window + DiscordantClusterMap dd_clean; + for (auto& i : dd) { + if (i.second.m_reg1.getOverlap(interval) > 0 || i.second.m_reg2.getOverlap(interval)) + dd_clean[i.first] = i.second; + } + + return dd_clean; + + } - DiscordantCluster::DiscordantCluster(BamReadVector &this_reads, BamReadVector &all_reads) { + // this reads is reads in the cluster. all_reads is big pile where all the clusters came from + DiscordantCluster::DiscordantCluster(const BamReadVector& this_reads, const BamReadVector& all_reads) { if (this_reads.size() == 0) return; @@ -22,76 +87,68 @@ namespace SnowTools { bool rev = this_reads[0].ReverseFlag(); bool mrev = this_reads[0].MateReverseFlag(); - //bool rev = r_is_rstrand(this_reads[0]); //this_reads[0]->IsReverseStrand(); - //bool mrev = r_is_mrstrand(this_reads[0]); //this_reads[0]->IsMateReverseStrand(); + // the ID is just the first reads Qname + m_id = this_reads[0].Qname(); + assert(m_id.length()); - //id = r_qname(this_reads[0]); //this_reads[0]->Name; - id = this_reads[0].Qname(); - - //cout << "rev " << rev << " mrev " << mrev << endl; for (auto& i : this_reads) { - //cout << "flag " << r_flag(i) << " frev " << r_is_rstrand(i) << " mrev " << r_is_mrstrand(i) << endl; - //assert(rev == r_is_rstrand(i) && mrev == r_is_mrstrand(i)); //i->IsReverseStrand() && mrev == i->IsMateReverseStrand()); + // double check that we did the clustering correctly. All read orientations should be same + assert(rev == i.ReverseFlag() && mrev == i.MateReverseFlag()); + + // add the read to the read map std::string tmp = i.GetZTag("SR"); - //r_get_Z_tag(i, "SR", tmp); - - assert(id.length()); - i.AddZTag("DC",id); - //r_add_Z_tag(i, "DC", id); - assert(tmp.length()); reads[tmp] = i; + + // set the qname map std::string qn = i.Qname(); qnames[qn] = true; - if (qn < id) - id = qn; + + // the ID is the lexographically lowest qname + if (qn < m_id) + m_id = qn; if (tmp.at(0) == 't') ++tcount; else ++ncount; } + // loop through the big stack of reads and find the mates addMateReads(all_reads); assert(reads.size() == mates.size()); assert(reads.size() > 0); - // set the mapq - reads_mapq = getMeanMapq(false); - mates_mapq = getMeanMapq(true); - // set the regions - reg1 = SnowTools::GenomicRegion(-1,500000000,-1); // read region + m_reg1 = SnowTools::GenomicRegion(-1,500000000,-1); // read region for (auto& i : reads) { - reg1.strand = i.second.ReverseFlag() ? '-' : '+'; //r_strand(i.second) == '+'; //(!i.second->IsReverseStrand()) ? '+' : '-'; - reg1.chr = i.second.ChrID(); //r_id(i.second); //i.second->RefID; - if (i.second.Position() < reg1.pos1) - reg1.pos1 = i.second.Position(); //r_pos(i.second); //i.second->Position; + m_reg1.strand = i.second.ReverseFlag() ? '-' : '+'; //r_strand(i.second) == '+'; //(!i.second->IsReverseStrand()) ? '+' : '-'; + m_reg1.chr = i.second.ChrID(); //r_id(i.second); //i.second->RefID; + if (i.second.Position() < m_reg1.pos1) + m_reg1.pos1 = i.second.Position(); //r_pos(i.second); //i.second->Position; int endpos = i.second.PositionEnd(); //r_endpos(i.second); - if (endpos > reg1.pos2) - reg1.pos2 = endpos; + if (endpos > m_reg1.pos2) + m_reg1.pos2 = endpos; } - reg2 = SnowTools::GenomicRegion(-1,500000000,-1); // mate region + m_reg2 = SnowTools::GenomicRegion(-1,500000000,-1); // mate region for (auto& i : mates) { - reg2.strand = i.second.ReverseFlag() ? '-' : '+'; //r_strand(i.second) == '+'; //(!i.second->IsReverseStrand()) ? '+' : '-'; - reg2.chr = i.second.ChrID(); //i.second->RefID; - if (i.second.Position() < reg2.pos1) - reg2.pos1 = i.second.Position(); + m_reg2.strand = i.second.ReverseFlag() ? '-' : '+'; //r_strand(i.second) == '+'; //(!i.second->IsReverseStrand()) ? '+' : '-'; + m_reg2.chr = i.second.ChrID(); //i.second->RefID; + if (i.second.Position() < m_reg2.pos1) + m_reg2.pos1 = i.second.Position(); int endpos = i.second.PositionEnd(); - if (endpos > reg2.pos2) - reg2.pos2 = endpos; + if (endpos > m_reg2.pos2) + m_reg2.pos2 = endpos; } - cluster = toRegionString(); - } - void DiscordantCluster::addMateReads(BamReadVector &bav) + void DiscordantCluster::addMateReads(const BamReadVector& bav) { for (auto& i : bav) { @@ -99,9 +156,6 @@ namespace SnowTools { if (qnames.count(i.Qname())) { std::string tmp = i.GetZTag("SR"); - i.AddZTag("DC", id); - //r_get_Z_tag(i, "SR", tmp); - //r_add_Z_tag(i, "DC", id); if (reads.count(tmp) == 0) // only add if this is a mate read mates[tmp] = i; } @@ -142,14 +196,12 @@ namespace SnowTools { std::string DiscordantCluster::toRegionString() const { - //int pos1 = (reg1.strand == '+') ? reg1.pos2 : reg1.pos1; - //int pos2 = (reg2.strand == '+') ? reg2.pos2 : reg2.pos1; - int pos1 = reg1.strand ? reg1.pos2 : reg1.pos1; - int pos2 = reg2.strand ? reg2.pos2 : reg2.pos1; + int pos1 = (m_reg1.strand == '+') ? m_reg1.pos2 : m_reg1.pos1; + int pos2 = (m_reg2.strand == '+') ? m_reg2.pos2 : m_reg2.pos1; std::stringstream ss; - ss << reg1.chr+1 << ":" << pos1 << "(" << reg1.strand << ")" << "-" << - reg2.chr+1 << ":" << pos2 << "(" << reg2.strand << ")"; + ss << m_reg1.chr+1 << ":" << pos1 << "(" << m_reg1.strand << ")" << "-" << + m_reg2.chr+1 << ":" << pos2 << "(" << m_reg2.strand << ")"; return ss.str(); } @@ -160,7 +212,7 @@ namespace SnowTools { out << dc.toRegionString() << " Tcount: " << dc.tcount << " Ncount: " << dc.ncount << " Mean MAPQ: " - << dc.reads_mapq << " Mean Mate MAPQ: " << dc.mates_mapq; + << dc.getMeanMapq(false) << " Mean Mate MAPQ: " << dc.getMeanMapq(true); /*for (auto& i : dc.reads) { std::string tmp; i.second->GetTag("SR",tmp); @@ -183,9 +235,6 @@ namespace SnowTools { for (auto& i : reads) { std::string tmp = i.second.GetZTag("SR"); - //r_get_Z_tag(i.second, "SR", tmp); - - //i.second->GetTag("SR", tmp); reads_string += tmp + ","; } @@ -197,10 +246,10 @@ namespace SnowTools { } std::stringstream out; - out << reg1.chr+1 << sep << reg1.pos1 << sep << reg1.strand << sep - << reg2.chr+1 << sep << reg2.pos1 << sep << reg2.strand << sep - << tcount << sep << ncount << sep << reads_mapq << sep - << mates_mapq << sep << reads_string; + out << m_reg1.chr+1 << sep << m_reg1.pos1 << sep << m_reg1.strand << sep + << m_reg2.chr+1 << sep << m_reg2.pos1 << sep << m_reg2.strand << sep + << tcount << sep << ncount << sep << getMeanMapq(false) << sep + << getMeanMapq(true) << sep << reads_string; return (out.str()); @@ -209,12 +258,133 @@ namespace SnowTools { // define how to sort theses bool DiscordantCluster::operator<(const DiscordantCluster &b) const { - if (reg1.chr < b.reg1.chr) + if (m_reg1.chr < b.m_reg1.chr) return true; - if (reg1.pos1 < b.reg1.pos1) + if (m_reg1.pos1 < b.m_reg1.pos1) return true; return false; } + + /** + * Cluster reads by alignment position + * + * Checks whether a read belongs to a cluster. If so, adds it. If not, ends + * and stores cluster, adds a new one. + * + * @param cvec Stores the vector of clusters, which themselves are vectors of read pointers + * @param clust The current cluster that is being added to + * @param a Read to add to cluster + * @param mate Flag to specify if we should cluster on mate position instead of read position + * @return Description of the return value + */ + bool DiscordantCluster::__add_read_to_cluster(BamReadClusterVector &cvec, BamReadVector &clust, const BamRead &a, bool mate) { + + // get the position of the previous read. If none, we're starting a new one so make a dummy + std::pair last_info; + if (clust.size() == 0) + last_info = {-1, -1}; + else if (mate) + last_info = {clust.back().MateChrID(), clust.back().MatePosition()}; + else + last_info = {clust.back().ChrID(), clust.back().Position()}; + + // get the position of the current read + std::pair this_info; + if (mate) + this_info = {a.MateChrID(), a.MatePosition()}; + else + this_info = {a.ChrID(), a.Position()}; + + // check if this read is close enough to the last + if ( (this_info.first == last_info.first) && (this_info.second - last_info.second) <= DISC_PAD ) { + // read belongs in current cluster, so add + clust.push_back(a); + last_info = this_info; + return true; + + // read does not belong to cluster. close this cluster and add to cvec + } else { + + // if enough supporting reads, add as a cluster + if (clust.size() >= MIN_PER_CLUST) + cvec.push_back(clust); + + // clear this cluster and start a new one + clust.clear(); + clust.push_back(a); + + return false; + } + } + + void DiscordantCluster::__cluster_mate_reads(BamReadClusterVector& brcv, BamReadClusterVector& fwd, BamReadClusterVector& rev) + { + // loop through the clusters, and cluster within clusters based on mate read + for (auto& v : brcv) + { + BamReadVector this_fwd, this_rev; + std::sort(v.begin(), v.end(), BamReadSort::ByMatePosition()); + for (auto& r : v) + { + // forward clustering + if (!r.MateReverseFlag()) + __add_read_to_cluster(fwd, this_fwd, r, true); + // reverse clustering + else + __add_read_to_cluster(rev, this_rev, r, true); + + } + // finish the last clusters + if (this_fwd.size() > 0) + fwd.push_back(this_fwd); + if (this_rev.size() > 0) + rev.push_back(this_rev); + } // finish main cluster loop + } + + void DiscordantCluster::__cluster_reads(const BamReadVector& brv, BamReadClusterVector& fwd, BamReadClusterVector& rev) + { + + // hold the current cluster + BamReadVector this_fwd, this_rev; + + std::set tmp_set; + + // cluster in the READ direction, separately for fwd and rev + for (auto& i : brv) { + std::string qq = i.Qname(); + + // only cluster if not seen before (e.g. left-most is READ, right most is MATE) + if (i.PairMappedFlag() && tmp_set.count(qq) == 0) { + + tmp_set.insert(qq); + + // forward clustering + if (!i.ReverseFlag()) + __add_read_to_cluster(fwd, this_fwd, i, false); + // reverse clustering + else + __add_read_to_cluster(rev, this_rev, i, false); + } + } + // finish the last clusters + if (this_fwd.size() > 0) + fwd.push_back(this_fwd); + if (this_rev.size() > 0) + rev.push_back(this_rev); + + } + + void DiscordantCluster::__convertToDiscordantCluster(DiscordantClusterMap &dd, const BamReadClusterVector& cvec, const BamReadVector& bav) { + + for (auto& v : cvec) { + if (v.size() > 1) { + DiscordantCluster d(v, bav); /// slow but works + dd[d.m_id] = d; + } + } + } + } diff --git a/src/GenomicRegion.cpp b/src/GenomicRegion.cpp index 5507f09dc..e44c349dd 100644 --- a/src/GenomicRegion.cpp +++ b/src/GenomicRegion.cpp @@ -92,6 +92,12 @@ std::string GenomicRegion::toString() const { return out.str(); } + std::string GenomicRegion::pointString() const { + std::stringstream out; + out << chrToString(chr) << ":" << SnowTools::AddCommas(pos1) << "(" << strand << ")"; + return out.str(); + } + void GenomicRegion::pad(int32_t pad) { if (pad > pos1) pos1 = 1; @@ -213,7 +219,7 @@ std::string GenomicRegion::chrToString(int32_t ref) { // checks whether a GenomicRegion is empty bool GenomicRegion::isEmpty() const { - return chr == 0 && pos1 == 0 && pos2 == 0; + return chr == -1 && pos1 == 0 && pos2 == 0; } @@ -298,9 +304,9 @@ void GenomicRegion::random() { break; } - if (!found) - std::cerr << "GenomicRegion: error, could not find matching chr in header for chr string " << tchr << std::endl; + //debug turn this back on + //if (!found) + // std::cerr << "GenomicRegion: error, could not find matching chr in header for chr string " << tchr << std::endl; } - } diff --git a/src/GenomicRegionCollection.cpp b/src/GenomicRegionCollection.cpp index f6af8c337..52f3232e3 100644 --- a/src/GenomicRegionCollection.cpp +++ b/src/GenomicRegionCollection.cpp @@ -4,6 +4,9 @@ #include #include #include "SnowTools/gzstream.h" +#include + +#include //#define DEBUG_OVERLAPS 1 @@ -259,8 +262,8 @@ GenomicRegionCollection::GenomicRegionCollection(int width, int ovlp, const T // undefined otherwise assert(width > ovlp); - uint32_t start = gr.pos1; - uint32_t end = gr.pos1 + width; + int32_t start = gr.pos1; + int32_t end = gr.pos1 + width; // region is smaller than width if ( end >= gr.pos2 ) { @@ -347,7 +350,7 @@ const T& GenomicRegionCollection::at(size_t i) const } template -GenomicRegionCollection GenomicRegionCollection::findOverlaps(GenomicRegionCollection& subject, std::vector& query_id, std::vector& subject_id, bool ignore_strand) +GenomicRegionCollection GenomicRegionCollection::findOverlaps(GenomicRegionCollection& subject, std::vector& query_id, std::vector& subject_id, bool ignore_strand) { GenomicRegionCollection output; @@ -355,13 +358,10 @@ GenomicRegionCollection GenomicRegionCollection::findOverlaps( // loop through the query GRanges (this) and overlap with subject for (size_t i = 0; i < m_grv.size(); ++i) { - - //debug - if (m_grv[i].chr != 0) - continue; + // which chr (if any) are common between query and subject + GenomicIntervalTreeMap::iterator ff = subject.m_tree.find(m_grv[i].chr); GenomicIntervalVector giv; - GenomicIntervalTreeMap::iterator ff = subject.m_tree.find(m_grv[i].chr); #ifdef DEBUG_OVERLAPS std::cerr << "TRYING OVERLAP ON QUERY " << m_grv[i] << std::endl; @@ -373,7 +373,6 @@ GenomicRegionCollection GenomicRegionCollection::findOverlaps( ff->second.findOverlapping(m_grv[i].pos1, m_grv[i].pos2, giv); #ifdef DEBUG_OVERLAPS - //debug std::cerr << "ff->second.intervals.size() " << ff->second.intervals.size() << std::endl; for (auto& k : ff->second.intervals) std::cerr << " intervals " << k.start << " to " << k.stop << " value " << k.value << std::endl; @@ -388,7 +387,7 @@ GenomicRegionCollection GenomicRegionCollection::findOverlaps( #ifdef DEBUG_OVERLAPS std::cerr << "find overlaps hit " << j.start << " " << j.stop << " -- " << j.value << std::endl; #endif - output.add(T(m_grv[i].chr, std::max(j.start, m_grv[i].pos1), std::min(j.stop, m_grv[i].pos2))); + output.add(T(m_grv[i].chr, std::max(static_cast(j.start), m_grv[i].pos1), std::min(static_cast(j.stop), m_grv[i].pos2))); } } } @@ -412,6 +411,56 @@ GRC GenomicRegionCollection::intersection(GRC& subject, bool ignore_strand /* return out; } +template +GRC GenomicRegionCollection::complement(GRC& subject, bool ignore_strand) +{ + + GRC out; + // get this - that + + using namespace boost::icl; + typedef interval_set TIntervalSet; + + // flatten each + subject.mergeOverlappingIntervals(); + this->mergeOverlappingIntervals(); + + // get list of THAT chr to go through + std::set chr; + for (auto& i : subject) + if (!chr.count(i.chr)) + chr.insert(i.chr); + + // the THIS events that have chr not in THAT, automatically get added + for (auto& i : m_grv) + if (!chr.count(i.chr)) + out.add(i); + + // loop through the chromosomes of THAT to subtract + for (auto& c : chr) + { + TIntervalSet intSetA; + TIntervalSet intSetB; + + // build up the intervals for THIS + for (auto& i : m_grv) + if (i.chr == c) + intSetA += discrete_interval::closed(i.pos1, i.pos2); + // build up the intervals for THAT + for (auto& i : subject) + if (i.chr == c) + intSetB += discrete_interval::closed(i.pos1, i.pos2); + + TIntervalSet intSetD = intSetA - intSetB; + + for (auto& i : intSetD) + out.add(GenomicRegion(c, first(i), upper(i))); + } + + return out; + +} + template void GenomicRegionCollection::pad(int v) { diff --git a/src/HTSTools.cpp b/src/HTSTools.cpp deleted file mode 100644 index 0b5e965ad..000000000 --- a/src/HTSTools.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include "SnowTools/HTSTools.h" -#include -#include -#include - -namespace SnowTools { - -// Trim the sequence by removing low quality bases from either end -int32_t qualityTrimRead(int qualTrim, int32_t &startpoint, Read &r) { - - int endpoint = -1; //seq.length(); - startpoint = 0; - int i = 0; - - uint8_t * qual = bam_get_qual(r.get()); - - // get the start point (loop forward) - while(i < r->core.l_qseq) { - int ps = qual[i]; - if (ps >= qualTrim) { - startpoint = i; - break; - } - i++; - } - - // get the end point (loop backwards) - i = r->core.l_qseq - 1; //seq.length() - 1; - while(i >= 0) { - - int ps = qual[i]; - - if (ps >= qualTrim) { //ps >= qualTrim) { - endpoint = i + 1; // endpoint is one past edge - break; - } - i--; - } - - // check that they aren't all bad - if (startpoint == 0 && endpoint == -1) - return 0; - - - return (endpoint - startpoint); - -} - -void removeAllTags(Read& a) -{ - size_t keep = (a->core.n_cigar<<2) + a->core.l_qname + ((a->core.l_qseq + 1)>>1) + a->core.l_qseq; - a->data = (uint8_t*)realloc(a->data, keep); // free the end, which has aux data - a->l_data = keep; -} - -// get a string tag that might be separted by "x" -std::vector GetStringTag(const Read& a, const std::string tag) { - - std::vector out; - std::string tmp; - - r_get_Z_tag(a, tag.c_str(), tmp); - assert(tmp.length()); - - if (tmp.find("x") != std::string::npos) { - std::istringstream iss(tmp); - std::string line; - while (std::getline(iss, line, 'x')) { - out.push_back(line); - } - } else { - out.push_back(tmp); - } - - assert(out.size()); - return out; - -} - -// add a tag that might already be there, separete by 'x' -void SmartAddTag(Read &a, const std::string tag, const std::string val) { - - std::string tmp; - r_get_Z_tag(a, tag.c_str(), tmp); - - if (tmp.length()) { - tmp += "x" + val; - r_remove_tag(a, tag.c_str()); - r_add_Z_tag(a, tag.c_str(), tmp); - } else { // normal with no x - r_add_Z_tag(a, tag.c_str(), val); - } - -} - -// get an integer tag that might be separted by "x" -std::vector GetIntTag(const Read& a, const std::string tag) { - - std::vector out; - std::string tmp; - - r_get_Z_tag(a, tag.c_str(), tmp); - assert(tmp.length()); - - if (tmp.find("x") != std::string::npos) { - std::istringstream iss(tmp); - std::string line; - while (std::getline(iss, line, 'x')) { - try { out.push_back(stoi(line)); } catch (...) { std::cerr << "Failed to read parsed int tag " << tag << " for value " << tmp << " with line " << line << std::endl; std::exit(EXIT_FAILURE); } - } - } else { - try { out.push_back(stoi(tmp)); } catch (...) { std::cerr << "Failed to read int tag " << tag << " for value " << tmp << std::endl; std::exit(EXIT_FAILURE); } - } - - assert(out.size()); - return out; - -} - -// return a sequence from the reference -std::string getRefSequence(const GenomicRegion &gr, faidx_t * fi) { - - int len; - std::string chrstring = GenomicRegion::chrToString(gr.chr); - char * seq = faidx_fetch_seq(fi, const_cast(chrstring.c_str()), gr.pos1-1, gr.pos2-1, &len); - - if (seq) { - return std::string(seq); - } else { - //cout << "Failed to get reference sequence at " << gr << endl; - return "LOAD_FAIL"; - } - -} - - -} - diff --git a/src/Makefile b/src/Makefile index ed4c421ef..fb99bc31d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -62,7 +62,6 @@ am_libsnowtools_a_OBJECTS = libsnowtools_a-GenomicRegion.$(OBJEXT) \ libsnowtools_a-gzstream.$(OBJEXT) \ libsnowtools_a-SnowUtils.$(OBJEXT) \ libsnowtools_a-gChain.$(OBJEXT) \ - libsnowtools_a-HTSTools.$(OBJEXT) \ libsnowtools_a-BamRead.$(OBJEXT) \ libsnowtools_a-gzstream.$(OBJEXT) \ libsnowtools_a-BWAWrapper.$(OBJEXT) \ @@ -124,7 +123,7 @@ ACLOCAL = ${SHELL} /xchip/gistic/Jeremiah/GIT/SnowTools/src/missing --run acloca AMDEP_FALSE = # AMDEP_TRUE = AMTAR = ${SHELL} /xchip/gistic/Jeremiah/GIT/SnowTools/src/missing --run tar -AM_CXXFLAGS = -g -Wall -Wextra -Wno-unknown-pragmas -std=c++11 +AM_CXXFLAGS = -g -Wno-unknown-pragmas -std=c++11 AUTOCONF = ${SHELL} /xchip/gistic/Jeremiah/GIT/SnowTools/src/missing --run autoconf AUTOHEADER = ${SHELL} /xchip/gistic/Jeremiah/GIT/SnowTools/src/missing --run autoheader AUTOMAKE = ${SHELL} /xchip/gistic/Jeremiah/GIT/SnowTools/src/missing --run automake-1.9 @@ -132,7 +131,7 @@ AWK = gawk CC = gcc CCDEPMODE = depmode=gcc3 CFLAGS = -O3 -CPPFLAGS = +CPPFLAGS = -I/xchip/gistic/Jeremiah/boost_1_58_0 CXX = g++ CXXCPP = g++ -E CXXDEPMODE = depmode=gcc3 @@ -201,7 +200,7 @@ sharedstatedir = ${prefix}/com sysconfdir = ${prefix}/etc target_alias = AUTOMAKE_OPTIONS = foreign -SUBDIRS = bwa htslib +SUBDIRS = bwa htslib noinst_LIBRARIES = libsnowtools.a libsnowtools_a_CPPFLAGS = \ -I$(top_srcdir)/htslib \ @@ -210,7 +209,7 @@ libsnowtools_a_CPPFLAGS = \ libsnowtools_a_LIBADD = $(top_srcdir)/bwa/libbwa.a $(top_srcdir)/htslib/libhts.a libsnowtools_a_SOURCES = GenomicRegion.cpp Histogram.cpp \ BamWalker.cpp BamStats.cpp MiniRules.cpp gzstream.C SnowUtils.cpp \ - gChain.cpp HTSTools.cpp BamRead.cpp gzstream.C BWAWrapper.cpp BamRead.cpp \ + gChain.cpp BamRead.cpp gzstream.C BWAWrapper.cpp BamRead.cpp \ AlignedContig.cpp \ BreakPoint.cpp DiscordantCluster.cpp \ bwa/is.c bwa/bwtindex.c \ @@ -310,7 +309,6 @@ include ./$(DEPDIR)/libsnowtools_a-BamWalker.Po include ./$(DEPDIR)/libsnowtools_a-BreakPoint.Po include ./$(DEPDIR)/libsnowtools_a-DiscordantCluster.Po include ./$(DEPDIR)/libsnowtools_a-GenomicRegion.Po -include ./$(DEPDIR)/libsnowtools_a-HTSTools.Po include ./$(DEPDIR)/libsnowtools_a-Histogram.Po include ./$(DEPDIR)/libsnowtools_a-MiniRules.Po include ./$(DEPDIR)/libsnowtools_a-QSufSort.Po @@ -454,20 +452,6 @@ libsnowtools_a-gChain.obj: gChain.cpp # DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ # $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libsnowtools_a-gChain.obj `if test -f 'gChain.cpp'; then $(CYGPATH_W) 'gChain.cpp'; else $(CYGPATH_W) '$(srcdir)/gChain.cpp'; fi` -libsnowtools_a-HTSTools.o: HTSTools.cpp - if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libsnowtools_a-HTSTools.o -MD -MP -MF "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" -c -o libsnowtools_a-HTSTools.o `test -f 'HTSTools.cpp' || echo '$(srcdir)/'`HTSTools.cpp; \ - then mv -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" "$(DEPDIR)/libsnowtools_a-HTSTools.Po"; else rm -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo"; exit 1; fi -# source='HTSTools.cpp' object='libsnowtools_a-HTSTools.o' libtool=no \ -# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ -# $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libsnowtools_a-HTSTools.o `test -f 'HTSTools.cpp' || echo '$(srcdir)/'`HTSTools.cpp - -libsnowtools_a-HTSTools.obj: HTSTools.cpp - if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libsnowtools_a-HTSTools.obj -MD -MP -MF "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" -c -o libsnowtools_a-HTSTools.obj `if test -f 'HTSTools.cpp'; then $(CYGPATH_W) 'HTSTools.cpp'; else $(CYGPATH_W) '$(srcdir)/HTSTools.cpp'; fi`; \ - then mv -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" "$(DEPDIR)/libsnowtools_a-HTSTools.Po"; else rm -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo"; exit 1; fi -# source='HTSTools.cpp' object='libsnowtools_a-HTSTools.obj' libtool=no \ -# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ -# $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libsnowtools_a-HTSTools.obj `if test -f 'HTSTools.cpp'; then $(CYGPATH_W) 'HTSTools.cpp'; else $(CYGPATH_W) '$(srcdir)/HTSTools.cpp'; fi` - libsnowtools_a-BamRead.o: BamRead.cpp if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libsnowtools_a-BamRead.o -MD -MP -MF "$(DEPDIR)/libsnowtools_a-BamRead.Tpo" -c -o libsnowtools_a-BamRead.o `test -f 'BamRead.cpp' || echo '$(srcdir)/'`BamRead.cpp; \ then mv -f "$(DEPDIR)/libsnowtools_a-BamRead.Tpo" "$(DEPDIR)/libsnowtools_a-BamRead.Po"; else rm -f "$(DEPDIR)/libsnowtools_a-BamRead.Tpo"; exit 1; fi diff --git a/src/Makefile.am b/src/Makefile.am index f00dc34e4..e5ae43a0d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,5 @@ AUTOMAKE_OPTIONS = foreign -SUBDIRS = bwa htslib +SUBDIRS = bwa htslib noinst_LIBRARIES = libsnowtools.a @@ -12,7 +12,7 @@ libsnowtools_a_LIBADD = $(top_srcdir)/bwa/libbwa.a $(top_srcdir)/htslib/libhts.a libsnowtools_a_SOURCES = GenomicRegion.cpp Histogram.cpp \ BamWalker.cpp BamStats.cpp MiniRules.cpp gzstream.C SnowUtils.cpp \ - gChain.cpp HTSTools.cpp BamRead.cpp gzstream.C BWAWrapper.cpp BamRead.cpp \ + gChain.cpp BamRead.cpp gzstream.C BWAWrapper.cpp BamRead.cpp \ AlignedContig.cpp \ BreakPoint.cpp DiscordantCluster.cpp \ bwa/is.c bwa/bwtindex.c \ diff --git a/src/Makefile.in b/src/Makefile.in index c7e069520..59dbfc451 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -62,7 +62,6 @@ am_libsnowtools_a_OBJECTS = libsnowtools_a-GenomicRegion.$(OBJEXT) \ libsnowtools_a-gzstream.$(OBJEXT) \ libsnowtools_a-SnowUtils.$(OBJEXT) \ libsnowtools_a-gChain.$(OBJEXT) \ - libsnowtools_a-HTSTools.$(OBJEXT) \ libsnowtools_a-BamRead.$(OBJEXT) \ libsnowtools_a-gzstream.$(OBJEXT) \ libsnowtools_a-BWAWrapper.$(OBJEXT) \ @@ -201,7 +200,7 @@ sharedstatedir = @sharedstatedir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ AUTOMAKE_OPTIONS = foreign -SUBDIRS = bwa htslib +SUBDIRS = bwa htslib noinst_LIBRARIES = libsnowtools.a libsnowtools_a_CPPFLAGS = \ -I$(top_srcdir)/htslib \ @@ -210,7 +209,7 @@ libsnowtools_a_CPPFLAGS = \ libsnowtools_a_LIBADD = $(top_srcdir)/bwa/libbwa.a $(top_srcdir)/htslib/libhts.a libsnowtools_a_SOURCES = GenomicRegion.cpp Histogram.cpp \ BamWalker.cpp BamStats.cpp MiniRules.cpp gzstream.C SnowUtils.cpp \ - gChain.cpp HTSTools.cpp BamRead.cpp gzstream.C BWAWrapper.cpp BamRead.cpp \ + gChain.cpp BamRead.cpp gzstream.C BWAWrapper.cpp BamRead.cpp \ AlignedContig.cpp \ BreakPoint.cpp DiscordantCluster.cpp \ bwa/is.c bwa/bwtindex.c \ @@ -310,7 +309,6 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-BreakPoint.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-DiscordantCluster.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-GenomicRegion.Po@am__quote@ -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-HTSTools.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-Histogram.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-MiniRules.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libsnowtools_a-QSufSort.Po@am__quote@ @@ -454,20 +452,6 @@ libsnowtools_a-gChain.obj: gChain.cpp @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libsnowtools_a-gChain.obj `if test -f 'gChain.cpp'; then $(CYGPATH_W) 'gChain.cpp'; else $(CYGPATH_W) '$(srcdir)/gChain.cpp'; fi` -libsnowtools_a-HTSTools.o: HTSTools.cpp -@am__fastdepCXX_TRUE@ if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libsnowtools_a-HTSTools.o -MD -MP -MF "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" -c -o libsnowtools_a-HTSTools.o `test -f 'HTSTools.cpp' || echo '$(srcdir)/'`HTSTools.cpp; \ -@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" "$(DEPDIR)/libsnowtools_a-HTSTools.Po"; else rm -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo"; exit 1; fi -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='HTSTools.cpp' object='libsnowtools_a-HTSTools.o' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libsnowtools_a-HTSTools.o `test -f 'HTSTools.cpp' || echo '$(srcdir)/'`HTSTools.cpp - -libsnowtools_a-HTSTools.obj: HTSTools.cpp -@am__fastdepCXX_TRUE@ if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libsnowtools_a-HTSTools.obj -MD -MP -MF "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" -c -o libsnowtools_a-HTSTools.obj `if test -f 'HTSTools.cpp'; then $(CYGPATH_W) 'HTSTools.cpp'; else $(CYGPATH_W) '$(srcdir)/HTSTools.cpp'; fi`; \ -@am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo" "$(DEPDIR)/libsnowtools_a-HTSTools.Po"; else rm -f "$(DEPDIR)/libsnowtools_a-HTSTools.Tpo"; exit 1; fi -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='HTSTools.cpp' object='libsnowtools_a-HTSTools.obj' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCXX_FALSE@ $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o libsnowtools_a-HTSTools.obj `if test -f 'HTSTools.cpp'; then $(CYGPATH_W) 'HTSTools.cpp'; else $(CYGPATH_W) '$(srcdir)/HTSTools.cpp'; fi` - libsnowtools_a-BamRead.o: BamRead.cpp @am__fastdepCXX_TRUE@ if $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libsnowtools_a_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT libsnowtools_a-BamRead.o -MD -MP -MF "$(DEPDIR)/libsnowtools_a-BamRead.Tpo" -c -o libsnowtools_a-BamRead.o `test -f 'BamRead.cpp' || echo '$(srcdir)/'`BamRead.cpp; \ @am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/libsnowtools_a-BamRead.Tpo" "$(DEPDIR)/libsnowtools_a-BamRead.Po"; else rm -f "$(DEPDIR)/libsnowtools_a-BamRead.Tpo"; exit 1; fi diff --git a/src/MiniRules.cpp b/src/MiniRules.cpp index 09e23abbb..81df0eff9 100644 --- a/src/MiniRules.cpp +++ b/src/MiniRules.cpp @@ -682,12 +682,12 @@ void Range::parseRuleLine(std::string line) { return false; } #endif - + return true; } - bool FlagRule::isValid(BamRead &r) { - +bool FlagRule::isValid(BamRead &r) { + if (isEvery()) return true; diff --git a/src/RealignTool.cpp b/src/RealignTool.cpp index a3f28a792..39dca1afb 100644 --- a/src/RealignTool.cpp +++ b/src/RealignTool.cpp @@ -3,8 +3,6 @@ #include #include -#include "SnowTools/BWAWrapper.h" -#include "SnowTools/BamRead.h" #include "SnowTools/AlignedContig.h" #include "SnowTools/BamWalker.h" @@ -31,15 +29,17 @@ void runRealign(int argc, char** argv) }; SnowTools::BWAWrapper w; - w.constructIndex(v); + //w.constructIndex(v); w.retrieveIndex(opt::refgenome); - std::string test_seq = v[0].seq + v[1].seq; - + //std::string test_seq = v[0].seq + v[1].seq; + std::string test_seq = "CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTA"; + std::string name = "TP53_MYC"; SnowTools::BamReadVector brv; w.alignSingleSequence(test_seq, name, brv, false); + return; SnowTools::AlignedContig ac(brv); diff --git a/src/SeqRead.cpp b/src/SeqRead.cpp deleted file mode 100644 index b17e9c204..000000000 --- a/src/SeqRead.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "SnowTools/SeqRead.h" diff --git a/src/SnowTools/AlignedContig.h b/src/SnowTools/AlignedContig.h index eaf559ae2..c3e891d14 100644 --- a/src/SnowTools/AlignedContig.h +++ b/src/SnowTools/AlignedContig.h @@ -3,11 +3,9 @@ #include -#include "SnowTools/GenomicRegion.h" -#include "SnowTools/GenomicRegionCollection.h" -#include "SnowTools/SnowUtils.h" #include "SnowTools/BamRead.h" #include "SnowTools/BreakPoint.h" +#include "SnowTools/DiscordantCluster.h" #include "SnowTools/BWAWrapper.h" namespace SnowTools { @@ -50,6 +48,12 @@ namespace SnowTools { * @param const CigarMap reference containing hash with key=chr_breakpos_indeltype, val=normal count */ void indelCigarMatches(const CigarMap &nmap, const CigarMap &tmap); + + /** Check whether the alignment fragement overlaps with the given windows + * + * This function is used to + */ + bool checkLocal(const GenomicRegion& window); const BPVec& getIndelBreaks() const { return m_indel_breaks; } @@ -73,7 +77,8 @@ namespace SnowTools { bool local = false; /**< boolean to note whether this fragment aligns to same location is was assembled from */ AlignedContig * c; // link to the parent aligned contigs - + + int num_align = 0; }; //! vector of AlignmentFragment objects @@ -106,6 +111,10 @@ namespace SnowTools { * @param const reference to another AlignedContig * @return bool bool returning true iff this contig has identical info has better MAPQ, or equal MAPQ but longer */ bool isWorse(const AlignedContig &ac) const; + + /*! @function Loop through all the alignment framgents and their indel breaks and check against cigar database + */ + void checkAgainstCigarMatches(const CigarMap& nmap, const CigarMap& tmap); /*! @function @abstract Get whether the query is on the reverse strand @@ -114,8 +123,8 @@ namespace SnowTools { */ void addAlignment(const BamRead &align); - //! add a discordant cluster that maps to same regions as this contig - void addDiscordantCluster(DiscordantCluster dc) { m_dc.push_back(dc); } + //! Loop through fragments and check if they overlap with window (and set local flag). Return TRUE if local found + bool checkLocal(const GenomicRegion& window); /*! @function loop through the vector of DiscordantCluster objects * associated with this contig and print @@ -185,13 +194,6 @@ namespace SnowTools { */ void alignReads(BamReadVector &bav); - /*! @function align reads to contig and modify their tags to show contig mapping - * Currently this function will attempt a SmithWaterman alignment for all reads - * that don't have an exact mapping to the contig. - * @param bav Vector of read smart pointers to align. Modifies their SW tag - */ - void alignReadsToContigs(ReadVec &bav); - //! return the contig sequence as it came off the assembler std::string getSequence() const { assert(m_seq.length()); return m_seq; } @@ -208,6 +210,8 @@ namespace SnowTools { */ bool hasVariant() const; + bool hasLocal() const { for (auto& i : m_frag_v) if (i.local) return true; return false; } + /*! @function retrieves all of the breakpoints by combining indels with global mutli-map break * @return vector of ind */ @@ -215,6 +219,10 @@ namespace SnowTools { std::vector getAllBreakPointPointers() const ; + void addDiscordantCluster(DiscordantClusterMap& dmap); + + int insertion_against_contig_read_count = 0; + int deletion_against_contig_read_count = 0; private: @@ -234,8 +242,6 @@ namespace SnowTools { bool m_hasvariant = false; // flag to specify whether this alignment has some potential varaint (eg indel) - bool m_tried_align_reads = false; // flag to specify whether we tried to align reads. - std::string m_seq = ""; // sequence of contig as it came off of assembler std::vector m_dc; // collection of all discordant clusters that map to same location as this contig diff --git a/src/SnowTools/BamRead.h b/src/SnowTools/BamRead.h index 629f69cc3..68da2374a 100644 --- a/src/SnowTools/BamRead.h +++ b/src/SnowTools/BamRead.h @@ -6,8 +6,21 @@ #include #include #include +#include +#include -#include "SnowTools/HTSTools.h" +#include "htslib/hts.h" +#include "htslib/sam.h" +#include "htslib/bgzf.h" +#include "htslib/kstring.h" +#include "htslib/faidx.h" + +static const char BASES[16] = {' ', 'A', 'C', ' ', + 'G', ' ', ' ', ' ', + 'T', ' ', ' ', ' ', + ' ', ' ', ' ', 'N'}; + +//#include "SnowTools/HTSTools.h" namespace SnowTools { @@ -81,7 +94,7 @@ class BamRead { inline bool MateMappedFlag() const { return (b->core.flag&BAM_FMUNMAP) == 0; } /** BamRead mate is mapped */ - inline bool PairMappedFlag() const { return ((!b->core.flag&BAM_FMUNMAP) && (!b->core.flag&BAM_FUNMAP)); } + inline bool PairMappedFlag() const { return !(b->core.flag&BAM_FMUNMAP) && !(b->core.flag&BAM_FUNMAP); } /** Count the total number of N bases in this sequence */ int32_t CountNBases() const; @@ -199,7 +212,7 @@ class BamRead { uint32_t* c = bam_get_cigar(b); int32_t p = 0; for (int32_t i = 0; i < b->core.n_cigar; ++i) { - if ( (c[i] & BAM_CSOFT_CLIP) || (c[i] & BAM_CHARD_CLIP)) + if ( (bam_cigar_opchr(c[i]) == 'S') || (bam_cigar_opchr(c[i]) == 'H')) p += bam_cigar_oplen(c[i]); else // not a clip, so stop counting break; @@ -207,12 +220,26 @@ class BamRead { return p; } + /** Get the end of the alignment on the read, by removing soft-clips + */ + inline int32_t AlignmentEndPosition() const { + uint32_t* c = bam_get_cigar(b); + int32_t p = 0; + for (int32_t i = b->core.n_cigar - 1; i >= 0; --i) { // loop from the end + if ( (bam_cigar_opchr(c[i]) == 'S') || (bam_cigar_opchr(c[i]) == 'H')) + p += bam_cigar_oplen(c[i]); + else // not a clip, so stop counting + break; + } + return (b->core.l_qseq - p); + } + /** Get the number of soft clipped bases */ inline int32_t NumSoftClip() const { int32_t p = 0; uint32_t* c = bam_get_cigar(b); for (int32_t i = 0; i < b->core.n_cigar; ++i) - if (c[i] & BAM_CSOFT_CLIP) + if (bam_cigar_opchr(c[i]) == 'S') p += bam_cigar_oplen(c[i]); return p; } @@ -221,8 +248,8 @@ class BamRead { inline int32_t NumHardClip() const { int32_t p = 0; uint32_t* c = bam_get_cigar(b); - for (int32_t i = 0; i < b->core.n_cigar; ++i) - if (c[i] & BAM_CHARD_CLIP) + for (int32_t i = 0; i < b->core.n_cigar; ++i) + if (bam_cigar_opchr(c[i]) == 'H') p += bam_cigar_oplen(c[i]); return p; } @@ -233,7 +260,7 @@ class BamRead { int32_t p = 0; uint32_t* c = bam_get_cigar(b); for (int32_t i = 0; i < b->core.n_cigar; ++i) - if ( (c[i] & BAM_CSOFT_CLIP) || (c[i] & BAM_CHARD_CLIP) ) + if ( (bam_cigar_opchr(c[i]) == 'S') || (bam_cigar_opchr(c[i]) == 'H') ) p += bam_cigar_oplen(c[i]); return p; } @@ -364,6 +391,28 @@ class BamRead { }; typedef std::vector BamReadVector; + + typedef std::vector BamReadClusterVector; + + namespace BamReadSort { + + // sort by read position + struct ByReadPosition + { + bool operator()( const BamRead& lx, const BamRead& rx ) const { + return (lx.ChrID() < rx.ChrID()) || (lx.ChrID() == rx.ChrID() && lx.Position() < rx.Position()); + } + }; + + // sort by read mate position + struct ByMatePosition + { + bool operator()( const BamRead& lx, const BamRead& rx ) const { + return (lx.MateChrID() < rx.MateChrID()) || (lx.MateChrID() == rx.MateChrID() && lx.MatePosition() < rx.MatePosition()); + } + }; + +} } #endif diff --git a/src/SnowTools/BamWalker.h b/src/SnowTools/BamWalker.h index d5d35c027..19d23af8a 100644 --- a/src/SnowTools/BamWalker.h +++ b/src/SnowTools/BamWalker.h @@ -2,14 +2,9 @@ #define SNOWTOOLS_BAM_WALKER_H__ #include +#include #include "SnowTools/MiniRules.h" -#include "SnowTools/GenomicRegion.h" -#include "SnowTools/GenomicRegionCollection.h" - -#include "SnowTools/HTSTools.h" -#include "SnowTools/SnowUtils.h" -#include "SnowTools/BamRead.h" // Phred score transformations inline int char2phred(char b) { @@ -83,8 +78,8 @@ class BamWalker { * calls required within HTSlib to close a BAM or SAM file. */ ~BamWalker() { - __close_read_bam(); - __close_write_bam(); + __close_read_bam(); + __close_write_bam(); } /** Set a part of the BAM to walk. @@ -130,7 +125,7 @@ class BamWalker { /** Pass a MiniRules script to the BAM. * -4 * This will call the constructor of MiniRulesCollection, and + * This will call the constructor of MiniRulesCollection, and * parse the provides rules and add it as a rule set to this BamWalker. * @param rules A string of rules, or a file pointing to a rules script */ @@ -150,6 +145,10 @@ class BamWalker { */ bool GetNextRead(BamRead &r, bool& rule); + /** Set regions to not be read from + */ + void addBlacklist(GRC& bl); + /** Write an alignment to the output BAM file * @param r The BamRead to save */ @@ -213,13 +212,25 @@ class BamWalker { std::string displayMiniRulesCollection() const; /** Return a pointer to the BAM header */ - bam_hdr_t * header() const { return br; }; + bam_hdr_t * header() const { return br.get(); }; + /** Set the limit for total number of reads seen */ + void setReadLimit(int lim) { m_limit = lim; m_num_reads_seen = 0; } + + /** Set the limit for total number of reads kept */ + void setReadKeepLimit(int lim) { m_keep_limit = lim; m_num_reads_kept = 0; } + protected: // for stdout mode, print header? bool m_print_header = false; + // limit to number of reads to be read in + int m_limit = -1; + int m_keep_limit = -1; + + void __check_regions_blacklist(); + // point index to this region of bam bool __set_region(const GenomicRegion& gp); @@ -242,18 +253,30 @@ class BamWalker { bool m_strip_all_tags = false; // hts - BGZF * fp = 0; - hts_idx_t * idx = 0; // hts_idx_load(bamfile.c_str(), HTS_FMT_BAI); - hts_itr_t * hts_itr = 0; // sam_itr_queryi(idx, 3, 60000, 80000); - bam_hdr_t * br = 0; - + std::shared_ptr fp; + //BGZF * fp = 0; + //hts_idx_t * idx = 0; // hts_idx_load(bamfile.c_str(), HTS_FMT_BAI); + std::shared_ptr idx; + std::shared_ptr hts_itr; + //hts_itr_t * hts_itr = 0; // sam_itr_queryi(idx, 3, 60000, 80000); + std::shared_ptr br; + //bam_hdr_t * br = 0; + + //std::shared_ptr fop; htsFile* fop = 0; // which tags to strip std::vector m_tag_list; + // blacklist + GRC blacklist; + // bool m_verbose = false; + + // read counter + int m_num_reads_seen = 0; + int m_num_reads_kept = 0; }; diff --git a/src/SnowTools/BreakPoint.h b/src/SnowTools/BreakPoint.h index a8b69d90d..a795996b6 100644 --- a/src/SnowTools/BreakPoint.h +++ b/src/SnowTools/BreakPoint.h @@ -31,7 +31,7 @@ void runRefilterBreakpoints(int argc, char** argv); * */ void parseBreakOptions(int argc, char** argv); - + struct BreakPoint { static std::string header() { @@ -131,7 +131,7 @@ struct BreakPoint { /** Construct a breakpoint from a cluster of discordant reads */ - BreakPoint(DiscordantCluster tdc); + BreakPoint(const DiscordantCluster& tdc); BreakPoint() { @@ -153,6 +153,8 @@ struct BreakPoint { std::string toPrintString() const; static void readPON(std::string &file, std::unique_ptr &pmap); + + void __combine_with_discordant_cluster(DiscordantClusterMap& dmap); /*! @function determine if the breakpoint has split read support * @param reference to a vector of read smart pointers that have been aligned to a contig @@ -236,7 +238,11 @@ struct BreakPoint { // define how to sort these bool operator < (const BreakPoint& bp) const { - return (gr1 < bp.gr1); // || (gr1 == gr2 && nsplit1 > bp.nsplit1) + return (gr1 < bp.gr1) || (gr1 == bp.gr1 && gr2 < bp.gr2) || + (gr1 == bp.gr1 && gr2 == bp.gr2 && nsplit > bp.nsplit) || // more read support should go first, do to property of std::unique + (gr1 == bp.gr1 && gr2 == bp.gr2 && tsplit > bp.tsplit) || + (gr1 == bp.gr1 && gr2 == bp.gr2 && dc.ncount > bp.dc.ncount) || + (gr1 == bp.gr1 && gr2 == bp.gr2 && dc.tcount > bp.dc.tcount); //(bp.gr1.ref == refID1 && bp.pos1 > pos1) || // low pos is first // (bp.refID1 == refID1 && bp.pos1 == pos1 && bp.pos2 == pos2 && nsplit1 > bp.nsplit1) || // if same, check nsplit // (bp.refID1 == refID1 && bp.pos1 == pos1 && bp.pos2 == pos2 && nsplit1 == bp.nsplit1 && tsplit1 > bp.tsplit1); // if also same, check tsplit diff --git a/src/SnowTools/DiscordantCluster.h b/src/SnowTools/DiscordantCluster.h index 0611b1c27..83dad858f 100644 --- a/src/SnowTools/DiscordantCluster.h +++ b/src/SnowTools/DiscordantCluster.h @@ -1,84 +1,105 @@ -#ifndef DISCORDANT_CLUSTER_H -#define DISCORDANT_CLUSTER_H +#ifndef DISCORDANT_CLUSTER_H__ +#define DISCORDANT_CLUSTER_H__ #include #include #include #include +#include #include "SnowTools/GenomicRegion.h" #include "SnowTools/BamRead.h" namespace SnowTools { + + //class DiscordantCluster; /** Class to hold clusters of discordant reads */ class DiscordantCluster { + + friend class BreakPoint; public: /** Create an empty cluster */ - DiscordantCluster() {} - - DiscordantCluster(BamReadVector &this_reads, BamReadVector &all_reads); - DiscordantCluster(std::string tcluster); - DiscordantCluster(std::string tcluster, SnowTools::GenomicRegion gr1, SnowTools::GenomicRegion gr2) - { - cluster = tcluster; - reg1 = gr1; - reg2 = gr2; - } - - - std::string cluster; - std::string id; - size_t tcount = 0; - size_t ncount = 0; - std::unordered_map qnames; // TODO get rid of it - std::unordered_map reads; - std::unordered_map mates; - - double reads_mapq; - double mates_mapq; - std::vector mapq; // TODO remoe - std::string contig = ""; + DiscordantCluster() { m_reg1 = GenomicRegion(); m_reg2 = GenomicRegion(); assert(m_reg1.isEmpty()); } + + /** Make a cluster from a set of reads (pre-clustered) and look up a larger set to find + * their mates + * @param this_reads Pre-clustered set of discordant reads (but not their mates) + * @param all_reads A pile of reads to search for mates + */ + DiscordantCluster(const BamReadVector& this_reads, const BamReadVector& all_reads); + /** Is this discordant cluster empty? */ + bool isEmpty() const { assert(m_reg1.isEmpty() == m_reg2.isEmpty()); return m_reg1.isEmpty(); } + /** Return a string representing the output file header */ static std::string header() { return "chr1\tpos1\tstrand1\tchr2\tpos2\tstrand2\ttcount\tncount\tmapq1\tmapq2\treads"; } - GenomicRegion reg1; - GenomicRegion reg2; - - void addMateReads(BamReadVector &bav); + void addMateReads(const BamReadVector& bav); // return the mean mapping quality for this cluster double getMeanMapq(bool mate) const; double getMeanMapq() const; + + /** Return the discordant cluster as a string with just coordinates */ std::string toRegionString() const; - // add the read names supporting this cluster + /** Add the read names supporting this cluster */ void addRead(std::string name); - // define how to print this to stdout + /** Print this with region string and read counts and mapq */ friend std::ostream& operator<<(std::ostream& out, const DiscordantCluster& dc); - // define how to print to file + /** Return as a string for writing to a file */ std::string toFileString(bool with_read_names = false) const; - // define how these are to be sorted + /** Sort by coordinate */ bool operator < (const DiscordantCluster& b) const; + + /** Has associated assembly contig */ + bool hasAssociatedAssemblyContig() const { return m_contig.length(); } + + static std::unordered_map clusterReads(const BamReadVector& bav, const GenomicRegion& interval); + + static bool __add_read_to_cluster(BamReadClusterVector &cvec, BamReadVector &clust, const BamRead &a, bool mate); + + static void __cluster_reads(const BamReadVector& brv, BamReadClusterVector& fwd, BamReadClusterVector& rev); + + static void __cluster_mate_reads(BamReadClusterVector& brcv, BamReadClusterVector& fwd, BamReadClusterVector& rev); + + static void __convertToDiscordantCluster(std::unordered_map &dd, const BamReadClusterVector& cvec, const BamReadVector& bav); + + + size_t tcount = 0; + size_t ncount = 0; + + private: + + GenomicRegion m_reg1; + GenomicRegion m_reg2; - + std::string m_id; + + std::unordered_map qnames; // TODO get rid of it + std::unordered_map reads; + std::unordered_map mates; + + std::string m_contig = ""; }; //! vector of AlignmentFragment objects - typedef std::unordered_map DMap; - //! vector of AlignmentFragment objects - typedef std::vector DVec; + typedef std::vector DiscordantClusterVector; + + //! Store a set of DiscordantCluster objects, indexed by the "id" field + typedef std::unordered_map DiscordantClusterMap; + } diff --git a/src/SnowTools/GenomicRegion.h b/src/SnowTools/GenomicRegion.h index 032179a50..92bd88e89 100644 --- a/src/SnowTools/GenomicRegion.h +++ b/src/SnowTools/GenomicRegion.h @@ -6,10 +6,12 @@ #include #include #include +#include #include "SnowTools/SnowUtils.h" #include "SnowTools/SnowToolsCommon.h" -#include "SnowTools/HTSTools.h" + +#include "htslib/sam.h" /** */ @@ -48,6 +50,10 @@ class GenomicRegion { */ GenomicRegion(const std::string& reg, bam_hdr_t* h); + /** Return a string representation of just the first base-pair + */ + std::string pointString() const; + static int32_t chrToNumber(std::string ref); static std::string chrToString(int32_t ref); diff --git a/src/SnowTools/GenomicRegionCollection.h b/src/SnowTools/GenomicRegionCollection.h index 67b9c7318..bc9cc5abc 100644 --- a/src/SnowTools/GenomicRegionCollection.h +++ b/src/SnowTools/GenomicRegionCollection.h @@ -16,9 +16,9 @@ namespace SnowTools { -typedef Interval GenomicInterval; +typedef Interval GenomicInterval; typedef std::unordered_map > GenomicIntervalMap; -typedef IntervalTree GenomicIntervalTree; +typedef IntervalTree GenomicIntervalTree; typedef std::unordered_map GenomicIntervalTreeMap; typedef std::vector GenomicIntervalVector; @@ -132,7 +132,11 @@ class GenomicRegionCollection { size_t countContained(const T &gr); - GenomicRegionCollection findOverlaps(GenomicRegionCollection &subject, std::vector& query_id, std::vector& subject_id, bool ignore_strand = false); + GenomicRegionCollection findOverlaps(GenomicRegionCollection &subject, std::vector& query_id, std::vector& subject_id, bool ignore_strand = false); + + /** The total amount spanned by this collection + */ + int width() const { int wid = 0; for (auto& i : m_grv) wid += i.width(); return wid; } /** Increase the left and right ends of each contained GenomicRegion by * the pad value. @@ -182,6 +186,8 @@ class GenomicRegionCollection { GenomicRegionCollection intersection(GenomicRegionCollection& subject, bool ignore_strand = false); + GenomicRegionCollection complement(GenomicRegionCollection& subject, bool ignore_strand = false); + std::vector m_grv; private: diff --git a/src/SnowTools/HTSTools.h b/src/SnowTools/HTSTools.h deleted file mode 100644 index 3d120590d..000000000 --- a/src/SnowTools/HTSTools.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef SNOWTOOLS_HTSTOOLS_H__ -#define SNOWTOOLS_HTSTOOLS_H__ - -#include -#include -#include -#include -#include - -#include "htslib/hts.h" -#include "htslib/sam.h" -#include "htslib/bgzf.h" -#include "htslib/kstring.h" -#include "htslib/faidx.h" - -#include "SnowTools/GenomicRegion.h" - -static const char BASES[16] = {' ', 'A', 'C', ' ', - 'G', ' ', ' ', ' ', - 'T', ' ', ' ', ' ', - ' ', ' ', ' ', 'N'}; - -struct free_delete { - void operator()(void* x) { bam_destroy1((bam1_t*)x); } -}; - -typedef std::shared_ptr Read; -typedef std::vector ReadVec; - -#define r_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0) -#define r_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0) -#define r_pos(b) ((b)->core.pos) -#define r_mpos(b) ((b)->core.mpos) -#define r_id(b) ((b)->core.tid) -#define r_mid(b) ((b)->core.mtid) -#define r_is_mapped(b) (!((b)->core.flag&BAM_FUNMAP)) -#define r_is_mmapped(b) (!((b)->core.flag&BAM_FMUNMAP)) -#define r_mapq(b) ((b)->core.qual) -#define r_cig_size(b) ((b)->core.n_cigar) -#define r_is_first(b) ((b)->core.flag&BAM_FREAD1) -#define r_qname(b) (std::string(bam_get_qname((b).get()))) -#define r_remove_tag(b, t) do { uint8_t *p = bam_aux_get((b).get(), t); if (p) bam_aux_del((b).get(), p); } while(0) -#define r_flag(b) ((b)->core.flag) -#define r_is_pmapped(b) (!((b)->core.flag&BAM_FUNMAP) && !((b)->core.flag&BAM_FMUNMAP) && ((b)->core.flag&BAM_FPAIRED)) -#define r_length(b) ((b)->core.l_qseq) -#define r_isize(b) ((b)->core.isize) -#define r_seq(b, s) do { uint8_t * p = bam_get_seq(b); char c[(b)->core.l_qseq+1]; for (int ww = 0; ww < (b)->core.l_qseq; ww++) { c[ww] = BASES[bam_seqi(p, ww)]; } c[(b)->core.l_qseq] = '\0'; s = std::string(c); } while(0) -#define r_get_clip(b, p) do { uint32_t* c = bam_get_cigar(b); for (int ww = 0; ww < (b)->core.n_cigar; ww++) if (c[ww] & BAM_CSOFT_CLIP || c[ww] & BAM_CHARD_CLIP) p += bam_cigar_oplen(c[ww]); } while(0) - -#define r_get_Z_tag(b, t, v) do { uint8_t* p = bam_aux_get((b).get(), t); if (p) v = std::string(bam_aux2Z(p)); else v = std::string(); } while(0) -#define r_get_SR(b, v) do { uint8_t* p = bam_aux_get((b).get(), "SR"); assert(p); v = bam_aux2Z(p); } while(0) -#define r_get_int32_tag(b, t, v) do { uint8_t* p = bam_aux_get((b).get(), t); if (p) v = bam_aux2i(p); else v = 0; } while(0) - -#define r_cig_type(b, i) (bam_cigar_opchr(bam_get_cigar((b).get())[i])) -#define r_cig_len(b, i) (bam_cigar_oplen(bam_get_cigar((b).get())[i])) - -#define r_add_Z_tag(b, t, v) bam_aux_append((b).get(), t, 'Z', (v).length()+1, (uint8_t*)(v).c_str()) -#define r_add_int32_tag(b, t, v) bam_aux_append((b).get(), t, 'i', 4, (uint8_t*)&(v)) -#define r_count_nbases(b, n) do { uint8_t* p = bam_get_seq(b); for (int ww = 0; ww < (b)->core.l_qseq; ww++) if (bam_seqi(p,ww) == 15) n++; } while(0) -#define r_count_sub_nbases(b, n, s, e) do { uint8_t* p = bam_get_seq(b); for (int ww = (s); ww < (e); ww++) if (bam_seqi(p,ww) == 15) n++; } while(0) - -#define r_is_rstrand(b) (((b)->core.flag&BAM_FREVERSE) > 0) -#define r_is_mrstrand(b) (((b)->core.flag&BAM_FMREVERSE) > 0) -#define r_is_dup(b) (((b)->core.flag&BAM_FDUP) > 0) -#define r_is_primary(b) (!((b)->core.flag&BAM_FSECONDARY)) -#define r_is_qc_fail(b) (((b)->core.flag&BAM_FQCFAIL) > 0) -#define r_strand(b) ((b)->core.flag&BAM_FREVERSE ? '-' : '+') -#define r_endpos(b) (bam_endpos((b).get())) - -#define r_get_trimmed_seq(b, s) do { int32_t ts, tl; r_get_int32_tag(b, "TS", ts); r_get_int32_tag(b, "TL", tl); r_seq(b, s); if (tl != 0 && tl != r_length(b)) s = s.substr(ts, tl); } while (0) - -namespace SnowTools { - - class GenomicRegion; - - int32_t qualityTrimRead(int qualTrim, int32_t &startpoint, Read &r); - - std::vector GetStringTag(const Read& a, const std::string tag); - - void SmartAddTag(Read &a, const std::string tag, const std::string val); - - std::vector GetIntTag(const Read& a, const std::string tag); - - //void rcomplement(std::string& a); - - void removeAllTags(Read& a); - - std::string getRefSequence(const GenomicRegion &gr, faidx_t *fi); -} - -#endif diff --git a/src/SnowTools/MiniRules.h b/src/SnowTools/MiniRules.h index 2ca998bd0..83763a599 100644 --- a/src/SnowTools/MiniRules.h +++ b/src/SnowTools/MiniRules.h @@ -6,9 +6,7 @@ #include #include -#include "SnowTools/GenomicRegion.h" #include "SnowTools/GenomicRegionCollection.h" -#include "SnowTools/SnowUtils.h" #include "SnowTools/BamRead.h" //#define HAVE_AHOCORASICK_AHOCORASICK_H 1 diff --git a/src/SnowTools/Read.h b/src/SnowTools/Read.h deleted file mode 100644 index 915a0b3ca..000000000 --- a/src/SnowTools/Read.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef SNOWTOOLS_READ_H__ -#define SNOWTOOLS_READ_H__ - -#include "SnowTools/HTSTools.h" - -class BamRead { - - - private: - - std::shared_ptr b; - - -}; diff --git a/src/VFilterTool.cpp b/src/VFilterTool.cpp deleted file mode 100644 index d2a9c840f..000000000 --- a/src/VFilterTool.cpp +++ /dev/null @@ -1,199 +0,0 @@ -#include "VFilterTool.h" - -#include -#include -#include - -#include "SnowTools/gzstream.h" -#include "SnowTools/SnowUtils.h" -#include "SnowTools/GenomicRegionCollection.h" -#include "SnowTools/GenomicRegion.h" - -#include "VariantBamWalker.h" - -using SnowTools::GenomicRegion; -using SnowTools::GenomicRegionCollection; -using SnowTools::GRC; - -static const char *VARIANT_BAM_USAGE_MESSAGE = -"Usage: vfilter -i -o [OPTIONS] \n\n" -" Description: Process a BAM file for use with rearrangement variant callers by removing proper pairs and bad regions\n" -"\n" -" General options\n" -" -v, --verbose Display more verbose output\n" -" -h, --help Display this help and exit\n" -" -i, --input-bam BAM file to filter\n" -" -o, --output-bam Output BAM file\n" -" -g, --region Regions (e.g. myvcf.vcf or WG for whole genome) or newline seperated subsequence file. Applied in same order as -r for multiple\n" -" -r, --rules A script for the rules. If specified multiple times, will be applied in same order as -g\n" -" -f, --rules-script A file (script) for the rules and regions/sequences. If specified, ignores -r and -g flags.\n" -" -k, --regions-file BED file of regions to process. Default: Whole genome\n" -" Optional Input\n" -"\n"; - -namespace opt { - - static std::string bam; - static std::string out; - static size_t verbose = 1; - static std::string rules = ""; - static std::string proc_regions = ""; - static std::string refgenome = SnowTools::REFHG19; -} - -static const char* shortopts = "hv:qji:o:r:c:k:g:c:f:T:"; -static const struct option longopts[] = { - { "help", no_argument, NULL, 'h' }, - { "verbose", required_argument, NULL, 'v' }, - { "input-bam", required_argument, NULL, 'i' }, - { "output-bam", required_argument, NULL, 'o' }, - { "qc-only", no_argument, NULL, 'q' }, - { "rules", required_argument, NULL, 'r' }, - { "reference", required_argument, NULL, 'T' }, - { "rules-script", required_argument, NULL, 'f' }, - { "region", required_argument, NULL, 'g' }, - { "regions-file", required_argument, NULL, 'k' }, - { NULL, 0, NULL, 0 } -}; - -// forward declare -void parseVarOptions(int argc, char** argv); - -void runVFilter(int argc, char** argv) { - - // parse the command line - parseVarOptions(argc, argv); - - if (opt::verbose > 0) { - std::cout << "Input BAM: " << opt::bam << std::endl; - std::cout << "Output BAM: " << opt::out << std::endl; - std::cout << "Input rules and regions: " << opt::rules << std::endl; - std::cout << "Input regions mask file: " << opt::proc_regions << std::endl; - } - - // set which regions to run - GRC grv_proc_regions; - if (opt::proc_regions.length()) - grv_proc_regions.regionFileToGRV(opt::proc_regions); - - // setup the walker - VariantBamWalker walk(opt::bam, opt::refgenome); - - // make the mini rules collection from the rules file - // this also calls function to parse the BED files - walk.SetMiniRulesCollection(opt::rules); - - // set the regions to run - SnowTools::GRC rules_rg = walk.GetMiniRulesCollection().getAllRegions(); - if (grv_proc_regions.size() && rules_rg.size()) // intersect rules regions with mask regions - rules_rg = rules_rg.intersection(grv_proc_regions); - else if (grv_proc_regions.size()) - rules_rg = grv_proc_regions; // rules is whole genome, so just make mask instead - - if (rules_rg.size()) - walk.setBamWalkerRegions(rules_rg.asGenomicRegionVector()); - - // print out some info - if (opt::verbose > 0) - std::cout << walk << std::endl;; - - // walk the BAM and process - walk.writeVariantBam(); - - // make a bed file - //if (opt::verbose > 0) - // std::cout << "...sending merged regions to BED file" << std::endl; - //mr->sendToBed("merged_rules.bed"); - - //index it - std::cout << "...writing the index file" << std::endl; - walk.MakeIndex(); - - return; -} - -void parseVarOptions(int argc, char** argv) { - - bool die = false; - - if (argc < 2) - die = true; - - for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { - std::istringstream arg(optarg != NULL ? optarg : ""); - switch (c) { - case 'h': die = true; break; - case 'v': arg >> opt::verbose; break; - //case 't': opt::twopass = true; break; - case 'i': arg >> opt::bam; break; - case 'o': arg >> opt::out; break; - case 'g': - { - std::string tmp; - arg >> tmp; - if (tmp.length() == 0) - break; - if (opt::rules.length()) - opt::rules += "%"; - opt::rules += "region@" + tmp; - } - break; - case 'c': // call stats hack - { - std::string tmp; - arg >> tmp; - if (tmp.length() == 0) - break; - if (opt::rules.length()) - opt::rules += "%"; - opt::rules += "region@" + tmp + ";mate"; - } - break; - - case 'r': - { - std::string tmp; - arg >> tmp; - if (tmp.length() == 0) - break; - if (opt::rules.length()) - opt::rules += "%"; - else - opt::rules = "region@WG%"; // need to specify a region - opt::rules += tmp; - } - break; - case 'k': arg >> opt::proc_regions; break; - case 'f': - { - std::string file; - arg >> file; - - std::string line; - igzstream iss(file.c_str()); - while(std::getline(iss, line, '\n')) { - if (opt::rules.length() && line.length()) - opt::rules += "%"; - if (line.length()) - opt::rules += line; - } - } - break; - } - } - - if (opt::bam == "") - die = true; - if (opt::out == "") - die = true; - - // dont stop the run for bad bams for quality checking only - //opt::perc_limit = opt::qc_only ? 101 : opt::perc_limit; - - // something went wrong, kill - if (die) { - std::cout << "\n" << VARIANT_BAM_USAGE_MESSAGE; - exit(1); - } - -} diff --git a/src/VFilterTool.h b/src/VFilterTool.h deleted file mode 100644 index 1ca3fce02..000000000 --- a/src/VFilterTool.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef SNOWTOOLS_VFILTER_TOOL_H__ -#define SNOWTOOLS_VFILTER_TOOL_H__ - -void runVFilter(int argc, char** argv); - - -#endif diff --git a/src/VariantBamWalker.cpp b/src/VariantBamWalker.cpp deleted file mode 100644 index 463465a8a..000000000 --- a/src/VariantBamWalker.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include "VariantBamWalker.h" - -using SnowTools::ReadCount; - -void VariantBamWalker::writeVariantBam() -{ - - // start the timer - clock_gettime(CLOCK_MONOTONIC, &start); - - ReadCount rc_main; - Read r; - std::cout << "...starting run through BAM" << std::endl; - std::string rule; - - while (GetNextRead(r, rule)) - { - - TrackSeenRead(r); - // read is valid - if (rule.length()) { - ++rc_main.keep; - WriteAlignment(r); - } - - if (++rc_main.total % 1000000 == 0) - printMessage(rc_main, r); - - } -} - -void VariantBamWalker::TrackSeenRead(Read &r) -{ - m_stats.addRead(r); -} - -void VariantBamWalker::printMessage(const ReadCount &rc_main, const Read &r) const -{ - - char buffer[100]; - std::string posstring = SnowTools::AddCommas(r_pos(r)); - std::sprintf (buffer, "Reading read %11s at position %2s:%-11s. Kept %11s (%2d%%) [running count across whole BAM]", - rc_main.totalString().c_str(), SnowTools::GenomicRegion::chrToString(r_id(r)).c_str(), posstring.c_str(), - rc_main.keepString().c_str(), rc_main.percent()); - - std::printf ("%s | ",buffer); - SnowTools::displayRuntime(start); - std::cout << std::endl; - -} diff --git a/src/VariantBamWalker.h b/src/VariantBamWalker.h deleted file mode 100644 index 763ba03e2..000000000 --- a/src/VariantBamWalker.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef VARIANT_VARIANT_BAM_WALKER_H__ -#define VARIANT_VARIANT_BAM_WALKER_H__ - -#include "SnowTools/BamWalker.h" -#include "SnowTools/BamStats.h" - -class VariantBamWalker: public SnowTools::BamWalker -{ - public: - - VariantBamWalker() {} - - VariantBamWalker(const std::string in) : SnowTools::BamWalker(in) {} - - void writeVariantBam(); - - void TrackSeenRead(Read &r); - - void printMessage(const SnowTools::ReadCount &rc_main, const Read &r) const; - - private: - - SnowTools::BamStats m_stats; - -}; -#endif diff --git a/src/config.status b/src/config.status index 4b28b39fb..f4849cdf6 100755 --- a/src/config.status +++ b/src/config.status @@ -499,7 +499,7 @@ s,@am__untar@,${AMTAR} xf -,;t t s,@CXX@,g++,;t t s,@CXXFLAGS@,-O3,;t t s,@LDFLAGS@,,;t t -s,@CPPFLAGS@, ,;t t +s,@CPPFLAGS@, -I/xchip/gistic/Jeremiah/boost_1_58_0,;t t s,@ac_ct_CXX@,g++,;t t s,@EXEEXT@,,;t t s,@OBJEXT@,o,;t t @@ -522,7 +522,7 @@ s,@RANLIB@,ranlib,;t t s,@ac_ct_RANLIB@,ranlib,;t t s,@CXXCPP@,g++ -E,;t t s,@EGREP@,grep -E,;t t -s,@AM_CXXFLAGS@,-g -Wall -Wextra -Wno-unknown-pragmas -std=c++11,;t t +s,@AM_CXXFLAGS@,-g -Wno-unknown-pragmas -std=c++11,;t t s,@LIBOBJS@,,;t t s,@LTLIBOBJS@,,;t t CEOF diff --git a/src/configure b/src/configure index 451e26b17..f2cfd98d0 100755 --- a/src/configure +++ b/src/configure @@ -862,6 +862,12 @@ Optional Features: --enable-development Turn on development options, like failing compilation on warnings +Optional Packages: + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --with-boost=PATH specify directory containing the bamtools library + (http://github.com/pezmaster31/bamtools) + Some influential environment variables: CXX C++ compiler command CXXFLAGS C++ compiler flags @@ -4307,6 +4313,20 @@ _ACEOF fi + +# Check whether --with-boost or --without-boost was given. +if test "${with_boost+set}" = set; then + withval="$with_boost" + +fi; +if test "$with_boost" -a -d "$with_boost"; then + boost_include="-I$with_boost" +else + boostr=/xchip/gistic/Jeremiah/boost_1_58_0 + boost_include="-I$boostr" +fi + + # Check for the bamtools library pathz # Bamtools has two different path formats # If it is built in place the files will be in {path}/lib/ and {path}/include/ @@ -4363,14 +4383,15 @@ if test "$enable_development"; then fi # Set compiler flags. -AM_CXXFLAGS="-g -Wall -Wextra $fail_on_warning -Wno-unknown-pragmas -std=c++11" +##AC_SUBST(AM_CXXFLAGS, "-g -Wall -Wextra $fail_on_warning -Wno-unknown-pragmas -std=c++11") +AM_CXXFLAGS="-g $fail_on_warning -Wno-unknown-pragmas -std=c++11" CXXFLAGS="-O3" CFLAGS="-O3" #AC_SUBST(CPPFLAGS, "$CPPFLAGS $htslib_include $aho_include") -CPPFLAGS="$CPPFLAGS " +CPPFLAGS="$CPPFLAGS $boost_include" CPPFLAGS="$CPPFLAGS" diff --git a/src/configure.ac b/src/configure.ac index c512e0122..0f758778b 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -20,6 +20,16 @@ AC_CHECK_HEADER([zlib.h]) AC_SEARCH_LIBS([gzopen],[z],,[AC_MSG_ERROR([libz not found, please install zlib (http://www.zlib.net/)])]) AC_SEARCH_LIBS([clock_gettime], [rt], [AC_DEFINE([HAVE_CLOCK_GETTIME], [1], [clock_getttime found])], ) +AC_ARG_WITH(boost, AS_HELP_STRING([--with-boost=PATH], + [specify directory containing the bamtools library (http://github.com/pezmaster31/bamtools)])) +if test "$with_boost" -a -d "$with_boost"; then + boost_include="-I$with_boost" +else + boostr=/xchip/gistic/Jeremiah/boost_1_58_0 + boost_include="-I$boostr" +fi + + # Check for the bamtools library pathz # Bamtools has two different path formats # If it is built in place the files will be in {path}/lib/ and {path}/include/ @@ -73,11 +83,12 @@ if test "$enable_development"; then fi # Set compiler flags. -AC_SUBST(AM_CXXFLAGS, "-g -Wall -Wextra $fail_on_warning -Wno-unknown-pragmas -std=c++11") +##AC_SUBST(AM_CXXFLAGS, "-g -Wall -Wextra $fail_on_warning -Wno-unknown-pragmas -std=c++11") +AC_SUBST(AM_CXXFLAGS, "-g $fail_on_warning -Wno-unknown-pragmas -std=c++11") AC_SUBST(CXXFLAGS, "-O3") AC_SUBST(CFLAGS, "-O3") #AC_SUBST(CPPFLAGS, "$CPPFLAGS $htslib_include $aho_include") -AC_SUBST(CPPFLAGS, "$CPPFLAGS ") +AC_SUBST(CPPFLAGS, "$CPPFLAGS $boost_include") AC_SUBST(CPPFLAGS, "$CPPFLAGS") AC_SUBST(LDFLAGS, "$LDFLAGS") diff --git a/src/source.dot b/src/source.dot new file mode 100644 index 000000000..0ced3c11f --- /dev/null +++ b/src/source.dot @@ -0,0 +1,62 @@ +digraph "source tree" { + overlap=scale; + size="8,10"; + ratio="fill"; + fontsize="16"; + fontname="Helvetica"; + clusterrank="local"; + "MiniRules" -> "SnowUtils" + "MiniRules" -> "GenomicRegionCollection" + "Histogram" -> "IntervalTree" + "SnowTools" -> "CoverageTool" + "RealignTool" -> "BamWalker" + "gChain" -> "GenomicRegionCollection" + "BamWalker" -> "GenomicRegionCollection" + "MiniRules" -> "BamRead" + "VFilterTool" -> "GenomicRegion" + "VariantBamWalker" -> "BamWalker" + "GenomicRegion" -> "HTSTools" + "BWAWrapper" -> "BamRead" + "BreakPoint" -> "GenomicRegionCollection" + "SnowUtils" -> "GenomicRegion" + "GenomicRegion" -> "SnowUtils" + "MiniRules" -> "GenomicRegion" + "AlignedContig" -> "GenomicRegionCollection" + "DiscordantCluster" -> "GenomicRegion" + "BamWalker" -> "GenomicRegion" + "AlignedContig" -> "GenomicRegion" + "GenomicRegionCollection" -> "IntervalTree" + "Read" -> "HTSTools" + "BreakPoint" -> "DiscordantCluster" + "GenomicRegion" -> "gzstream" + "BamWalker" -> "SnowUtils" + "RealignTool" -> "BamRead" + "BamStats" -> "BamRead" + "AlignedContig" -> "BamRead" + "AlignedContig" -> "BreakPoint" + "VFilterTool" -> "SnowUtils" + "BamStats" -> "Histogram" + "AlignedContig" -> "SnowUtils" + "GenomicRegionCollection" -> "GenomicRegion" + "VFilterTool" -> "GenomicRegionCollection" + "BreakPoint" -> "GenomicRegion" + "AlignedContig" -> "BamWalker" + "BamWalker" -> "BamRead" + "BamWalker" -> "HTSTools" + "HTSTools" -> "GenomicRegion" + "BreakPoint" -> "gzstream" + "RealignTool" -> "BWAWrapper" + "SnowUtils" -> "gzstream" + "VFilterTool" -> "gzstream" + "BreakPoint" -> "BamRead" + "VFilterTool" -> "VariantBamWalker" + "BamWalker" -> "MiniRules" + "SnowTools" -> "RealignTool" + "GenomicRegionCollection" -> "gzstream" + "GenomicRegion" -> "SnowToolsCommon" + "DiscordantCluster" -> "BamRead" + "VariantBamWalker" -> "BamStats" + "BamRead" -> "HTSTools" + "RealignTool" -> "AlignedContig" + "AlignedContig" -> "BWAWrapper" +}