From 8aa2fa88f94716acdea8adee5eceb6fa01d121ef Mon Sep 17 00:00:00 2001 From: Jeremiah Wala Date: Fri, 15 May 2015 19:13:54 -0400 Subject: [PATCH] added back SnowTools --- src/SnowTools/AlignedContig.h | 294 +++++++++++++++ src/SnowTools/BWAWrapper.h | 101 ++++++ src/SnowTools/BamRead.h | 142 ++++++++ src/SnowTools/BamStats.h | 85 +++++ src/SnowTools/BamWalker.h | 259 +++++++++++++ src/SnowTools/GenomicRegion.h | 104 ++++++ src/SnowTools/GenomicRegionCollection.h | 200 ++++++++++ src/SnowTools/HTSTools.h | 85 +++++ src/SnowTools/Histogram.h | 168 +++++++++ src/SnowTools/IntervalTree.h | 214 +++++++++++ src/SnowTools/MiniRules.h | 464 ++++++++++++++++++++++++ src/SnowTools/Read.h | 14 + src/SnowTools/SnowTools.h | 5 + src/SnowTools/SnowToolsCommon.h | 106 ++++++ src/SnowTools/SnowUtils.h | 136 +++++++ src/SnowTools/gChain.h | 32 ++ src/SnowTools/gzstream.h | 128 +++++++ 17 files changed, 2537 insertions(+) create mode 100644 src/SnowTools/AlignedContig.h create mode 100644 src/SnowTools/BWAWrapper.h create mode 100644 src/SnowTools/BamRead.h create mode 100644 src/SnowTools/BamStats.h create mode 100644 src/SnowTools/BamWalker.h create mode 100644 src/SnowTools/GenomicRegion.h create mode 100644 src/SnowTools/GenomicRegionCollection.h create mode 100644 src/SnowTools/HTSTools.h create mode 100644 src/SnowTools/Histogram.h create mode 100644 src/SnowTools/IntervalTree.h create mode 100644 src/SnowTools/MiniRules.h create mode 100644 src/SnowTools/Read.h create mode 100644 src/SnowTools/SnowTools.h create mode 100644 src/SnowTools/SnowToolsCommon.h create mode 100644 src/SnowTools/SnowUtils.h create mode 100644 src/SnowTools/gChain.h create mode 100644 src/SnowTools/gzstream.h diff --git a/src/SnowTools/AlignedContig.h b/src/SnowTools/AlignedContig.h new file mode 100644 index 000000000..23c6b2d92 --- /dev/null +++ b/src/SnowTools/AlignedContig.h @@ -0,0 +1,294 @@ +#ifndef ALIGNED_CONTIG_H +#define ALIGNED_CONTIG_H + +#include + +#include "SnowTools/GenomicRegion.h" +#include "SnowTools/GenomicRegionCollection.h" +#include "SnowTools/SnowUtils.h" + +#include "BamToolsUtils.h" +#include "BreakPoint.h" + +using SnowTools::GRC; + +typedef std::vector CigarOpVec; +typedef std::unordered_map CigarMap; + +class AlignedContig; +typedef std::unordered_map ContigMap; +typedef std::vector AlignedContigVec; + +/*! This class contains a single alignment fragment from a contig to + * the reference. For a multi-part mapping of a contig to the reference, + * an object of this class represents just a single fragment from that alignment. + */ +struct AlignmentFragment { + + friend AlignedContig; + + /*! Construct an AlignmentFragment from a BWA alignment + * @param const reference to a BamAlignment + * @param const reference to a GenomicRegion window where this contig was assembled from + */ + AlignmentFragment(const BamTools::BamAlignment &talign, const SnowTools::GenomicRegion &window, const CigarMap &nmap, const CigarMap &tmap); + + //! sort AlignmentFragment objects by start position + bool operator < (const AlignmentFragment& str) const { return (start < str.start); } + + //! print the AlignmentFragment + friend std::ostream& operator<<(std::ostream &out, const AlignmentFragment& c); + + /*! @function + * @abstract Parse an alignment frag for a breakpoint + * @param reference to a BreakPoint to be created + * @return boolean informing whether there was a remaining indel break + */ + bool parseIndelBreak(BreakPoint &bp); + + /*! @function check if breakpoints match any cigar strings direct from the reads + * This is important in case the assembly missed a read (esp normal) that indicates + * that there is an indel. Basically this function says that if assembly calls a breakpoint + * and it agrees with a read alignment breakpoint, combine the info. + * @param const CigarMap reference containing hash with key=chr_breakpos_indeltype, val=tumor count + * @param const CigarMap reference containing hash with key=chr_breakpos_indeltype, val=normal count + */ + void indelCigarMatches(const CigarMap &nmap, const CigarMap &tmap); + + BPVec indel_breaks; /**< indel variants on this alignment */ + + CigarOpVec cigar; /**< cigar oriented to assembled orientation */ + + private: + + BamTools::BamAlignment align; /**< BWA alignment to reference */ + + size_t idx = 0; // index of the cigar where the last indel was taken from + + int break1 = -1; // 0-based breakpoint 1 on contig + int break2 = -1; /**< 0-based breakpoint 2 on contig */ + int gbreak1 = -1; /**< 0-based breakpoint 1 on reference chr */ + int gbreak2 = -1; /**< 0-based breakpoint 1 on reference chr */ + + size_t start; /**< the start position of this alignment on the reference. */ + + bool local = false; /**< boolean to note whether this fragment aligns to same location is was assembled from */ + + std::string m_name; // name of the entire contig + + std::string m_seq; // sequence of the entire contig + +}; + +// define a way to order the contigs by start +/*struct AlignmentOrdering { + inline bool operator() (const AlignmentFragment& struct1, const AlignmentFragment& struct2) { + return (struct1.start < struct2.start); + } + };*/ + +//! vector of AlignmentFragment objects +typedef std::vector AlignmentFragmentVector; + +/*! Contains the mapping of an aligned contig to the reference genome, + * along with pointer to all of the reads aligned to this contig, and a + * store of all of the breakpoints associated with this contig + */ +class AlignedContig { + + friend AlignmentFragment; + + public: + + /*! Constructor which parses an alignment record from BWA (a potentially multi-line SAM record) + * @param const reference to a string representing a SAM alignment (contains newlines if multi-part alignment) + * @param const pointer to a BamReader, which is used to convert chr ids to strings (e.g. X, Y) + * @param const reference to a SnowTools::GenomicRegion window specifying where in the reference this contig was assembled from. + */ + AlignedContig(const std::string &sam, const BamTools::BamReader * reader, const SnowTools::GenomicRegion &twindow, + const CigarMap &nmap, const CigarMap &tmap); + + std::string samrecord; /**< the original SAM record */ + + SnowTools::GenomicRegion window; /**< reference window from where this contig was assembled */ + + /*! @function Determine if this contig has identical breaks and is better than another. + * @param const reference to another AlignedContig + * @return bool bool returning true iff this contig has identical info has better MAPQ, or equal MAPQ but longer */ + bool isWorse(const AlignedContig &ac) const; + + /*! @function + @abstract Get whether the query is on the reverse strand + @param b pointer to an alignment + @return boolean true if query is on the reverse strand + */ + void addAlignment(const BamTools::BamAlignment &align, const SnowTools::GenomicRegion &window, + const CigarMap &nmap, const CigarMap &tmap); + + //! add a discordant cluster that maps to same regions as this contig + void addDiscordantCluster(DiscordantCluster dc) { m_dc.push_back(dc); } + + /*! @function loop through the vector of DiscordantCluster objects + * associated with this contig and print + */ + std::string printDiscordantClusters() const; + + //! return the name of the contig + std::string getContigName() const { assert(m_align.size()); return m_align[0].align.Name; } + + /*! @function get the maximum mapping quality from all alignments + * @return int max mapq + */ + int getMaxMapq() const { + int m = -1; + for (auto& i : m_align) + if (i.align.MapQuality > m) + m = i.align.MapQuality; + return m; + } + + /*! @function get the minimum mapping quality from all alignments + * @return int min mapq + */ + int getMinMapq() const { + int m = 1000; + for (auto& i : m_align) + if (i.align.MapQuality < m) + m = i.align.MapQuality; + return m; + } + + /*! @function loop through all of the breakpoints and + * calculate the split read support for each. Requires + * alignedReads to have been run first (will error if not run). + */ + void splitCoverage(); + + /*! Checks if any of the indel breaks are in a blacklist. If so, mark the + * breakpoints of the indels for skipping. That is, hasMinmal() will return false; + * @param grv The blaclist regions + */ + void blacklist(GRC& grv); + + /*! @function if this is a Discovar contig, extract + * the tumor and normal read support + * @param reference to int to fill for normal support + * @param reference to int to fill for tumor support + * @return boolean reporting if this is a discovar name + */ + bool parseDiscovarName(size_t &tumor, size_t &normal); + + /*! @function dump the contigs to a fasta + * @param ostream to write to + */ + void printContigFasta(std::ofstream &os) const; + + /*! @function set the breakpoints on the reference by combining multi-mapped contigs + */ + void setMultiMapBreakPairs(); + + /*! @function align reads to contig and modify their tags to show contig mapping + * Currently this function will attempt a SmithWaterman alignment for all reads + * that don't have an exact mapping to the contig. + * @param bav Vector of read smart pointers to align. Modifies their SW tag + */ + void alignReadsToContigs(ReadVec &bav); + + //! return the contig sequence as it came off the assembler + std::string getSequence() const { assert(m_seq.length()); return m_seq; } + + //! detemine if the contig contains a subsequence + bool hasSubSequence(const std::string& subseq) const { + return (m_seq.find(subseq) != std::string::npos); + } + + //! print this contig + friend std::ostream& operator<<(std::ostream &out, const AlignedContig &ac); + + /*! @function query if this contig contains a potential variant (indel or multi-map) + * @return true if there is multimapping or an indel + */ + bool hasVariant() const; + + /*! @function retrieves all of the breakpoints by combining indels with global mutli-map break + * @return vector of ind + */ + std::vector getAllBreakPoints() const; + std::vector getAllBreakPointPointers() const ; + + std::vector m_local_breaks; // store all of the multi-map BreakPoints for this contigs + + BreakPoint m_global_bp; // store the single spanning BreakPoing for this contig e + + ReadVec m_bamreads; // store smart pointers to all of the reads that align to this contig + + bool m_skip = false; // flag to specify that we should minimally process and simply dump to contigs_all.sam + + AlignmentFragmentVector m_align; // store all of the individual alignment fragments + + private: + + bool m_hasvariant = false; // flag to specify whether this alignment has some potential varaint (eg indel) + + bool m_tried_align_reads = false; // flag to specify whether we tried to align reads. + + std::string m_seq = ""; // sequence of contig as it came off of assembler + + std::vector m_dc; // collection of all discordant clusters that map to same location as this contig + +}; + +struct PlottedRead { + + int pos; + std::string seq; + std::string info; + + bool operator<(const PlottedRead& pr) const { + return (pos < pr.pos); + } + +}; + +typedef std::vector PlottedReadVector; + +struct PlottedReadLine { + + std::vector read_vec; + int available = 0; + int contig_len = 0; + + void addRead(PlottedRead *r) { + read_vec.push_back(r); + available = r->pos + r->seq.length() + 5; + } + + bool readFits(PlottedRead &r) { + return (r.pos >= available); + } + + friend std::ostream& operator<<(std::ostream& out, const PlottedReadLine &r) { + int last_loc = 0; + for (auto& i : r.read_vec) { + assert(i->pos - last_loc >= 0); + out << std::string(i->pos - last_loc, ' ') << i->seq; + last_loc = i->pos + i->seq.length(); + } + int name_buff = r.contig_len - last_loc; + assert(name_buff < 10000); + out << std::string(max(name_buff, 5), ' '); + for (auto& i : r.read_vec) { // add the data + out << i->info << ","; + } + return out; + } + +}; + +typedef std::vector PlottedReadLineVector; + + + +#endif + + diff --git a/src/SnowTools/BWAWrapper.h b/src/SnowTools/BWAWrapper.h new file mode 100644 index 000000000..8514b5cf8 --- /dev/null +++ b/src/SnowTools/BWAWrapper.h @@ -0,0 +1,101 @@ +#ifndef SNOWTOOLS_BWAWRAPPER_H__ +#define SNOWTOOLS_BWAWRAPPER_H__ + +#include +#include +#include +#include +#include + +#include "SnowTools/BamRead.h" + +#define MEM_F_SOFTCLIP 0x200 + +extern "C" { + #include "bwa/bwa.h" + #include "bwa/bntseq.h" + #include "bwa/kseq.h" + #include + #include "bwa/utils.h" + #include "bwa/bwamem.h" + //#include "bwa/is.c" + int is_bwt(ubyte_t *T, int n); + +} + +KSEQ_DECLARE(gzFile) + +using SnowTools::BamRead; + +namespace SnowTools { + + /** Structure to hold unaligned sequence (name and bases) + */ + struct USeq { + std::string name; + std::string seq; + }; + typedef std::vector USeqVector; + +/** Calls BWA-MEM on sequence queries and returns aligned reads, all in memory + */ +class BWAWrapper { + + public: + + /** Create a new BWAWrapper + */ + BWAWrapper() { + memopt = mem_opt_init(); + memopt->flag |= MEM_F_SOFTCLIP; + } + + ~BWAWrapper() { + + if (idx) + bwa_idx_destroy(idx); + if (memopt) + free(memopt); + } + + void alignSingleSequence(const std::string& seq); + + /** Construct a new bwa index for this object. + */ + void constructIndex(const USeqVector& v); + + /** Retrieve a bwa index object from disk + */ + void retrieveIndex(const std::string& file); + + /** Dump the stored index to files + */ + void writeIndexToFiles(const std::string& index_name); + + private: + + mem_opt_t * memopt; + + bwaidx_t* idx = 0; + + // overwrite the bwa bwt_pac2pwt function + bwt_t *__bwt_pac2bwt(const uint8_t *pac, int bwt_seq_lenr); + + // add an anns (chr annotation structure) + bntann1_t* __add_to_anns(const std::string& name, const std::string& seq, bntann1_t * ann, size_t offset); + + // overwrite the bwa-mem add1 function, which takes a sequence and adds to pac + uint8_t* __add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q); + + // make the pac structure (2-bit encoded packed sequence) + uint8_t* __make_pac(const USeqVector& v, bool for_only, bool write_file); + + void __write_pac_to_file(const std::string& file); + + std::string __print_bns(); +}; + +} + + +#endif diff --git a/src/SnowTools/BamRead.h b/src/SnowTools/BamRead.h new file mode 100644 index 000000000..50cf217d4 --- /dev/null +++ b/src/SnowTools/BamRead.h @@ -0,0 +1,142 @@ +#ifndef SNOWTOOLS_READ_H__ +#define SNOWTOOLS_READ_H__ + +#include + +#include "SnowTools/HTSTools.h" +#include + +namespace SnowTools { + +/** Class to store and interact with an HTSLib bam1_t read. + * + * HTSLibrary reads are stored in the bam1_t struct. Memory allocation + * is taken care of by bam1_t init, and deallocation by destroy_bam1. This + * class is a C++ interface that automatically takes care of memory management + * for these C allocs/deallocs. The only member of BamRead is a bam1_t object. + * Alloc/dealloc is taken care of by the constructor and destructor. + */ +class BamRead { + + public: + + /** Construct an empty BamRead by calling bam_init1() + */ + BamRead(); + + /** Destory a BamRead by calling bam_destroy1(b) + */ + ~BamRead(); + + /** BamRead is aligned on reverse strand */ + inline bool ReverseFlag() const { return (b->core.flag&BAM_FREVERSE) != 0; } + + /** Mate of BamRead is aligned on reverse strand */ + inline bool MateReverseFlag() const { return (b->core.flag&BAM_FMREVERSE) != 0; } + + /** Get the alignment position */ + inline int32_t Position() const { return b->core.pos; } + + /** Get the alignment position of the mate */ + inline int32_t MatePosition() const { return b->core.mpos; } + + /** Get the chromosome ID of the read */ + inline int32_t ChrID() const { return b->core.tid; } + + /** Get the chrosome ID of the mate read */ + inline int32_t MateChrID() const { return b->core.mtid; } + + /** Check if this read is mapped*/ + inline bool MappedFlag() const { return !(b->core.flag&BAM_FUNMAP); } + + /** Check if this reads mate is mapped*/ + inline bool MateMappedFlag() const { return !(b->core.flag&BAM_FMUNMAP); } + + /** Get the mapping quality */ + inline int32_t MapQuality() const { return b->core.qual; } + + /** Get the number of cigar fields */ + inline int32_t CigarSize() const { return b->core.n_cigar; } + + /** Check if this read is first in pair */ + inline bool FirstFlag() const { return (b->core.flag&BAM_FREAD1); } + + /** Get the qname of this read as a string */ + inline std::string Qname() const { return std::string(bam_get_qname(b)); } + + /** Get the qname of this read as a char array */ + inline char* QnameChar() const { return bam_get_qname(b); } + + /** Get the full alignment flag for this read */ + inline int32_t AlignmentFlag() const { return b->core.flag; } + + /** Get the insert size for this read */ + inline int32_t InsertSize() const { return b->core.isize; } + + /** Get the number of query bases of this read (aka length) */ + inline int32_t Length() const { return b->core.l_qseq; } + + void SetMapQuality(int32_t m) { b->core.qual = m; } + + /** Set the query name */ + void SetQname(std::string n); + + /** Set the sequence name */ + void SetSequence(std::string s); + + friend std::ostream& operator<<(std::ostream& out, const BamRead &r); + + /** Get the sequence of this read as a string */ + inline std::string Sequence() const { + uint8_t * p = bam_get_seq(b); + char c[b->core.l_qseq+1]; // need +1 for \0 at end + for (int32_t i = 0; i < b->core.l_qseq; ++i) + c[i] = BASES[bam_seqi(p, i)]; + c[b->core.l_qseq] = '\0'; // null terminate for proper std::string construciton + return std::string(c); + } + + /** Get the number of soft clipped bases */ + inline int32_t NumSoftClip() const { + int32_t p = 0; + uint32_t* c = bam_get_cigar(b); + for (int32_t i = 0; i < b->core.n_cigar; ++i) + if (c[i] & BAM_CSOFT_CLIP) + p += bam_cigar_oplen(c[i]); + return p; + } + + /** Get the number of clipped bases (hard clipped and soft clipped) */ + inline int32_t NumClip() const { + int32_t p = 0; + uint32_t* c = bam_get_cigar(b); + for (int32_t i = 0; i < b->core.n_cigar; ++i) + if ( (c[i] & BAM_CSOFT_CLIP) || (c[i] & BAM_CHARD_CLIP) ) + p += bam_cigar_oplen(c[i]); + return p; + } + + /** Get a string (Z) tag + * @param tag Name of the tag. e.g. "XP" + * @return The value stored in the tag. Returns empty string if it does not exist. + */ + inline std::string GetZTag(const std::string& tag) const { + uint8_t* p = bam_aux_get(b,tag.c_str()); + if (!p) + return ""; + return std::string(bam_aux2Z(p)); + + } + + //std::string toSam(bam_hdr_t* h) const; + + private: + + //bam_hdr_h *hdr; + bam1_t * b; + + +}; + +} +#endif diff --git a/src/SnowTools/BamStats.h b/src/SnowTools/BamStats.h new file mode 100644 index 000000000..2b65911d9 --- /dev/null +++ b/src/SnowTools/BamStats.h @@ -0,0 +1,85 @@ +#ifndef SNOWTOOLS_BAMSTATS_H__ +#define SNOWTOOLS_BAMSTATS_H__ + +#include +#include +#include + +#include "SnowTools/Histogram.h" +#include "SnowTools/HTSTools.h" + +namespace SnowTools{ + + /** Store information pertaining to a given read group * + * + * This class will collect statistics on number of: read, supplementary reads, unmapped reads, qcfail reads, duplicate reads. + * It will also create Histogram objects to store counts of: mapq, nm, isize, clip, mean phred score, length + */ +class BamReadGroup { + + friend class BamStats; + + public: + + /** Construct an empty BamReadGroup */ + BamReadGroup() {} + + /** Construct an empty BamReadGroup for the specified read group + * @param name Name of the read group + */ + BamReadGroup(const std::string& name); + + /** Display basic information about this read group + */ + friend std::ostream& operator<<(std::ostream& out, const BamReadGroup& rg); + + /** Add a BamRead to this read group */ + void addRead(Read &r); + + private: + + size_t reads; + size_t supp; + size_t unmap; + size_t qcfail; + size_t duplicate; + + Histogram mapq; + Histogram nm; + Histogram isize; + Histogram clip; + Histogram phred; + Histogram len; + + std::string m_name; + +}; + +/** Class to store statistics on a BAM file. + * + * BamStats currently stores a map of BamReadGroup objects. Bam statistics + * are collected then on a read-group basis, but can be output in aggregate. See + * BamReadGroup for description of relevant BAM statistics. + */ +class BamStats +{ + + public: + + /** Loop through the BamReadGroup objections and print them */ + friend std::ostream& operator<<(std::ostream& out, const BamStats& qc); + + /** Add a read by finding which read group it belongs to and calling the + * addRead function for that BamReadGroup. + */ + void addRead(Read &r); + + private: + + std::unordered_map m_group_map; + +}; + +} + +#endif diff --git a/src/SnowTools/BamWalker.h b/src/SnowTools/BamWalker.h new file mode 100644 index 000000000..6b56c4ccb --- /dev/null +++ b/src/SnowTools/BamWalker.h @@ -0,0 +1,259 @@ +#ifndef SNOWTOOLS_BAM_WALKER_H__ +#define SNOWTOOLS_BAM_WALKER_H__ + +#include + +#include "SnowTools/MiniRules.h" +#include "SnowTools/GenomicRegion.h" +#include "SnowTools/GenomicRegionCollection.h" + +#include "SnowTools/HTSTools.h" +#include "SnowTools/SnowUtils.h" + +// Phred score transformations +inline int char2phred(char b) { + uint8_t v = b; + assert(v >= 33); + return v - 33; +} + +// from samtools +inline char *samfaipath(const char *fn_ref) +{ + char *fn_list = 0; + if (fn_ref == 0) return 0; + fn_list = (char*)calloc(strlen(fn_ref) + 5, 1); + strcat(strcpy(fn_list, fn_ref), ".fai"); + if (access(fn_list, R_OK) == -1) { // fn_list is unreadable + std::cerr << "ERROR: Cannot read the index file for CRAM read/write" << std::endl; + } + return fn_list; +} + +namespace SnowTools { + +/////////////// +// Hold read counts +////////////// +struct ReadCount { + + int keep = 0; + int total = 0; + + int percent () const { + int perc = SnowTools::percentCalc(keep, total); + return perc; + } + + std::string totalString() const { + return SnowTools::AddCommas(total); + } + + std::string keepString() const { + return SnowTools::AddCommas(keep); + } + +}; + +/** Walk along a BAM or along BAM regions and stream in/out reads + */ +class BamWalker { + + public: + + /** Construct a new BamWalker for streaming data in and streaming + * out to a new BAM file. + * + * This constructor will open the BAM file and read the header. + * It will also open the out BAM file. + */ + //BamWalker(const std::string& in, const std::string& out); + + /** Construct a new BamWalker for reading a BAM + */ + BamWalker(const std::string& in); + + /** Construct an empty BamWalker */ + BamWalker() {} + + /** Destroy a BamWalker and close all connections to the BAM + * + * Calling the destructor will take care of all of the C-style dealloc + * calls required within HTSlib to close a BAM or SAM file. + */ + ~BamWalker() { + __close_read_bam(); + __close_write_bam(); + } + + /** Set a part of the BAM to walk. + * + * This will set the BAM pointer to the given region. + * @param gp Location to point the BAM to + */ + void setBamWalkerRegion(const GenomicRegion& gp); + + /** Set up multiple regions. Overwrites current regions. + * + * This will set the BAM pointer to the first element of the + * input list. + * @param grv Set of location to point BAM to + */ + void setBamWalkerRegions(const GenomicRegionVector& grv); + + /** Create the index file for the output bam in BAI format. + * + * This will make a call to HTSlib bam_index_build for the output file. + */ + void MakeIndex(); + + /** Print a run-time message to stdout. + * + * Prints a message about all of the reads that have been visited, and informaiton + * about the current read + */ + void printRuntimeMessage(const ReadCount &rc_main, const Read &r) const; + + /** Print out some basic info about this walker, + * including Minz0iRules + */ + friend std::ostream& operator<<(std::ostream& out, const BamWalker& b); + + /** Open a BAM file for streaming in + */ + bool OpenReadBam(const std::string& bam); + + /** Open a BAM file for streaming out + */ + bool OpenWriteBam(const std::string& bam); + + /** Pass a MiniRules script to the BAM. + * + * This will call the constructor of MiniRulesCollection, and + * parse the provides rules and add it as a rule set to this BamWalker. + * @param rules A string of rules, or a file pointing to a rules script + */ + void SetMiniRulesCollection(const std::string& rules); + + /** Explicitly provide a MiniRulesCollection to this BamWalker + */ + void SetMiniRulesCollection(const MiniRulesCollection& mr) { m_mr = mr; } + + /** Retrieve the next read from the BAM. + * + * If a MiniRulesCollection is defined for this BAM + * will grab the next valid read. + * r Read to fill with data + * rule bool identifying if this read passed the rules + * @return true if the next read is available + */ + bool GetNextRead(Read &r, bool& rule); + + /** Write an alignment to the output BAM file + * @param r The BamRead to save + */ + void WriteAlignment(Read &r); + + /** Return the MiniRulesCollection object used by this BamWalker + */ + const MiniRulesCollection& GetMiniRulesCollection() const { return m_mr; } + + /** Send the counts for passed rules to a file */ + void MiniRulesToFile(const std::string& file) const { m_mr.countsToFile(file); } + + /** Set the BamWalker to count reads for all rules */ + void setCountAllRules() { m_mr.m_fall_through = true; } + + + std::string m_in; + std::string m_out; + MiniRulesCollection m_mr; + + /** Set a flag to say if we should print reads to stdout + */ + void setStdout() { fop = sam_open("-", "w"); } + + /** Set a flag to say if we should print reads to CRAM format + */ + void setCram(const std::string& out, const std::string& ref) { + m_out = out; + fop = sam_open(m_out.c_str(), "wc"); + if (!fop) { + std::cerr << "!!!\n!!!\n!!!\nCannot open CRAM file for writing. Will try BAM next. File: " << m_out << std::endl; + return; + } + + // need to open reference for CRAM writing + char* fn_list = samfaipath(ref.c_str()); + if (fn_list) { + if (hts_set_fai_filename(fop, fn_list) != 0) { + fprintf(stderr, "Failed to use reference \"%s\".\n", fn_list); + } + } else { + std::cerr << "Failed to get the reference for CRAM compression" << std::endl; + } + m_print_header = true; + } + + void setPrintHeader() { + m_print_header = true; + } + + /** Set the output bam to remove all alignment tags */ + void setStripAllTags() { m_strip_all_tags = true; } + + /** Set a list of tags to strip */ + void setStripTags(const std::string& list); + + /** Set to have verbose actions */ + void setVerbose() { m_verbose = true; } + + /** Return the MiniRulesCollection as a string */ + std::string displayMiniRulesCollection() const; + + protected: + + // for stdout mode, print header? + bool m_print_header = false; + + // point index to this region of bam + bool __set_region(const GenomicRegion& gp); + + struct timespec start; + + //open bam, true if success + bool __open_BAM_for_reading(); + + // close this bam file and delete all pointers + void __close_read_bam(); + void __close_write_bam(); + + // open m_out, true if success + bool __open_BAM_for_writing(); + + // define the regions to walk + size_t m_region_idx = 0; + GenomicRegionVector m_region; + + bool m_strip_all_tags = false; + + // hts + BGZF * fp = 0; + hts_idx_t * idx = 0; // hts_idx_load(bamfile.c_str(), HTS_FMT_BAI); + hts_itr_t * hts_itr = 0; // sam_itr_queryi(idx, 3, 60000, 80000); + bam_hdr_t * br = 0; + + htsFile* fop = 0; + + // which tags to strip + std::vector m_tag_list; + + // + bool m_verbose = false; +}; + + +} +#endif + + diff --git a/src/SnowTools/GenomicRegion.h b/src/SnowTools/GenomicRegion.h new file mode 100644 index 000000000..18465b723 --- /dev/null +++ b/src/SnowTools/GenomicRegion.h @@ -0,0 +1,104 @@ +#ifndef SNOWTOOLS_GENOMIC_REGION_H__ +#define SNOWTOOLS_GENOMIC_REGION_H__ + +#include +#include +#include +#include +#include + +#include "SnowTools/SnowUtils.h" +#include "SnowTools/SnowToolsCommon.h" +#include "SnowTools/HTSTools.h" + +/** + */ +namespace SnowTools { +class GenomicRegion { + + template friend class GenomicRegionCollection; + + public: + + /** Construct an "empty" GenomicRegion at (chr -1), pos 0, width = 1 + */ + GenomicRegion() : chr(-1), pos1(0), pos2(0) {}; + + /** Construct a GenomicRegion at a specific start and end location + * @param t_chr Chromosome id (chr1 = 0, etc) + * @param t_pos1 Start position + * @param t_pos2 End position + * @param strand true for positive, false for negative + */ + GenomicRegion(int32_t t_chr, uint32_t t_pos1, uint32_t t_pos2, bool t_strand = true); + + /** Construct a GenomicRegion from a string + */ + GenomicRegion(std::string t_chr, std::string t_pos1, std::string t_pos2); + + /** Construct a GenomicRegion from a samtools style region string. + * + * This calls the samtools-like parser, which accepts in form "chr7:10,000-11,100". + * Note that this requires that a pointer to the BAM header be provided as well + * to convert the text representation of the chr to the id number. + */ + GenomicRegion(const std::string& reg, bam_hdr_t* h); + + static int32_t chrToNumber(std::string ref); + static std::string chrToString(int32_t ref); + + static uint32_t posToBigPos(int32_t refid, uint32_t pos); + + /** Randomize the position of this GenomicRegion on the genome + * + * Creates a GenomicRegion with pos1 = pos2. Simulates a random value + * with val <= genome_size_XY and then converts to GenomicRegion + */ + void random(); + + /** Check if the GenomicRegion is empty (aka chr -1) + */ + bool isEmpty() const; + + /** Find the distance between two GenomicRegion objects + */ + int distance(const GenomicRegion &gr) const; + + // define how these are to be sorted + bool operator < (const GenomicRegion& b) const; + bool operator==(const GenomicRegion& b) const; + bool operator<=(const GenomicRegion &b) const; + + // determine if something overlaps with centromere + int centromereOverlap() const; + + // check if there is an overlap + int getOverlap(const GenomicRegion gr) const; + + friend std::ostream& operator<<(std::ostream& out, const GenomicRegion& gr); + std::string toString() const; + void pad(uint32_t pad); + + int width() const; + + int32_t chr = 0; + uint32_t pos1 = 0; + uint32_t pos2 = 0; + bool strand = true; // true = pos, false = neg + + private: + + + //char strand = '*'; + //std::string id; + //int mapq = 0; + +}; + +typedef std::vector GenomicRegionVector; +typedef GenomicRegion GR; + +} + + +#endif diff --git a/src/SnowTools/GenomicRegionCollection.h b/src/SnowTools/GenomicRegionCollection.h new file mode 100644 index 000000000..aa4621b81 --- /dev/null +++ b/src/SnowTools/GenomicRegionCollection.h @@ -0,0 +1,200 @@ +#ifndef SWAP_GENOMIC_REGION_COLLECTION_H__ +#define SWAP_GENOMIC_REGION_COLLECTION_H__ + +#include +#include +#include +#include +#include + +#include "SnowTools/IntervalTree.h" +#include "SnowTools/GenomicRegion.h" + + +/** Class to store vector of intervals on the genome + */ + +namespace SnowTools { + +typedef Interval GenomicInterval; +typedef std::unordered_map > GenomicIntervalMap; +typedef IntervalTree GenomicIntervalTree; +typedef std::unordered_map GenomicIntervalTreeMap; +typedef std::vector GenomicIntervalVector; + +template +class GenomicRegionCollection { + + public: + + /** Construct an empty GenomicRegionCollection + */ + GenomicRegionCollection() {}; + + /** Construct from a plain vector of GenomicRegion objects + */ + GenomicRegionCollection(std::vector& vec); + + /** Construct from a single GenomicRegion + */ + GenomicRegionCollection(const T& gr); + + /** Construct a GenomicRegionCollection with overlapping intervals + * + * @param width Desired bin width + * @param ovlp Amount that the bins should overlap + * @param gr GenomicRegion to divide into smaller overlapping bins + */ + GenomicRegionCollection(int width, int ovlp, const T &gr); + + /** Read in a MuTect call-stats file and adds to GenomicRegionCollection object. + * + * Reads a MuTect call-stats file and imports only + * lines with KEEP marked. + * @param file Path to call-stats file + * @param pad Amount to pad intervals by + */ + void readMuTect(const std::string &file, int pad = 0); + + /** Read in a BED file and adds to GenomicRegionCollection object + * @param file Path to BED file + * @param pad Amount to pad intervals by + */ + void readBEDfile(const std::string &file, int pad = 0); + + /** Read in a VCF file and adds to GenomicRegionCollection object + * @param file Path to BED file + * @param pad Amount to pad intervals by + */ + void readVCFfile(const std::string &file, int pad = 0); + + /** Read in a text file (can be gzipped) and add to GenomicRegionCollection + * + * This function will automatically detect which file type is being input: + * -- ends in .vcf -> readVCFfile + * -- ends in .bed -> readBEDfile + * -- header contains "MuTect" -> readMuTect + * The values are appended to existing vector of GenomicRegion objects + * @param file Text file to read and store intervals + * @param pad Amount to pad the intervals by (calls GenomicRegion::pad) + */ + void regionFileToGRV(const std::string &file, int pad = 0); + + /** Fill in the GenomicIntervalTreeMap stored in this object. + * + * A GenomicIntervalTreeMap is an unordered_map of GenomicIntervalTrees for + * each chromosome. A GenomicIntervalTree is an interval tree on the ranges + * defined by the genomic interval, with cargo set at the same GenomicRegion object. + */ + void createTreeMap(); + + /** Send the GenomicRegionCollection to a BED file + * @param file Output file + */ + void sendToBED(const std::string file); + + /** Reduces the GenomicRegion objects to minimal set + */ + void mergeOverlappingIntervals(); + + /** Return the number of GenomicRegions stored + */ + size_t size() const { return m_grv.size(); } + + /** Add a new GenomicRegion to end + */ + void add(const T& g) { m_grv.push_back(g); createTreeMap(); } + + /** Is this object empty? + */ + bool empty() const { return !m_grv.size(); } + + /** Clear out all of the GenomicRegion objects + */ + void clear() { m_grv.clear(); m_tree.clear(); idx = 0; } + + /** Retrieve a GenomicRegion at given index. + * + * Note that this does not move the idx iterator, which is + * used to loop through all the regions. Throws an exception + * if the index is out of bounds. + * @return GenomicRegion pointed to by index i + */ + const T& at(size_t i) const; + + /** Find overlaps between this vector and input GenomicRegion. + * + * Requires that the GenomicIntervalTreeMap have been created first + * @param gr Region to test + * @return Number of overlapping elements in this GenomicRegionCollection + */ + size_t findOverlapping(const T &gr); + + size_t countContained(const T &gr); + + GenomicRegionCollection findOverlaps(GenomicRegionCollection &subject, std::vector& query_id, std::vector& subject_id, bool ignore_strand = false); + + /** Increase the left and right ends of each contained GenomicRegion by + * the pad value. + * @param v Amount to pad each end by. Result is increase in width by 2*pad. + */ + void pad(int v); + + /** Set the i'th GenomicRegion */ + T& operator[](size_t i) { return m_grv[i]; } + + /** Retreive the i'th GenomicRegion */ + const T& operator[](size_t i) const { return m_grv[i]; } + + /** Add two GenomicRegionCollection objects together + */ + void concat(const GenomicRegionCollection& g); + + /** Output the GenomicRegionCollection to a BED format + * + * Currently just outputs chr pos pos2 with no header. + * @return BED formated string reprsentation + */ + std::string sendToBED() const; + + /** Fill the next GenomicRegion object. + * @return false if add end of vector + */ + bool getNextGenomicRegion(T& gr); + + /** Rewind the element pointer to the first GenomicRegion + */ + void rewind() { idx = 0; } + + GenomicRegionVector asGenomicRegionVector() const { + GenomicRegionVector gg; + for (auto& i : m_grv) + gg.push_back(GenomicRegion(i.chr, i.pos1, i.pos2, i.strand)); + return gg; + } + + typename std::vector::iterator begin() { return m_grv.begin(); } + + typename std::vector::iterator end() { return m_grv.end(); } + + // always construct this object any time m_grv is modifed + GenomicIntervalTreeMap m_tree; + + GenomicRegionCollection intersection(GenomicRegionCollection& subject, bool ignore_strand = false); + + std::vector m_grv; + + private: + + // index for current GenomicRegion + size_t idx = 0; + +}; + +typedef GenomicRegionCollection GRC; + +} + +#include "GenomicRegionCollection.cpp" + +#endif diff --git a/src/SnowTools/HTSTools.h b/src/SnowTools/HTSTools.h new file mode 100644 index 000000000..4bc299b3f --- /dev/null +++ b/src/SnowTools/HTSTools.h @@ -0,0 +1,85 @@ +#ifndef SNOWTOOLS_HTSTOOLS_H__ +#define SNOWTOOLS_HTSTOOLS_H__ + +#include +#include +#include +#include +#include + +#include "htslib/hts.h" +#include "htslib/sam.h" +#include "htslib/bgzf.h" +#include "htslib/kstring.h" + + +static const char BASES[16] = {' ', 'A', 'C', ' ', + 'G', ' ', ' ', ' ', + 'T', ' ', ' ', ' ', + ' ', ' ', ' ', 'N'}; + +struct free_delete { + void operator()(void* x) { bam_destroy1((bam1_t*)x); } +}; + +typedef std::shared_ptr Read; +typedef std::vector ReadVec; + +#define r_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define r_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0) +#define r_pos(b) ((b)->core.pos) +#define r_mpos(b) ((b)->core.mpos) +#define r_id(b) ((b)->core.tid) +#define r_mid(b) ((b)->core.mtid) +#define r_is_mapped(b) (!((b)->core.flag&BAM_FUNMAP)) +#define r_is_mmapped(b) (!((b)->core.flag&BAM_FMUNMAP)) +#define r_mapq(b) ((b)->core.qual) +#define r_cig_size(b) ((b)->core.n_cigar) +#define r_is_first(b) ((b)->core.flag&BAM_FREAD1) +#define r_qname(b) (std::string(bam_get_qname((b).get()))) +#define r_remove_tag(b, t) do { uint8_t *p = bam_aux_get((b).get(), t); if (p) bam_aux_del((b).get(), p); } while(0) +#define r_flag(b) ((b)->core.flag) +#define r_is_pmapped(b) (!((b)->core.flag&BAM_FUNMAP) && !((b)->core.flag&BAM_FMUNMAP) && ((b)->core.flag&BAM_FPAIRED)) +#define r_length(b) ((b)->core.l_qseq) +#define r_isize(b) ((b)->core.isize) +#define r_seq(b, s) do { uint8_t * p = bam_get_seq(b); char c[(b)->core.l_qseq+1]; for (int ww = 0; ww < (b)->core.l_qseq; ww++) { c[ww] = BASES[bam_seqi(p, ww)]; } c[(b)->core.l_qseq] = '\0'; s = std::string(c); } while(0) +#define r_get_clip(b, p) do { uint32_t* c = bam_get_cigar(b); for (int ww = 0; ww < (b)->core.n_cigar; ww++) if (c[ww] & BAM_CSOFT_CLIP || c[ww] & BAM_CHARD_CLIP) p += bam_cigar_oplen(c[ww]); } while(0) + +#define r_get_Z_tag(b, t, v) do { uint8_t* p = bam_aux_get((b).get(), t); if (p) v = std::string(bam_aux2Z(p)); else v = std::string(); } while(0) +#define r_get_SR(b, v) do { uint8_t* p = bam_aux_get((b).get(), "SR"); assert(p); v = bam_aux2Z(p); } while(0) +#define r_get_int32_tag(b, t, v) do { uint8_t* p = bam_aux_get((b).get(), t); if (p) v = bam_aux2i(p); else v = 0; } while(0) + +#define r_cig_type(b, i) (bam_cigar_opchr(bam_get_cigar((b).get())[i])) +#define r_cig_len(b, i) (bam_cigar_oplen(bam_get_cigar((b).get())[i])) + +#define r_add_Z_tag(b, t, v) bam_aux_append((b).get(), t, 'Z', (v).length()+1, (uint8_t*)(v).c_str()) +#define r_add_int32_tag(b, t, v) bam_aux_append((b).get(), t, 'i', 4, (uint8_t*)&(v)) +#define r_count_nbases(b, n) do { uint8_t* p = bam_get_seq(b); for (int ww = 0; ww < (b)->core.l_qseq; ww++) if (bam_seqi(p,ww) == 15) n++; } while(0) +#define r_count_sub_nbases(b, n, s, e) do { uint8_t* p = bam_get_seq(b); for (int ww = (s); ww < (e); ww++) if (bam_seqi(p,ww) == 15) n++; } while(0) + +#define r_is_rstrand(b) (((b)->core.flag&BAM_FREVERSE) > 0) +#define r_is_mrstrand(b) (((b)->core.flag&BAM_FMREVERSE) > 0) +#define r_is_dup(b) (((b)->core.flag&BAM_FDUP) > 0) +#define r_is_primary(b) (!((b)->core.flag&BAM_FSECONDARY)) +#define r_is_qc_fail(b) (((b)->core.flag&BAM_FQCFAIL) > 0) +#define r_strand(b) ((b)->core.flag&BAM_FREVERSE ? '-' : '+') +#define r_endpos(b) (bam_endpos((b).get())) + +#define r_get_trimmed_seq(b, s) do { int32_t ts, tl; r_get_int32_tag(b, "TS", ts); r_get_int32_tag(b, "TL", tl); r_seq(b, s); if (tl != 0 && tl != r_length(b)) s = s.substr(ts, tl); } while (0) + + namespace SnowTools { + + int32_t qualityTrimRead(int qualTrim, int32_t &startpoint, Read &r); + + std::vector GetStringTag(const Read& a, const std::string tag); + + void SmartAddTag(Read &a, const std::string tag, const std::string val); + + std::vector GetIntTag(const Read& a, const std::string tag); + + void rcomplement(std::string& a); + + void removeAllTags(Read& a); + } + +#endif diff --git a/src/SnowTools/Histogram.h b/src/SnowTools/Histogram.h new file mode 100644 index 000000000..f945e6439 --- /dev/null +++ b/src/SnowTools/Histogram.h @@ -0,0 +1,168 @@ +#ifndef SNOWTOOLS_HISTOGRAM_H__ +#define SNOWTOOLS_HISTOGRAM_H__ + +#include +#include +#include +#include +#include +#include + +#include "IntervalTree.h" + +class Bin; + +typedef Interval BinInterval; +typedef IntervalTree BinIntervalTree; +typedef std::vector BinIntervalVector; + +typedef uint32_t S; + +#define INTERCHR 250000000 + +namespace SnowTools { + + class Matrix; + class Histogram; + +/** Stores one bin in a Histogram + */ +class Bin { + + friend class Histogram; + + public: + + /** Construct a new object with 0 count and range [0,1] + */ + Bin() : m_count(0) + { + bounds = {0,1}; + } + + /** Output the bin in format "start range, end range, count" + */ + friend std::ostream& operator<<(std::ostream &out, const Bin &b) { + out << b.bounds.first << "," << b.bounds.second << "," << b.m_count; + return out; + } + + /** Return the number of counts in this histogram bin + */ + size_t getCount() const { return m_count; } + + /** Check if a value fits within the range of this bin + * @param dist Distance value to check if its in this range + * @return true if the value is within the range + */ + + /** Check if this bin contains a value + * @param elem Value to check if its in this range + * @return true if the value is within the range + */ + bool contains(const int32_t& elem) const; + + /** Define bin comparison operator by location of left bound, then right */ + bool operator < (const Bin& b) const; + + /** Decrement the histogram bin by one. + * Note that this is the prefix version only + */ + Bin& operator--(); + + /** Increment the histogram bin by one. + * Note that this is the prefix version only + */ + Bin& operator++(); + + private: + size_t m_count; + std::pair bounds; //@! was"bin"; +}; + +/** Class to store histogram of numeric values. + * + * The bins of the Histogram are not uniformly spaced, and their ranges determined + * by partitioning the spans it tablulates into uniform quantiles when initialized + * by Histogram::initialSpans(). As elements are added and removed this initial bin + * definition remains constant. + */ +class Histogram { + + private: + + std::vector m_ind; + + public: + + std::vector m_bins; + /** Construct an empty histogram + */ + Histogram() {} + + /** Construct a new histogram with bins spaced evenly + */ + Histogram(const int32_t& start, const int32_t& end, const uint32_t& width); + + friend std::ostream& operator<<(std::ostream &out, const Histogram &h) { + for (auto& i : h.m_bins) + out << i << std::endl; + return out; + } + + /** Return iterator to the fist bin + */ + std::vector::iterator begin() { return m_ind.begin(); } + + /** Return iterator to the last bin + */ + std::vector::iterator end() { return m_ind.end(); } + + /** initialize histogram from a vector of spans + */ + void initialSpans(size_t num_bins, std::vector* pspanv, size_t min_bin_width = 0); + + /** Add an element to the histogram + * @param elem Length of event to add + */ + void addElem(const int32_t &elem); + + /** Remove a span from the histogram + * @param span Length of event to remove + */ + void removeElem(const int32_t &elem); + + /** Output to CSV file like: bin_start,bin_end,count + */ + void toCSV(std::ofstream &fs); + + /** Return the total number of elements in the Histogram + */ + size_t totalCount() const { + size_t tot = 0; + for (auto& i : m_bins) + tot += i.getCount(); + return tot; + } + + /** Get count for a histogram bin + * @param i Bin index + * @return number of events in histogram bin + */ + int32_t binCount(size_t i) { return m_bins[i].getCount(); } + + /** Get number of bins in histogram + * @return Number of bins in histogram + */ + size_t numBins() { return m_bins.size(); } + + /** Find bin corresponding to a span + * @param elem Event length + * @return Bin containing event length + */ + size_t retrieveBinID(const int32_t& elem) const; + +}; + +} +#endif diff --git a/src/SnowTools/IntervalTree.h b/src/SnowTools/IntervalTree.h new file mode 100644 index 000000000..7b5c27c80 --- /dev/null +++ b/src/SnowTools/IntervalTree.h @@ -0,0 +1,214 @@ +#ifndef __INTERVAL_TREE_H +#define __INTERVAL_TREE_H + +#include +#include +#include + +template + class Interval { + public: + K start; + K stop; + T value; + Interval(K s, K e, const T& v) + : start(s) + , stop(e) + , value(v) + { } +}; + +template + K intervalStart(const Interval& i) { + return i.start; +} + +template + K intervalStop(const Interval& i) { + return i.stop; +} + +template + std::ostream& operator<<(std::ostream& out, Interval& i) { + out << "Interval(" << i.start << ", " << i.stop << "): " << i.value; + return out; +} + +template + class IntervalStartSorter { + public: + bool operator() (const Interval& a, const Interval& b) { + return a.start < b.start; + } +}; + +template + class IntervalTree { + + public: + typedef Interval interval; + typedef std::vector intervalVector; + typedef IntervalTree intervalTree; + + intervalVector intervals; + intervalTree* left; + intervalTree* right; + K center; + + IntervalTree(void) + : left(NULL) + , right(NULL) + , center(0) + { } + + IntervalTree(const intervalTree& other) + : left(NULL) + , right(NULL) + { + center = other.center; + intervals = other.intervals; + if (other.left) { + left = new intervalTree(*other.left); + } + if (other.right) { + right = new intervalTree(*other.right); + } + } + + IntervalTree& operator=(const intervalTree& other) { + center = other.center; + intervals = other.intervals; + if (other.left) { + left = new intervalTree(*other.left); + } else { + if (left) delete left; + left = NULL; + } + if (other.right) { + right = new intervalTree(*other.right); + } else { + if (right) delete right; + right = NULL; + } + return *this; + } + + IntervalTree( + intervalVector& ivals, + std::size_t depth = 16, + std::size_t minbucket = 64, + K leftextent = 0, + K rightextent = 0, + std::size_t maxbucket = 512 + ) + : left(NULL) + , right(NULL) + { + + --depth; + IntervalStartSorter intervalStartSorter; + if (depth == 0 || (ivals.size() < minbucket && ivals.size() < maxbucket)) { + std::sort(ivals.begin(), ivals.end(), intervalStartSorter); + intervals = ivals; + } else { + if (leftextent == 0 && rightextent == 0) { + // sort intervals by start + std::sort(ivals.begin(), ivals.end(), intervalStartSorter); + } + + K leftp = 0; + K rightp = 0; + K centerp = 0; + + if (leftextent || rightextent) { + leftp = leftextent; + rightp = rightextent; + } else { + leftp = ivals.front().start; + std::vector stops; + stops.resize(ivals.size()); + transform(ivals.begin(), ivals.end(), stops.begin(), intervalStop); + rightp = *max_element(stops.begin(), stops.end()); + } + + //centerp = ( leftp + rightp ) / 2; + centerp = ivals.at(ivals.size() / 2).start; + center = centerp; + + intervalVector lefts; + intervalVector rights; + + for (typename intervalVector::iterator i = ivals.begin(); i != ivals.end(); ++i) { + interval& interval = *i; + if (interval.stop < center) { + lefts.push_back(interval); + } else if (interval.start > center) { + rights.push_back(interval); + } else { + intervals.push_back(interval); + } + } + + if (!lefts.empty()) { + left = new intervalTree(lefts, depth, minbucket, leftp, centerp); + } + if (!rights.empty()) { + right = new intervalTree(rights, depth, minbucket, centerp, rightp); + } + } + } + + void findOverlapping(K start, K stop, intervalVector& overlapping) const { + if (!intervals.empty() && ! (stop < intervals.front().start)) { + for (typename intervalVector::const_iterator i = intervals.begin(); i != intervals.end(); ++i) { + const interval& interval = *i; + if (interval.stop >= start && interval.start <= stop) { + overlapping.push_back(interval); + } + } + } + + if (left && start <= center) { + left->findOverlapping(start, stop, overlapping); + } + + if (right && stop >= center) { + right->findOverlapping(start, stop, overlapping); + } + + } + + void findContained(K start, K stop, intervalVector& contained) const { + if (!intervals.empty() && ! (stop < intervals.front().start)) { + for (typename intervalVector::const_iterator i = intervals.begin(); i != intervals.end(); ++i) { + const interval& interval = *i; + if (interval.start >= start && interval.stop <= stop) { + contained.push_back(interval); + } + } + } + + if (left && start <= center) { + left->findContained(start, stop, contained); + } + + if (right && stop >= center) { + right->findContained(start, stop, contained); + } + + } + + ~IntervalTree(void) { + // traverse the left and right + // delete them all the way down + if (left) { + delete left; + } + if (right) { + delete right; + } + } + +}; + +#endif diff --git a/src/SnowTools/MiniRules.h b/src/SnowTools/MiniRules.h new file mode 100644 index 000000000..35b11ad27 --- /dev/null +++ b/src/SnowTools/MiniRules.h @@ -0,0 +1,464 @@ +#ifndef MINI_RULES_H +#define MINI_RULES_H + +#include +#include +#include +#include + +#include "SnowTools/GenomicRegion.h" +#include "SnowTools/GenomicRegionCollection.h" +#include "SnowTools/SnowUtils.h" +#include "SnowTools/HTSTools.h" + +#define HAVE_AHOCORASICK_AHOCORASICK_H 1 +#ifdef HAVE_AHOCORASICK_AHOCORASICK_H + +#include "ahocorasick/ahocorasick.h" +#include + +// custom deleter for aho-corasick +struct atm_free_delete { + void operator()(void* x) { free((AC_AUTOMATA_t*)x); } +}; +typedef std::unique_ptr atm_ptr; +#endif + +namespace SnowTools { + +/** Stores a rule for a single alignment flag. + * + * Rules for alignment flags can be one of three states: + * - NA - All flag values are valid + * - Off - Flag is valid if OFF + * - On - Flag is valid if ON + */ +class Flag { + + public: + + /** Construct a new Flag with NA rule + */ + Flag() : on(false), off(false), na(true) {} + + /** Set the flag to NA + */ + void setNA() { on = false; off = false; na = true; } + + /** Set the flag to ON + */ + void setOn() { on = true; off = false; na = false; } + + /** Set the flag to OFF + */ + void setOff() { on = false; off = true; na = false; } + + /** Query if the flag is NA + */ + bool isNA() const { return na; } + + /** Query if the flag is ON + */ + bool isOn() const { return on; } + + /** Query if the flag is OFF + */ + bool isOff() const { return off; } + + /** Set the values of this flag from an input line. + * + * This takes as input a rule string (e.g. !hardclip) + * and parses it to set the rule. The type of rule it + * will look for is specified by the regex (e.g. !?hardclip) + * @return Returns true if the regex was successfully parsed + */ + bool parseRuleLine(std::string &val, std::regex ®); + + private: + + bool on; + bool off; + bool na; + +}; + +/** Hold a range of valid numeric values (e.g. isize). + * + * Can optionally invert the range to make rule the complement of the range + * (eg isize NOT in [300,600] + */ +struct Range { + + /** Construct a Range from 0 to 0, non-inverted + */ + Range() : min(0), max(0), inverted(false), pattern("") {} + + /** Construct a Range from mn to mx, inclusive + */ + Range(int mn, int mx, int in, std::string p) : min(mn), max(mx), inverted(in), pattern(p) {} + + int min; + int max; + bool inverted; + std::string pattern; + bool every = true; + bool none = false; + + bool isValid(int val) { + if (every) + return true; + if (none) + return true; + if (!inverted) + return (val >= min && val <= max); + else + return (val < min || val > max); + } + + void parseRuleLine(std::string line); + + friend std::ostream& operator<<(std::ostream &out, const Range &r); + + // set that this ranges accepts everything + void setEvery() { + every = true; + none = false; + } + + // set that this range accepts nothing + void setNone() { + every = false; + none = true; + } + + // return if this range accepts all values + bool isEvery() const { return every; } + + // return if this range accepts no values + bool isNone() const { return none; } + + +}; + +// a container to hold boolean rules based mostly on alignment flag +struct FlagRule { + + FlagRule() { + dup = Flag(); + supp = Flag(); + qcfail = Flag(); + hardclip = Flag(); + fwd_strand = Flag(); + rev_strand = Flag(); + mate_fwd_strand = Flag(); + mate_rev_strand = Flag(); + mapped = Flag(); + mate_mapped = Flag(); + ff = Flag(); + fr = Flag(); + rf = Flag(); + rr = Flag(); + ic = Flag(); + paired = Flag(); + } + + + /** + * if inv is true, then if flag rule is ON and read is ON, return FALSE + */ + /*bool inline flagCheck(Flag &f, bam1_t *b, int bamflag, bool inv) { + + if (!f.isNA()) { + bool val = (b->core.flag & bamflag); + if ( (f.isOff() && val) || (f.isOn() && !val)) + return inv ? false : true; + } + return true; + }*/ + + Flag dup, supp, qcfail, hardclip, fwd_strand, rev_strand, + mate_fwd_strand, mate_rev_strand, mapped, mate_mapped, ff, fr, rf, rr, ic, paired; + + bool na = true; + void parseRuleLine(std::string line); + + // ask whether a read passes the rule + bool isValid(Read &r); + + friend std::ostream& operator<<(std::ostream &out, const FlagRule &fr); + + // set every flag to NA (most permissive) + void setEvery() { + dup.setOn(); + supp.setOn(); + qcfail.setOn(); + hardclip.setOn(); + fwd_strand.setOn(); + rev_strand.setOn(); + mate_fwd_strand.setOn(); + mate_rev_strand.setOn(); + mapped.setOn(); + mate_mapped.setOn(); + ff.setOn(); + fr.setOn(); + rf.setOn(); + rr.setOn(); + ic.setOn(); + paired.setOn(); + na = true; + } + + // set every flag to OFF everythign off) + void setNone() { + dup.setOff(); + supp.setOff(); + qcfail.setOff(); + hardclip.setOff(); + fwd_strand.setOff(); + rev_strand.setOff(); + mate_fwd_strand.setOff(); + mate_rev_strand.setOff(); + mapped.setOff(); + mate_mapped.setOff(); + ff.setOff(); + fr.setOff(); + rf.setOff(); + rr.setOff(); + ic.setOff(); + paired.setOff(); + } + + + // ask if every flag is set to NA (most permissive) + bool isEvery() const { return na; } + +}; + +// +class AbstractRule { + + public: + + AbstractRule() {} + ~AbstractRule() {} + + std::string name = ""; + Range isize = {-1, -1, true, "isize"}; // include all + Range mapq = {-1, -1, true, "mapq"}; + Range len = {-1, -1, true, "length"}; + Range clip = {-1, -1, true, "clip"}; + Range phred = {-1, -1, true, "phred"}; + Range nm = {-1, -1, true, "nm"}; + Range nbases = {-1,-1,true, "nbases"}; + Range ins = {-1,-1,true, "ins"}; + Range del = {-1,-1,true, "del"}; + Range xp = {-1,-1,true, "xp"}; + std::unordered_map orientation; + + std::string atm_file = ""; + bool atm_inv = false; + size_t atm_count = 0; + + std::string id; + + // how many reads pass this rule? + size_t m_count = 0; + +#ifdef HAVE_AHOCORASICK_AHOCORASICK_H + //atm_ptr atm; + AC_AUTOMATA_t * atm = 0; +#endif + + uint32_t subsam_seed = 999; + double subsam_frac = 1; + + bool none = false; + // set to true if you want a read to belong to the region if its mate does + //bool mate = false; + + FlagRule fr; + + bool isValid(Read &r); + + void parseRuleLine(std::string line); + + void parseSubLine(std::string line); + +#ifdef HAVE_AHOCORASICK_AHOCORASICK_H + bool ahomatch(Read &r); + + bool ahomatch(const char * seq, unsigned len); +#endif + + void parseSeqLine(std::string line); + + friend std::ostream& operator<<(std::ostream &out, const AbstractRule &fr); + + void setEvery() { + isize.setEvery(); + mapq.setEvery(); + len.setEvery(); + clip.setEvery(); + phred.setEvery(); + nm.setEvery(); + nbases.setEvery(); + fr.setEvery(); + ins.setEvery(); + del.setEvery(); + xp.setEvery(); + atm_file = ""; + subsam_frac = 1; + } + + void setNone() { + isize.setNone(); + mapq.setNone(); + len.setNone(); + clip.setNone(); + phred.setNone(); + nm.setNone(); + nbases.setNone(); + fr.setNone(); + del.setNone(); + xp.setNone(); + ins.setNone(); + none = true; + } + + // return if this rule accepts all reads + bool isEvery() const { + return ins.isEvery() && del.isEvery() && isize.isEvery() && mapq.isEvery() && len.isEvery() && clip.isEvery() && phred.isEvery() && nm.isEvery() && nbases.isEvery() && fr.isEvery() && (atm_file.length() == 0) && (subsam_frac >= 1) && xp.isEvery(); + } + + // return if this rule accepts no reads + bool isNone() const { + return none; + //return isize.isNone() && mapq.isNone() && len.isNone() && clip.isNone() && phred.isNone() && nm.isNone() && fr.isNone(); + } + + +}; + +class MiniRulesCollection; + +/** Define a set of rules for creating a variant bam. The syntax is: + all@!isize:[0,800],mapq:[0,60] + region@REGION_FILE + rule1@isize:[0,800],mapq:[0,60] + rule2@!isize[0,800]:mapq[0,60],:ardclip:supplementary:duplicate:qcfail + rule3@ + + A file of NA indicates that the rule should be applied genome-wide. + The ordering of the lines sets the hierarchical rule. For instance, a rule on line 2 will be applied + before a rule on line 3 for all regions that are the union of regions in level 3 and below. + + e.g. Level 3 region file has region chr1 100 1000 + Level 2 region file has region chr1 150 1200 + The union of these will produce a new region chr1 100 1200, with level 2 +*/ +class MiniRules { + + friend class MiniRulesCollection; + + public: + MiniRules() {} + ~MiniRules() {} + + std::string id; + + bool isValid(Read &r); + + void setRegionFromFile(const std::string& file); + + bool isReadOverlappingRegion(Read &r); + + friend std::ostream& operator<<(std::ostream& out, const MiniRules &mr); + + size_t size() const { + return m_abstract_rules.size(); + } + + void parseDiscordantShortcut(const std::string& line, const AbstractRule& ar); + + bool m_whole_genome = false; + + int m_width = 0; + + std::string m_region_file; + //private: + + GRC m_grv; + + int m_level = -1; + + int pad = 0; // how much should we pad the region? + + std::vector m_abstract_rules; + + // rule applies to mate too + bool m_applies_to_mate = false; + + // pointer to its containing MiniRulesCollection + MiniRulesCollection * mrc; + + // how many reads pass this MiniRule + size_t m_count = 0; + +}; + +// a hierarchy of mini rules to operate on +class MiniRulesCollection { + + public: + + /** Construct an empty MiniRulesCollection + * that will pass all reads + */ + MiniRulesCollection() {} + + MiniRulesCollection(std::string file, bam_hdr_t *b); + + bool isValid(Read &r); + + friend std::ostream& operator<<(std::ostream& out, const MiniRulesCollection &mr); + + void sendToBed(std::string file); + + // check if we should do the whole genome + bool hasWholeGenome() const { + for (auto it : m_regions) + if (it.m_whole_genome) + return true; + return false; + } + + size_t m_count = 0; // passed + size_t m_count_seen = 0; // tested + + GRC getAllRegions() const; + + size_t size() const { return m_regions.size(); } + + size_t numRules() const { + size_t num = 0; + for (auto& it : m_regions) + num += it.size(); + return num; + } + + std::vector m_regions; + + bam_hdr_t * h;// in case we need to convert from text chr to id chr + + void countsToFile(const std::string& file) const; + + // should we keep checking rules, even it passed? (useful for counting) + bool m_fall_through = false; + + private: + +}; + +} + +#endif diff --git a/src/SnowTools/Read.h b/src/SnowTools/Read.h new file mode 100644 index 000000000..915a0b3ca --- /dev/null +++ b/src/SnowTools/Read.h @@ -0,0 +1,14 @@ +#ifndef SNOWTOOLS_READ_H__ +#define SNOWTOOLS_READ_H__ + +#include "SnowTools/HTSTools.h" + +class BamRead { + + + private: + + std::shared_ptr b; + + +}; diff --git a/src/SnowTools/SnowTools.h b/src/SnowTools/SnowTools.h new file mode 100644 index 000000000..e7aed03a1 --- /dev/null +++ b/src/SnowTools/SnowTools.h @@ -0,0 +1,5 @@ +#ifndef SNOWTOOLS_SNOWTOOLS_H__ +#define SNOWTOOLS_SNOWTOOLS_H__ + + +#endif diff --git a/src/SnowTools/SnowToolsCommon.h b/src/SnowTools/SnowToolsCommon.h new file mode 100644 index 000000000..fddaac17d --- /dev/null +++ b/src/SnowTools/SnowToolsCommon.h @@ -0,0 +1,106 @@ +#ifndef SNOWTOOLS_COMMON_H__ +#define SNOWTOOLS_COMMON_H__ + +/*! \mainpage Snowtools 1.0 + * + * \section intro_sec Introduction + * + * SnowTools is a C++ package for querying BAM and SAM files, performing + * BWA-MEM operations in memory, and performing advanced filtering of + * reads using a hierarchy of rules. At its core, SnowTools is a C++ interface + * to the C htslib project, which provides the code for core BAM/SAM operations. + * (https://github.com/samtools/htslib) + * + * SnowTools is built to be extendable. See (...) for examples of how to take advantage of C++ + * class extensions to build off of the SnowTools base functionality. + * + * SnowTools is available as a stand-alone executable, which wrap a number of tools. + * However, certain functionality provided in an optimal form by SamTools or + * BamTools are intentially left out of SnowTools. In cases where improvements or alternative + * approaches are useful, the functionally is partially duplicated (e.g. count). + * + * As noted above, there are many overlaps between this project at the BamTools project from Derek Barnett + * (https://github.com/pezmaster31/bamtools). As such, SnowTools is provided as a + * a supplemental library that may be more suited to your individual needs that BamTools, or vice-versa. To + * aid developers in deciding which package is right for them, I have put together a small list of + * similarities and differences between these two packages + * + * + * \subsection similaries SnowTools/BamTools similarity + * - Provide read/write access to BAM files + * - Classes containing individual reads and ways to interact with them (e.g. edit tags) + * - Available as an API and a command-line versoin + * \subsection differenes SnowTools/BamTools differences + * - Sort/index functionality is independently implemented in BamTools. In SnowTools, the Samtools + * sort and index functions are called directly. + * - BamTools stores quality scores and sequences as strings. In SnowTools, the HTSlib native bam1_t format + * is used instead. This format has a lower memory footprint by using only 4 bits per base, rather than 8. + * Conversion to C++ style std::string is provided as a function and can be done on the fly. + * - BamTools provides the BamMultiReader class for reading multiple BAM files at once, while + * SnowTools does not currently support this functionality. + * - BamTools provides the following CLI tools that are not supported in SnowTools + * - convert - Convert between BAM and other formats + * - header - Print the BAM header + * - index - Index a BAM file + * - merge - Merge muliple BAM files + * - random - Select random alignments from BAM file + * - resolve - Resolve paired-end reads IsProperPair flag + * - revert - Remove duplicate marks and restore original base qualities + * - sort - Sorts a BAM file + * - split - Split a BAM file on user-specified property + * \section install_sec Installation + * \code + * git clone https://github.com/broadinstitute/isva.git + * cd isva/SnowTools/src + * ./configure + * make + * \endcode + * + * I have successfully compiled with GCC-4.9. + * + * \section support Support + * This code is being developed and maintained by Jeremiah Wala (jwala@broadinstitute.org) + * + * \section attribution Attributions + * - Jeremiah Wala - Harvard MD-PhD candidate, Bioinformatics and Integrative Genomics + * - Steve Schumacher - Research Scientist, Dana Farber Cancer Institute + * - Cheng-Zhong Zhang - Research Scientist, Broad Institute + * - Marcin Imielinski - Asst Prof, Cornell University + */ + +#include +#include +#include + +namespace SnowTools { + + static const std::vector CHR_NAME {"1", "2", "3", "4", "5", "6", "7", "8", "9", + "10", "11", "12", "13", "14", "15", "16", "17", + "18", "19", "20", "21", "22", "X", "Y", "M"}; + static const std::vector CHR_NAME_NUM {"1", "2", "3", "4", "5", "6", "7", "8", "9", + "10", "11", "12", "13", "14", "15", "16", "17", + "18", "19", "20", "21", "22", "23", "24"}; + + static const int CHR_LEN [25] = {249250621, 243199373, 198022430, 191154276, //1-4 + 180915260, 171115067, //5-6 + 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, //7-12 + 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, //13-18 + 59128983, 63025520, 48129895, 51304566, 155270560, 59373566, //19-24 + 16571}; //25 + + static const uint32_t CHR_CLEN [25] = {0, 249250621, 492449994, 690472424, 881626700, 1062541960, 1233657027, + 1392795690,1539159712,1680373143,1815907890,1950914406,2084766301, + 2199936179, 2307285719, 2409817111, 2500171864, 2581367074, 2659444322, + 2718573305, 2781598825, 2829728720, 2881033286, 3036303846, 3095677412}; + + static const uint32_t genome_size_XY = 3095677411; + + static std::string const REFHG19 = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; + + static const int NONCENT_CHR [44] = {1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10, + 11,11,12,12,13,14,15,16,16,17,17,18,18,19,19,20,20,21,21,22,23,23,24,24}; + + +} + +#endif diff --git a/src/SnowTools/SnowUtils.h b/src/SnowTools/SnowUtils.h new file mode 100644 index 000000000..833ed4975 --- /dev/null +++ b/src/SnowTools/SnowUtils.h @@ -0,0 +1,136 @@ +#ifndef SNOWUTILS_H +#define SNOWUTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "SnowTools/gzstream.h" +#include "SnowTools/GenomicRegion.h" + +namespace SnowTools { + + /** Check if a file is readable and exists + * @param name Name of a file to test + * @return File is readable and exists + */ + inline bool read_access_test (const std::string& name) { + return (access (name.c_str(), R_OK) == 0); + } + + /** Format a number to include commas + * @param data Number to format + * @param Formatted number + */ + template + std::string AddCommas(T data) { + std::stringstream ss; + ss << data; + std::string s = ss.str(); + if (s.length() > 3) + for (int i = s.length()-3; i > 0; i -= 3) + s.insert(i,","); + return s; + } + + inline std::string displayRuntime(const timespec start) { + +#ifndef __APPLE__ + struct timespec finish; + clock_gettime(CLOCK_MONOTONIC, &finish); + double elapsed = (finish.tv_sec - start.tv_sec); + int t = clock()/CLOCKS_PER_SEC; + int min = (int)std::floor(elapsed / 60.0); + int sec = (int)(elapsed-min*60); + char buffer[100]; + sprintf (buffer, "CPU: %4dm%02ds Wall: %4dm%02ds", + (int)floor( ((double)t) /60.0), t % 60, min, sec); + buffer[99] = '\0'; + return std::string(buffer); +#else + return "--- time not configured for apple\n"; + std::cerr << start << std::endl; +#endif + } + + /** Deprecated + */ + inline void rcomplement(std::string &a) { + + std::reverse(&a[0], &a[a.size()]); + std::string::iterator it = a.begin(); + for (; it != a.end(); it++) + if (*it == 'A') + *it = 'T'; + else if (*it == 'T') + *it = 'A'; + else if (*it == 'C') + *it = 'G'; + else + *it = 'C'; + } + + // calculate the percentage + template inline int percentCalc(T numer, T denom) { + if (denom <= 0) + return 0; + int perc = static_cast(floor((float)numer / (float)denom * 100.0)); + return perc; + } + + // remove the last character from a string + inline std::string cutLastChar(std::string in) { + if (in.length() == 0) + return in; + else + return in.substr(0, in.length() - 1); + } + + // remove substrings from a string + inline std::string scrubString(std::string toscrub, std::string toremove) { + std::string::size_type i = toscrub.find(toremove); + while (i != std::string::npos) { + toscrub.erase(i, toremove.length()); + i = toscrub.find(toremove); + } + return toscrub; + } + + void genRandomVals(uint32_t &i1, uint32_t &i2, const uint32_t &max); + + void genRandomValue(uint32_t &i, const uint32_t &max); + + /*! @function Loops through a text file to count the number of lines. + * @param file The file to count + * @param exclude String which, if present in line, causes line to not be counted. + * @param include String which must be present in the line to be counted. + * @return Number of valid lines in file + */ + inline size_t countLines(const std::string &file, const std::string &exclude = "", const std::string &include = "") { + + //open the file + igzstream inFile(file.c_str()); + if (!inFile) + return 0; + + // loop through the file + size_t count = 0; + std::string dum; + while (std::getline(inFile, dum)) { + if (!include.length() || (dum.find(include) != std::string::npos)) // file must have include + if (!exclude.length() || (dum.find(exclude) == std::string::npos)) // and not have exclude + count++; + + } + + return count; + } + +} + +#endif diff --git a/src/SnowTools/gChain.h b/src/SnowTools/gChain.h new file mode 100644 index 000000000..fb6576e41 --- /dev/null +++ b/src/SnowTools/gChain.h @@ -0,0 +1,32 @@ +#ifndef SNOWTOOLS_GCHAIN_H__ +#define SNOWTOOLS_GCHAIN_H__ + +#include "SnowTools/GenomicRegionCollection.h" + +namespace SnowTools { + +template +class gChain { + + public: + + gChain() {} + + gChain(const GenomicRegionCollection& x, const GenomicRegionCollection& y); + + GenomicRegionCollection lift(GenomicRegionCollection& g); + + bool checkValidity() const; + + private: + + GenomicRegionCollectionm_galx; + GenomicRegionCollection m_galy; + + std::vector m_pad_left; + std::vector m_pad_right; + +}; + +} +#endif diff --git a/src/SnowTools/gzstream.h b/src/SnowTools/gzstream.h new file mode 100644 index 000000000..3c74fe054 --- /dev/null +++ b/src/SnowTools/gzstream.h @@ -0,0 +1,128 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// +// See src/COPYING for the full terms of the license. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// Modified : Jared Simpson 2010/04/16. Converted license notification to +// GPLv3 to be compatible with the rest of the code base as +// allowed by LGPLv2.1. No other changes were made. +// +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // +