From 02b0ea8382a3fd2545286fcde10098f231a511c3 Mon Sep 17 00:00:00 2001 From: Mihaela Pertea Date: Tue, 21 Jan 2020 14:28:01 -0500 Subject: [PATCH 1/3] Quick bug fix to 2.1.0, which had GBitVec going out of bounds --- rlink.cpp | 55 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/rlink.cpp b/rlink.cpp index 5764075..91402e9 100644 --- a/rlink.cpp +++ b/rlink.cpp @@ -3015,33 +3015,36 @@ int create_graph(int refstart,int s,int g,CBundle *bundle,GPVec& bn } } - if(ichild.Count()) { // might need to be linked to sink - bool addsink=true; - for(int j=0;jparent.Count();j++) { - if(sink->parent[j]==i) { - addsink=false; - break; - } - } - if(addsink) { - if(!icov) icov=get_cov(1,no2gnode[s][g][i]->start-refstart,no2gnode[s][g][i]->end-refstart,bpcov)/no2gnode[s][g][i]->len(); - //fprintf(stderr,"consider inode=%d with cov=%f\n",i,icov); - float chcov=0; // all children coverages - for(int j=0;jchild.Count();j++) { - int c=no2gnode[s][g][i]->child[j]; - chcov+=get_cov(1,no2gnode[s][g][c]->start-refstart,no2gnode[s][g][c]->end-refstart,bpcov)/no2gnode[s][g][c]->len(); + if(ichild.Count()) { // might need to be linked to sink + bool addsink=true; + for(int j=0;jparent.Count();j++) { + if(sink->parent[j]==i) { + addsink=false; + break; + } } - if(chcovparent.Add(i); // prevnode is the parent of sink - float tmp=i; - futuretr.Add(tmp); - futuretr.cAdd(-1.0); - tmp=(icov-chcov)/DROP; - futuretr.Add(tmp); - //fprintf(stderr,"Add link to sink from node %d with abundance %f\n",i,tmp); - edgeno++; + if(addsink) { + if(!icov) icov=get_cov(1,no2gnode[s][g][i]->start-refstart,no2gnode[s][g][i]->end-refstart,bpcov)/no2gnode[s][g][i]->len(); + //fprintf(stderr,"consider inode=%d with cov=%f\n",i,icov); + float chcov=0; // all children coverages + for(int j=0;jchild.Count();j++) { + int c=no2gnode[s][g][i]->child[j]; + chcov+=get_cov(1,no2gnode[s][g][c]->start-refstart,no2gnode[s][g][c]->end-refstart,bpcov)/no2gnode[s][g][c]->len(); + } + if(chcovparent.Add(i); // prevnode is the parent of sink + float tmp=i; + futuretr.Add(tmp); + futuretr.cAdd(-1.0); + tmp=(icov-chcov)/DROP; + futuretr.Add(tmp); + //fprintf(stderr,"Add link to sink from node %d with abundance %f\n",i,tmp); + edgeno++; + } } } + else edgeno++; // node has no children so it might get linked to sink in traverse_dfs function } } @@ -13104,7 +13107,7 @@ void count_good_junctions(BundleData* bdata) { } } rd.juncs[i-1]->nreads-=rd.read_count; - if(rd.juncs[i-1]->nreadsstrand=0; + //if(rd.juncs[i-1]->nreadsstrand=0; // this approach removes a perfectly valid (guide) junction which might not be the desired effect rd.juncs[i-1]=nullj; } @@ -13129,7 +13132,7 @@ void count_good_junctions(BundleData* bdata) { } } rd.juncs[i-1]->nreads-=rd.read_count; - if(rd.juncs[i-1]->nreadsstrand=0; + //if(rd.juncs[i-1]->nreadsstrand=0; // this approach removes a perfectly valid (guide) junction which might not be the desired effect rd.juncs[i-1]=nullj; } else { From 82a2e0e8e6660c650ec986608d7e6bd8ab875ae6 Mon Sep 17 00:00:00 2001 From: Geo Pertea Date: Tue, 21 Jan 2020 21:56:32 -0500 Subject: [PATCH 2/3] fixed memory leak in rlink.cpp --- rlink.cpp | 5 ++++- run_tests.sh | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/rlink.cpp b/rlink.cpp index 91402e9..94d01da 100644 --- a/rlink.cpp +++ b/rlink.cpp @@ -9383,7 +9383,10 @@ void process_refguides(int gno,int edgeno,GIntHash& gpos,int& lastgpos,GPVe } } if(ng) { - for(int g=0;g Date: Thu, 23 Jan 2020 16:47:39 -0500 Subject: [PATCH 3/3] gclib update/sync --- gclib/GBase.h | 9 ++++++- gclib/gff.cpp | 72 +++++++++++++++++++++++++++++++++++++++------------ gclib/gff.h | 4 ++- 3 files changed, 67 insertions(+), 18 deletions(-) diff --git a/gclib/GBase.h b/gclib/GBase.h index 2a247fd..d0f0369 100644 --- a/gclib/GBase.h +++ b/gclib/GBase.h @@ -1,6 +1,6 @@ #ifndef G_BASE_DEFINED #define G_BASE_DEFINED -#define GCLIB_VERSION "0.11.3" +#define GCLIB_VERSION "0.11.4" #ifdef HAVE_CONFIG_H #include "config.h" @@ -356,6 +356,13 @@ class GSeg { return (s->start<=start && s->end>=end); } + bool equals(GSeg& d){ + return (start==d.start && end==d.end); + } + bool equals(GSeg* d){ + return (start==d->start && end==d->end); + } + //fuzzy coordinate matching: bool coordMatch(GSeg* s, uint fuzz=0) { //caller must check for s!=NULL if (fuzz==0) return (start==s->start && end==s->end); diff --git a/gclib/gff.cpp b/gclib/gff.cpp index 5aa705a..677ef51 100644 --- a/gclib/gff.cpp +++ b/gclib/gff.cpp @@ -2124,13 +2124,16 @@ GffObj* GffObj::finalize(GffReader* gfr) { if (reduceAttributes) { //for each attribute of the 1st exon, if it has the //same value for all other exons, move it to transcript level - if (reduceExonAttrs(exons) && gfr->showWarnings()) { - GMessage("Info: duplicate exon attributes reduced for %s\n", gffID); - } + //bool reduced=reduceExonAttrs(exons); + reduceExonAttrs(exons); + //if (gfr->showWarnings() && reduced) + // GMessage("Info: duplicate exon attributes reduced for %s\n", gffID); //do the same for CDS segments, if any if (cdss!=NULL && cdss->Count()>0 && (*cdss)[0]->attrs!=NULL) { - if (reduceExonAttrs(*cdss) && gfr->showWarnings()) - GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID); + //reduced= + reduceExonAttrs(*cdss); + //if (gfr->showWarnings() && reduced) + // GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID); } } //merge close exons if requested @@ -3118,23 +3121,60 @@ void GffObj::getCDSegs(GVec& cds) { } -//-- transcript overlap classification functions +//-- transcript match/overlap classification functions + + +char transcriptMatch(GffObj& a, GffObj& b, int& ovlen) { + //return '=' if exact exon match, '~' if intron-chain match (or 80% overlap for single-exon) + // or 0 otherwise + int imax=a.exons.Count()-1; + int jmax=b.exons.Count()-1; + ovlen=0; + if (imax!=jmax) return false; //different number of exons, cannot match + if (imax==0) //single-exon mRNAs + return (singleExonTMatch(a,b,ovlen)); + if ( a.exons[imax]->startend || + b.exons[jmax]->startend ) + return 0; //intron chains do not overlap at all + //check intron overlaps + ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1; + ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start; + for (int i=1;i<=imax;i++) { + if (ilen(); + if ((a.exons[i-1]->end!=b.exons[i-1]->end) || + (a.exons[i]->start!=b.exons[i]->start)) { + return 0; //intron mismatch + } + } + //--- full intron chain match: + if (a.exons[0]->start==b.exons[0]->start && + a.exons.Last()->end==b.exons.Last()->end) + return '='; + return '~'; +} + -bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) { - //if (m.exons.Count()>1 || r.exons.Count()>1..) +char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) { + //return '=' if exact match, '~' if the overlap is >=80% of the longer sequence length + // return 0 if there is no overlap GSeg mseg(m.start, m.end); ovlen=mseg.overlapLen(r.start,r.end); + if (ovlen<=0) return 0; // fuzzy matching for single-exon transcripts: // matching = overlap is at least 80% of the length of the longer transcript // *OR* in case of reverse containment (reference contained in m) // it's also considered "matching" if the overlap is at least 80% of // the reference len AND at least 70% of the query len + if (m.start==r.start && m.end==r.end) return '='; if (m.covlen>r.covlen) { - return ( (ovlen >= m.covlen*0.8) || - (ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 )); + if ( (ovlen >= m.covlen*0.8) || + (ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ) ) //allow also some fuzzy reverse containment - } else - return (ovlen >= r.covlen*0.8); + return '~'; + } else { + if (ovlen >= r.covlen*0.8) return '~'; + } + return 0; } //formerly in gffcompare @@ -3148,10 +3188,10 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch) { GSeg mseg(m.start, m.end); if (jmax==0) { //also single-exon ref //ovlen=mseg.overlapLen(r.start,r.end); - if (singleExonTMatch(m, r, ovlen)) { - if (strictMatch) return (r.exons[0]->start==m.exons[0]->start && - r.exons[0]->end==m.exons[0]->end) ? '=' : '~'; - else return '='; + char eqcode=0; + if ((eqcode=singleExonTMatch(m, r, ovlen))>0) { + if (strictMatch) return eqcode; + else return '='; } if (m.covlen= m.covlen*0.8) return 'c'; } // fuzzy containment diff --git a/gclib/gff.h b/gclib/gff.h index 4c47a4d..d18e865 100644 --- a/gclib/gff.h +++ b/gclib/gff.h @@ -60,7 +60,9 @@ int classcode_rank(char c); //returns priority value for class codes char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch=false); //returns: class code -bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen); //single-exon fuzzy transcript match +char transcriptMatch(GffObj& a, GffObj& b, int& ovlen); //generic transcript match test +// -- return '=', '~' or 0 +char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen); //single-exon transcript match test //--- // -- tracking exon/CDS segments from local mRNA to genome coordinates