Skip to content

Commit

Permalink
gclib update/sync
Browse files Browse the repository at this point in the history
  • Loading branch information
gpertea committed Jan 23, 2020
1 parent 82a2e0e commit d8c07c1
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 18 deletions.
9 changes: 8 additions & 1 deletion gclib/GBase.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#ifndef G_BASE_DEFINED
#define G_BASE_DEFINED
#define GCLIB_VERSION "0.11.3"
#define GCLIB_VERSION "0.11.4"

#ifdef HAVE_CONFIG_H
#include "config.h"
Expand Down Expand Up @@ -356,6 +356,13 @@ class GSeg {
return (s->start<=start && s->end>=end);
}

bool equals(GSeg& d){
return (start==d.start && end==d.end);
}
bool equals(GSeg* d){
return (start==d->start && end==d->end);
}

//fuzzy coordinate matching:
bool coordMatch(GSeg* s, uint fuzz=0) { //caller must check for s!=NULL
if (fuzz==0) return (start==s->start && end==s->end);
Expand Down
72 changes: 56 additions & 16 deletions gclib/gff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2124,13 +2124,16 @@ GffObj* GffObj::finalize(GffReader* gfr) {
if (reduceAttributes) {
//for each attribute of the 1st exon, if it has the
//same value for all other exons, move it to transcript level
if (reduceExonAttrs(exons) && gfr->showWarnings()) {
GMessage("Info: duplicate exon attributes reduced for %s\n", gffID);
}
//bool reduced=reduceExonAttrs(exons);
reduceExonAttrs(exons);
//if (gfr->showWarnings() && reduced)
// GMessage("Info: duplicate exon attributes reduced for %s\n", gffID);
//do the same for CDS segments, if any
if (cdss!=NULL && cdss->Count()>0 && (*cdss)[0]->attrs!=NULL) {
if (reduceExonAttrs(*cdss) && gfr->showWarnings())
GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID);
//reduced=
reduceExonAttrs(*cdss);
//if (gfr->showWarnings() && reduced)
// GMessage("Info: duplicate CDS attributes reduced for %s\n", gffID);
}
}
//merge close exons if requested
Expand Down Expand Up @@ -3118,23 +3121,60 @@ void GffObj::getCDSegs(GVec<GffExon>& cds) {

}

//-- transcript overlap classification functions
//-- transcript match/overlap classification functions


char transcriptMatch(GffObj& a, GffObj& b, int& ovlen) {
//return '=' if exact exon match, '~' if intron-chain match (or 80% overlap for single-exon)
// or 0 otherwise
int imax=a.exons.Count()-1;
int jmax=b.exons.Count()-1;
ovlen=0;
if (imax!=jmax) return false; //different number of exons, cannot match
if (imax==0) //single-exon mRNAs
return (singleExonTMatch(a,b,ovlen));
if ( a.exons[imax]->start<b.exons[0]->end ||
b.exons[jmax]->start<a.exons[0]->end )
return 0; //intron chains do not overlap at all
//check intron overlaps
ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1;
ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start;
for (int i=1;i<=imax;i++) {
if (i<imax) ovlen+=a.exons[i]->len();
if ((a.exons[i-1]->end!=b.exons[i-1]->end) ||
(a.exons[i]->start!=b.exons[i]->start)) {
return 0; //intron mismatch
}
}
//--- full intron chain match:
if (a.exons[0]->start==b.exons[0]->start &&
a.exons.Last()->end==b.exons.Last()->end)
return '=';
return '~';
}


bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) {
//if (m.exons.Count()>1 || r.exons.Count()>1..)
char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen) {
//return '=' if exact match, '~' if the overlap is >=80% of the longer sequence length
// return 0 if there is no overlap
GSeg mseg(m.start, m.end);
ovlen=mseg.overlapLen(r.start,r.end);
if (ovlen<=0) return 0;
// fuzzy matching for single-exon transcripts:
// matching = overlap is at least 80% of the length of the longer transcript
// *OR* in case of reverse containment (reference contained in m)
// it's also considered "matching" if the overlap is at least 80% of
// the reference len AND at least 70% of the query len
if (m.start==r.start && m.end==r.end) return '=';
if (m.covlen>r.covlen) {
return ( (ovlen >= m.covlen*0.8) ||
(ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ));
if ( (ovlen >= m.covlen*0.8) ||
(ovlen >= r.covlen*0.8 && ovlen >= m.covlen* 0.7 ) )
//allow also some fuzzy reverse containment
} else
return (ovlen >= r.covlen*0.8);
return '~';
} else {
if (ovlen >= r.covlen*0.8) return '~';
}
return 0;
}

//formerly in gffcompare
Expand All @@ -3148,10 +3188,10 @@ char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch) {
GSeg mseg(m.start, m.end);
if (jmax==0) { //also single-exon ref
//ovlen=mseg.overlapLen(r.start,r.end);
if (singleExonTMatch(m, r, ovlen)) {
if (strictMatch) return (r.exons[0]->start==m.exons[0]->start &&
r.exons[0]->end==m.exons[0]->end) ? '=' : '~';
else return '=';
char eqcode=0;
if ((eqcode=singleExonTMatch(m, r, ovlen))>0) {
if (strictMatch) return eqcode;
else return '=';
}
if (m.covlen<r.covlen)
{ if (ovlen >= m.covlen*0.8) return 'c'; } // fuzzy containment
Expand Down
4 changes: 3 additions & 1 deletion gclib/gff.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ int classcode_rank(char c); //returns priority value for class codes

char getOvlCode(GffObj& m, GffObj& r, int& ovlen, bool strictMatch=false); //returns: class code

bool singleExonTMatch(GffObj& m, GffObj& r, int& ovlen); //single-exon fuzzy transcript match
char transcriptMatch(GffObj& a, GffObj& b, int& ovlen); //generic transcript match test
// -- return '=', '~' or 0
char singleExonTMatch(GffObj& m, GffObj& r, int& ovlen); //single-exon transcript match test

//---
// -- tracking exon/CDS segments from local mRNA to genome coordinates
Expand Down

0 comments on commit d8c07c1

Please sign in to comment.