3 end quality, inframecount

zhpn1024 · Nov 15, 2020 · 82fb35b · 82fb35b
1 parent e2375a1
commit 82fb35b
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 18 deletions.
diff --git a/src/run/predict.py b/src/run/predict.py
@@ -45,6 +45,7 @@ def set_parser(parser):
   parser.add_argument("--seq", action="store_true", help="Report ORF sequences")
   parser.add_argument("--aaseq", action="store_true", help="Report amino acid sequences")
   parser.add_argument("--blocks", action="store_true", help="Report all exon block positions for predicted ORFs")
+  parser.add_argument("--inframecount", action="store_true", help="Report the sum of all counts at the in-frame positions in the ORF")
   # Reads filters
   parser.add_argument("--maxNH", type=int, default=5, help="Max NH value allowed for bam alignments (default: 5)")
   parser.add_argument("--minMapQ", type=float, default=1, help="Min MapQ value required for bam alignments (default: 1)")
@@ -71,8 +72,9 @@ def run(args):
   global tisbampaths, tisoffdict, ribobampaths, riboffdict, genomefapath, compatible, compatiblemis
   global minaalen, enrichtest, slp, paras, verbose, alt, title, tis2ribo, gfilter
   global tpth, fpth, minpth, fspth, framebest, framelocalbest, longest, transprofile, TIS_types #fspth
-  global paired, seq, aaseq, blocks # showtime
+  global paired, seq, aaseq, blocks, inframecount # showtime
   paired, seq, aaseq, blocks = args.paired, args.seq, args.aaseq, args.blocks
+  inframecount = args.inframecount
   ribo.maxNH, ribo.minMapQ, ribo.secondary = args.maxNH, args.minMapQ, args.secondary
   tisbampaths = args.tisbampaths
   ribobampaths = args.ribobampaths
@@ -282,6 +284,7 @@ class MyPool(multiprocessing.pool.Pool):
   if seq : s += '\tSeq'
   if aaseq : s += '\tAASeq'
   if blocks: s += '\tBlocks'
+  if inframecount: s += '\tInFrameCount'
   s += '\n'
   outfile.write(s)
 
@@ -302,6 +305,7 @@ class MyPool(multiprocessing.pool.Pool):
     if seq : s += '\t' + e.sq
     if aaseq : s += '\t' + e.aa
     if blocks: s += '\t' + e.blocks
+    if inframecount: s += '\t{}'.format(e.inframecount)
     s += '\n'
     if allout is not None : allout.write(s)
     if e.q <= args.fsqth : outfile.write(s) # "%s\t%d\n" % (e, e.length)) #, e.sq))
@@ -422,6 +426,7 @@ def _pred_gene(ps): ### trans
         if tp is not None and fsp > fspth : continue
         has_stop = tsq[stop-3:stop] in orf.cstop
         e = getResult(t, tis, stop, cds1, cds2, tsq, [ip, ttis.cnts[tis], tp, rp, 'N', fsp], has_stop)
+        if inframecount: e.inframecount = sum(tribo.cnts[tis:stop:3])
         es.append(e)
 
     else : #all possible ORFs
@@ -461,6 +466,7 @@ def _pred_gene(ps): ### trans
           fsp, fss = stat.fisher_method([tps[i], rps[i]]) #
           if fsp > fspth : continue
           e = getResult(t, tis, o.stop, cds1, cds2, tsq, [ip, ttis.cnts[tis], tps[i], rps[i], rst[i], fsp], o.has_stop_codon)
+          if inframecount: e.inframecount = sum(tribo.cnts[tis:o.stop:3])
           #tistype = tisType(tis, o.stop, cds1, cds2)
           #orfstr = '{}\t{}\t{}'.format(tsq[tis:tis+3],tis,o.stop)
           #tid = "%s\t%s\t%s\t%s\t%s:%d-%d:%s\t%s\t%s" % (t.gid, t.id, t.symbol, t.genetype, t.chr, t.genome_pos(tis), t.genome_pos(o.stop), t.strand, orfstr, tistype)
@@ -481,7 +487,7 @@ def _pred_gene(ps): ### trans
 def getResult(t, tis, stop, cds1, cds2, tsq, values, has_stop = True):
   tistype = tisType(tis, stop, cds1, cds2)
   orfstr = '{}\t{}\t{}'.format(tsq[tis:tis+3], tis, stop)
-  gtis, gstop = t.genome_pos(tis), t.genome_pos(stop)
+  gtis, gstop = t.genome_pos(tis, bias=1), t.genome_pos(stop, bias=0)
   if t.strand != '-' : genomestr = '{}:{}-{}:{}'.format(t.chr, gtis, gstop, t.strand)
   else : genomestr = '{}:{}-{}:{}'.format(t.chr, gstop, gtis, t.strand)
   tid = "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (t.gid, t.id, t.symbol, t.genetype, genomestr, orfstr, TIS_types[tistype])

diff --git a/src/run/quality.py b/src/run/quality.py
@@ -23,6 +23,7 @@ def set_parser(parser):
   parser.add_argument("--bins", type=int, default=20, help="Bins for cds profile (default: 20)")
   parser.add_argument("--nom0", action="store_true", help="Do not consider reads with mismatch at position 0 as a new group")
   parser.add_argument("--th", type=float, default=0.5, help="Threshold for quality (default: 0.5)")
+  parser.add_argument("--end3", action="store_true", help="Plot 3' end profile")
   # Reads filters
   parser.add_argument("--maxNH", type=int, default=1, help="Max NH value allowed for bam alignments (default: 1)")
   parser.add_argument("--minMapQ", type=float, default=1, help="Min MapQ value required for bam alignments (default: 1)")
@@ -217,12 +218,14 @@ def frange(start=0,stop=1,step=1):
     lst.append(i)
     i += step
   return lst
-def stdisplot(ax, x, y, lab = '', hali = 'left', vali = 'top', frame = 3, f0 = 0, color = ['r','g','b'], size='medium', tcol = 'k'):
+def stdisplot(ax, x, y, lab = '', hali = 'left', vali = 'top', frame = 3, f0 = 0, color = ['r','g','b'], size='medium', tcol = 'k', shift = 0):
   for i in range(frame):
     i0 = (f0 + i) % frame
-    ax.bar(x[i0::frame], y[i0::frame], width = 0.6, color = color[i], edgecolor = color[i],alpha=0.6, align='edge')
+    xs = [n + shift for n in x[i0::frame]]
+    ax.bar(xs, y[i0::frame], width = 0.6, color = color[i], edgecolor = color[i],alpha=0.6, align='edge')
   if hali == 'left' : tx = min(x)
   else : tx = max(x)
+  tx += shift
   ty = nearest(max(y+[1]), up=False) ##
   ax.text(tx ,max(y+[1])*0.7 , lab,horizontalalignment=hali,verticalalignment=vali, size=size, color = tcol)
   ax.spines['right'].set_visible(False)
@@ -267,6 +270,9 @@ def plot4(gs, i, l, disf, dis1, dis2, disc, offdict, args, start = 0):
     if m0 : ax0.yaxis.set_label_coords(-0.9, 0.4)
     else : ax0.yaxis.set_label_coords(-0.4, 0.4)
 
+  shift = 0
+  if args.end3: shift = l
+
   if args.tis : 
     use, tisframe, tistxt, mp = ribo.TISquality(dis1[l], dis = args.dis, threshold = args.th)
     if tisframe != frame : use = False
@@ -284,16 +290,16 @@ def plot4(gs, i, l, disf, dis1, dis2, disc, offdict, args, start = 0):
   if use : tcol = fbcols[frame]
   ax1 = plot.subplot(gs[i, start+1:start+3])
   x = list(range(*args.dis))
-  stdisplot(ax1, x, dis1[l], lab = txt, hali = 'left', f0 = -args.dis[0], color=fbcols, tcol = tcol)
+  stdisplot(ax1, x, dis1[l], lab = txt, hali = 'left', f0 = -args.dis[0], color=fbcols, tcol = tcol, shift = shift)
 
   if use and not args.tis : 
-    plot.hlines(th, -18, -6, color=tcol, linestyles ='dashed')
+    plot.hlines(th, -18+shift, -6+shift, color=tcol, linestyles ='dashed')
   if args.tis : 
     #mp = ribo.getTIS(dis1[l], [d1,d2])
-    ax1.text(mp+1 ,max(dis1[l]+[1])*0.7 , tistxt, horizontalalignment='left',verticalalignment='top', size='medium', color = tcol)
+    ax1.text(mp+1+shift, max(dis1[l]+[1])*0.7 , tistxt, horizontalalignment='left',verticalalignment='top', size='medium', color = tcol)
   ax2 = plot.subplot(gs[i, start+3:start+5])
 
-  stdisplot(ax2, x, dis2[l], lab = '', hali = 'right', f0 = -args.dis[0], color=fbcols, tcol = tcol)
+  stdisplot(ax2, x, dis2[l], lab = '', hali = 'right', f0 = -args.dis[0], color=fbcols, tcol = tcol, shift = shift)
   ax3 = plot.subplot(gs[i, start+5])
   cdsplot(ax3, disc[l])
   if args.tis : 

diff --git a/src/zbio/gtf.py b/src/zbio/gtf.py
@@ -4,8 +4,10 @@
 '''
 
 def add_chr(chr):
-  if chr.isdigit() or chr in ('X','Y','M',): chr = 'chr' + chr
-  elif chr == 'MT' : chr = 'chrM' 
+  if len(chr) <= 3 and (chr.isdigit() or chr in ('X','Y',)): chr = 'chr' + chr
+  elif chr in ('M','MT','MtDNA','mitochondrion_genome','Mito') : chr = 'chrM' 
+  elif len(chr) == 2 and chr[0].isdigit(): chr = 'chr' + chr
+  elif chr in ('I','II','III','IV','V','VI','VII','VIII','IX','XI','XII','XIII','XIV','XV','XVI'): chr = 'chr' + chr
   return chr
 def rm_chr(chr):
   if chr[3:].isdigit() or chr in ('chrX','chrY'): chr = chr[3:]
@@ -102,10 +104,15 @@ def tid(self):
     try : return self.tid_c
     except : 
       self.tid_c = self.attr('transcript_id')
-      if self.gff and self.tid_c == '':
-        p = self.attr('Parent')
-        if p.startswith('rna-'): self.tid_c = p[4:]
-        if p.startswith('gene-'): self.tid_c = p[5:]
+      if self.gff: # and self.tid_c == '':
+        if self.lst[2] in ('transcript', 'mRNA', 'tRNA', 'rRNA'):
+          self.tid_c = self.id
+        else:
+          p = self.attr('Parent')
+          if p.startswith('rna-'): self.tid_c = p[4:]
+          elif p.startswith('gene-'): self.tid_c = p[5:]
+          elif p.startswith('rna'): self.tid_c = p
+          elif p.startswith('gene'): self.tid_c = p
       return self.tid_c
   @property
   def symbol(self):
@@ -119,6 +126,7 @@ def id(self):
     try: return self._id
     except:
       self._id = self.attr('exon_id')
+      if self._id == '': self._id = self.attr('ID')
       return self._id
   @id.setter
   def id(self, value):
@@ -250,7 +258,8 @@ class gtfTrans(Exon):
   '''
   def __init__(self, lst, gff = False, addchr = False): 
     Exon.__init__(self, lst, gff, addchr)
-    self.id = self.tid
+    if self.tid != '': self.id = self.tid
+    elif self.id != '': self.tid_c = self.id
     #self.type = lst[1]
     self.exons = []
     self.cds = []
@@ -673,7 +682,7 @@ def gtfgene_iter(fin, filt = [], gff = False, addchr = False, chrs = None, verbo
       genes[g.id] = g
       genes2[g.id] = g
       #if g.id not in gidlist: gidlist.append(g.id)
-    elif lst[2] == 'transcript':
+    elif lst[2] in ('transcript', 'mRNA', 'tRNA', 'rRNA'):
       t = gtfTrans(lst, gff, addchr)
       if t.gid not in genes:
         g = gtfGene(lst, gff, addchr)

diff --git a/src/zbio/io.py b/src/zbio/io.py
@@ -9,8 +9,11 @@ def splitIter(filePath, sep = '\t', gz = False, skip = 0, title = None, encoding
     if filePath.split('.')[-1].lower() == 'gz' : gz = True
     if gz :
       import gzip
-      infile = gzip.open(filePath, 'rt', encoding=encoding)
-    else : infile = open(filePath, 'r', encoding=encoding)
+      try: infile = gzip.open(filePath, 'rt', encoding=encoding)
+      except: infile = gzip.open(filePath)
+    else :
+      try: infile = open(filePath, 'r', encoding=encoding)
+      except: infile = open(filePath)
   else : infile = filePath
   for i in range(skip):
     l = next(infile)