diff --git a/lib/biocode/gff.py b/lib/biocode/gff.py index d540366..c431d85 100644 --- a/lib/biocode/gff.py +++ b/lib/biocode/gff.py @@ -344,6 +344,12 @@ def get_gff3_features(gff3_file, assemblies=None): parent_feat.add_exon(exon) features[feat_id] = exon + elif cols[2] == 'intron': + intron = biocode.things.Intron(id=feat_id, parent=parent_feat) + intron.locate_on(target=current_assembly, fmin=rfmin, fmin_partial=fmin_partial, fmax=rfmax, fmax_partial=fmax_partial, strand=rstrand) + parent_feat.add_intron(intron) + features[feat_id] = intron + elif cols[2] == 'CDS': if phase == '.': phase = 0 @@ -729,6 +735,23 @@ def print_biogene( gene=None, fh=None, source=None, on=None ): columns[8] = build_column_9( id=exon.id, parent=RNA.id, other=exon_annot_atts ) fh.write( "\t".join(columns) + "\n" ) + ## handle introns for this RNA + for intron in sorted(RNA.introns( on )): + intron_loc = intron.location_on( on ) + + if intron_loc is None: + raise Exception("ERROR: Expected intron {0} to be located on {1} but it wasn't".format(intron.id, on.id)) + + intron_partiality_string = _partiality_string(intron_loc) + intron_annot_atts = dict() + if intron_partiality_string is not None: + intron_annot_atts['Partial'] = intron_partiality_string + + columns[2] = 'intron' + columns[3:5] = [str(intron_loc.fmin + 1), str(intron_loc.fmax)] + columns[8] = build_column_9( id=intron.id, parent=RNA.id, other=intron_annot_atts ) + fh.write( "\t".join(columns) + "\n" ) + # are there polypeptides? for polypeptide in sorted(RNA.polypeptides()): if len(polypeptide.locations) == 0: diff --git a/lib/biocode/things.py b/lib/biocode/things.py index 37270ce..b8912ab 100644 --- a/lib/biocode/things.py +++ b/lib/biocode/things.py @@ -1,6 +1,7 @@ import itertools import sys +import re import uuid #from biocode import utils, gff, tbl @@ -842,9 +843,10 @@ class Intron( LocatableThing ): removed from within the transcript by splicing together the sequences (exons) on either side of it." ''' - def __init__( self, id=None, locations=None, length=None ): + def __init__( self, id=None, locations=None, parent=None, length=None ): super().__init__(locations) self.id = id + self.parent = parent self.length = length @@ -1029,6 +1031,7 @@ def __init__( self, id=None, locations=None, parent=None, locus_tag=None, childr ## initialize any types needed self.children = _initialize_type_list(self.children, 'exon') + self.children = _initialize_type_list(self.children, 'intron') self.children = _initialize_type_list(self.children, 'CDS') self.children = _initialize_type_list(self.children, 'polypeptide') self.children = _initialize_type_list(self.children, 'UTR') @@ -1044,6 +1047,10 @@ def add_exon(self, exon): exon.parent = self self.children['exon'].append(exon) + def add_intron(self, intron): + intron.parent = self + self.children['intron'].append(intron) + def add_five_prime_UTR(self, utr): utr.parent = self self.children['UTR'].append(utr) @@ -1207,32 +1214,36 @@ def has_introns( self ): return False def introns(self, on=None): - ''' - Dynamically generates Intron objects in order for the current RNA. The coordinates of the - generated introns depend on the object passed via the 'on' argument - ''' - if on is None: - raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") - - mol_on = on - - intron_objs = list() - last_exon = None - last_exon_loc = None - - for exon in sorted(self.exons()): - exon_loc = exon.location_on( mol_on ) - - if last_exon is not None: - intron_id = uuid.uuid4() - intron = Intron( id=intron_id ) - intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) - intron_objs.append( intron ) - - last_exon = exon - last_exon_loc = exon_loc - - return intron_objs + + ''' + Dynamically generates Intron objects in order for the current RNA. The coordinates of the + generated introns depend on the object passed via the 'on' argument + ''' + if on is None: + raise Exception("ERROR: the introns() method requires a passed molecule using the 'on' argument") + + mol_on = on + + intron_objs = list() + last_exon = None + last_exon_loc = None + + intron_count = 0 + for exon in sorted(self.exons()): + exon_loc = exon.location_on( mol_on ) + + if last_exon is not None: + intron_count = intron_count + 1 + intron_count_id = str(intron_count) + intron_id = str(re.sub('exon[0-9]', 'intron' + intron_count_id, exon.id)) ## ## This is generating random id, need format such as "g1.t1.intron1" + intron = Intron( id=intron_id ) + intron.locate_on( target=mol_on, fmin=last_exon_loc.fmax, fmax=exon_loc.fmin, strand=exon_loc.strand ) + intron_objs.append( intron ) + + last_exon = exon + last_exon_loc = exon_loc + + return intron_objs def polypeptides(self): return self.children['polypeptide']