Skip to content

Commit

Permalink
Drop transcripts with duplicate transcript_id tags
Browse files Browse the repository at this point in the history
  • Loading branch information
mdshw5 authored Nov 2, 2023
1 parent b353a93 commit 078cd51
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pisces/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
with tqdm(
total=db.count_features_of_type('gene'),
unit='gene') as pbar:
transcripts_to_index = set()
for gene in db.features_of_type('gene'):
try:
if options["gtf_type_tag"] == True:
Expand Down Expand Up @@ -344,6 +345,9 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
transcript['transcript_id']
except KeyError:
continue # transcript id is missing (ex TRNAV-CAC gene in RefSeq)
if transcript['transcript_id'][0] in transcripts_to_index:
logging.warn("Duplicate transcript %s will be dropped!", transcript['transcript_id'][0])
continue
# Write entry in the transcripts to genes table
gene2tx.write("{txp}\t{gene}\n".format(
gene=gene['gene_id'][0],
Expand All @@ -356,6 +360,7 @@ def features_to_string(features, fasta_in, masked=True, strand=True):
masked=options["masked"])
transcripts_fasta.write('>' + transcript['transcript_id'][0] + '\n')
transcripts_fasta.write(fa_seq + '\n')
transcripts_to_index.add(transcript['transcript_id'][0])

exons = db.children(gene, featuretype='exon', order_by='start')
merged_exons = db.merge(exons, merge_criteria=(mc.seqid, mc.feature_type, mc.overlap_any_inclusive))
Expand Down

0 comments on commit 078cd51

Please sign in to comment.