-
Notifications
You must be signed in to change notification settings - Fork 0
/
id_to_gene3.py
39 lines (30 loc) · 1.38 KB
/
id_to_gene3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import numpy as np
import argparse
import warnings
warnings.filterwarnings('ignore')
# create the command line parser
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--txtfile", default='test.txt', help="path to id txt file to get data")
args = parser.parse_args()
ids = args.txtfile
# opens text files of FlyBase IDs that are in a unique intersection
flybase_ids_txt = open(ids, 'r')
# create list of all FlaseBase IDs and strip new line character
lst_ids = [line.strip('\n') for line in flybase_ids_txt.readlines()]
# read gtf_genes.bed file as dataframe
genes = pd.read_csv("gtf_genes.bed", sep="\t", names=['chr', 'start', 'end', 'gene_id', 'score', 'gene_name'])
# find genes present in both the gtf_genes.bed file as well as the gene list from the intersection text file
overlaps = genes[genes['gene_id'].isin(lst_ids)]
# create new dataframe with the chr, start, end, gene_id, score, and gene_name of these common genes
bed = pd.DataFrame()
bed['chrom'] = overlaps['chr'].astype(str)
bed['chromStart'] = overlaps['start'].astype(int)
bed['chromEnd'] = overlaps['end'].astype(int)
bed['geneID'] = overlaps['gene_id'].astype(str)
bed['score'] = overlaps['score'].astype(int)
bed['geneName'] = overlaps['gene_name'].astype(str)
# save dataframe as bed file
name = ids.split(".")[0] + ".bed"
bed.to_csv(name, header=None, index=None, sep="\t", mode='w')
print('Done!')