-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_index.py
executable file
·149 lines (105 loc) · 4.7 KB
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import *
from argparse import ArgumentParser
from os import path
parser = ArgumentParser(description="Significance testing for methylation.")
parser.add_argument('-c', '--covered', help="DataFrame dump containing total call counts", required=True)
parser.add_argument('-g', '--genome', help="Genome in fasta format.", required=True)
parser.add_argument('-gff', '--gff', help="Genome annotation in gff format.", required=True)
#parser.add_argument('-i', '--index', help="DataFrame with the strand and context data", required=True)
parser.add_argument('--chr_prefix', help="Seqid prefix for chromosomal sequences", default = 'NC_0376')
args= parser.parse_args()
covered = pd.read_pickle(args.covered)
genome = load_genome(args.genome, upper=False)
index = covered.index.to_frame()
strand = np.empty(len(index), dtype=bool)
context = np.empty(len(index), dtype='U3')
masked = np.empty(len(index), dtype=bool)
for i, (seqid, pos) in tqdm(enumerate(index.itertuples(index=False)), total= len(index)) :
masked[i] = genome[seqid][pos].islower()
if genome[seqid][pos] in ('C','c') :
strand[i]=True
context[i] = str(genome[seqid][pos:pos+3].upper())
else :
strand[i]=False
context[i] = str(genome[seqid][pos-2:pos+1].reverse_complement().upper())
index['strand'] = strand
index['masked'] = masked
index['context'] = context
index['nuclear'] = index.seqid.apply(lambda seqid: seqid.startswith(args.chr_prefix)).astype(bool)
index['di_context'] = index.context.apply(lambda x: x[:2]).astype('category')
db = load_annotation(args.gff)
gene_accession = dict()
for gene in db.all_features(featuretype='gene') :
for label in gene.attributes['Dbxref'] :
if label.startswith('GeneID') :
gene_accession[gene.id] = label.split(':')[1]
# Choosing one mrna per gene
print(f"Choosing a representative mrna per gene..")
# Note that protein coding genes should be all genes
# if we are using canon-gff parsed version of the annotation
protein_coding_genes = list(
filter(
lambda gene: gene.attributes["gene_biotype"][0] == "protein_coding",
db.all_features(featuretype="gene"),
)
)
representative_mrnas = list()
for gene in protein_coding_genes:
mrnas = list(db.children(gene, featuretype="mRNA"))
if len(mrnas) == 1:
representative_mrnas.append(mrnas[0])
else:
# for multiple transcripts, pick longest translated product
len_longest_mrna = 0
for mrna in mrnas:
mrna_len = db.children_bp(mrna, child_featuretype="CDS")
if mrna_len > len_longest_mrna:
longest_mrna = mrna
len_longest_mrna = mrna_len
representative_mrnas.append(longest_mrna)
# Build dict of feature positions for fast search
feature_pos = dict()
for mrna in representative_mrnas :
if mrna.seqid not in feature_pos :
feature_pos[mrna.seqid] = dict()
GeneID = gene_accession[mrna.attributes['Parent'][0]]
gene_pos = (mrna.start, mrna.end)
exon_pos = [(exon.start, exon.end) for exon in db.children(mrna, featuretype='exon') ]
feature_pos[mrna.seqid][gene_pos] = (GeneID, exon_pos)
# Function to search in the feature pos index
def pos_infeature( postuple, feature_pos) :
seqid, pos = postuple
# offset from 0 based pos to 1 based annotation
pos +=1
if seqid not in feature_pos :
return (None, None)
for gene_pos in feature_pos[seqid] :
if gene_pos[0] <= pos <= gene_pos[1] :
GeneID, exon_list = feature_pos[seqid][gene_pos]
# in gene
for i, exon_pos in enumerate(exon_list, start=1):
if exon_pos[0] <= pos <= exon_pos[1] :
# in exon
return (GeneID, i)
# in gene, but not in exon
return (GeneID, None)
return (None, None)
# Note that we are neglecting the case where the position could be inexon in ANOTHER gene
# There should not be too many of these.
df = pd.DataFrame([pos_infeature(pos, feature_pos) for pos in tqdm(index.index, total=len(index)) ],
index = index.index ,
columns=['ingene', 'inexon'], dtype='category')
index = index.join(df)
index.drop(['seqid', 'pos'], axis=1, inplace=True)
index.reset_index(inplace=True)
index.seqid = index.seqid.astype('category')
if index.pos.max() < 2**32:
index.pos = index.pos.astype('uint32')
index.context = index.context.astype('category')
index.di_context = index.di_context.astype('category')
index.strand = index.strand.astype(bool)
index.to_pickle('index.pkl')