-
Notifications
You must be signed in to change notification settings - Fork 0
/
PlotDiversityTargetsFlankingSites.py
283 lines (248 loc) · 10.5 KB
/
PlotDiversityTargetsFlankingSites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# -*- coding: utf-8 -*-
"""
Created on Fri May 13 13:55:12 2016
@author: RJovelin
"""
# use this script to generate a graph comparing diversity at mirna target sites with adjacent windows
# load matplotlib, change backend when X server is not available
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
# load custome modules
from manipulate_sequences import *
from sliding_windows import *
from sites_with_coverage import *
from miRNA_target import *
from divergence import *
from genomic_coordinates import *
# load scipy and numpy
import numpy as np
from scipy import stats
import math
# convert genome fasta to dict
genome = convert_fasta('../Genome_Files/noamb_356_v1_4.txt')
print('converted genome fasta to dict')
# get the allele counts for all sites with coverage, exclude sites with sample size < 10
chromo_sites = get_non_coding_snps('../SNP_files/', 10)
print('got allele counts at all sites')
# get mirna target coordinates
# {gene: [[chromo, start, end, orientation, seed, N_mirnas, site_type, conservation, utr]]}
target_coord = get_miRNA_target_coord('Cremanei_miRNA_sites.txt')
print('got miRNA target coordinates')
# get the set of valid transcripts
valid_transcripts = get_valid_transcripts('../Genome_Files/unique_transcripts.txt')
print('made list of valid transcripts')
# remove genes that are not not valid (ie to keep a single transcript per gene)
to_remove = [gene for gene in target_coord if gene not in valid_transcripts]
for gene in to_remove:
del target_coord[gene]
print('deleted {0} non-valid genes'.format(len(to_remove)))
print('filtered non-valid transcripts')
# compute diversity for all mirna targets
targets_theta = []
# loop over gene in target cood
for gene in target_coord:
# loop over targets in gene
for i in range(len(target_coord[gene])):
# get chromo
chromo = target_coord[gene][i][0]
# get start, end position
start = target_coord[gene][i][1]
end = target_coord[gene][i][2]
# get conservation
conservation = target_coord[gene][i][7]
# get utr type
utr = target_coord[gene][i][8]
# check that chromo is in chromo_sites
if chromo in chromo_sites:
# compute theta, accepting 0 missing sites
theta = compute_theta_non_coding(chromo_sites, chromo, start, end, 0)
# check that theta is defined
if theta != 'NA':
# add to list
targets_theta.append(theta)
print('computed theta for miRNA targets')
# create a dict for upstream and downstream flanking sites
# {window_position : [list of thetas across all regions at that position]}
upstream_pos, downstream_pos = {}, {}
# compute theta for each non-overlapping windows upstream of target sites
# loop over targets
# compute theta for each upstream region
for gene in target_coord:
# loop over target on gene
for i in range(len(target_coord[gene])):
# get target coordinates
chromo = target_coord[gene][i][0]
start = target_coord[gene][i][1]
end = target_coord[gene][i][2]
orientation = target_coord[gene][i][3]
# check that chromo in chromo-sites
if chromo in chromo_sites:
# determine the length of the target sites
target_size = end - start
upstream_size = target_size * 3
# compute upstream coordinates
# take 3 windows on each side of the target
if orientation == '+':
up_end = start
if start - upstream_size < 0:
up_start = 0
else:
up_start = start - upstream_size
elif orientation == '-':
up_start = end
if up_start + upstream_size < len(genome[chromo]):
up_end = up_start + upstream_size
else:
up_end = up_start + upstream_size
# compute theta for each non-verlapping window
theta_windows = sequence_sliding_window(chromo, up_start, up_end, orientation, True, target_size, target_size, chromo_sites, 0)
# add theta at each position in the upstream dict
for j in theta_windows:
# check if j in dict
if j in upstream_pos:
# add theta if theta is defined
if len(theta_windows[j]) != 0:
upstream_pos[j].append(theta_windows[j][0])
else:
# initiate list value and add theta if theta is defined
upstream_pos[j] = []
if len(theta_windows[j]) != 0:
upstream_pos[j].append(theta_windows[j][0])
print('computed sliding windows for upstream')
# loop over targets
for gene in target_coord:
# loop over target on gene
for i in range(len(target_coord[gene])):
# get target coordinates
chromo = target_coord[gene][i][0]
start = target_coord[gene][i][1]
end = target_coord[gene][i][2]
orientation = target_coord[gene][i][3]
# check that chromo in chromo-sites
if chromo in chromo_sites:
# determine the length of the target sites
target_size = end - start
downstream_size = target_size * 3
# compute downstream coordinates
# take 3 windows on each side of the target
if orientation == '+':
down_start = end
if down_start + downstream_size < len(genome[chromo]):
down_end = down_start + downstream_size
else:
down_end = len(genome[chromo])
elif orientation == '-':
down_end = start
if start - downstream_size < 0:
down_start = 0
else:
down_start = start - downstream_size
theta_windows = sequence_sliding_window(chromo, down_start, down_end, orientation, False, target_size, target_size, chromo_sites, 0)
# add theta at each position in the downstream dict
for j in theta_windows:
# check if j is key
if j in downstream_pos:
# add theta if theta is defined
if len(theta_windows[j]) != 0:
downstream_pos[j].append(theta_windows[j][0])
else:
# initiate list value and add theta if theta is defined
downstream_pos[j] = []
if len(theta_windows[j]) != 0:
downstream_pos[j].append(theta_windows[j][0])
print('computed sliding windows for downstream')
# create a list of lists of diversity values ordered according to position around target sites
alldata = []
# lower index in upstream_pos corresponds to end the upstream region
# need to invert positions
# create a list of keys in upstream, reverse sorted
up_pos = [i for i in upstream_pos]
up_pos.sort()
up_pos.reverse()
down_pos = [i for i in downstream_pos]
down_pos.sort()
for i in up_pos:
alldata.append(upstream_pos[i])
alldata.append(targets_theta)
for i in down_pos:
alldata.append(downstream_pos[i])
# create figure
fig = plt.figure(1, figsize = (4, 3))
# add a plot to figure (1 row, 1 column, 1st plot)
ax = fig.add_subplot(1, 1, 1)
width = 0.8
ind = np.arange(len(alldata))
Means = [np.mean(i) for i in alldata]
SEM = []
for i in alldata:
SEM.append(np.std(i) / math.sqrt(len(i)))
# use a bar plot
graph = ax.bar(ind, Means, width, yerr = SEM,
color = ['#99d8c9', '#99d8c9', '#99d8c9', '#2ca25f','#99d8c9', '#99d8c9', '#99d8c9'],
linewidth = 1.5, error_kw=dict(elinewidth=1.5, ecolor='black', markeredgewidth = 1.5))
ax.margins(0.05)
xvals = [i + 0.5 for i in range(len(alldata))]
# Set a buffer around the edge of the x-axis
#plt.xlim([-0.5, len(alldata) + 0.5])
# zom in by setting up y limits
plt.ylim([0.020, 0.025])
# do not show ticks
plt.tick_params(
axis='both', # changes apply to the x-axis and y-axis (other option : x, y)
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
right = 'off',
left = 'off',
labelbottom= 'off', # labels along the bottom edge are off
colors = 'black',
labelsize = 10,
direction = 'out')
# Set the tick labels font name
for label in ax.get_yticklabels():
label.set_fontname('Arial')
# write label for x and y axis
ax.set_ylabel('Nucleotide polymorphism', color = 'black', size = 10, ha = 'center', fontname = 'Arial')
ax.set_xlabel('\nUpstream\t\tTargets\t\tDownstream', color = 'black', size = 10, ha = 'center', fontname = 'Arial')
# add labels to x-ticks, rotate and align right
#ax.set_xticklabels(site_types, ha = 'center', size = 10, fontname = 'Arial')
# remove lines around the frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(True)
# offset the spines
for spine in ax.spines.values():
spine.set_position(('outward', 5))
# add a light grey horizontal grid to the plot, semi-transparent,
ax.yaxis.grid(True, linestyle='--', which='major', color='lightgrey', alpha=0.5)
# hide these grids behind plot objects
ax.set_axisbelow(True)
# annotate figure to add significance
# get the x and y coordinates
y_min, y_max = 0.020, 0.025
# compare upstream and downstream bins to targets
for i in range(len(list(ind))):
# do not compare targets with targets
if i != 3:
# get the P value of Wilcoxon rank sum test
Pval = stats.ranksums(alldata[i], alldata[3])[1]
# get stars for significance
if Pval > 0.05:
P = 'N.S.'
elif Pval < 0.05 and Pval > 0.01:
P = '*'
elif Pval < 0.01 and Pval > 0.001:
P = '**'
elif Pval < 0.001:
P = '***'
# add stars for significance
if P == 'N.S.':
ax.text(i + width/2, y_max + abs(y_max - y_min)*0.03, P, horizontalalignment='center',
verticalalignment='center', color = 'grey', fontname = 'Arial', size = 6)
else:
ax.text(i + width/2, y_max + abs(y_max - y_min)*0.01, P, horizontalalignment='center',
verticalalignment='center', color = 'grey', fontname = 'Arial')
# save figure
fig.savefig('DiversityTargetsFlankingSites.pdf', bbox_inches = 'tight')