-
Notifications
You must be signed in to change notification settings - Fork 0
/
PlotChromoWideGeneRepeatDiversity.py
296 lines (255 loc) · 9.59 KB
/
PlotChromoWideGeneRepeatDiversity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 20 13:17:35 2015
@author: RJovelin
"""
# use this script to plot nucleotide diversity with repeat or gene density for
# linkage groups LG1, LG2 or LG4
# usage python3 PlotChromoWideGeneRepeatDiversity.py [options]
#- [repeats/genes] : plot gene or repeat density along with nucleotide diversity
#- [LG1/LG2/LG4] : chromo to plot
#- [PB, noPB] : if diversity is computed using Ontario strains only (noPB) ot with all strains (PB)
# use Agg backend on server without X server
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import rc
rc('mathtext', default='regular')
# import modules
import numpy as np
from scipy import stats
import math
import sys
# import custom modules
from manipulate_sequences import *
from miRNA_target import *
from repeats_TEs import *
from sliding_windows import *
from sites_with_coverage import *
from divergence import *
# get density to plot with nucleotide diversity
density = sys.argv[1]
# get chromo
LG = sys.argv[2]
# get corresponding chromos
if LG == 'LG1':
chromosome = 'linkage_group_1'
elif LG == 'LG2':
chromosome = 'linkage_group_2'
elif LG == 'LG4':
chromosome = 'linkage_group_4'
# choose strains to compute diversity
strains = sys.argv[3]
# convert genome fasta to dict
genome = convert_fasta('../Genome_Files/noamb_356_v1_4.txt')
print('genome converted to fasta dict')
# get the gene coordinates {gene1 : [chromo, start, end, sense]}
genes_coord = get_genes_coordinates('../Genome_Files/356_10172014.gff3')
print('got gene coordinates')
# get the set of valid transcripts
valid_transcripts = get_valid_transcripts('../Genome_Files/unique_transcripts.txt')
print('got the set of valid transcripts')
# get a dict with {chromo: [start position (0-based) of the valid transcripts]}
genes_start = {}
for gene in genes_coord:
if gene in valid_transcripts:
chromo = genes_coord[gene][0]
start = genes_coord[gene][1] -1
# check if chromo in genes_start
if chromo in genes_start:
genes_start[chromo].append(start)
else:
genes_start[chromo] = [start]
# sort positions
for chromo in genes_start:
genes_start[chromo].sort()
print('got the genes\'s first positions')
# get the coordinates of the repeats
# {repeat : [[chromo, start, end, orientation], [chromo, start, end, orientation]]}
repeats_coord = get_repeats_coord('../Genome_Files/356_v1_4.fasta.out', False)
print('got the repeat coordinates')
# create a dict with {chromo: [list of repeat start positions 0-based]}
repeats_start = {}
# loop over repeat family
for fam in repeats_coord:
# loop over instances of repeat
for i in range(len(repeats_coord[fam])):
# get chromo
chromo = repeats_coord[fam][i][0]
# get start position
start = repeats_coord[fam][i][1]
# check if chromo in dict
if chromo in repeats_start:
# add start position to list
repeats_start[chromo].append(start)
else:
# initiate list
repeats_start[chromo] = [start]
# sort positions
for chromo in repeats_start:
repeats_start[chromo].sort()
print('got the repeats\'s first positions')
# create list with count of gene o repeat in 50000 bp windows
range_counts = [0] * (len(genome[chromosome]) // 50000)
# check if gene or repeat density is recorded
if density == 'repeats':
for start in repeats_start[chromosome]:
which_range = start // 50000
if which_range == len(range_counts):
which_range -= 1
# count repeats
range_counts[which_range] += 1
elif density == 'genes':
for start in genes_start[chromosome]:
which_range = start // 50000
if which_range == len(range_counts):
which_range -= 1
# counts genes
range_counts[which_range] += 1
# create a list with the position of each window interval
positions = [i for i in range(len(range_counts))]
print('position', len(positions))
print(LG, len(genome[chromosome]))
print('interval length', len(genome[chromosome]) // 50000)
# create figure
fig = plt.figure(1, figsize = (4, 1))
# add a plot to figure (1 row, 1 column, 1 plot)
ax1 = fig.add_subplot(1, 1, 1)
# plot the repeat of gene density per window
graph1 = ax1.plot(positions, range_counts, linewidth = 1, color = '#b2df8a')
# restrict the x and y axis to the range of data
if density == 'genes':
ax1.set_ylim([0,25])
elif density == 'repeats':
ax1.set_ylim([0, 200])
# set y axis label
if density == 'genes':
YaxisText = 'Gene density'
elif density == 'repeats':
YaxisText = 'Repeat density'
ax1.set_ylabel(YaxisText, size = 10, ha = 'center', fontname = 'Arial')
# determine tick position on x axis
xpos = [j for j in range(0, len(positions) + 50, 50)]
# convert interval windows numbers to genomic positions
xtext = list(map(lambda x : (x * 50000) / 1000000, xpos))
xtext = list(map(lambda x : str(x), xtext))
# set up tick positions and labels
plt.xticks(xpos, xtext, fontsize = 10, fontname = 'Arial')
plt.yticks(fontsize = 0)
# set x axis label
ax1.set_xlabel('Position along linkage group (Mb)', size = 10, ha = 'center', fontname = 'Arial')
# do not show lines around figure, keep bottow line
ax1.spines["top"].set_visible(False)
ax1.spines["bottom"].set_visible(True)
ax1.spines["right"].set_visible(False)
ax1.spines["left"].set_visible(False)
# offset the spines
for spine in ax1.spines.values():
spine.set_position(('outward', 5))
# add a light grey horizontal grid to the plot, semi-transparent,
ax1.yaxis.grid(True, linestyle='--', which='major', color='lightgrey', alpha=0.5, linewidth = 0.5)
# hide these grids behind plot objects
ax1.set_axisbelow(True)
# compute theta per 50 Kb window
# get the allele counts for all sites with coverage, keep all sites
if strains == 'noPB':
chromo_sites = get_non_coding_snps('../SNP_files/', 0)
elif strains == 'PB':
# compute theta per 50 Kb window in all strains
chromo_sites = get_all_strains_snps('../PB_ON_SNP_files/', 0)
print('got allele counts')
# create a list of starting positions of each window on chromo
LG_positions = [i for i in range(0, len(genome[chromosome]), 50000)]
print('chromo positions', len(LG_positions))
# create a dict to store theta for each widow interval
diversity = {}
# set up interval counter
j = 0
# loop over start position in LG:
for i in LG_positions:
# compute theta per window
# use a large threshold > window length to accept any number of missing site
theta = compute_theta_non_coding(chromo_sites, chromosome, i, i + 50000, 100000)
diversity[j] = theta
# update j
j += 1
print('computed diversity')
# create a list of positions for which diversity is calculated, sort positions
divpos = [i for i in diversity]
divpos.sort()
# create parallel list with theta values
polymorphism = [diversity[i] for i in divpos]
print('positions diversity windows', len(divpos))
assert positions == divpos[:-1]
# add another graph on top of previous one
ax2 = ax1.twinx()
# plot theta per window
graph2 = ax2.plot(positions, polymorphism[:-1], linewidth = 1, color = '#1f78b4')
# set y axis label
ax2.set_ylabel('Polymorphism', size = 10, ha = 'center', fontname = 'Arial')
# do not show lines around figure, keep bottow line
ax2.spines["top"].set_visible(False)
ax2.spines["bottom"].set_visible(False)
ax2.spines["right"].set_visible(False)
ax2.spines["left"].set_visible(False)
# do not show ticks for 2nd graph
ax2.tick_params(
axis='both', # changes apply to the x-axis and y-axis (other option : x, y)
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
right = 'off',
left = 'off',
labelbottom='off', # labels along the bottom edge are off
colors = 'black',
labelsize = 10,
direction = 'out') # ticks are outside the frame when bottom = 'on
# do not show ticks on 1st graph
ax1.tick_params(
axis='x', # changes apply to the x-axis and y-axis (other option : x, y)
which='both', # both major and minor ticks are affected
bottom='on', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
right = 'off',
left = 'off',
labelbottom='on', # labels along the bottom edge are off
colors = 'black',
labelsize = 10,
direction = 'out') # ticks are outside the frame when bottom = 'on
#
#
# do not show ticks
ax1.tick_params(
axis='y', # changes apply to the x-axis and y-axis (other option : x, y)
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
right = 'off',
left = 'off',
labelbottom='off', # labels along the bottom edge are off
colors = 'black',
labelsize = 10,
direction = 'out') # ticks are outside the frame when bottom = 'on
for label in ax2.get_yticklabels():
label.set_fontname('Arial')
# add lines
lns = graph1+graph2
# get labels
if density == 'genes':
labs = ['Genes', 'Diversity']
elif density == 'repeats':
labs = ['Repeats', 'Diversity']
# plot legend
ax2.legend(lns, labs, loc=2, fontsize = 8, frameon = False)
# save figure
if density == 'genes':
data = 'GeneDensity'
elif density == 'repeats':
data = 'RepeatDensity'
if strains == 'noPB':
snps = 'KSRPXSnps'
elif strains == 'PB':
snps = 'KSRPXPBSnps'
outputfile = 'PlotDiversity' + data + snps + LG
print(outputfile)
fig.savefig(outputfile + '.pdf', bbox_inches = 'tight')