-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpeaksanno.py
executable file
·373 lines (307 loc) · 18.1 KB
/
peaksanno.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#!/usr/bin/env python3
'''
======================== PEAKS ANNOTATION ======================
USAGE
All inclusive script for Annotation of ChIP-Seq peaks.
This is included in the SEASEQ pipeline.
```
python PEAKSANNO.py -g <GTF/GFF/GFF3> -c <CHROM SIZES> -p <MACS PEAKS bed> -s <MACS SUMMITS bed>
```
The script input requirements are :
- GTF : GTF/GFF/GFF3 having either gene or transcript (prefered) annotation
- CHROM SIZES : chrom.sizes file
- MACS PEAKS bed : MACS peaks bed file
- MACS SUMMITS bed : MACS peaks summit file
Output files provided are :
1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__
2) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
3) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__
4) peaks overlapping windows in __*peaks_within_window.regions.txt*__
5) peaks identified in previous overlapping regions and comparison of
all regions in __*peaks_compared_regions.peaks.txt*__
6) genes identified in previous overlapping regions and comparison of
all regions in __*peaks_compared_regions.genes.txt*__
7) distribution graphs in __*peaks_compared_regions.distribution.pdf*__
Definitions of annotation regions:
- TSS: TSS (transcription start site)
- promoters: 1kb +/- TSS
- window: 10kb upstream --> 3kb downstream of genebody
- genebody: 1kb upstream of TSS --> TES (transcription end site)
Dependencies of script:
* bedtools
* python3
==================================================================
'''
import os
import re
import argparse
import subprocess
import matplotlib.pyplot as plt
from pathlib import Path
import collections
from collections import defaultdict
if not os.path.exists('annotation'):
os.makedirs('annotation')
def parse_genelocations(chromz, results, up_dist, down_dist, tss=True):
""" Parse genomic regions
Args:
chromz (dict) : Chromosomal sizes
results (string) : Individual gene coordinates
up_dist (int) : genomic distance upstream / +
down_dist (int) : genomic distance downstream / -
tss (boolean) : TSS or entire gene (true if tss)
"""
lines = results.split("\t")
lines[3], lines[4] = int(lines[3]), int(lines[4])
if lines[6] == "+":
end_region = 4 if tss else 3
end_position = lines[end_region] + down_dist
start_position = lines[3] - up_dist
elif lines[6] == "-":
end_region = 3 if tss else 4
start_position = lines[end_region] - down_dist
end_position = lines[4] + up_dist
start_position = max(start_position, 1)
end_position = min(end_position, int(chromz[lines[0]]))
return "{0}\t{1}\t{2}\t{3}\n".format("\t".join(lines[0:3]), start_position, end_position, "\t".join(lines[5:]))
def gtf_to_genes(chrom_sizes_file, gtf_file):
""" GTF to GENES
Args:
chrom_sizes_file (file) : Chromosomal sizes file
gtf_file (file) : GTF file
"""
#read ucsc chrom sizes, gtf
chrom_sizes_dict = {line.split('\t')[0]: line.split('\t')[1].rstrip("\n") for line in open(chrom_sizes_file)}
haschr = any(chrom.startswith('chr') for chrom in chrom_sizes_dict) #check 'chr' prefix in gtf file
feature_dict = {line.split("\t")[2]: line.split("\t")[2] for line in open(gtf_file) if not line.startswith('#')}
feature = "transcript" if 'transcript' in feature_dict else "gene"
print("NOTE :\tFeature type used is '%s'" % feature)
#parsing gene annotation file, GFF, GFF3, GTF
with open(gtf_file, 'r') as gtf_input:
for gtf_col in gtf_input:
if not gtf_col.startswith('#'):
gtf_col.split("\t")[0] = "chr" + gtf_col.split("\t")[0] if haschr and not gtf_col.split("\t")[0].startswith('chr') else gtf_col.split("\t")[0][3:] if not haschr and gtf_col.split("\t")[0].startswith('chr') else gtf_col.split("\t")[0]
if gtf_col.split("\t")[2] == feature:
record = gtf_col.split("\t")[8].split(';')
if gtf_file.split('.')[-1] in ['gff', 'gff3']:
transcript_id = [s for s in record if "transcript_id" in s][0] if re.search(";transcript_id", gtf_col.split("\t")[8]) else record[0]
gene_name = [s for s in record if "gene_name" in s][0] if re.search(";gene_name", gtf_col.split("\t")[8]) else record[0]
transcript_id = re.sub('[\"\;]', '', transcript_id.split('=')[-1]) #clean transcript_id
gene_name = re.sub('[\"\;]', '', gene_name.split('=')[-1]) #clean gene_name
elif gtf_file.split('.')[-1] == 'gtf':
transcript_id = [s for s in record if "transcript_id " in s][0] if re.search("; transcript_id", gtf_col.split("\t")[8]) else record[0]
gene_name = [s for s in record if "gene_name " in s][0] if re.search("; gene_name", gtf_col.split("\t")[8]) else record[0]
transcript_id = re.sub('[\"\;]', '', transcript_id.split(' ')[-1]) #clean transcript_id
gene_name = re.sub('[\"\;]', '', gene_name.split(' ')[-1]) #clean gene_name
results = "{0}\t{1}\t{2}\t{3}".format(gtf_col.split("\t")[0], "\t".join(gtf_col.split("\t")[1:8]), gene_name, transcript_id)
try:
if chrom_sizes_dict[gtf_col.split("\t")[0]]:
with open('annotation/original_genes.gff', 'w') as original_output:
original_output.write(results + "\n")
final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 1000, False)
for file_name, up, down, tss in [('promoter-region_genes.gff', 10000, 3000, True), ('window-region_genes.gff', 1000, 1000, False), ('TSS-region_genes.gff', 0, 0, False), ('genebody-region_genes.gff', 1000, 0, True)]:
with open(f'annotation/{file_name}', 'w') as file:
file.write(final_output)
except KeyError:
continue
def annotate_regions(region_file, output_file, macs_summits_file=None):
""" Annotate Peaks based on genomic features
Args:
region_file (file) : Region file
output_file (string) : Output file name
macs_summits_file (file) : MACS summits file
"""
#initialize chromosomal positions based on gff provided
opt_chrom, opt_start, opt_stop, opt_peakid, opt_score, opt_none, opt_gene, opt_transcript = 0, 1, 2, 3, 4, 6, 8, 9
summits_dict = {}
if macs_summits_file: #read summits file
with open(macs_summits_file, 'r') as summits_input:
summits_dict = {line.strip().split('\t')[opt_peakid]: (line.strip().split('\t')[opt_start], line.strip().split('\t')[opt_stop]) for line in summits_input}
#read intersectbed file
with open(region_file, 'r') as intersect_input, open(output_file, 'w') as final_output:
peaks_dict, peaks_score, gene_dict, transcript_dict, peak_region = {}, {}, {}, {}, []
for line in intersect_input:
line = line.strip().split('\t')
if int(line[opt_none]) != -1:
string_identifier = tuple(line[i] for i in [opt_chrom, opt_start, opt_stop])
peaks_dict[string_identifier] = line[opt_peakid]
peaks_score[line[opt_peakid]] = line[opt_score]
peak_region.append(string_identifier)
gene_dict[string_identifier] = gene_dict.get(string_identifier, "") + line[opt_gene]
transcript_dict[string_identifier] = transcript_dict.get(string_identifier, "") + line[opt_transcript]
unique_peak_region = list(set(peak_region))
header = "peak_id\tchrom\tpeak_start\tpeak_end\tsummit_start\tsummit_end\tgene_name\ttranscript_id\tpeak_score\n" if macs_summits_file else "peak_id\tchrom\tpeak_start\tpeak_end\tgene_name\ttranscript_id\tpeak_score\n"
final_output.write(header)
for chromregion in unique_peak_region:
peakid = peaks_dict[chromregion]
if macs_summits_file:
final_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" %(peakid, '\t'.join(chromregion), '\t'.join(summits_dict[peakid]),','.join(unique(gene_dict[chromregion])),','.join(unique(transcript_dict[chromregion])),peaks_score[peakid]))
else:
final_output.write("%s\t%s\t%s\t%s\t%s\n" %(peakid, '\t'.join(chromregion), ','.join(unique(gene_dict[chromregion])), ','.join(unique(transcript_dict[chromregion])), peaks_score[peakid]))
def unique(list1):
""" Get unique list
Args:
list1 (list) : list of variable separated by comma ','
"""
return list(set(list1.split(",")))
def include_genes(macs_peaks_file, file_name):
""" Annotate Genes in peak regions
Args:
macs_peaks_file (file) : MACS peaks file
output_file (string) : Output file name
"""
peaks_chr_dict, peaks_count, gene_names_dict, region_dict, peaks_id_dict, region_names_list, master_generegion_dict, barcoderegions_dict, barcodepeaks_id_dict, master_peaksid_dict = {}, 0, {}, defaultdict(dict), defaultdict(dict), [], defaultdict(dict), {}, {}, defaultdict(dict)
#reading peaks
peaks_chr_dict = {line.split('\t')[3]: line.split('\t')[0:3] for line in open(macs_peaks_file, 'r')}
peaks_count = sum(1 for _ in open(macs_peaks_file, 'r'))
#reading gtf gene names
gene_names_dict = {(line.split('\t')[0],line.split('\t')[1]): line.split('\t')[0] for line in open("tempPrefix.genes.names", 'r')}
#reading the genomic regions files
for region_prefix in ("promoter", "genebody", "window", "closest"):
region = region_prefix + ".tempPrefix"
region_name = Path(region).stem.upper()
region_names_list.append(region_name)
with open(region, 'r') as region_input:
for line in region_input:
line = line.strip().split('\t')
region_dict[region_name][(line[8], line[9])] = line[8:10]
peaks_id_dict[region_name][line[3]] = line[0:3]
peaks_id_list = peaks_chr_dict.keys()
peaks_id_list = sorted(peaks_id_list, key=lambda item:
(int(item[10:]) if item[10:].isdigit()
else print(item[10:], type(item[10:])), item))
#order regions and gene names
region_names_list.sort()
orderedgene_names_dict = collections.OrderedDict(sorted(gene_names_dict.items()))
#barcoding the identified regions
for eachgene in orderedgene_names_dict:
for eachkey in region_names_list:
found = 1 if eachgene in region_dict[eachkey] else 0
master_generegion_dict[eachkey][eachgene] = found
barcoderegions_dict[eachgene] = barcoderegions_dict.get(eachgene, "") + str(found)
for eachpeaks_id in peaks_id_list:
for eachkey in region_names_list:
found = 1 if eachpeaks_id in peaks_id_dict[eachkey] else 0
master_peaksid_dict[eachkey][eachpeaks_id] = found
barcodepeaks_id_dict[eachpeaks_id] = barcodepeaks_id_dict.get(eachpeaks_id, "") + str(found)
#write to output file
gene_file_name = file_name.replace('.txt', '.genes.txt')
with open(gene_file_name, 'w') as final_output:
final_output.write("GeneName\tTranscript_ID\tBARCODE\t%s\tEVER\n" % ('\t'.join(region_names_list)))
for eachgene in orderedgene_names_dict:
string_identifier = [barcoderegions_dict[eachgene]]
found_sum = sum(master_generegion_dict[region][eachgene] for region in region_names_list)
string_identifier.extend(str(master_generegion_dict[region][eachgene]) for region in region_names_list)
string_identifier.append("1" if found_sum >= 1 else "0")
final_output.write("%s\t%s\n" % ('\t'.join(eachgene), '\t'.join(string_identifier)))
peaks_file_name = file_name.replace('.txt', '.peaks.txt')
pdf_file_name = file_name.replace('.txt', '.distribution.pdf')
with open(peaks_file_name, 'w') as final_output_peaks:
final_output_peaks.write("chrom\tstart\tstop\tpeak_id\tBARCODE\t%s\n" %('\t'.join(region_names_list)))
for eachpeaks_id in peaks_id_list:
string_identifier = [barcodepeaks_id_dict[eachpeaks_id]]
string_identifier.extend(str(master_peaksid_dict[region][eachpeaks_id]) for region in region_names_list)
final_output_peaks.write("%s\t%s\t%s\n" %('\t'.join(peaks_chr_dict[eachpeaks_id]), eachpeaks_id, '\t'.join(string_identifier)))
region_count_dict = {region: round(len(peaks_id_dict[region]) * 100 / peaks_count) for region in ("PROMOTER", "GENEBODY", "WINDOW")}
rects = plt.bar(region_count_dict.keys(), region_count_dict.values(), width=0.4)
axes = plt.gca()
axes.set_ylim([0, 100])
peaks_title = Path(macs_peaks_file).stem
plt.title(peaks_title.split('.sorted')[0])
plt.ylabel("% of peaks")
#label the bars
xpos = 'center'
ha = {'center': 'center', 'right': 'left', 'left': 'right'}
offset = {'center': 0.5, 'right': 0.57, 'left': 0.43} # x_txt = x + w*off
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
'{}%'.format(height), ha=ha[xpos], va='bottom')
#save figure
plt.savefig(pdf_file_name)
for eachregion in ("PROMOTER", "GENEBODY", "WINDOW"):
print("Total number of %s = %d" %(eachregion, len(peaks_id_dict[eachregion])))
print("Percentage of %s = %.3f" %(eachregion, len(peaks_id_dict[eachregion])*100/peaks_count))
def main():
'''
Generate genomic coordinates of all promoters, window, genebody, TSS
Definitions: promoters: +/- 1kb of TSS (transcription start site)
window: 10kb upstream --> 3kb downstream of genebody
genebody: 1kb upstream of TSS --> TTS/TES (transcription termination/end site)
TSS: TSS (transcription start site)
Annotate Peaks based on genomic features listed above
Comparison master files for all genes/transcripts
and their genomic regions within peaks called.
Genomic regions are specified based on regions provided,
default expectation are promoter, genebody, window, closest
'''
parser = argparse.ArgumentParser()
parser.add_argument("-g", "--gtf", dest="gtf", required=True, help="Enter .gtf/gff file to be processed.")
parser.add_argument("-c", "--chrom", dest="chrom_sizes", required=True, help="Enter ucsc chrom sizes file to be processed.")
parser.add_argument("-p", "--peaks", dest="macs_peaks", required=True, help="Enter MACS peaks bed file.")
parser.add_argument("-s", "--summit", dest="macs_summits", required=False, help="Enter MACS summit bed file.")
options = parser.parse_args()
# Stage 1
# Extract genomic regions of interest
gtf_to_genes(options.chrom_sizes, options.gtf)
#Peaks filecheck
checkcolumns = sum(1 for _ in open(options.macs_peaks, 'r'))
if checkcolumns <= 0: sys.exit("ERROR :\tNo peaks were identified\n")
# Turn 4 column bed to 5 (for SICER peaks: include unique id)
with open(options.macs_peaks, 'r') as peaks_file:
numberofcolumns = len(peaks_file.readline().strip().split('\t'))
fifthcol_value = next(peaks_file).split('\t')[4].strip() #FDRisland from SICER produce a fifth empty column
new_macs_peaks = options.macs_peaks
if numberofcolumns == 4 or len(fifthcol_value) < 1:
new_macs_peaks = options.macs_peaks + ".tempPrefix"
# command = "cat " + options.macs_peaks + ' | awk -F\\\\t ' + "'" + \
# '{print $1 "\\t" $2 "\\t" $3 "\\t" $1":"$2"-"$3 "\\t" $4}' + \
# "' > tempPrefix-new.macs_peaks.bed"
# print(command)
# os.system(command)
macs_peaks_output = open(new_macs_peaks, 'w')
macs_peaks_input = open(options.macs_peaks,'r')
macs_peaks_count = 1
for line in macs_peaks_input:
line = line.strip().split('\t')
new_peaks_id = "SICERpeak_" + str(macs_peaks_count)
results = ("{0}\t{1}\t{2}\n".format("\t".join(line[0:3]), new_peaks_id, line[3]))
macs_peaks_output.write(results)
macs_peaks_count += 1
macs_peaks_output.close()
# Convert TSS to compatible bedtools (closestBed) format and obtain center of peaks coordinate
command = "cut -f1,4,5,9,10 annotation/TSS-region_genes.gff | sort -k1,1 -k2,2n " + \
"> tempPrefix-sorted.TSS; " + \
"cat " + new_macs_peaks + ' | awk -F\\\\t ' + "'" + \
'{print $1 "\\t" int(($2+$3)/2) "\\t" int(($2+$3)/2) "\\t" $4 "\\t" $5}' + \
"' | sort -k1,1 -k2,2n > tempPrefix-centerofpeaks.bed;" + \
"closestBed -t all -a tempPrefix-centerofpeaks.bed -b tempPrefix-sorted.TSS" + \
"> closest.tempPrefix"
os.system(command)
# Stage 2
# Annotate peaks based on previously extracted genomic coordinates
for region in ("promoter", "genebody", "window"):
command = 'intersectBed -b ' + new_macs_peaks + ' -a annotation/' + region + \
'-region_genes.gff -wb | cut -f1,4,5,9,10,11,12,13,14,15 |' + "awk -F\\\\t '" + \
'{print $6 "\\t" $7 "\\t" $8 "\\t" $9 "\\t" $10 "\\t" $1 "\\t" $2 "\\t" $3 ' + \
'"\\t" $4 "\\t" $5}' + \
"' > " + region + ".tempPrefix"
os.system(command)
# Convert annotated regions to tab delimited file and include summit location
for region in ("promoter", "genebody", "window", "closest"):
region_file = region + ".tempPrefix"
output_file = "peaks_within_" + region + ".regions.txt"
if options.macs_summits:
annotate_regions(region_file, output_file, options.macs_summits)
else:
annotate_regions(region_file, output_file)
command = "mv peaks_within_closest.regions.txt centerofpeaks_closest.regions.txt; \
cut -f 9,10 annotation/original_genes.gff > tempPrefix.genes.names"
os.system(command)
# Stage 3
# Binary matrix of all genes
include_genes(new_macs_peaks, "peaks_compared_regions.txt")
command = "rm -rf *tempPrefix*"
os.system(command)
if __name__ == "__main__":
main()