-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
372 lines (319 loc) · 15.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
from result_dictionary_maker import *
from drawer import *
from asciithing import *
from create_html_output import *
import pandas as pd
import argparse
import textwrap
# Argparse - The raccomanded command-line parsing module in the python standard library
description_message = '''\
-----------------------------------------------------------------
IF NO OPTION IS SELECTED, THE PROGRAM WILL RUN IN [ASSISTED MODE]
-----------------------------------------------------------------
DESCRIPTION
Cuterle is a bioinformatic tool.
It returns an output file containing every domain annotated by InterProScan.
Pfam or SMART analysis are choosen by which method has more matches.
LIST OF OUTPUT FILE
extracted_domains.fasta - contains every domains extracted
[optional] domains_list.csv - contains the table's raw data (domain_name,count)
[optional] domains_view[seq_name].jpg - schematic domains draw FOR EACH sequence
NAME FORMAT
The name for every sequence added to extracted_domain.fasta is [>1,2,3,4,5,6]
1 - Protein accession (e.g. P51587)
2 - Length of the domain
3 - Start location of the domain
4 - End location of the domain
5 - InterPro annotations - description (e.g. [BRCA2 repeat])
6 - InterPro accession (e.g. [IPR002035])
It is possible to CHANGE the order for every tag;
e.g. [-nf 1] or [-nf 1,2,3,4] or [-nf 5,4,3,2,2,2,1]
DO NOT USE SPACE between the number!
------------------------------------------
'''
cuterle_parser = argparse.ArgumentParser(usage="%(prog)s [options]",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(description_message)
)
cuterle_parser.add_argument("-m",
help="Enable the manual mode. -tsv and -fasta argument are requested",
action="store_true"
)
cuterle_parser.add_argument("-tsv",
help="Input file containing the tsv file output from InterPro",
type=str,
metavar="file.tsv"
)
cuterle_parser.add_argument("-fasta",
help="Input file containing the fasta sequences",
type=str,
metavar="file.fasta"
)
cuterle_parser.add_argument("-a",
help="Prior choice between 'Pfam' and 'SMART'. Read the documentation.",
type=str,
metavar="Pfam or SMART"
)
cuterle_parser.add_argument("-nf",
help="Name format. Read the documentation. Default format: [1,2,3,4,5,6]",
type=str,
)
cuterle_parser.add_argument("-accession",
help="InterPro annotations - accession ((e.g. IPR002093)",
type=str,
)
cuterle_parser.add_argument("-draw_image",
help="FOR EACH sequences create a ~.jpg file reporting sequence+domains",
action="store_true"
)
cuterle_options = cuterle_parser.parse_args()
manual_mode = cuterle_options.m
tsv_file = cuterle_options.tsv
fasta_file = cuterle_options.fasta
prior_choice = cuterle_options.a
name_format = cuterle_options.nf
accession = cuterle_options.accession
draw_choice = cuterle_options.draw_image
# *********************************************************************************************
# *********************************************************************************************
# Get tsv file as input - The while loop checks the existences of the input tsv file
if manual_mode and existence_file_check(tsv_file, "*.tsv"):
pass
elif manual_mode and tsv_file is None:
print(f"You have not selected any tsv file!")
exit()
elif manual_mode:
print(f"{tsv_file} doesn't exist or doesn't has the correct format!")
exit()
elif manual_mode is False:
print(logo)
print(separator)
print("Welcome, this is CUTERLE, \n"
"a bioinformatic tool which return an output file containing every domain annotated by InterProScan\n"
"via Pfam or SMART analysis from the list of protein submitted.")
print(separator)
print("The first file requested is the one with ~.tsv extension which contains the domains coordinates.")
print("In the current folder you have the following *.tsv files:")
print_file_in_the_folder("*.tsv")
while True:
tsv_file = input("Type the file name using this format -> file.tsv : ")
if existence_file_check(tsv_file, "*.tsv"):
break
else:
print(f"{tsv_file} doesn't exist or doesn't has .tsv extension. Retry.")
pass
# Get fasta file as input - The while loop checks the existences of the input fasta file
if manual_mode and existence_file_check(fasta_file, "*.fasta"):
pass
elif manual_mode and fasta_file is None:
print(f"You have not selected any fasta file!")
exit()
elif manual_mode:
print(f"{fasta_file} doesn't exist or doesn't has the correct format!")
exit()
else:
print(separator)
print("The second file requested is the one with ~.fasta extension which contains the sequence list.")
print("Remember to use THE SAME fasta file used to get the tsv one.")
print("In the current folder you have the following *.fasta files:")
print_file_in_the_folder("*.fasta")
while True:
fasta_file = input("Type the file name using this format -> file.fasta : ")
if existence_file_check(fasta_file, "*.fasta"):
break
else:
print(f"{fasta_file} doesn't exist or doesn't has .fasta extension. Retry.")
pass
folder_name = i_counter()
# Checks (and eventually add) if the tsv file already has columns' name (0, 1, 2, 3, ...)
check_column_name(tsv_file)
# Create a list which contains every protein name
protein_list = protein_list_maker(fasta_file)
# Create with pandas a new dataframe
dataframe_tsv = pd.read_table(tsv_file)
# Create results_dictionary
domain_order = "Increasing"
result_dictionary = result_dictionary_maker(protein_list, dataframe_tsv, prior_choice, fasta_file, domain_order)
# Count how many result for each analysis have been found
smart_counter = 0
pfam_counter = 0
for everyrecord in result_dictionary:
if result_dictionary[everyrecord]["Analysis_used"] == "Pfam":
pfam_counter += len(result_dictionary[everyrecord]["Extracted_domains"])
elif result_dictionary[everyrecord]["Analysis_used"] == "SMART":
smart_counter += len(result_dictionary[everyrecord]["Extracted_domains"])
# Count how many result there are in total for both analysis
smart_plus_pfam = smart_counter + pfam_counter
# Create a table
table_list = create_table_row_list(result_dictionary)
if manual_mode:
# if table_choice:
with open(f"{folder_name}/domains_list.csv", "w") as domain_csv:
for everyrow in table_list:
domain_csv.write(f"{everyrow[0]},{everyrow[1]},{everyrow[2]}\n")
# if html_choice:
create_html_output(folder_name, fasta_file, tsv_file, result_dictionary, table_list)
save_choice_list = []
domain_to_save = []
if accession:
save_choice_list = accession.split(",")
for everyaccession in table_list:
domain_to_save.append(everyaccession)
else:
for everyprotein in protein_list:
for everydomain in result_dictionary[everyprotein]["Extracted_domains"]:
ip_accession = everydomain["IP_ACCESSION"]
if ip_accession in domain_to_save:
pass
else:
domain_to_save.append(everydomain["IP_ACCESSION"])
else:
print(separator)
printing_table(table_list)
print(f"{smart_plus_pfam} domains have been found: {pfam_counter} by Pfam and {smart_counter} by SMART.")
while True:
table_choice = input("Do you want to save this table as ~.csv file? y/n ")
if table_choice == "y":
with open(f"{folder_name}/domains_list.csv", "w") as domain_csv:
for everyrow in table_list:
domain_csv.write(f"{everyrow[0]},{everyrow[1]},{everyrow[2]}\n")
print(f"You can find your results in domains_list.csv inside {folder_name}")
break
elif table_choice == "n":
break
else:
print("Strange way to type 'y' or 'n'. Retry.\n")
print("\nWhich domains do you want to save?")
print("- Save all the domains -> 'all'")
print("- Choose by index -> e.g. single 'index,1' or multiple 'index,1,3,4'")
print("- None -> 'none' ")
print("DO NOT use space. If you have some doubt, go back to the readme.")
domain_to_save = []
while True:
save_choice_input = input("Write here your choice: ")
save_choice_list = save_choice_input.split(",")
save_choice = save_choice_list[0]
# Da aggiungere eccezione per numeri maggiori dell'index della lista
if save_choice == "index":
for everychoice in save_choice_list:
if everychoice == "index":
pass
else:
domain_to_save.append(table_list[int(everychoice)][0])
break
elif save_choice == "all":
for everyelement in table_list:
domain_to_save.append(everyelement[0])
break
elif save_choice == "none":
exit()
else:
print("You forgot to add 'index' or 'accession', or made some typos. Retry.\n")
# # *********************************************************************************************
# # SAVING THE DOMAINS
# # *********************************************************************************************
# Counter for the domain saved
domain_saved = 0
# er = every result
for everyprotein in protein_list:
for everydomain in result_dictionary[everyprotein]["Extracted_domains"]:
protein_accession = everyprotein
domain_name = everydomain["DOMAIN_NAME"]
domain_order = everydomain["DOMAIN_ORDER"]
start_location = everydomain["START"]
stop_location = everydomain["STOP"]
domain_length = everydomain["LENGTH"]
ip_accession = everydomain["IP_ACCESSION"]
protein_sequence = result_dictionary[everyprotein]["Sequence"]
domain_sequence = protein_sequence[start_location:stop_location]
if ip_accession in domain_to_save:
name_format_dict = {
"1": protein_accession,
"2": domain_name,
"3": domain_length,
"4": ip_accession,
"5": start_location,
"6": stop_location,
}
# ----> Renaming the sequence following the input
if name_format is None:
name_format = "1,2,3,4,5,6"
name_format_choosen = name_format.split(",")
name_format_string = ">"
for n in name_format_choosen:
if n == "1":
name_format_string += f" [{name_format_dict[n]}] -"
elif n == "2":
name_format_string += f" [DOMAIN-NAME: {name_format_dict[n]}] -"
elif n == "3":
name_format_string += f" [DOMAIN-LENGTH: {name_format_dict[n]}] -"
elif n == "4":
name_format_string += f" [IP-ACCESSION: {name_format_dict[n]}] -"
elif n == "5":
name_format_string += f" [START: {name_format_dict[n]}] -"
elif n == "6":
name_format_string += f" [STOP: {name_format_dict[n]}] -"
sliced_text = slice(len(name_format_string) - 2)
name_format_string_final = name_format_string[sliced_text]
# If the output file already exists, append the sequences
try:
with open(f"{folder_name}/extracted_domains.fasta", "a") as file_output:
file_output.write(f"{name_format_string_final}\n")
file_output.write(f"{domain_sequence}\n\n")
# ----> IF OUTPUTFILE DOES NOT EXIST, THEN WE CREATE IT WITH THE FIRST SEQUENCE
except FileNotFoundError:
with open(f"{folder_name}/extracted_domains.fasta", "w") as file_output:
file_output.write(f"{name_format_string_final}\n")
file_output.write(f"{domain_sequence}\n\n")
domain_saved += 1
if manual_mode is False:
print(f"\n{domain_saved} domains had been saved in extracted_domains.fasta inside {folder_name}")
# # *********************************************************************************************
# # ASK FOR DRAW EVERY SEQUENCES (SEE DRAW.PY)
# # *********************************************************************************************
if manual_mode:
if draw_choice:
domain_order = "Increasing"
result_dictionary = result_dictionary_maker(protein_list, dataframe_tsv, prior_choice, fasta_file,
domain_order)
sequences_drawer(protein_list, table_list, result_dictionary, folder_name)
else:
print(separator)
while True:
wanna_draw = input(f"Are you interested in create a new file.image for EACH sequence in {fasta_file}? y/n ")
if wanna_draw == "y" and seq_in_fastafile_count(fasta_file) > 5:
print("\n# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #")
print(f"WARNING! In the {fasta_file} there are more than 5 sequences")
print("# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #")
while True:
choice = input("\nAre you sure to continue? y/n ")
if choice == "y":
domain_order = "Decreasing"
result_dictionary = result_dictionary_maker(protein_list, dataframe_tsv, prior_choice, fasta_file,
domain_order)
sequences_drawer(protein_list, table_list, result_dictionary, folder_name)
break
elif choice == "n":
break
else:
print("Strange way to type 'y' or 'n'. Retry.\n")
break
elif wanna_draw == "y":
domain_order = "Decreasing"
result_dictionary = result_dictionary_maker(protein_list, dataframe_tsv, prior_choice, fasta_file,
domain_order)
sequences_drawer(protein_list, table_list, result_dictionary, folder_name)
break
elif wanna_draw == "n":
break
else:
print("Strange way to type 'y' or 'n'. Retry.\n")
print(separator)
# *********************************************************************************************
# BYE MESSAGE
# *********************************************************************************************
if manual_mode:
pass
else:
print("Have a productive day!")
print(separator)