-
Notifications
You must be signed in to change notification settings - Fork 0
/
cdd_funfam.py
171 lines (133 loc) · 7.55 KB
/
cdd_funfam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python3
"""
@author T. Paysan-Lafosse
@brief This script generates a list of CDD entries overlapping with CATH-Funfams
@arguments [CONFIG_FILE]: file containing database connection and files location (see config.ini)
"""
import argparse
import os
import re
from configparser import ConfigParser
import traceback
import cx_Oracle
from statistics import mean
import go_comparison.funfam2gomapping as f2go
def get_overlap(start1, end1, start2, end2):
totalRange = max(end1, end2) - min(start1, start2)
sumOfRanges = (end1 - start1) + (end2 - start2)
overlappingInterval = 0
if sumOfRanges > totalRange: # means they overlap
overlappingInterval = min(end1, end2) - max(start1, start2)
return overlappingInterval
def get_total_cdd_matches(connectString):
try:
# subprocess.run(["source", "~oracle/ora112setup.sh"])
con = cx_Oracle.connect(connectString)
cur = con.cursor()
except:
stackTrace = traceback.format_exc()
print(stackTrace)
request = """
select method_ac, count(distinct protein_ac)
from match
where dbcode='J'
group by method_ac
"""
cur.execute(request)
cdd_matches = dict(cur.fetchall())
cur.close()
con.close()
return cdd_matches
def find_overlaping(user, password, schema, outputdir, outfile_one_match, outfile_multi_match, funfam2gofile):
connectString = "".join([user, "/", password, "@", schema])
list_cdd = dict()
list_funfam = set()
cdd_matches = get_total_cdd_matches(connectString)
try:
# subprocess.run(["source", "~oracle/ora112setup.sh"])
connection = cx_Oracle.connect(connectString)
cursor = connection.cursor()
except:
stackTrace = traceback.format_exc()
print(stackTrace)
#get funfam2go list from file
funfam2go = dict()
if os.path.exists(funfam2gofile) and os.path.getsize(funfam2gofile) > 0:
funfam2go = f2go.get_mapping_from_file(funfam2gofile)
request="""
SELECT m.protein_ac, m.method_ac, m.pos_from, m.pos_to, fm.method_ac, fm.pos_from, fm.pos_to
FROM INTERPRO.MATCH m
JOIN INTERPRO.FEATURE_MATCH fm on m.protein_ac=fm.protein_ac
LEFT JOIN INTERPRO.ENTRY2METHOD e2m on m.method_ac=e2m.method_ac
LEFT JOIN INTERPRO.METHOD_COMMENT m2c on m2c.method_ac=e2m.method_ac
WHERE m.DBCODE='J' AND fm.dbcode='f'
AND e2m.method_ac is null
AND m2c.method_ac is null
"""
cursor.execute(request)
for row in cursor:
protein, cdd, cdd_start, cdd_end, funfam, funfam_start, funfam_end = row
cdd_len = int(cdd_end)-int(cdd_start)
funfam_len = int(funfam_end)-int(funfam_start)
len_overlap = get_overlap(int(cdd_start), int(cdd_end), int(funfam_start), int(funfam_end))
if funfam not in funfam2go:
list_funfam.add((outputdir, funfam))
try:
list_cdd[cdd][funfam]["protein"].append(protein)
list_cdd[cdd][funfam]["cdd_doms"].append(cdd_len)
list_cdd[cdd][funfam]["funfam_doms"].append(funfam_len)
except KeyError:
try:
list_cdd[cdd][funfam] = {"protein":[protein], "overlap":[], "cdd_doms":[cdd_len], "funfam_doms":[funfam_len]}
except KeyError:
list_cdd[cdd] = {funfam: {"protein":[protein], "overlap":[], "cdd_doms":[cdd_len], "funfam_doms":[funfam_len]}}
if len_overlap > cdd_len/2 and len_overlap > funfam_len/2: #overlap should be at least half the length of the CDD and funfam domains
list_cdd[cdd][funfam]["overlap"].append(len_overlap)
cursor.close()
connection.close()
#search funfam GO terms and save results in file for subsequent runs
if len(list_funfam) > 0:
funfam2go.update(f2go.get_list_funfam2go(list(list_funfam)))
f2go.save_in_file(funfam2gofile, funfam2go)
with open(outfile_one_match, "w") as f1, open(outfile_multi_match, "w") as fm:
header = "cdd; link to cdd website; funfam; total number of CDD/UniProt matches; total number of UniProt matches for the CDD/funfam pair; number of UniProt matches that overlap for the CDD/funfam pair; percentage of CDD/UniProt matches vs overlap; percentage of proteins overlapping; average overlap length of CDD/funfam; average length of the CDD domain; average length of the funfam domain; comments\n"
# header = "cdd, funfam, total number of CDD/UniProt matches, total number of UniProt matches for the CDD/funfam pair, number of UniProt matches that overlap for the CDD/funfam pair, percentage of CDD/UniProt matches vs overlap, percentage of proteins overlapping, average overlap length of CDD/funfam, average length of the CDD domain, average length of the funfam domain, comments\n"
f1.write(header)
fm.write(header)
for cdd, funfams in list_cdd.items():
count_overlap = 0
text = ""
prontolink = f'=HYPERLINK("http://pronto.ebi.ac.uk:5000/signature/{cdd}","{cdd}")'
cddlink = f'=HYPERLINK("https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid={cdd}","ncbi_{cdd}")'
for funfam, content in funfams.items():
percent_cdd_match_overlap = round(len(content['overlap'])*100/int(cdd_matches[cdd]), 2)
percentoverlap = round(len(content['overlap'])*100/len(content['protein']), 2)
nb_funfam2go = len(funfam2go[funfam]) if funfam in funfam2go else 0
if len(content["overlap"]) > 0 and percentoverlap >= 50 and percent_cdd_match_overlap >= 50 and nb_funfam2go > 0 and nb_funfam2go < 5:
count_overlap += 1
gene3d = re.match(r'G3DSA:((\d+\.)+\d+):FF:0+([1-9]\d*)', funfam).group(1)
funfamid = re.match(r'G3DSA:((\d+\.)+\d+):FF:0+([1-9]\d*)', funfam).group(3)
funfamlink = f'=HYPERLINK("http://www.cathdb.info/version/latest/superfamily/{gene3d}/funfam/{funfamid}","{funfam}")'
text+=f"{prontolink};{cddlink};{funfamlink};{cdd_matches[cdd]};{len(content['protein'])};{len(content['overlap'])};{percent_cdd_match_overlap};{percentoverlap};{round(mean(content['overlap']),2)};{round(mean(content['cdd_doms']),2)};{round(mean(content['funfam_doms']),2)};\n"
# text+=f"{cdd},{funfam},{percent_cdd_match_overlap},{len(content['protein'])},{len(content['overlap'])},{percent_cdd_match_overlap},{percentoverlap},{round(mean(content['overlap']),2)},{round(mean(content['cdd_doms']),2)},{round(mean(content['funfam_doms']),2)},\n"
if count_overlap == 1:
f1.write(text)
else:
fm.write(text)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("config", metavar="FILE", help="configuration file")
args = parser.parse_args()
if not os.path.isfile(args.config):
parser.error(f"Cannot open '{args.config}': " f"no such file or directory")
config = ConfigParser()
config.read(args.config)
outputdir = config["dir"]["outputdir"]
outfile_one_match = os.path.join(outputdir, "list_cdd_funfam_one_match_test.csv")
outfile_multi_match = os.path.join(outputdir, "list_cdd_funfam_multi_match_test.csv")
funfam2gofile = os.path.join(outputdir, "funfam2go.csv")
# database connection values
user = config["database"]["user"]
password = config["database"]["password"]
schema = config["database"]["schema"]
find_overlaping(user, password, schema, outputdir, outfile_one_match, outfile_multi_match, funfam2gofile)