-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprot-update-tcdb.py
100 lines (92 loc) · 2.89 KB
/
prot-update-tcdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from sys import *
import csv, os, json
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--output_qs", help="output to QS",
action="store_true")
parser.add_argument("-q", "--query", help="perform SPARQL query",
action="store_true")
args = parser.parse_args()
QS = args.output_qs
dontquery = not args.query
script = os.path.basename(argv[0])[:-3]
ndate = '2020-04-29'
newd = ndate + 'T00:00:00Z'
LIMIT = 100000
if dontquery is False:
print('performing query...')
ret = os.popen('wd sparql {}.rq >{}.json'.format(script, script))
if ret.close() is not None:
raise
file = open('{}.json'.format(script))
s = file.read()
jol = json.loads(s)
unips = {}
tcdbs = {}
dups = set()
for d in jol:
uid = d.get('u')
it = d.get('p')
git = unips.get(uid)
if git is not None and git != it:
#print('more than one value: {} ({}, {})'.format(uid, git, it))
dups.add(uid)
continue
elif git is None:
unips[uid] = it
t = d.get('t')
if t is None or len(t) == 0:
continue
stmt = d.get('stmt')
refd = d.get('refdate')
tc = tcdbs.get(it)
if tc is None:
tcdbs[it] = [(t,stmt,refd)]
else:
tc.append((t,stmt,refd))
for k in dups:
unips.pop(k)
print('ignoring {} items with duplicate UniProt IDs'.format(len(dups)))
ctr = 0
uids = set(unips.keys())
tups = []
for line in open('tcdb.txt').readlines():
l = line.rstrip()
if l[0] != '>':
continue
ll = l.split('|')
u = ll[2].rstrip()
t = ll[3][:ll[3].find(' ')]
if u in uids and unips.get(u) is not None:
ctr = ctr + 1
if ctr > LIMIT:
break
qit = unips.get(u)
if QS:
print('{}|P7260|"{}"|S248|Q142667|S813|+{}/11'.format(qit, t, newd))
else:
if tcdbs.get(qit) is None:
j = {"id": qit, "claims": { "P7260": [{ "value": t,
"references": { "P248": "Q142667", "P813": ndate }}] } }
f = open('t.json', 'w')
f.write(json.dumps(j))
f.close()
print(json.dumps(j), flush=True)
ret = os.popen('wd ee t.json')
print(ret.read())
if ret.close() is not None:
print('ERROR')
continue
for oldset in tcdbs.get(qit):
origt,stmt,refd = oldset
if refd == newd:
continue
j = {"id": qit, "claims": { "P7260": [{ "id": stmt, "value": t,
"references": { "P248": "Q142667", "P813": ndate }}] } }
f = open('t.json', 'w')
f.write(json.dumps(j))
f.close()
print(json.dumps(j), flush=True)
ret = os.popen('wd ee t.json')
print(ret.read())
if ret.close() is not None:
print('ERROR')