forked from datashinobi/Sequence-Graph-transform
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sgt.py
83 lines (48 loc) · 1.8 KB
/
sgt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
def getpositions(S, V):
'''
compute index position of sequence S within V
sequence S , space set V
return list of tuples [(value, position)]
[(209981, (array([8]),))(320033, (array([6]),)]
'''
positions = [(v, np.where(S==v)) for v in V if v in S]
return positions
def sgt(S, V, ls, k =1):
'''
Extract Sequence Graph Transform features algorithm 2
S: sequence
V : set domain of all values
ls: is length sensitive
k: hyperparameter defaults to 1 for supervised learning typically selected κ from {1, 5, 10}
return: sgt matrix
'''
size = V.shape[0]
l = 0
W0, Wk = np.zeros((size,size)), np.zeros((size,size))
positions = getpositions(S,V)
for i, u in enumerate(V):
try:
index = [p[0] for p in positions].index(u)
except ValueError:
# move to next element
break
U = np.array(positions[index][1]).ravel()
for j, v in enumerate(V):
try:
index = [p[0] for p in positions].index(v)
except ValueError:
# move to next element
break
V2 = np.array(positions[index][1]).ravel()
C = [(i,j) for i in U for j in V2 if j > i]
W0[i,j] = len(C)
cu = np.array([i[0] for i in C])
cv = np.array([i[1] for i in C])
Wk[i,j] = np.sum(np.exp(-k * np.abs(cu - cv)))
l += U.shape[0]
if ls:
W0 /= l
W0[np.where(W0==0)] = 1e7 #avoid divide by 0
sgt = np.power(np.divide(Wk, W0), 1/k)
return sgt