-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathS_events.py
executable file
·153 lines (130 loc) · 4.62 KB
/
S_events.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from comparison_sol import str_path
from comparison_sol import count_solutions
from comparison_sol import remove_duplicate
#from overlap_lists import redundant
import pandas as pd
import subprocess
import shlex
import os
import time
import signal
# These functions are from short_reads_pipeline.py
def find_edges(LUs):
sr_edges = []
c = 0 # c = counter
for i in range(len(LUs)):
edge = str(c) + "_" + str(c + 1)
c = c + 2
sr_edges.append(edge)
#print("sr_edges =", sr_edges)
Dic_LUs_edges = dict(zip(LUs, sr_edges))
#print("Dic_LUs_edges =", Dic_LUs_edges)
return Dic_LUs_edges
# this invert (reverse) a node. For example, 2/1 becomes 1/2
def invert_node(node):
pos_l = node.find("/") # find the position of / in the node
first = node[:pos_l]
second = node[pos_l+1:]
#print(first, second)
inv_node = second + "/" + first
return inv_node
# remove duplicates in the list of nodes
def reduce_nodes(nodes):
new_nodes = []
for i in nodes:
if i not in new_nodes and invert_node(i) not in new_nodes:
new_nodes.append(i)
return new_nodes
# this make the smaller LU first in the node. For example, 2/1 becomes 1/2
def sort_nodes(nodes):
new_nodes = []
for n in nodes:
pos_l = n.find("/") # find the position of / in the node
first = n[:pos_l]
second = n[pos_l + 1:]
if int(first) <= int(second):
new_nodes.append(n)
else:
new_nodes.append(invert_node(n))
return new_nodes
# Find the Nodes or the LoxP junctions
def find_nodes_and_edges(path, debug=False):
# Find the edges in the reference
#path_abs = [abs(i) for i in path]
MAX = abs(max(path, key=abs)) #this find the biggest number by absolute value
ref = range(1, MAX+1)
ref_edges = find_edges(ref)
# Convert the path in the edges
path_edges = []
for LU in path:
if LU >= 0:
edge = ref_edges[LU]
else:
# if the LU is negative invert the edge
pos_ = ref_edges[-LU].find("_")
edge = ref_edges[-LU][pos_ + 1:] + "_" + ref_edges[-LU][:pos_]
path_edges.append(edge)
# Count the edges
Dic_edges_counter = {}
for LU in ref_edges:
Dic_edges_counter[ref_edges[LU]] = 0
for edge in path_edges:
try:
Dic_edges_counter[edge] = Dic_edges_counter[edge] + 1
except:
# it means that the LU is inverted (negative) and that the edge is inverted
pos_ = edge.find("_")
edge = edge[pos_ + 1:] + "_" + edge[:pos_]
Dic_edges_counter[edge] = Dic_edges_counter[edge] + 1
# Find the nodes
path_nodes = []
first = "0"
for i in path_edges:
# find position of "_"
pos_ = i.find("_")
second = i[:pos_]
node = first + "/" + second
path_nodes.append(node)
first = i[pos_+1:]
# remove first element
path_nodes.pop(0)
# remove duplicates in the list of nodes
new_path_nodes = reduce_nodes(path_nodes)
new_path_nodes = sort_nodes(new_path_nodes) # I am not sure this is essential, maybe can be removed
if debug == True:
print("ref_edges =", ref_edges)
print("path_edges =", path_edges)
print("Dic_edges_counter =", Dic_edges_counter)
print("path_nodes =", path_nodes)
return Dic_edges_counter, new_path_nodes,
###############################################
# These are new functions
# Find novel loxP junctions.
def find_novel_loxP_junctions(path, ref=[], debug=False):
if ref == []:
# Find the reference
#path_abs = [abs(i) for i in path]
MAX = abs(max(path, key=abs)) #this find the biggest number by absolute value
ref = range(1, MAX+1)
# find loxP_junctions in the reference
nodes_ref = find_nodes_and_edges(ref, debug=False)[1]
# find loxP_junctions in the SCRaMbLEd chromosome
nodes_SCRaMbLEd = find_nodes_and_edges(syn, debug=False)[1]
# subtract loxP_junctions in the SCRaMbLEd chromosome to the loxP_junctions in the reference
novel_nodes = []
for N in nodes_SCRaMbLEd:
if N not in nodes_ref:
novel_nodes.append(N)
if debug:
print(find_nodes_and_edges(ref))
print("nodes_ref =", nodes_ref)
print("nodes_SCRaMbLEd =", nodes_SCRaMbLEd)
print("novel_nodes =", nodes_SCRaMbLEd)
return novel_nodes
# Classifying recombination events
# test the code
if __name__ == "__main__":
ref = [1,2,3,4,5]
syn = [1,2,5,-3,2]
#print(find_nodes_and_edges(syn, debug=False))
print(find_novel_loxP_junctions(syn, ref=[], debug=True))