-
Notifications
You must be signed in to change notification settings - Fork 0
/
db_join_interval.py
104 lines (90 loc) · 3.08 KB
/
db_join_interval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!python
# join hole intervals
# check manual for usage and important details
# v1.0 12/2020 paulo.ernesto
'''
usage: $0 left_db*csv,xlsx left_hid:left_db left_from:left_db left_to:left_db right_db*csv,xlsx right_hid:right_db right_from:right_db right_to:right_db output*csv
'''
import sys, os.path
import numpy as np
import pandas as pd
# import modules from a pyz (zip) file with same name as scripts
sys.path.append(os.path.splitext(sys.argv[0])[0] + '.pyz')
from _gui import usage_gui, pd_load_dataframe, pd_save_dataframe
from db_create_from_to import pd_create_from_to
import portion
import time
def interval_consolidate(dl):
''' remove redundant intervals '''
cl = dl.copy()
for d0 in dl:
for d1 in dl:
if d0 != d1 and d1 in cl and d0.contains(d1):
cl.remove(d1)
return cl
def pd_join_interval(dfs, v_lut):
hl = dict()
vl = []
for i in range(2):
if not v_lut[i]['from']:
pd_create_from_to(dfs[i], v_lut[i]['hid'], v_lut[i]['to'], True)
v_lut[i]['from'] = 'from'
dfs[i].set_index(v_lut[i]['hid'], False, False, True)
dfs[i].set_index(pd.RangeIndex(0, len(dfs[i])), False, True, True)
# first step: construct a comprehensive list of intervals
# where any of the inputs touches
for row, df in dfs[i].iterrows():
hid = df[v_lut[i]['hid']]
if hid not in hl:
hl[hid] = set()
d = portion.closedopen(df[v_lut[i]['from']], df[v_lut[i]['to']])
if not d.empty:
hl[hid].add(d)
# add variables one by one since we have to preserve order anyway
for c in dfs[i].columns:
if c not in vl:
vl.append(c)
t = time.time()
odf = pd.DataFrame(columns=vl)
for hid, dl in hl.items():
print("#", hid)
c0 = min(dl).lower
c1 = None
while True:
r = pd.Series(None, vl, 'object', (hid,c0))
cs = portion.singleton(c0)
for i in range(len(dfs)):
if hid in dfs[i].index:
for row_i, row_d in dfs[i].loc[hid].iterrows():
d = portion.closedopen(row_d[v_lut[i]['from']], row_d[v_lut[i]['to']])
if d.overlaps(cs):
r.update(row_d)
if c1 is None or c1 > d.upper:
c1 = d.upper
if c1 is None:
break
r['from'] = c0
r['to'] = c1
c0 = c1
c1 = None
odf = odf.append(r)
print("pd_join_interval profile time", time.time() - t)
return odf
def db_join_interval(left_db, left_hid, left_from, left_to, right_db, right_hid, right_from, right_to, output):
v_lut = [{},{}]
v_lut[0]['hid'] = left_hid or 'DHID'
v_lut[1]['hid'] = right_hid or 'DHID'
v_lut[0]['from'] = left_from
v_lut[1]['from'] = right_from
v_lut[0]['to'] = left_to
v_lut[1]['to'] = right_to
dfs = [pd_load_dataframe(left_db), pd_load_dataframe(right_db)]
odf = pd_join_interval(dfs, v_lut)
if output:
odf.reset_index(drop=True, inplace=True)
pd_save_dataframe(odf, output)
else:
print(odf.to_string())
main = db_join_interval
if __name__=="__main__":
usage_gui(__doc__)