-
Notifications
You must be signed in to change notification settings - Fork 5
/
clean_db.py
96 lines (87 loc) · 3.19 KB
/
clean_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
import json
import numpy as np
import pandas as pd
def db_to_dataframe(filename):
"""
Load the json file obtained through create_db into a pandas dataframe
Parameters
----
filename: string; the full path to the json file, complete with extension
Returns
----
d: DataFrame; the dataframe containing the elements stored in the file
Example
----
>>>import pandas as pd
>>>import json
>>>import os
>>>d = {'first': {'a': 1, 'b': 2}, 'second': {'a': 7, 'b': 14}}
>>>with open('a.json', 'w') as file: json.dump(d, file)
>>>f = db_to_dataframe('a.json')
>>>fs
Out:
a b
0 1 2
1 7 14
>>>os.remove('a.json')
"""
d = json.load(open(filename))
d = pd.DataFrame(d).T
d = d.set_index(np.arange(len(d)))
return d
def clean_db(db, drop_columns=['rhythm'],
drop_na_in=['weight', 'tpr', 'madRR', 'medianRR', 'opt_delay',
'afib', 'age', 'sex'],
drop_zeroes_in=['weight', 'age'],
quality_threshold=None,
reset_index=True
):
"""
Clean the 'cardio' dataframe by removing the 'bad' elements
Parameters
----
db: pandas DataFrame; the dataframe contaning the cardio data extracted through create_db
drop_columns: list of strings; (optional; default: ['rhythm']); the column labels that must be dropped from the dataframe. If None, it is skipped
drop_na_in: list of strings; (optional; default: 'weight', 'tpr', 'madRR', 'medianRR', 'opt_delay', 'afib', 'age', 'sex']); the subset of labels for which we must drop the row with NAN values. If None, it is skipped
drop_zeroes_in: list of strings; (optional; default: ['weight', 'age']); the subset of labels for which we must drop the zero values because they bear no meaning. If None, it is skipped.
quality_threshold: float; (optional; default: None); the quality threshold above which we drop the elements in the dataframe. BEWARE: if it is not None, the label 'quality' must be present!
reset_index: bool; (default: True); if True reset the index so as to be able to call the elements by row
Return
---
newdb: pandas DataFrame; the cleaned dataframe
Example
---
>>>import numpy as np
>>>import pandas as pd
>>>d = {}
>>>names = ['first', 'second', 'third', 'fourth']
>>>v1, v2 = [0, 0, 56, 66], [20, np.nan, 40, 56]
>>>for name, _, __ in zip(names, v1, v2):
... d[name] = {'rhythm': np.nan, 'weight': _, 'age': __}
>>>d = pd.DataFrame(d).T
>>>clean_db(d)
Out:
age weight
0 40.0 56.0
1 56.0 66.0
"""
newdb = db
if quality_threshold is not None:
newdb = newdb[newdb['quality'] < quality_threshold]
if drop_columns is not None:
dcn = list(set(drop_columns) & set(newdb.columns))
if len(dcn) != 0:
newdb = newdb.drop(columns=dcn)
if drop_na_in is not None:
dnn = list(set(drop_na_in) & set(newdb.columns))
if len(dnn) != 0:
newdb = newdb.dropna(subset=dnn)
if drop_zeroes_in is not None:
dzn = list(set(drop_zeroes_in) & set(newdb.columns))
if len(dzn) != 0:
for _ in dzn:
newdb = newdb[newdb[_].astype(float) > 0]
if reset_index == True:
newdb = newdb.set_index(np.arange(len(newdb)))
return newdb