-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_prep.py
116 lines (102 loc) · 5.3 KB
/
data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import numpy as np
from datetime import datetime
import glob
import os
def extract_elan_labels(data, subject, path):
"""Converts exported ELAN player annotations and appends them as column to a sensor DataFrame.
Parameters
----------
data : Pandas DataFrame
DataFrame containing sensor data, ground truth data and appended participants' annotations
subject : str
Subject identifier of the participant annotations belong to
path: str
Path to the annotations
Returns
-------
Pandas DataFrame
Initial data DataFrame with the participant's annotations appended as additional column
"""
timestamps = pd.to_timedelta(data['timestamps'])
data = data.reset_index(drop=True)
labels = pd.read_csv(path, delimiter='\t', index_col=0, header=None, lineterminator='\n')
labels = labels.reset_index()
columns = ['layer', 'drop1', 'start', 'start[ms]', 'end', 'end[ms]', 'length', 'length[ms]', 'label']
labels.columns = columns
labels = labels.drop(['drop1'], axis=1)
starting_point = pd.Timedelta(hours=0, minutes=0, seconds=0, microseconds=0).to_timedelta64()
for row in labels.iterrows():
start_time = row[1]['start']
end_time = row[1]['end']
dt_start = datetime.strptime(start_time, "%H:%M:%S.%f").time()
timedelta_start = pd.Timedelta(hours=dt_start.hour, minutes=dt_start.minute, seconds=dt_start.second,
microseconds=dt_start.microsecond).to_timedelta64()
dt_end = datetime.strptime(end_time, "%H:%M:%S.%f").time()
timedelta_end = pd.Timedelta(hours=dt_end.hour, minutes=dt_end.minute, seconds=dt_end.second,
microseconds=dt_end.microsecond).to_timedelta64()
start = starting_point + timedelta_start
end = starting_point + timedelta_end
start_index = np.abs(np.subtract(timestamps, start)).argmin()
end_index = np.abs(np.subtract(timestamps, end)).argmin()
try:
data[subject].loc[start_index:end_index] = row[1]['label']
except:
print('error')
return data
def get_gui_label_timestamps(file_type, raw_data_folder, annotations):
"""Appends each participants annotations to sensor dataframe for both ELAN player and MAD-GUI annotations.
Parameters
----------
file_type : str
Annotation file type, i.e. which part of dataset was annotated
raw_data_folder : str
Folder location of sensor data
annotations: str
Path to the annotations
Returns
-------
Pandas DataFrame
Wetlab and WEAR sensor dataframes with appended annotations of each participant as separate columns
"""
gt = "groundtruth" + file_type
annotations_wear_mad = glob.glob(os.path.join(annotations, "wear/*" + file_type + ".csv"))
annotations_wear_mad.sort()
annotations_wear_elan = glob.glob(os.path.join(annotations, "wear/*" + file_type + ".txt"))
annotations_wear_elan.sort()
annotations_wetlab_mad = glob.glob(os.path.join(annotations, "wetlab/*" + file_type + ".csv"))
annotations_wetlab_mad.sort()
annotations_wetlab_elan = glob.glob(os.path.join(annotations, "wetlab/*" + file_type + ".txt"))
annotations_wetlab_elan.sort()
wetlab_data = pd.read_csv(os.path.join(raw_data_folder, "wetlab" + file_type + ".csv"), names=['timestamps', 'sbj', 'acc_x', 'acc_y', 'acc_z', gt, 'groundtruth1'], index_col=0).drop('groundtruth1', axis=1)
wetlab_data.fillna('null_class', inplace=True)
wear_data = pd.read_csv(os.path.join(raw_data_folder, "wear" + file_type + ".csv"), index_col=0, names=['timestamps', 'right_arm_acc_x','right_arm_acc_y','right_arm_acc_z', 'labels']).rename({'labels': gt}, axis=1)
wear_data.fillna('null_class', inplace=True)
# wetlab
for i, j in enumerate(annotations_wetlab_mad):
j = j.split('.')[0].split('/')[-1]
wetlab_data[j] = np.full(fill_value="null_class", shape=(len(wetlab_data[gt]), 1))
labels = pd.read_csv(annotations_wetlab_mad[i])
for n in range(0, labels.shape[0]):
start = labels.start[n]
end = labels.end[n]
description = labels.description[n][2:-3]
wetlab_data[j].loc[start:end] = description
for i, j in enumerate(annotations_wear_mad):
j = j.split('.')[0].split('/')[-1]
wear_data[j] = np.full(fill_value="null_class", shape=(len(wear_data[gt]), 1))
labels = pd.read_csv(annotations_wear_mad[i])
for n in range(0, labels.shape[0]):
start = labels.start[n]
end = labels.end[n]
description = labels.description[n][2:-3]
wear_data[j].loc[start:end] = description
for i, j in enumerate(annotations_wetlab_elan):
j = j.split('.')[0].split('/')[-1]
wetlab_data[j] = np.full(fill_value="null_class", shape=(len(wetlab_data[gt]), 1))
wetlab_data = extract_elan_labels(wetlab_data, j, annotations_wetlab_elan[i])
for i, j in enumerate(annotations_wear_elan):
j = j.split('.')[0].split('/')[-1]
wear_data[j] = np.full(fill_value="null_class", shape=(len(wear_data[gt]), 1))
wear_data = extract_elan_labels(wear_data, j, annotations_wear_elan[i])
return wetlab_data.reset_index(drop=True), wear_data.reset_index(drop=True)