-
Notifications
You must be signed in to change notification settings - Fork 4
/
fish_helper.py
66 lines (49 loc) · 2.12 KB
/
fish_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
from pathlib import Path
import gzip
import json
import pandas as pd
import numpy as np
class DatasetContainer:
"""
This class should function as a container for a specific dataset. Its task is to contain the pandas dataframe
as well as metadata that might be needed in certain use cases (such as validation).
The DatasetContainer is initialized with either a pandas dataset (and metadata) or a path that links to a dataset.
If both is provided the path will be ignored.
"""
def __init__(self, dataset: pd.DataFrame = None, metadata: dict = None, path_of_dataset: str = None):
super(DatasetContainer, self).__init__()
if dataset is None and path_of_dataset is not None:
self.load(path_of_dataset)
else:
self.data: pd.DataFrame = dataset
self.meta: dict = metadata
def save(self, path: str):
if path is None:
raise ValueError('No path was provided for the save function.')
path = Path(path).resolve()
path.parent.mkdir(parents=True, exist_ok=True)
data_json = self.data.to_json()
meta_json = json.dumps(self.meta)
container_dict = {
'data': data_json,
'meta': meta_json
}
# save file to ziped format
with gzip.open(path, 'w') as fout:
fout.write(json.dumps(container_dict).encode('utf-8'))
def load(self, path: str):
if path is None:
raise ValueError('No path was provided for the load function.')
with gzip.open(path, 'r') as fin:
container_json = json.loads(fin.read().decode('utf-8'))
self.meta = json.loads(container_json['meta'])
if not self.meta is None and "dtypes" in self.meta:
dtype = self.meta["dtypes"]
else:
dtype = True
self.data = pd.read_json(container_json['data'], dtype=dtype)
for column in self.data:
if type(self.data[column].iloc[0]) == list:
self.data[column] = self.data[column].apply(lambda x: np.array(x))
return self