Skip to content

Commit

Permalink
Merge pull request #135 from george0st/change
Browse files Browse the repository at this point in the history
Switch to standard CSV sep and decimal
  • Loading branch information
george0st authored Apr 1, 2024
2 parents 8c864e0 + ffe7ce7 commit 185de1a
Show file tree
Hide file tree
Showing 21 changed files with 639 additions and 593 deletions.
6 changes: 4 additions & 2 deletions 01-model/model.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
{
"name": "Financial services",
"name": "Model setting",
"description": "The machine learning meta-model with synthetic data (useful for MLOps/feature store), part of the quality gate concept.",
"kind": "model",
"spec": {
"version": "0.1.9"
"version": "0.1.9",
"CSV_SEPARATOR": ",",
"CSV_DECIMAL": "."
}
}
Binary file modified 02-data/01-size-100/01-basic-party.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/02-basic-contact.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/03-basic-relation.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/04-basic-account.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/05-basic-transaction.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/06-basic-event.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/07-basic-communication.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/01-basic-party.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/02-basic-contact.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/03-basic-relation.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/04-basic-account.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/05-basic-transaction.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/06-basic-event.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/07-basic-communication.csv.gz
Binary file not shown.
586 changes: 285 additions & 301 deletions 03-test/01-size-100.json

Large diffs are not rendered by default.

582 changes: 299 additions & 283 deletions 03-test/02-size-1k.json

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions docs/structure.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@

## Generated data
- [Basic rules for generated data](./rules.md), cardinality, constrains, etc.
- Data is in format CSV/GZ format (with expected future parquet support)
- A few details: header=True, encoding="utf-8", sep=";", decimal=","
- Data is in format CSV/GZ format ( expected future parquet support)
- A few details:
- header=True, encoding="utf-8", sep=",", decimal="."
- NOTE: setting of 'sep' and 'decimal' see the json file '01-model/model.json')



16 changes: 11 additions & 5 deletions generator/base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import os
from generator.base import Base
from generator.setup import Setup


class BaseData(Base):
Expand Down Expand Up @@ -39,6 +40,8 @@ def save(self, path, append: bool, dir: str, compress: bool):
if not os.path.exists(path):
os.makedirs(path)

setup=Setup()

# print(f"Creating: {'APPEND' if append else 'WRITE'}, name: '{self.name}', dir: '{dir}'...")
df=pd.DataFrame(self.model)
if compress:
Expand All @@ -48,11 +51,14 @@ def save(self, path, append: bool, dir: str, compress: bool):
index=False,
mode="a" if append else "w",
encoding='utf-8',
sep=";",
decimal=",",
sep=setup.csv_separator,
decimal=setup.csv_decimal,
compression=compression_opts)

# df.to_parquet(os.path.join(path,f"{self.name}.parquet"),
# compression: CompressionOptions = "infer",


# df.to_parquet(os.path.join(path,f"{self.name}.parquet"),
# engine="pyarrow",
# compression='gzip',
# # header=False if append else True,
Expand All @@ -65,8 +71,8 @@ def save(self, path, append: bool, dir: str, compress: bool):
index=False,
mode="a" if append else "w",
encoding='utf-8',
sep=";",
decimal=",")
sep=setup.csv_separator,
decimal=setup.csv_decimal)

# df.to_parquet(os.path.join(path,f"{self.name}.parquet"),
# engine='fastparquet',
Expand Down
31 changes: 31 additions & 0 deletions generator/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import json
import math
import uuid
import os
import numpy as np

class Singleton (type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]


class Setup(metaclass=Singleton):

def __init__(self, model_path):
self._model_setting={}

with open(os.path.join(model_path, "model.json"), "r") as json_file:
setting = json.load(json_file)

self._model_setting=setting["spec"]

@property
def csv_separator(self):
return self._model_setting["CSV_SEPARATOR"]

@property
def csv_decimal(self):
return self._model_setting["CSV_DECIMAL"]
5 changes: 5 additions & 0 deletions generator/synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@
from generator.base_data import BaseData
from generator.base_test import BaseTest
from generator.data_hint import DataHint
from generator.setup import Setup

class SyntheticData:

def __init__(self, model_path="01-model", output_path="02-data", test_path="03-test"):

# init setup singleton
Setup(model_path)

self._model_path=model_path
self._output_path=output_path
self._test_path=test_path
Expand Down

0 comments on commit 185de1a

Please sign in to comment.