-
Notifications
You must be signed in to change notification settings - Fork 4
/
train
109 lines (87 loc) · 3.52 KB
/
train
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
import json
from pathlib import Path
import pandas as pd
from imblearn.combine import SMOTETomek
from pycaret.classification import create_model, save_model, setup, save_config
from utils import *
class TrainConfig:
ROOT_DIR = Path("/opt/ml")
INPUT_DATA_DIR = ROOT_DIR / "input/data"
IN_PARAM_JSON = ROOT_DIR / "input/config/hyperparameters.json"
IN_TRAINING_DIR = INPUT_DATA_DIR / "train"
IN_TRAIN_CSV = IN_TRAINING_DIR / "train.csv"
OUT_MODEL_DIR = ROOT_DIR / "model"
OUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)
def inspect_input():
logger.info(f"Start inspect_input")
files = list_dir(TrainConfig.INPUT_DATA_DIR)
logger.info(f"{TrainConfig.INPUT_DATA_DIR.as_posix()}: {files}")
files = list_dir(TrainConfig.IN_TRAINING_DIR)
logger.info(f"{TrainConfig.IN_TRAINING_DIR.as_posix()}: {files}")
def train(random_seed=17, *args, **kwargs):
logger.info(f"Start train")
df = pd.read_csv(TrainConfig.IN_TRAIN_CSV)
logger.info(f"Pycaret setup")
exp_baseline = setup(
data=df,
train_size=0.9, # split training and hold-out test set
target="target",
session_id=17,
use_gpu=False,
data_split_shuffle=True,
data_split_stratify=True,
fold_strategy="stratifiedkfold",
fold=5,
imputation_type="simple", # there's a small number of missing values
numeric_imputation="mean", # use mean to replace missing values
fix_imbalance=True,
fix_imbalance_method=SMOTETomek(), # imblearn's method
remove_outliers=False,
outliers_threshold=0.05, # percentage of outliers in distribution's tail
normalize=True,
normalize_method="zscore", # z=(x-u)/s
transformation=False,
transformation_method="yeo-johnson",
feature_interaction=False, # create a*b features
feature_ratio=False, # create a/b features
interaction_threshold=0.01, # threshold feature importance
polynomial_features=False,
polynomial_degree=2,
polynomial_threshold=0.1,
trigonometry_features=False, # create tan(x), sin(x), cos(x)
feature_selection=True, # use tree models to compute feature importance
feature_selection_threshold=0.8,
remove_multicollinearity=True, # drop highly correlated features
multicollinearity_threshold=0.9,
pca=True, # reduce feature space dimension
pca_method="linear",
pca_components=0.99, # % components to keep
silent=True, # no asking enter
)
logger.info(f"Pycaret finalize_model")
model1 = create_model("lightgbm")
logger.info(f"Pycaret save_model")
model_path = TrainConfig.OUT_MODEL_DIR / "final-model"
save_model(model1, model_path.as_posix())
logger.info(f"Pycaret save_config")
config_path = TrainConfig.OUT_MODEL_DIR / "final-config"
save_config(config_path.as_posix())
def inspect_output():
logger.info(f"Start inspect_output")
files = list_dir(TrainConfig.OUT_MODEL_DIR)
logger.info(f"{TrainConfig.OUT_MODEL_DIR.as_posix()}: {files}")
if __name__ == "__main__":
params = {}
if TrainConfig.IN_PARAM_JSON.exists():
with open(TrainConfig.IN_PARAM_JSON.as_posix(), "r") as tc:
params = json.load(tc)
logger.info(f"params: {params}")
args = {}
random_seed = params.get("random_seed", None)
if random_seed is not None:
args["random_seed"] = float(random_seed)
inspect_input()
train(**args)
inspect_output()
sys.exit(0)