-
Notifications
You must be signed in to change notification settings - Fork 22
/
train_model.py
executable file
·80 lines (67 loc) · 2.63 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
"""Script to train XGBoost models."""
import os
from datetime import datetime as dt
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import joblib
from constants import Constants
from version import __version__
__author__ = "Bogdan Kirilenko, 2020."
__email__ = "bogdan.kirilenko@senckenberg.de"
__credits__ = ["Michael Hiller", "Virag Sharma", "David Jebb"]
def train_on(x, y, save_to, name=None):
"""Train model on the X and y given."""
# models parameters, work fine for both multi and single exon models
n_trees = 50
max_depth = 3
learning_rate = 0.1
# create and fit the model, also add cross-validation
model = xgb.XGBClassifier(
n_estimators=n_trees, max_depth=max_depth, learning_rate=learning_rate
)
kfold = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)
results = cross_val_score(model, x, y, cv=kfold)
model.fit(x, y)
if name: # some verbosity
y_lst = list(y)
print(f"{name} model: ")
print(f"Training on {len(x)} samples")
print(f"Positives: {y_lst.count(1)}; Negatives: {y_lst.count(0)}")
print(f"Using features: {x.columns}")
print("Accuracy: {0:.3f} {1:.3f}".format(results.mean() * 100, results.std() * 100))
joblib.dump(model, save_to) # save the model
print(f"Model saved to: {save_to}")
# load dataset, defile where is what: input and output files
# training data is in the repository
t0 = dt.now()
file_location = os.path.dirname(__file__)
models_dir = "models"
train_tsv = "train.tsv"
se_model_dat = "se_model.dat"
me_model_dat = "me_model.dat"
train_set = os.path.join(file_location, models_dir, train_tsv)
se_model_path = os.path.join(file_location, models_dir, se_model_dat)
me_model_path = os.path.join(file_location, models_dir, me_model_dat)
# load dataframe
df = pd.read_csv(train_set, header=0, sep="\t")
print(f"Training dataset size: {len(df)}")
# split overall dataframe into two frames:
# one for 1-exon genes, another for multi-exon
df_se = df[df["single_exon"] == 1]
df_me = df[df["single_exon"] == 0]
# create X and y for both models
# Single and multi-exon models require different sets of features
X_se = df_se.copy()
X_se = X_se[Constants.SE_MODEL_FEATURES]
y_se = df_se["y"]
X_me = df_me.copy()
X_me = X_me[Constants.ME_MODEL_FEATURES]
y_me = df_me["y"]
print(f"Single exon train length: {len(X_se)}; multi exon: {len(X_me)}")
# train and save models
train_on(X_se, y_se, se_model_path, name="Single exon")
train_on(X_me, y_me, me_model_path, name="Multi exon")
print(f"Done in {dt.now() - t0}")