-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
64 lines (48 loc) · 2.1 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import numpy as np
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.metrics import r2_score
# Read the data
df = pd.read_csv("preprocessed_data.csv")
# Split into train and test data
train_data = df
# Select required columns
train_data = train_data[['batting_team', 'bowling_team', 'venue', 'toss_winner', 'toss_decision', 'wickets','runs_total']]
# Split train_data into train and validation set (called as X_test, y_test)
y = train_data.pop('runs_total')
X = train_data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
# Define column transformers to Encode categorical data
ct = ColumnTransformer([
('ohe', OneHotEncoder(), ['batting_team', 'bowling_team', 'venue', 'toss_decision']),
], remainder='passthrough')
# Transform Data
X_train = ct.fit_transform(X_train)
# Train model
X_train_copy, y_train_copy = X_train.copy(), y_train.copy()
sgd_reg_pipeline = Pipeline([
('reg', SGDRegressor(random_state=42)),
])
sgd_reg_pipeline.fit(X_train_copy, y_train_copy)
X_test_copy = X_test.copy()
X_test_copy = ct.transform(X_test_copy)
y_pred = sgd_reg_pipeline.predict(X_test_copy)
print("R2 Score:", r2_score(y_test, y_pred))
scores = cross_val_score(sgd_reg_pipeline, X_train_copy, y_train_copy, cv=5, scoring="r2")
print("Using cross validation:")
print("Minimum Score:",np.min(scores))
print("Maximum Score:",np.max(scores))
print("Average Score:", np.average(scores))
# Save model for later use
joblib.dump(sgd_reg_pipeline, "model.joblib")
joblib.dump(ct, "transformer.joblib")