diff --git a/src/03.silver/dota/models/pre_match/predict.py b/src/03.silver/dota/models/pre_match/predict.py index b4907b3..65cbf0f 100644 --- a/src/03.silver/dota/models/pre_match/predict.py +++ b/src/03.silver/dota/models/pre_match/predict.py @@ -90,7 +90,7 @@ # COMMAND ---------- -radiant_prob, dire_prob = model.predict_proba(df_predict[df_predict.columns[3:]])[0]*100 +dire_prob, radiant_prob = model.predict_proba(df_predict[model.feature_names_in_])[0]*100 df_dashboard = spark.createDataFrame( pd.DataFrame( @@ -104,3 +104,7 @@ ) df_dashboard.display() + +# COMMAND ---------- + +df_predict[model.feature_names_in_] diff --git a/src/03.silver/dota/models/pre_match/target.sql b/src/03.silver/dota/models/pre_match/target.sql index c73fa17..1e9faaf 100644 --- a/src/03.silver/dota/models/pre_match/target.sql +++ b/src/03.silver/dota/models/pre_match/target.sql @@ -7,6 +7,6 @@ FROM silver.dota.matches WHERE dtMatchDay >= '2018-01-01' - AND dtMatchDay < '2023-08-24' + AND dtMatchDay < '2023-08-29' AND idDireTeam IS NOT NULL AND idRadiantTeam IS NOT NULL \ No newline at end of file diff --git a/src/03.silver/dota/models/pre_match/training.py b/src/03.silver/dota/models/pre_match/training.py index 4ae30dc..1b19bfb 100644 --- a/src/03.silver/dota/models/pre_match/training.py +++ b/src/03.silver/dota/models/pre_match/training.py @@ -1,5 +1,6 @@ # Databricks notebook source # DBTITLE 1,Imports + from databricks import feature_store import sys @@ -9,15 +10,21 @@ import dbtools import pandas as pd +import numpy as np + +pd.set_option('display.max_rows', 1000) from sklearn import model_selection from sklearn import ensemble from sklearn import pipeline from sklearn import tree from sklearn import metrics +from sklearn import preprocessing from feature_engine import encoding from feature_engine import imputation +from feature_engine import creation +from feature_engine import selection import lightgbm as lgb @@ -29,7 +36,6 @@ # COMMAND ---------- # DBTITLE 1,Lookups e Target - query = dbtools.import_query("target.sql") df = spark.sql(query) @@ -63,16 +69,14 @@ ) training_df = (training_set.load_df() - .filter('nrFrequency180Radiant > 10 and nrFrequency180Dire > 10') + .filter('avgFrequency180Radiant > 10 and avgFrequency180Dire > 10') + .filter('minFrequency30Radiant > 0 and minFrequency30Dire > 0') .toPandas()) # COMMAND ---------- # DBTITLE 1,Modelagem -to_remove = set(['descTeamNameRadiant', 'descTeamTagRadiant', - 'descTeamTagDire','descTeamNameDire']) - -features = list(set(training_df.columns[4:-1]) - to_remove) +features = training_df.columns[4:-1] target = 'flRadiantWin' X_train, X_test, y_train, y_test = model_selection.train_test_split(training_df[features], @@ -82,8 +86,8 @@ # COMMAND ---------- -print("Tamanho base de treino:", X_train.shape[0]) -print("Tamanho base de teste:", X_test.shape[0]) +print("Tamanho base de treino:", X_train.shape[0], "| Taxa resposta:", y_train.mean()) +print("Tamanho base de teste:", X_test.shape[0], "| Taxa resposta:", y_test.mean()) # COMMAND ---------- @@ -94,17 +98,25 @@ mlflow.sklearn.autolog() missing_0 = imputation.ArbitraryNumberImputer(arbitrary_number=0, - variables=X_test.columns.tolist()) + variables=X_train.columns.tolist()) + + min_max = preprocessing.MinMaxScaler(feature_range=(1,2)).set_output(transform="pandas") model = lgb.LGBMClassifier(n_jobs=-1, random_state=42) - params = {"min_child_samples":[900,1000], - "learning_rate":[0.01], - "n_estimators":[1000], - "subsample":[0.9], - "max_depth":[15]} + params = { + "learning_rate":[0.1, 0.01], + "n_estimators":[500,1000], + "min_child_samples":[250,400,800], + "num_leaves": [10,20,30,50,100,200,500] + } - grid = model_selection.GridSearchCV(model, cv=3, param_grid=params, scoring='roc_auc', verbose=3) + grid = model_selection.GridSearchCV(model, + cv=3, + param_grid=params, + scoring='roc_auc', + verbose=3, + n_jobs=1) model_pipe = pipeline.Pipeline( [('imputer', missing_0), @@ -152,3 +164,7 @@ scikitplot.metrics.plot_lift_curve(y_true=y_test, y_probas=proba_test) plt.show() + +# COMMAND ---------- + +