Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mlj table #262

Merged
merged 11 commits into from
Oct 27, 2023
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <jeremie.db@evovest.com>"]
version = "0.16.4"
version = "0.16.5"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
16 changes: 8 additions & 8 deletions benchmarks/regressor-df.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using EvoTrees
using DataFrames
using BenchmarkTools
using Random: seed!
import CUDA
# import CUDA

nobs = Int(1e6)
num_feat = Int(100)
Expand Down Expand Up @@ -45,13 +45,13 @@ params_xgb = Dict(
:max_bin => 64,
)

# dtrain = DMatrix(x_train, y_train)
# watchlist = Dict("train" => DMatrix(x_train, y_train))
# @time m_xgb = xgboost(dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
# # @btime m_xgb = xgboost($dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
# @info "xgboost predict:"
# @time pred_xgb = XGBoost.predict(m_xgb, x_train);
# # @btime XGBoost.predict($m_xgb, $x_train);
dtrain = DMatrix(x_train, y_train)
watchlist = Dict("train" => DMatrix(x_train, y_train))
@time m_xgb = xgboost(dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
# @btime m_xgb = xgboost($dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
@info "xgboost predict:"
@time pred_xgb = XGBoost.predict(m_xgb, x_train);
# @btime XGBoost.predict($m_xgb, $x_train);

@info "EvoTrees"
dtrain = DataFrame(x_train, :auto)
Expand Down
116 changes: 116 additions & 0 deletions benchmarks/regressor-mlj.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
using Revise
using Statistics
using StatsBase: sample
using EvoTrees
using DataFrames
using BenchmarkTools
using Random: seed!
import CUDA
using MLJ

nobs = Int(2e6)
num_feat = Int(100)
nrounds = 200
T = Float64
nthread = Base.Threads.nthreads()
@info "testing with: $nobs observations | $num_feat features. nthread: $nthread"
seed!(123)
x_train = rand(T, nobs, num_feat)
y_train = rand(T, size(x_train, 1))

@info nthread
loss = "mse"
if loss == "mse"
loss_evo = :mse
metric_evo = :mae
elseif loss == "logloss"
loss_evo = :logloss
metric_evo = :logloss
end

@info "EvoTrees"
dtrain = DataFrame(x_train, :auto)
# dtrain.y .= y_train
# target_name = "y"
verbosity = 0

params_evo = EvoTreeRegressor(
loss=loss_evo,
nrounds=nrounds,
alpha=0.5,
lambda=0.0,
gamma=0.0,
eta=0.05,
max_depth=6,
min_weight=1.0,
rowsample=0.5,
colsample=0.5,
nbins=64,
rng=123,
)

@info "EvoTrees CPU"
device = "cpu"

iterated_model = IteratedModel(
model=params_evo,
resampling=Holdout(; fraction_train=0.5),
measures=rmse,
controls=[Step(5),
Patience(200),
NumberLimit(40)],
retrain=false)

mach = machine(iterated_model, dtrain, y_train)
@time fit!(mach);

@info "init"
@time m_df, cache_df = EvoTrees.init(params_evo, dtrain; target_name);

# @info "train - no eval"
# @time m_evo_df = fit_evotree(params_evo, dtrain; target_name, device, verbosity, print_every_n=100);
# @time m_evo_df = fit_evotree(params_evo, dtrain; target_name, device, verbosity, print_every_n=100);

@info "train - eval"
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
# @time m_evo = fit_evotree(params_evo, dtrain; target_name, device);
# @btime fit_evotree($params_evo, $dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@info "predict"
@time pred_evo = m_evo(dtrain);
@btime m_evo($dtrain);

@info "EvoTrees GPU"
device = "gpu"
@info "train"
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
# @btime m_evo = fit_evotree($params_evo, $dtrain; target_name, device);
# @btime fit_evotree($params_evo, $dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@info "predict"
@time pred_evo = m_evo(dtrain; device);
@btime m_evo($dtrain; device);


using MLJBase
using MLJModels
using Tables

EvoTreeBooster = @load EvoTreeRegressor
booster = EvoTreeBooster()

X, y = make_regression(1000, 5)

# this works:
mach = machine(booster, X, y) |> fit!

# this doesn't
X, y = make_regression(1_000_000, 100);
@time X = DataFrame(X);
@time X = Tables.rowtable(X);
@time X = Tables.columntable(X);

mach = machine(booster, X, y) |> fit!

schema = Tables.schema(dtrain)
schema.names
8 changes: 4 additions & 4 deletions benchmarks/regressor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ using XGBoost
using EvoTrees
using BenchmarkTools
using Random: seed!
import CUDA
# import CUDA

### v.0.15.1
# desktop | 1e6 | depth 11 | cpu: 37.2s
# desktop | 10e6 | depth 11 | cpu

### perf depth
# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s
# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s
### v0.16.5
# desktop | 1e6 | depth 11 | cpu: 31s gpu: 50 sec | xgboost cpu: 26s
# desktop | 10e6 | depth 11 | cpu 200s gpu: 80 sec | xgboost cpu: 267s

#threads
# laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)
Expand Down
1 change: 0 additions & 1 deletion experiments/MLJ.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ config = EvoTreeClassifier(
gamma = 0.0,
nbins = 32,
nrounds = 200,
device = "cpu"
)
model = fit_evotree(config; x_train, y_train);
model = fit_evotree(config; x_train, y_train, x_eval = x_train, y_eval = y_train, metric=:mlogloss, print_every_n=10, early_stopping_rounds=25);
Expand Down
129 changes: 0 additions & 129 deletions experiments/depth-debug.jl

This file was deleted.

Loading
Loading