diff --git a/Project.toml b/Project.toml index c6e1e298..f96d0415 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.16.3" +version = "0.16.4" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl new file mode 100644 index 00000000..d7b094b6 --- /dev/null +++ b/experiments/hist/perf-gpu.jl @@ -0,0 +1,202 @@ +using Revise +using CUDA +using StatsBase: sample +using BenchmarkTools +using Base.Threads: @threads +using Random: seed! + +""" + hist_kernel! +""" +function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S} + tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x + bdx, bdy = blockDim().z, blockDim().y + bix, biy = blockIdx().z, blockIdx().y + gdx = gridDim().z + + j = tiy + bdy * (biy - 1) + if j <= length(js) + jdx = js[j] + i_max = length(is) + niter = cld(i_max, bdx * gdx) + @inbounds for iter = 1:niter + i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) + if i <= i_max + @inbounds idx = is[i] + @inbounds bin = x_bin[idx, jdx] + hid = Base._to_linear_index(h∇, k, bin, jdx) + CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx])) + end + end + end + sync_threads() + return nothing +end + +function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + CUDA.synchronize() + CUDA.@sync for j in jsc + nbins = size(h[j], 2) + copyto!(h[j], view(h∇, :, 1:nbins, j)) + end + return nothing +end + +function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + CUDA.synchronize() + return nothing +end + +function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + copyto!(h∇_cpu, h∇) + CUDA.synchronize() + return nothing +end + + +function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + # CUDA.synchronize() + copyto!(h∇_cpu, h∇) + # CUDA.synchronize() + @threads for j in jsc + nbins = size(h[j], 2) + @views h[j] .= h∇_cpu[:, 1:nbins, j] + # h[j] .= h∇_cpu[:, 1:nbins, j] + end + return nothing +end + + +seed!(123) +nbins = 32 +nfeats = 100 +nobs = Int(1e6) +x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); +∇ = rand(Float32, 3, nobs); +h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats] +rowsample = 0.5 +colsample = 0.5 +is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) +js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) + +∇_gpu = CuArray(∇) +x_bin_gpu = CuArray(x_bin) +h∇_cpu = zeros(Float32, 3, nbins, nfeats) +h∇_gpu = CuArray(h∇_cpu) +is_gpu = CuArray(is) +js_gpu = CuArray(js) + +CUDA.allowscalar(false) +CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) +# ref without copy to cpu: ~same +# ref 10K: 875.100 μs (168 allocations: 7.08 KiB) +# ref 100K: 1.236 ms (215 allocations: 9.91 KiB) +# ref 1M: 6.138 ms (227 allocations: 12.00 KiB) +# ref 10M: 67.075 ms (235 allocations: 13.38 KiB) + +# with copy +# CUDA v4 1M: 2.903 ms (124 allocations: 6.98 KiB) +# CUDA v5 1M: 3.542 ms (848 allocations: 37.14 KiB) +@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) + +# without copy +# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB) +# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB) +@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) + +# without single array copy +# CUDA v4 1M: +# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB) +@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) + +# without single array copy +# CUDA v4 1M: +# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB) +@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) + + +using CUDA, BenchmarkTools +function gpu_copy!(h, h∇, jsc) + CUDA.@sync for j in jsc + nbins = size(h[j], 2) + copyto!(h[j], view(h∇, :, 1:nbins, j)) + end + return nothing +end + +h∇ = [zeros(Float32, 3, 32) for n in 1:100]; +h∇_gpu = CUDA.zeros(Float32, 3, 32, 100); +js = 1:100 + +# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB) +# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB) +@btime gpu_copy!(h∇, h∇_gpu, js) + + +function gpu_copy2!(h, h∇, jsc) + for j in jsc + nbins = size(h[j], 2) + @async copyto!(h[j], view(h∇, :, 1:nbins, j)) + end + return nothing +end + +h∇ = [zeros(Float32, 3, 32) for n in 1:100]; +h∇_gpu = CUDA.zeros(Float32, 3, 32, 100); +js = 1:100 + +# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB) +# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB) +@btime gpu_copy2!(h∇, h∇_gpu, js) diff --git a/experiments/readme_plots-df-cpu.jl b/experiments/readme_plots-df-cpu.jl index 878c6d7f..8e7a8df7 100644 --- a/experiments/readme_plots-df-cpu.jl +++ b/experiments/readme_plots-df-cpu.jl @@ -121,535 +121,3 @@ plot!( linewidth=1.5, label="Linear - C", ) - -# logistic / cross-entropy -params1 = EvoTreeRegressor( - loss=:logistic, - nrounds=200, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1, - dtrain; - fnames=["x_num", "x_cat"], - target_name="y", - deval, - metric=:logloss, - print_every_n=25, - early_stopping_rounds=20, - verbosity=0 -); -# 218.040 ms (123372 allocations: 34.71 MiB) -# @btime model = fit_evotree($params1, $X_train, $Y_train, X_eval = $X_eval, Y_eval = $Y_eval) -plot( - dtrain.x_num, - dtrain.y, - msize=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -dinfer = dtrain[dtrain.x_cat.=="A", :] -pred = model(dinfer) -x_perm = sortperm(dinfer.x_num) -plot!( - dinfer.x_num[x_perm], - pred[x_perm], - color="lightblue", - linewidth=1.5, - label="Linear - A", -) -dinfer = dtrain[dtrain.x_cat.=="B", :] -pred = model(dinfer); -x_perm = sortperm(dinfer.x_num) -plot!( - dinfer.x_num[x_perm], - pred[x_perm], - color="blue", - linewidth=1.5, - label="Linear - B", -) -dinfer = dtrain[dtrain.x_cat.=="C", :] -pred = model(dinfer); -x_perm = sortperm(dinfer.x_num) -plot!( - dinfer.x_num[x_perm], - pred[x_perm], - color="navy", - linewidth=1.5, - label="Linear - C", -) - -# L1 -params1 = EvoTreeRegressor( - loss=:L1, - alpha=0.5, - nrounds=500, - nbins=64, - lambda=0.0, - gamma=0.0, - eta=0.1, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:mae -); -@time pred_train_L1 = predict(model, x_train) -@time pred_eval_L1 = predict(model, x_eval) -sqrt(mean((pred_train_L1 .- y_train) .^ 2)) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train, - y_train, - msize=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train_linear[x_perm], - color="navy", - linewidth=1.5, - label="Linear", -) -plot!( - x_train[:, 1][x_perm], - pred_train_linear_w[x_perm], - color="lightblue", - linewidth=1.5, - label="LinearW", -) -plot!( - x_train[:, 1][x_perm], - pred_train_logistic[x_perm], - color="darkred", - linewidth=1.5, - label="Logistic", -) -plot!( - x_train[:, 1][x_perm], - pred_train_L1[x_perm], - color="darkgreen", - linewidth=1.5, - label="L1", -) -savefig("figures/regression_sinus.png") - -# Poisson -params1 = EvoTreeCount( - loss=:poisson, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.1, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:poisson -); -@time pred_train_poisson = predict(model, x_train); -sqrt(mean((pred_train_poisson .- y_train) .^ 2)) - -# Gamma -params1 = EvoTreeRegressor( - loss=:gamma, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.02, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:gamma -); -@time pred_train_gamma = predict(model, x_train); -sqrt(mean((pred_train_gamma .- y_train) .^ 2)) - -# Tweedie -params1 = EvoTreeRegressor( - loss=:tweedie, - nrounds=500, - nbins=64, - lambda=0.5, - gamma=0.1, - eta=0.1, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:tweedie -); -@time pred_train_tweedie = predict(model, x_train); -sqrt(mean((pred_train_tweedie .- y_train) .^ 2)) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train, - y_train, - msize=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train_poisson[x_perm], - color="navy", - linewidth=1.5, - label="Poisson", -) -plot!( - x_train[:, 1][x_perm], - pred_train_gamma[x_perm], - color="lightblue", - linewidth=1.5, - label="Gamma", -) -plot!( - x_train[:, 1][x_perm], - pred_train_tweedie[x_perm], - color="darkred", - linewidth=1.5, - label="Tweedie", -) -savefig("figures/regression_sinus2.png") - - -############################### -## Quantiles -############################### -# q50 -params1 = EvoTreeRegressor( - loss=:quantile, - alpha=0.5, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.0, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:mae -); -# 116.822 ms (74496 allocations: 36.41 MiB) for 100 iterations -# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval) -@time pred_train_q50 = predict(model, x_train) -sum(pred_train_q50 .< y_train) / length(y_train) - -# q20 -params1 = EvoTreeRegressor( - loss=:quantile, - alpha=0.2, - nrounds=300, - nbins=64, - lambda=0.1, - gamma=0.0, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25); -@time pred_train_q20 = predict(model, x_train) -sum(pred_train_q20 .< y_train) / length(y_train) - -# q80 -params1 = EvoTreeRegressor( - loss=:quantile, - alpha=0.8, - nrounds=300, - nbins=64, - lambda=0.1, - gamma=0.0, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25) -@time pred_train_q80 = predict(model, x_train) -sum(pred_train_q80 .< y_train) / length(y_train) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train, - y_train, - ms=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train_q50[x_perm], - color="navy", - linewidth=1.5, - label="Median", -) -plot!( - x_train[:, 1][x_perm], - pred_train_q20[x_perm], - color="darkred", - linewidth=1.5, - label="Q20", -) -plot!( - x_train[:, 1][x_perm], - pred_train_q80[x_perm], - color="darkgreen", - linewidth=1.5, - label="Q80", -) -savefig("figures/quantiles_sinus.png") - - -############################### -## gaussian -############################### -params1 = EvoTreeMLE( - loss=:gaussian, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.05, - max_depth=6, - min_weight=10.0, - rowsample=1.0, - colsample=1.0, - rng=123, -) - -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:gaussian -); -# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10); -@time pred_train = EvoTrees.predict(model, x_train); -# @btime pred_train = EvoTrees.predict(model, X_train); - -pred_gauss = - [Distributions.Normal(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1)] -pred_q80 = quantile.(pred_gauss, 0.8) -pred_q20 = quantile.(pred_gauss, 0.2) - -mean(y_train .< pred_q80) -mean(y_train .< pred_q20) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train[:, 1], - y_train, - ms=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 1], - color="navy", - linewidth=1.5, - label="mu", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 2], - color="darkred", - linewidth=1.5, - label="sigma", -) -plot!( - x_train[:, 1][x_perm], - pred_q20[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q20", -) -plot!( - x_train[:, 1][x_perm], - pred_q80[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q80", -) -savefig("figures/gaussian-sinus.png") - - -############################### -## Logistic -############################### -params1 = EvoTrees.EvoTreeMLE( - loss=:logistic, - nrounds=500, - nbins=64, - lambda=1.0, - gamma=0.1, - eta=0.03, - max_depth=6, - min_weight=1.0, - rowsample=1.0, - colsample=1.0, - rng=123, -) - -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:logistic_mle -); -# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10); -@time pred_train = EvoTrees.predict(model, x_train); -# @btime pred_train = EvoTrees.predict(model, X_train); - -pred_logistic = [ - Distributions.Logistic(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1) -] -pred_q80 = quantile.(pred_logistic, 0.8) -pred_q20 = quantile.(pred_logistic, 0.2) - -mean(y_train .< pred_q80) -mean(y_train .< pred_q20) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train[:, 1], - y_train, - ms=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 1], - color="navy", - linewidth=1.5, - label="mu", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 2], - color="darkred", - linewidth=1.5, - label="s", -) -plot!( - x_train[:, 1][x_perm], - pred_q20[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q20", -) -plot!( - x_train[:, 1][x_perm], - pred_q80[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q80", -) -savefig("figures/logistic-sinus.png") diff --git a/experiments/readme_plots-df-gpu.jl b/experiments/readme_plots-df-gpu.jl index bff06ae4..0e623a28 100644 --- a/experiments/readme_plots-df-gpu.jl +++ b/experiments/readme_plots-df-gpu.jl @@ -5,6 +5,7 @@ using StatsBase: sample, quantile using Distributions using Random using Plots +using CUDA using EvoTrees using DataFrames using CategoricalArrays @@ -124,495 +125,3 @@ plot!( linewidth=1.5, label="Linear - C", ) - -# logistic / cross-entropy -params1 = EvoTreeRegressor( - loss=:logistic, - nrounds=200, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) - -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:logloss -); -# 218.040 ms (123372 allocations: 34.71 MiB) -# @btime model = fit_evotree($params1, $X_train, $Y_train, X_eval = $X_eval, Y_eval = $Y_eval) -@time pred_train_logistic = predict(model, x_train); -@time pred_eval_logistic = predict(model, x_eval) -sqrt(mean((pred_train_logistic .- y_train) .^ 2)) - -# L1 -params1 = EvoTreeRegressor( - loss=:L1, - alpha=0.5, - nrounds=500, - nbins=64, - lambda=0.0, - gamma=0.0, - eta=0.1, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:mae -); -@time pred_train_L1 = predict(model, x_train) -@time pred_eval_L1 = predict(model, x_eval) -sqrt(mean((pred_train_L1 .- y_train) .^ 2)) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train, - y_train, - msize=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train_linear[x_perm], - color="navy", - linewidth=1.5, - label="Linear", -) -plot!( - x_train[:, 1][x_perm], - pred_train_linear_w[x_perm], - color="lightblue", - linewidth=1.5, - label="LinearW", -) -plot!( - x_train[:, 1][x_perm], - pred_train_logistic[x_perm], - color="darkred", - linewidth=1.5, - label="Logistic", -) -plot!( - x_train[:, 1][x_perm], - pred_train_L1[x_perm], - color="darkgreen", - linewidth=1.5, - label="L1", -) -savefig("figures/regression_sinus.png") - -# Poisson -params1 = EvoTreeCount( - loss=:poisson, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.1, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:poisson -); -@time pred_train_poisson = predict(model, x_train); -sqrt(mean((pred_train_poisson .- y_train) .^ 2)) - -# Gamma -params1 = EvoTreeRegressor( - loss=:gamma, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.02, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:gamma -); -@time pred_train_gamma = predict(model, x_train); -sqrt(mean((pred_train_gamma .- y_train) .^ 2)) - -# Tweedie -params1 = EvoTreeRegressor( - loss=:tweedie, - nrounds=500, - nbins=64, - lambda=0.5, - gamma=0.1, - eta=0.1, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:tweedie -); -@time pred_train_tweedie = predict(model, x_train); -sqrt(mean((pred_train_tweedie .- y_train) .^ 2)) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train, - y_train, - msize=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train_poisson[x_perm], - color="navy", - linewidth=1.5, - label="Poisson", -) -plot!( - x_train[:, 1][x_perm], - pred_train_gamma[x_perm], - color="lightblue", - linewidth=1.5, - label="Gamma", -) -plot!( - x_train[:, 1][x_perm], - pred_train_tweedie[x_perm], - color="darkred", - linewidth=1.5, - label="Tweedie", -) -savefig("figures/regression_sinus2.png") - - -############################### -## Quantiles -############################### -# q50 -params1 = EvoTreeRegressor( - loss=:quantile, - alpha=0.5, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.0, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:mae -); -# 116.822 ms (74496 allocations: 36.41 MiB) for 100 iterations -# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval) -@time pred_train_q50 = predict(model, x_train) -sum(pred_train_q50 .< y_train) / length(y_train) - -# q20 -params1 = EvoTreeRegressor( - loss=:quantile, - alpha=0.2, - nrounds=300, - nbins=64, - lambda=0.1, - gamma=0.0, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25); -@time pred_train_q20 = predict(model, x_train) -sum(pred_train_q20 .< y_train) / length(y_train) - -# q80 -params1 = EvoTreeRegressor( - loss=:quantile, - alpha=0.8, - nrounds=300, - nbins=64, - lambda=0.1, - gamma=0.0, - eta=0.05, - max_depth=6, - min_weight=1.0, - rowsample=0.5, - colsample=1.0, -) -@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25) -@time pred_train_q80 = predict(model, x_train) -sum(pred_train_q80 .< y_train) / length(y_train) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train, - y_train, - ms=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train_q50[x_perm], - color="navy", - linewidth=1.5, - label="Median", -) -plot!( - x_train[:, 1][x_perm], - pred_train_q20[x_perm], - color="darkred", - linewidth=1.5, - label="Q20", -) -plot!( - x_train[:, 1][x_perm], - pred_train_q80[x_perm], - color="darkgreen", - linewidth=1.5, - label="Q80", -) -savefig("figures/quantiles_sinus.png") - - -############################### -## gaussian -############################### -params1 = EvoTreeMLE( - loss=:gaussian, - nrounds=500, - nbins=64, - lambda=0.1, - gamma=0.1, - eta=0.05, - max_depth=6, - min_weight=10.0, - rowsample=1.0, - colsample=1.0, - rng=123, -) - -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:gaussian -); -# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10); -@time pred_train = EvoTrees.predict(model, x_train); -# @btime pred_train = EvoTrees.predict(model, X_train); - -pred_gauss = - [Distributions.Normal(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1)] -pred_q80 = quantile.(pred_gauss, 0.8) -pred_q20 = quantile.(pred_gauss, 0.2) - -mean(y_train .< pred_q80) -mean(y_train .< pred_q20) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train[:, 1], - y_train, - ms=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 1], - color="navy", - linewidth=1.5, - label="mu", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 2], - color="darkred", - linewidth=1.5, - label="sigma", -) -plot!( - x_train[:, 1][x_perm], - pred_q20[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q20", -) -plot!( - x_train[:, 1][x_perm], - pred_q80[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q80", -) -savefig("figures/gaussian-sinus.png") - - -############################### -## Logistic -############################### -params1 = EvoTrees.EvoTreeMLE( - loss=:logistic, - nrounds=500, - nbins=64, - lambda=1.0, - gamma=0.1, - eta=0.03, - max_depth=6, - min_weight=1.0, - rowsample=1.0, - colsample=1.0, - rng=123, -) - -@time model = fit_evotree( - params1; - x_train, - y_train, - x_eval, - y_eval, - print_every_n=25, - early_stopping_rounds=50, - metric=:logistic_mle -); -# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10); -@time pred_train = EvoTrees.predict(model, x_train); -# @btime pred_train = EvoTrees.predict(model, X_train); - -pred_logistic = [ - Distributions.Logistic(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1) -] -pred_q80 = quantile.(pred_logistic, 0.8) -pred_q20 = quantile.(pred_logistic, 0.2) - -mean(y_train .< pred_q80) -mean(y_train .< pred_q20) - -x_perm = sortperm(x_train[:, 1]) -plot( - x_train[:, 1], - y_train, - ms=0.5, - mcolor="darkgray", - mswidth=0, - background_color=RGB(1, 1, 1), - seriestype=:scatter, - xaxis=("feature"), - yaxis=("target"), - legend=true, - label="", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 1], - color="navy", - linewidth=1.5, - label="mu", -) -plot!( - x_train[:, 1][x_perm], - pred_train[x_perm, 2], - color="darkred", - linewidth=1.5, - label="s", -) -plot!( - x_train[:, 1][x_perm], - pred_q20[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q20", -) -plot!( - x_train[:, 1][x_perm], - pred_q80[x_perm, 1], - color="darkgreen", - linewidth=1.5, - label="q80", -) -savefig("figures/logistic-sinus.png") diff --git a/experiments/readme_plots_gpu.jl b/experiments/readme_plots_gpu.jl index 33b89c92..18fdcf6c 100644 --- a/experiments/readme_plots_gpu.jl +++ b/experiments/readme_plots_gpu.jl @@ -5,12 +5,13 @@ using Distributions using Random using Plots using Revise +using CUDA using EvoTrees using EvoTrees: predict, sigmoid, logit # using ProfileView # prepare a dataset -tree_type = "binary" +tree_type = "binary" # binary/oblivious device = "gpu" Random.seed!(123) diff --git a/ext/EvoTreesCUDAExt/eval.jl b/ext/EvoTreesCUDAExt/eval.jl index 6ea10153..f041c18e 100644 --- a/ext/EvoTreesCUDAExt/eval.jl +++ b/ext/EvoTreesCUDAExt/eval.jl @@ -46,7 +46,7 @@ end function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDeviceVector{T}, w::CuDeviceVector{T}) where {T<:AbstractFloat} i = threadIdx().x + (blockIdx().x - 1) * blockDim().x if i <= length(y) - @inbounds pred = sigmoid(p[1, i]) + @inbounds pred = EvoTrees.sigmoid(p[1, i]) @inbounds eval[i] = w[i] * (-y[i] * log(pred) + (y[i] - 1) * log(1 - pred)) end return nothing diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl index 31bc1f55..2533f789 100644 --- a/ext/EvoTreesCUDAExt/fit-utils.jl +++ b/ext/EvoTreesCUDAExt/fit-utils.jl @@ -23,7 +23,7 @@ function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, i return nothing end -function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) +function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) config = launch_configuration(kernel.fun) max_threads = config.threads ÷ 4 @@ -37,10 +37,10 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) blocks = (1, by, bx) h∇ .= 0 kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - CUDA.@sync for j in jsc + copyto!(h∇_cpu, h∇) + Threads.@threads for j in jsc nbins = size(h[j], 2) - copyto!(h[j], view(h∇, :, 1:nbins, j)) + @views h[j] .= h∇_cpu[:, 1:nbins, j] end return nothing end diff --git a/ext/EvoTreesCUDAExt/fit.jl b/ext/EvoTreesCUDAExt/fit.jl index f0e32cf0..1f454254 100644 --- a/ext/EvoTreesCUDAExt/fit.jl +++ b/ext/EvoTreesCUDAExt/fit.jl @@ -21,6 +21,7 @@ function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.E cache.out, cache.left, cache.right, + cache.h∇_cpu, cache.h∇, cache.x_bin, cache.feattypes, @@ -43,6 +44,7 @@ function grow_tree!( out, left, right, + h∇_cpu::Array{Float64,3}, h∇::CuArray{Float64,3}, x_bin::CuMatrix, feattypes::Vector{Bool}, @@ -87,7 +89,7 @@ function grow_tree!( end end else - update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) + update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end Threads.@threads for n ∈ sort(n_current) @@ -160,6 +162,7 @@ function grow_otree!( out, left, right, + h∇_cpu::Array{Float64,3}, h∇::CuArray{Float64,3}, x_bin::CuMatrix, feattypes::Vector{Bool}, @@ -214,7 +217,7 @@ function grow_otree!( end end else - update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) + update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end Threads.@threads for n ∈ n_current diff --git a/ext/EvoTreesCUDAExt/init.jl b/ext/EvoTreesCUDAExt/init.jl index 6a8dfcda..3a0b24c9 100644 --- a/ext/EvoTreesCUDAExt/init.jl +++ b/ext/EvoTreesCUDAExt/init.jl @@ -63,7 +63,8 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU} !isnothing(offset) && (pred .+= CuArray(offset')) # initialize gradients - h∇ = CUDA.zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins)) + h∇_cpu = zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins)) + h∇ = CuArray(h∇_cpu) ∇ = CUDA.zeros(T, 2 * K + 1, nobs) @assert (length(y) == length(w) && minimum(w) > 0) ∇[end, :] .= w @@ -117,6 +118,7 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU} right=right, ∇=∇, h∇=h∇, + h∇_cpu=h∇_cpu, fnames=fnames, edges=edges, featbins=featbins, diff --git a/figures/gaussian-sinus-oblivious-gpu.png b/figures/gaussian-sinus-oblivious-gpu.png index abde7cdd..c62c1267 100644 Binary files a/figures/gaussian-sinus-oblivious-gpu.png and b/figures/gaussian-sinus-oblivious-gpu.png differ diff --git a/figures/regression-sinus-oblivious-gpu.png b/figures/regression-sinus-oblivious-gpu.png index 44644279..eb60970d 100644 Binary files a/figures/regression-sinus-oblivious-gpu.png and b/figures/regression-sinus-oblivious-gpu.png differ