diff --git a/Project.toml b/Project.toml index f96d0415..195f96e0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" authors = ["jeremiedb "] -version = "0.16.4" +version = "0.16.5" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" diff --git a/benchmarks/regressor-df.jl b/benchmarks/regressor-df.jl index 19a23017..4e9cceb8 100644 --- a/benchmarks/regressor-df.jl +++ b/benchmarks/regressor-df.jl @@ -6,7 +6,7 @@ using EvoTrees using DataFrames using BenchmarkTools using Random: seed! -import CUDA +# import CUDA nobs = Int(1e6) num_feat = Int(100) @@ -45,13 +45,13 @@ params_xgb = Dict( :max_bin => 64, ) -# dtrain = DMatrix(x_train, y_train) -# watchlist = Dict("train" => DMatrix(x_train, y_train)) -# @time m_xgb = xgboost(dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...); -# # @btime m_xgb = xgboost($dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...); -# @info "xgboost predict:" -# @time pred_xgb = XGBoost.predict(m_xgb, x_train); -# # @btime XGBoost.predict($m_xgb, $x_train); +dtrain = DMatrix(x_train, y_train) +watchlist = Dict("train" => DMatrix(x_train, y_train)) +@time m_xgb = xgboost(dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...); +# @btime m_xgb = xgboost($dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...); +@info "xgboost predict:" +@time pred_xgb = XGBoost.predict(m_xgb, x_train); +# @btime XGBoost.predict($m_xgb, $x_train); @info "EvoTrees" dtrain = DataFrame(x_train, :auto) diff --git a/benchmarks/regressor-mlj.jl b/benchmarks/regressor-mlj.jl new file mode 100644 index 00000000..29035c40 --- /dev/null +++ b/benchmarks/regressor-mlj.jl @@ -0,0 +1,116 @@ +using Revise +using Statistics +using StatsBase: sample +using EvoTrees +using DataFrames +using BenchmarkTools +using Random: seed! +import CUDA +using MLJ + +nobs = Int(2e6) +num_feat = Int(100) +nrounds = 200 +T = Float64 +nthread = Base.Threads.nthreads() +@info "testing with: $nobs observations | $num_feat features. nthread: $nthread" +seed!(123) +x_train = rand(T, nobs, num_feat) +y_train = rand(T, size(x_train, 1)) + +@info nthread +loss = "mse" +if loss == "mse" + loss_evo = :mse + metric_evo = :mae +elseif loss == "logloss" + loss_evo = :logloss + metric_evo = :logloss +end + +@info "EvoTrees" +dtrain = DataFrame(x_train, :auto) +# dtrain.y .= y_train +# target_name = "y" +verbosity = 0 + +params_evo = EvoTreeRegressor( + loss=loss_evo, + nrounds=nrounds, + alpha=0.5, + lambda=0.0, + gamma=0.0, + eta=0.05, + max_depth=6, + min_weight=1.0, + rowsample=0.5, + colsample=0.5, + nbins=64, + rng=123, +) + +@info "EvoTrees CPU" +device = "cpu" + +iterated_model = IteratedModel( + model=params_evo, + resampling=Holdout(; fraction_train=0.5), + measures=rmse, + controls=[Step(5), + Patience(200), + NumberLimit(40)], + retrain=false) + +mach = machine(iterated_model, dtrain, y_train) +@time fit!(mach); + +@info "init" +@time m_df, cache_df = EvoTrees.init(params_evo, dtrain; target_name); + +# @info "train - no eval" +# @time m_evo_df = fit_evotree(params_evo, dtrain; target_name, device, verbosity, print_every_n=100); +# @time m_evo_df = fit_evotree(params_evo, dtrain; target_name, device, verbosity, print_every_n=100); + +@info "train - eval" +@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100); +@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100); +# @time m_evo = fit_evotree(params_evo, dtrain; target_name, device); +# @btime fit_evotree($params_evo, $dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100); +@info "predict" +@time pred_evo = m_evo(dtrain); +@btime m_evo($dtrain); + +@info "EvoTrees GPU" +device = "gpu" +@info "train" +@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100); +@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100); +# @btime m_evo = fit_evotree($params_evo, $dtrain; target_name, device); +# @btime fit_evotree($params_evo, $dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100); +@info "predict" +@time pred_evo = m_evo(dtrain; device); +@btime m_evo($dtrain; device); + + +using MLJBase +using MLJModels +using Tables + +EvoTreeBooster = @load EvoTreeRegressor +booster = EvoTreeBooster() + +X, y = make_regression(1000, 5) + +# this works: +mach = machine(booster, X, y) |> fit! + +# this doesn't +X, y = make_regression(1_000_000, 100); +@time X = DataFrame(X); +@time X = Tables.rowtable(X); +@time X = Tables.columntable(X); + +mach = machine(booster, X, y) |> fit! + +schema = Tables.schema(dtrain) +schema.names \ No newline at end of file diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl index d38804b6..cb32da27 100644 --- a/benchmarks/regressor.jl +++ b/benchmarks/regressor.jl @@ -6,15 +6,15 @@ using XGBoost using EvoTrees using BenchmarkTools using Random: seed! -import CUDA +# import CUDA ### v.0.15.1 # desktop | 1e6 | depth 11 | cpu: 37.2s # desktop | 10e6 | depth 11 | cpu -### perf depth -# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s -# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s +### v0.16.5 +# desktop | 1e6 | depth 11 | cpu: 31s gpu: 50 sec | xgboost cpu: 26s +# desktop | 10e6 | depth 11 | cpu 200s gpu: 80 sec | xgboost cpu: 267s #threads # laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB) diff --git a/experiments/MLJ.jl b/experiments/MLJ.jl index 503ff474..505c508f 100644 --- a/experiments/MLJ.jl +++ b/experiments/MLJ.jl @@ -68,7 +68,6 @@ config = EvoTreeClassifier( gamma = 0.0, nbins = 32, nrounds = 200, - device = "cpu" ) model = fit_evotree(config; x_train, y_train); model = fit_evotree(config; x_train, y_train, x_eval = x_train, y_eval = y_train, metric=:mlogloss, print_every_n=10, early_stopping_rounds=25); diff --git a/experiments/depth-debug.jl b/experiments/depth-debug.jl deleted file mode 100644 index 4a8df1eb..00000000 --- a/experiments/depth-debug.jl +++ /dev/null @@ -1,129 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using Revise -using EvoTrees -using Profile - -nobs = Int(1e6) -num_feat = Int(100) -nrounds = 200 -nthread = Base.Threads.nthreads() -x_train = rand(nobs, num_feat) -y_train = rand(size(x_train, 1)) - -config = EvoTreeRegressor(; - loss=:mse, - nrounds=200, - lambda=0.0, - gamma=0.0, - eta=0.05, - max_depth=10, - min_weight=1.0, - rowsample=0.5, - colsample=0.5, - nbins=64, - tree_type="binary", - rng=123 -) - -################################ -# high-level -################################ -_device = EvoTrees.GPU -@time EvoTrees.fit_evotree(config; x_train, y_train, device = "gpu") - -@time m, cache = EvoTrees.init(config, x_train, y_train); -@time EvoTrees.grow_evotree!(m, cache, config) -@btime EvoTrees.grow_evotree!(m, cache, config) - -Profile.clear() -# Profile.init() -Profile.init(n = 10^5, delay = 0.01) -# @profile m, cache = EvoTrees.init(config, x_train, y_train); -@profile EvoTrees.grow_evotree!(m, cache, config) -Profile.print() - -################################ -# mid-level -################################ -@time m, cache = EvoTrees.init(config, x_train, y_train); -@time EvoTrees.grow_evotree!(m, cache, config) -# compute gradients -@time m, cache = EvoTrees.init(config, x_train, y_train); -@time EvoTrees.update_grads!(cache.βˆ‡, cache.pred, cache.y, config) -# subsample rows -@time cache.nodes[1].is = EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, config.rowsample, config.rng) -# subsample cols -EvoTrees.sample!(config.rng, cache.js_, cache.js, replace=false, ordered=true) -L = EvoTrees._get_struct_loss(m) -# instantiate a tree then grow it -tree = EvoTrees.Tree{L,1}(config.max_depth) -grow! = config.tree_type == "oblivious" ? EvoTrees.grow_otree! : EvoTrees.grow_tree! -@time EvoTrees.grow_tree!( - tree, - cache.nodes, - config, - cache.βˆ‡, - cache.edges, - cache.js, - cache.out, - cache.left, - cache.right, - cache.x_bin, - cache.feattypes, - cache.monotone_constraints -) - -using ProfileView -ProfileView.@profview EvoTrees.grow_tree!( - tree, - cache.nodes, - config, - cache.βˆ‡, - cache.edges, - cache.js, - cache.out, - cache.left, - cache.right, - cache.x_bin, - cache.feattypes, - cache.monotone_constraints -) - -################################ -# end mid-level -################################ - - -@time m_evo = grow_tree!(params_evo; x_train, y_train, device, print_every_n=100); - -@info "train - no eval" -@time m_evo = fit_evotree(params_evo; x_train, y_train, device, print_every_n=100); - - -offset = 0 -feat = 15 -cond_bin = 32 -@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14); -@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14); -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) - - - diff --git a/experiments/hist/hist_cpu.jl b/experiments/hist/hist_cpu.jl index 3208afb4..0c70ad7f 100644 --- a/experiments/hist/hist_cpu.jl +++ b/experiments/hist/hist_cpu.jl @@ -35,497 +35,7 @@ hβˆ‡ = [zeros(Float64, 3, nbins) for n in 1:nfeats] is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) -# 6.886 ms (97 allocations: 10.67 KiB) -# 9.624 ms (97 allocations: 10.67 KiB) +# laptop: 6.886 ms (97 allocations: 10.67 KiB) +# desktop: 3.451 ms (61 allocations: 6.52 KiB) @time hist_cpu!(hβˆ‡, βˆ‡, x_bin, is, js) @btime hist_cpu!($hβˆ‡, $βˆ‡, $x_bin, $is, $js) - -# base kernel -function kernel1!(h, x, id) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= size(id, 1) && j <= size(id, 2) - @inbounds k = Base._to_linear_index(h, id[i, j], j) - # @inbounds k = id[i,j] + 32 * (j-1) - @inbounds CUDA.atomic_add!(pointer(h, k), x[i, j]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu1!(h::AbstractMatrix{T}, x::AbstractMatrix{T}, id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} - # thread_i = min(MAX_THREADS, size(id, 1)) - # thread_j = min(MAX_THREADS Γ· thread_i, size(id, 2)) - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS Γ· thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) .Γ· threads) - # println("threads:", threads) - # println("blocks:", blocks) - @cuda blocks = blocks threads = threads kernel1!(h, x, id) - return -end - -nbins = 32 -ncol = 100 -items = Int32(1e6) -hist = zeros(Float32, nbins, ncol) -Ξ΄ = rand(Float32, items, ncol) -idx = UInt8.(rand(1:nbins, items, ncol)) -# idx = Int64.(rand(1:nbins, items, ncol)) - -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) - -hist .- Array(hist_gpu) -sum(hist) - sum(Array(hist_gpu)) - -CUDA.@time hist_gpu1!(hist_gpu, Ξ΄_gpu, idx_gpu, MAX_THREADS=1024) -@time hist_cpu!(hist, Ξ΄, idx) -@btime hist_cpu!($hist, $Ξ΄, $idx) -@btime CUDA.@sync hist_gpu1!($hist_gpu, $Ξ΄_gpu, $idx_gpu, MAX_THREADS=1024) -# test on view -CUDA.@time hist_gpu1!(hist_gpu, view(Ξ΄_gpu, 1:itemsΓ·2, 1:ncolΓ·2), view(idx_gpu, 1:itemsΓ·2, 1:ncolΓ·2), MAX_THREADS=1024) - -size(Ξ΄_gpu) -size(view(Ξ΄_gpu, 1:itemsΓ·2, 1:ncolΓ·2)) - -############################################################## -## Build histogram from a subsample idx -# base kernel -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol) -Ξ΄ = rand(Float32, items, ncol) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items Γ· 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol Γ· 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -CUDA.@time hist_gpu2!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -############################################# -# test for SVector - basic test - success! -function kernel!(x, y) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if i <= length(x) - # @inbounds x[i] += y[i] - k = Base._to_linear_index(x, i) - CUDA.atomic_add!(pointer(x, k), y[i]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(x, y; MAX_THREADS=1024) - thread_i = min(MAX_THREADS, length(x)) - threads = (thread_i) - blocks = ceil.(Int, length(x) .Γ· threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel!(x, y) - end - return -end - -x = rand(SVector{2,Float32}, Int(1e7)) -y = rand(SVector{2,Float32}, Int(1e7)) -x = rand(Float32, Int(1e7)) -y = rand(Float32, Int(1e7)) - -x_gpu = CuArray(x) -y_gpu = CuArray(y) - -CuArrays.@time hist_gpu!(x_gpu, y_gpu) -@btime hist_gpu!($x_gpu, $y_gpu) - - -############################################# -# test for SVector - real test -function kernelS2!(h, x, id) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= size(id, 1) && j <= size(id, 2) - @inbounds k = Base._to_linear_index(h, id[i, j], j) - # @inbounds k = id[i,j] + 32 * (j-1) - # @inbounds CUDAnative.atomic_add!(pointer(h, k), x[i,j]) - # h[id[i,j],j] += x[i,j] - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpuS2!(h, x, id; MAX_THREADS=256) where {T} - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS Γ· thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) .Γ· threads) - println("threads:", threads) - println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernelS2!(h, x, id) - end - return -end - -hist = zeros(SVector{2,Float32}, nbins, ncol) -Ξ΄ = rand(SVector{2,Float32}, items, ncol) -idx = rand(1:nbins, items, ncol) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) - -CuArrays.@time hist_gpuS2!(hist_gpu, Ξ΄_gpu, idx_gpu) -@btime hist_gpuS2!($hist_gpu, $Ξ΄_gpu, $idx_gpu) - - -############################################################## -## Build histogram from a subsample idx -# accumulate all gradient single pass -function kernel3!(h, x, id, 𝑖, 𝑗) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 1) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 1]) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 2) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 2]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu3!(h, x, id, 𝑖, 𝑗; MAX_THREADS=1024) - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CuArrays.@sync begin - @cuda blocks = blocks threads = threads kernel3!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol, 2) -Ξ΄ = rand(Float32, items, ncol, 2) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items Γ· 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol Γ· 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -CuArrays.@time hist_gpu3!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu3!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - - - -# accumulate in shared memory histograms -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, nbins) where {T<:AbstractFloat} - tid = threadIdx().x - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - - # shared memory on block of length nbins - # To Do: nbins cannot be passed as argument - dynamic shared memory generate kernel through macro - # shared = CUDAnative.@cuStaticSharedMem(T, 32) - # fill!(shared, 0) - # sync_threads() - - # accumulate in per block histogram - # Why is the atomic add on shared much longer than atomic on h in global mem? - if i <= size(id, 1) && j <= size(id, 2) - # should be the legit way to go - 70ms - # @inbounds CUDAnative.atomic_add!(pointer(shared, id[i,j]), x[i,j]) - - # unsafe (collisions) on shared mem: 3.0ms - # @inbounds shared[id[i,j]] = x[i,j] - - # unsafe (collisions) add on global memory - 3.6ms - # @inbounds h[id[i,j],j] += x[i,j] - - # atomic add on global hist - 3.2ms - @inbounds k = id[i, j] + nbins * (j - 1) - @inbounds CUDA.atomic_add!(pointer(h, k), x[i, j]) - end - # sync_threads() - - # if blockIdx().x == 1 - # if tid <= nbins - # CUDA.atomic_add!(pointer(h,tid), shared[tid]) - # end - # end - return -end - -# shared memory - -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id::CuMatrix{Int}, nbins; MAX_THREADS=256) where {T<:AbstractFloat} - # thread_i = min(MAX_THREADS, size(id, 1)) - # thread_j = min(MAX_THREADS Γ· thread_i, size(id, 2)) - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS Γ· thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) ./ threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, nbins) - end - return h -end - -CuArrays.@time hist_gpu2!(hist_gpu, Ξ΄_gpu, idx_gpu, 32, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 32, MAX_THREADS=1024) -@device_code_warntype hist_gpu2!(hist_gpu, Ξ΄_gpu, idx_gpu, 32, MAX_THREADS=1024) - - - - -###################################### -# Appoach 1 -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - -hist = zeros(Float32, nbins, ncol) -Ξ΄ = rand(Float32, items) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items Γ· 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol Γ· 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -CUDA.@time hist_gpu!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -###################################### -# Appoach 2 - Loop for assigning command grad to appropriate bin per column -# Idea: exploit the fact that there's a single grad per row: take that grad and add it to each column bin -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{UInt8}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - - - - -using CUDA -using BenchmarkTools -N1 = Int(2^12) -x1 = rand(Float32, N1); -x2 = rand(Float32, N1); -x1g = CuArray(x1); -x2g = CuArray(x2); - -function dot_atomic!(x, y, z) - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if idx <= length(x) - CUDA.atomic_add!(pointer(z, 1), x[idx] * y[idx]) - end - return nothing -end - -function bench_dot_atomic!(x, y, z, threads) - numblocks = ceil(Int, N1 / threads) - @cuda threads = threads blocks = numblocks dot_atomic!(x, y, z) -end - -threads = 512 -numblocks = ceil(Int, N1 / threads) -z0 = CUDA.zeros(1) -# @cuda threads=gthreads blocks=numblocks dot_atomic!(x1g, x2g, z0) -@btime CUDA.@sync bench_dot_atomic!($x1g, $x2g, $z0, threads) -# 17.323 ms (50 allocations: 1.67 KiB) - -function dot_share!(x::CuDeviceVector{T}, y::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 64) - fill!(shared, 0) - sync_threads() - - if idx <= length(x) - @inbounds shared[tid] = x[idx] * y[idx] - end - sync_threads() - - i = blockDim().x Γ· 2 - while i > 0 - if tid <= i - @inbounds shared[tid] += shared[tid+i] # valid non atomic operation - # CUDA.atomic_add!(pointer(shared, tid), shared[tid+i]) # invalid op - results in error - end - sync_threads() - i Γ·= 2 - end - if tid == 1 - CUDA.atomic_add!(pointer(z, 1), shared[1]) - end - return nothing -end - -function bench_dot_share!(x, y, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share!(x, y, z) - return z -end - -function wrap_share(x, y, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(1) - x = bench_dot_share!(x, y, z, threads, numblocks) - return x -end - -threads = 64 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(1) -@cuda threads = threads blocks = numblocks dot_share!(x1g, x2g, z) -@time CUDA.@sync wrap_share(x1g, x2g, threads) -@btime CUDA.@sync wrap_share($x1g, $x2g, threads) -x = CUDA.@sync wrap_share(x1g, x2g, threads) -x1g' * x2g -@btime x1g' * x2g -@btime x1' * x2 - - -function dot_share2!(x::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - bid = blockIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 128) - fill!(shared, 0.0f0) - sync_threads() - - # if idx <= length(x) - # shared[tid] = x[idx] * y[idx] - # end - # sync_threads() - - i = blockDim().x Γ· 2 - while i > 0 - # if tid <= i - if tid == 1 - # shared[tid] += shared[tid + i] # valid non atomic operation - CUDA.atomic_add!(pointer(shared, tid), shared[tid+1]) # invalid op - results in error - end - sync_threads() - i Γ·= 2 - end - if tid == 1 - z[bid] += shared[1] - end - return nothing -end - -function bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return sum(z) -end - -function wrap_share2(x, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(numblocks) - # x = bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return (x) -end - -threads = 128 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(numblocks) -sum(z) -x1g' * x2g -CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x1g, x2g, z) -@btime CUDA.@sync wrap_share2($x1g, $x2g, threads) -x = CUDA.@sync wrap_share2(x1g, x2g, threads) - -@btime x1g' * x2g - - -using CUDA -function kernel(x) - shared = @cuStaticSharedMem(Float32, 2) - fill!(shared, 1.0f0) - sync_threads() - # @atomic shared[threadIdx().x] += 0f0 - tid = threadIdx().x - CUDA.atomic_add!(pointer(shared, tid), shared[tid+1]) - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(1) -@cuda kernel(x) -synchronize() \ No newline at end of file diff --git a/experiments/hist/hist_gpu share-v2.jl b/experiments/hist/hist_gpu share-v2.jl index 7ec98e55..bee557b3 100644 --- a/experiments/hist/hist_gpu share-v2.jl +++ b/experiments/hist/hist_gpu share-v2.jl @@ -1,6 +1,6 @@ using Revise using CUDA -using StaticArrays +# using StaticArrays using StatsBase: sample using BenchmarkTools @@ -10,10 +10,6 @@ using BenchmarkTools # - each block build histogram for many features -> (k, j) # - ################################################ - -function agg_share() -end - # base kernel function kernel_share_1!(h::CuDeviceArray{T,3}, βˆ‡, x_bin, is) where {T} @@ -69,19 +65,19 @@ function hist_share_1!(h::AbstractArray{T,3}, x::AbstractMatrix{T}, id::Abstract return end -nbins = 32 +nbins = 64 nfeats = 100 -nobs = Int32(1e6) +nobs = Int(1e6) hist = zeros(Float32, 3, nbins, ncol) -Ξ΄ = rand(Float32, items, 3) +βˆ‡ = rand(Float32, nobs, 3) # idx = Int64.(rand(1:nbins, items, ncol)) -idx = UInt8.(rand(1:nbins, items, ncol)) +is = UInt8.(rand(1:nbins, nobs, ncol)) hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) +βˆ‡_gpu = CuArray(βˆ‡) idx_gpu = CuArray(idx) -@time hist_share_1!(hist, Ξ΄, idx) -@btime hist_share_1!(hist, Ξ΄, idx) -@CUDA.time hist_share_1!(hist_gpu, Ξ΄_gpu, idx_gpu, MAX_THREADS=128) -@btime CUDA.@sync hist_share_1!($hist_gpu, $Ξ΄_gpu, $idx_gpu, MAX_THREADS=128) +@time hist_share_1!(hist, βˆ‡, idx) +@btime hist_share_1!(hist, βˆ‡, idx) +@CUDA.time hist_share_1!(hist_gpu, βˆ‡_gpu, idx_gpu, MAX_THREADS=128) +@btime CUDA.@sync hist_share_1!($hist_gpu, $βˆ‡_gpu, $idx_gpu, MAX_THREADS=128) diff --git a/experiments/hist/hist_gpu share.jl b/experiments/hist/hist_gpu share.jl index ae90f161..4477274f 100644 --- a/experiments/hist/hist_gpu share.jl +++ b/experiments/hist/hist_gpu share.jl @@ -1,872 +1,88 @@ using Revise using CUDA -using StaticArrays using StatsBase: sample using BenchmarkTools -function hist_cpu_1!(hist, Ξ΄, idx) - Threads.@threads for j in 1:size(idx, 2) - for i in 1:size(idx, 1) - @inbounds hist[idx[i,j], j] += Ξ΄[i,1] - end - end - return -end - -function hist_cpu_2!(h1::Matrix{T}, h2::Matrix{T}, hw::Matrix{T}, - Ξ΄1::Vector{T}, Ξ΄2::Vector{T}, w::Vector{T}, idx::Matrix{UInt8}) where {T} - Threads.@threads for j in 1:size(idx, 2) - @inbounds for i in 1:size(idx, 1) - @inbounds h1[idx[i,j], j] += Ξ΄1[i] - @inbounds h2[idx[i,j], j] += Ξ΄2[i] - @inbounds hw[idx[i,j], j] += w[i] - end - end - return -end - - -function hist_cpu_3!(h1::Matrix{T}, h2::Matrix{T}, hw::Matrix{T}, - Ξ΄1::Vector{T}, Ξ΄2::Vector{T}, 𝑀::Vector{T}, idx::Matrix{UInt8}, 𝑖, 𝑗) where {T} - - @inbounds Threads.@threads for j in 𝑗 - @inbounds for i in 𝑖 - h1[idx[i,j], j] += Ξ΄1[i] - h2[idx[i,j], j] += Ξ΄2[i] - hw[idx[i,j], j] += 𝑀[i] - end - end - return -end - # base kernel -function kernel_s4!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, xid::CuDeviceMatrix{S}) where {T,S} - +function kernel_s4!(h::CuDeviceArray{T,3}, βˆ‡::CuDeviceMatrix{T}, x_bin::CuDeviceMatrix{S}) where {T,S} + nbins = size(h, 2) - it, jt = threadIdx().x, threadIdx().y + it, jt, kt = threadIdx().x, threadIdx().y, threadIdx().z ib, jb = blockIdx().x, blockIdx().y id, jd = blockDim().x, blockDim().y ig, jg = gridDim().x, gridDim().y j = jt + (jb - 1) * jd - - shared = @cuDynamicSharedMem(T, 3 * nbins) - fill!(shared, 0) - sync_threads() - - i_tot = size(x, 1) - iter = 0 - while iter * id * ig < i_tot - i = it + id * (ib - 1) + iter * id * ig - if i <= size(xid, 1) && j <= size(xid, 2) - # depends on shared to be assigned to a single feature - k = 3 * (xid[i, j] - 1) - @inbounds CUDA.atomic_add!(pointer(shared, k + 1), x[i, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 2), x[i, 2]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 3), x[i, 3]) - end - iter += 1 - end - sync_threads() - # loop to cover cases where nbins > nthreads - for iter in 1:(nbins - 1) Γ· id + 1 - bin_id = it + id * (iter - 1) - if bin_id <= nbins - @inbounds k = Base._to_linear_index(h, 1, bin_id, j) - @inbounds CUDA.atomic_add!(pointer(h, k), shared[3 * (bin_id - 1) + 1]) - @inbounds CUDA.atomic_add!(pointer(h, k + 1), shared[3 * (bin_id - 1) + 2]) - @inbounds CUDA.atomic_add!(pointer(h, k + 2), shared[3 * (bin_id - 1) + 3]) - end - end - # sync_threads() - return nothing -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu_s4!(h::AbstractArray{T,3}, x::AbstractMatrix{T}, id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} - thread_i = min(MAX_THREADS, size(id, 1)) - thread_j = 1 - threads = (thread_i, thread_j) - blocks = ceil.(Int, (16, size(id, 2))) - fill!(h, 0) - @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h, 2) * 3 kernel_s4!(h, x, id) - return -end - -# function hist_cpu_2!(hist, Ξ΄, idx) -# Threads.@threads for j in 1:size(idx, 2) -# for i in 1:size(idx, 1) -# bin = idx[i,j] -# @inbounds hist[1, bin, j] += Ξ΄[i,1] -# @inbounds hist[2, bin, j] += Ξ΄[i,2] -# @inbounds hist[3, bin, j] += Ξ΄[i,3] -# end -# end -# return -# end - -nbins = 64 -ncol = 100 -items = Int32(1e6) -hist = zeros(Float32, 3, nbins, ncol) -Ξ΄ = rand(Float32, items, 3) -# idx = Int64.(rand(1:nbins, items, ncol)) -idx = UInt8.(rand(1:nbins, items, ncol)) - -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) - -@time hist_cpu_2!(hist, Ξ΄, idx) -@btime hist_cpu_2!(hist, Ξ΄, idx) -@CUDA.time hist_gpu_s4!(hist_gpu, Ξ΄_gpu, idx_gpu, MAX_THREADS=128) -@btime CUDA.@sync hist_gpu_s4!($hist_gpu, $Ξ΄_gpu, $idx_gpu, MAX_THREADS=128) - - -# base kernel -function kernel_s5!(hΞ΄1::CuDeviceArray{T,3}, hΞ΄2::CuDeviceArray{T,3}, h𝑀::CuDeviceMatrix{T}, Ξ΄1::CuDeviceMatrix{T}, Ξ΄2::CuDeviceMatrix{T}, 𝑀::CuDeviceVector{T}, xid::CuDeviceMatrix{S}) where {T,S} - - nbins = size(h𝑀, 1) - it, jt = threadIdx().x, threadIdx().y - ib, jb = blockIdx().x, blockIdx().y - id, jd = blockDim().x, blockDim().y - ig, jg = gridDim().x, gridDim().y - j = jt + (jb - 1) * jd - shared = @cuDynamicSharedMem(T, 3 * nbins) fill!(shared, 0) sync_threads() - i_tot = size(𝑀, 1) + i_tot = size(x_bin, 1) iter = 0 while iter * id * ig < i_tot i = it + id * (ib - 1) + iter * id * ig - if i <= size(xid, 1) && j <= size(xid, 2) + if i <= size(x_bin, 1) && j <= size(x_bin, 2) # depends on shared to be assigned to a single feature - k = 3 * (xid[i, j] - 1) - @inbounds CUDA.atomic_add!(pointer(shared, k + 1), Ξ΄1[i, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 2), Ξ΄2[i, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 3), 𝑀[i]) + k = 3 * (x_bin[i, j] - 1) + @inbounds CUDA.atomic_add!(pointer(shared, k + kt), βˆ‡[i, kt]) + # @inbounds CUDA.atomic_add!(pointer(shared, k + 1), βˆ‡[i, 1]) + # @inbounds CUDA.atomic_add!(pointer(shared, k + 2), βˆ‡[i, 2]) + # @inbounds CUDA.atomic_add!(pointer(shared, k + 3), βˆ‡[i, 3]) end iter += 1 end sync_threads() # loop to cover cases where nbins > nthreads - for iter in 1:(nbins - 1) Γ· id + 1 + for iter in 1:(nbins-1)Γ·id+1 bin_id = it + id * (iter - 1) if bin_id <= nbins - @inbounds k = Base._to_linear_index(hΞ΄1, 1, bin_id, j) - @inbounds CUDA.atomic_add!(pointer(hΞ΄1, k), shared[3 * (bin_id - 1) + 1]) - @inbounds CUDA.atomic_add!(pointer(hΞ΄2, k), shared[3 * (bin_id - 1) + 2]) - @inbounds CUDA.atomic_add!(pointer(h𝑀, k), shared[3 * (bin_id - 1) + 3]) + @inbounds k = Base._to_linear_index(h, 1, bin_id, j) - 1 + @inbounds CUDA.atomic_add!(pointer(h, k + kt), shared[3*(bin_id-1)+kt]) + # @inbounds CUDA.atomic_add!(pointer(h, k), shared[3*(bin_id-1)+1]) + # @inbounds CUDA.atomic_add!(pointer(h, k + 1), shared[3*(bin_id-1)+2]) + # @inbounds CUDA.atomic_add!(pointer(h, k + 2), shared[3*(bin_id-1)+3]) end end - # sync_threads() - return nothing -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu_s5!(hΞ΄1::AbstractArray{T,3}, hΞ΄2::AbstractArray{T,3}, h𝑀::AbstractMatrix{T}, - Ξ΄1::AbstractMatrix{T}, Ξ΄2::AbstractMatrix{T}, 𝑀::AbstractVector{T}, - id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} - thread_i = min(MAX_THREADS, size(id, 1)) - thread_j = 1 - threads = (thread_i, thread_j) - blocks = ceil.(Int, (16, size(id, 2))) - fill!(hΞ΄1, 0) - fill!(hΞ΄2, 0) - fill!(h𝑀, 0) - @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h𝑀, 1) * 3 kernel_s5!(hΞ΄1, hΞ΄2, h𝑀, Ξ΄1, Ξ΄2, 𝑀, id) - return -end - -nbins = 32 -ncol = 100 -items = Int32(1e6) - -Ξ΄1 = CUDA.rand(Float32, items, 1) -Ξ΄2 = CUDA.rand(Float32, items, 1) -𝑀 = CUDA.rand(Float32, items) -idx = UInt8.(rand(1:nbins, items, ncol)) - -hΞ΄1 = CUDA.zeros(1, nbins, ncol) -hΞ΄2 = CUDA.zeros(1, nbins, ncol) -h𝑀 = CUDA.zeros(nbins, ncol) -idx_gpu = CuArray(idx) - -@CUDA.time hist_gpu_s5!(hΞ΄1, hΞ΄2, h𝑀, Ξ΄1, Ξ΄2, 𝑀, idx_gpu, MAX_THREADS=128) -@btime CUDA.@sync hist_gpu_s5!($hΞ΄1, $hΞ΄2, $h𝑀, $Ξ΄1, $Ξ΄2, $𝑀, $idx_gpu, MAX_THREADS=128) - - -# base kernel -function kernel_s6!(hΞ΄1::CuDeviceArray{T,3}, hΞ΄2::CuDeviceArray{T,3}, h𝑀::CuDeviceMatrix{T}, Ξ΄1::CuDeviceMatrix{T}, Ξ΄2::CuDeviceMatrix{T}, 𝑀::CuDeviceVector{T}, xid::CuDeviceMatrix{S}, 𝑖, 𝑗) where {T,S} - - nbins = size(h𝑀, 1) - it, jt = threadIdx().x, threadIdx().y - ib, jb = blockIdx().x, blockIdx().y - id, jd = blockDim().x, blockDim().y - ig, jg = gridDim().x, gridDim().y - j = jt + (jb - 1) * jd - - shared = @cuDynamicSharedMem(T, 3 * nbins) - fill!(shared, 0) sync_threads() - - i_tot = size(𝑀, 1) - iter = 0 - while iter * id * ig < i_tot - i = it + id * (ib - 1) + iter * id * ig - if i <= length(𝑖) && j <= length(𝑗) - # depends on shared to be assigned to a single feature - i_idx = 𝑖[i] - k = 3 * (xid[i_idx, 𝑗[j]] - 1) - @inbounds CUDA.atomic_add!(pointer(shared, k + 1), Ξ΄1[i_idx, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 2), Ξ΄2[i_idx, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 3), 𝑀[i_idx]) - end - iter += 1 - end - sync_threads() - # loop to cover cases where nbins > nthreads - for iter in 1:(nbins - 1) Γ· id + 1 - bin_id = it + id * (iter - 1) - if bin_id <= nbins - @inbounds k = Base._to_linear_index(hΞ΄1, 1, bin_id, 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(hΞ΄1, k), shared[3 * (bin_id - 1) + 1]) - @inbounds CUDA.atomic_add!(pointer(hΞ΄2, k), shared[3 * (bin_id - 1) + 2]) - @inbounds CUDA.atomic_add!(pointer(h𝑀, k), shared[3 * (bin_id - 1) + 3]) - end - end - # sync_threads() return nothing end # base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu_s6!(hΞ΄1::AbstractArray{T,3}, hΞ΄2::AbstractArray{T,3}, h𝑀::AbstractMatrix{T}, - Ξ΄1::AbstractMatrix{T}, Ξ΄2::AbstractMatrix{T}, 𝑀::AbstractVector{T}, - id::AbstractMatrix{S}, 𝑖, 𝑗; MAX_THREADS=256) where {T,S} - thread_i = min(MAX_THREADS, size(id, 1)) +function hist_gpu_s4!(h::AbstractArray{T,3}, βˆ‡::AbstractMatrix{T}, x_bin::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} + thread_i = min(MAX_THREADS, size(x_bin, 1)) thread_j = 1 - threads = (thread_i, thread_j) - blocks = ceil.(Int, (16, size(id, 2))) - fill!(hΞ΄1, 0) - fill!(hΞ΄2, 0) - fill!(h𝑀, 0) - @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h𝑀, 1) * 3 kernel_s6!(hΞ΄1, hΞ΄2, h𝑀, Ξ΄1, Ξ΄2, 𝑀, id, 𝑖, 𝑗) - return -end - -nbins = 32 -ncol = 100 -items = Int32(1e6) - -Ξ΄1 = CUDA.rand(Float32, items, 1) -Ξ΄2 = CUDA.rand(Float32, items, 1) -𝑀 = CUDA.rand(Float32, items) -idx = UInt8.(rand(1:nbins, items, ncol)) - -hΞ΄1 = CUDA.zeros(1, nbins, ncol) -hΞ΄2 = CUDA.zeros(1, nbins, ncol) -h𝑀 = CUDA.zeros(nbins, ncol) -idx_gpu = CuArray(idx) - -𝑖 = CuArray(UInt32.(1:items)) -𝑗 = CuArray(UInt32.(1:ncol)) - -hΞ΄1_cpu = reshape(Array(hΞ΄1), size(hΞ΄1)[2:3]) -hΞ΄2_cpu = reshape(Array(hΞ΄2), size(hΞ΄2)[2:3]) -h𝑀_cpu = Array(h𝑀) -Ξ΄1_cpu = reshape(Array(Ξ΄1), :) -Ξ΄2_cpu = reshape(Array(Ξ΄2), :) -𝑀_cpu = Array(𝑀) -𝑖_cpu = Array(𝑖) -𝑗_cpu = Array(𝑗) -idx_cpu = Array(idx_gpu) - -@time hist_cpu_3!(hΞ΄1_cpu, hΞ΄2_cpu, h𝑀_cpu, Ξ΄1_cpu, Ξ΄2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) -@btime hist_cpu_3!(hΞ΄1_cpu, hΞ΄2_cpu, h𝑀_cpu, Ξ΄1_cpu, Ξ΄2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) - -@CUDA.time hist_gpu_s6!(hΞ΄1, hΞ΄2, h𝑀, Ξ΄1, Ξ΄2, 𝑀, idx_gpu, 𝑖, 𝑗, MAX_THREADS=128) -@btime CUDA.@sync hist_gpu_s6!($hΞ΄1, $hΞ΄2, $h𝑀, $Ξ΄1, $Ξ΄2, $𝑀, $idx_gpu, $𝑖, $𝑗, MAX_THREADS=128) - - - -function update_hist_cpu!(hist_Ξ΄::Matrix{SVector{L,T}}, hist_δ²::Matrix{SVector{L,T}}, hist_𝑀::Matrix{SVector{1,T}}, - Ξ΄::Vector{SVector{L,T}}, δ²::Vector{SVector{L,T}}, 𝑀::Vector{SVector{1,T}}, - X_bin, 𝑖, 𝑗) where {L,T,S} - - hist_Ξ΄ .*= 0.0 - hist_δ² .*= 0.0 - hist_𝑀 .*= 0.0 - - @inbounds Threads.@threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_Ξ΄[X_bin[i,j], j] += Ξ΄[i] - hist_δ²[X_bin[i,j], j] += δ²[i] - hist_𝑀[X_bin[i,j], j] += 𝑀[i] - end - end -end - -hΞ΄1_cpu = SVector.(hΞ΄1_cpu) -hΞ΄2_cpu = SVector.(hΞ΄2_cpu) -h𝑀_cpu = SVector.(h𝑀_cpu) -Ξ΄1_cpu = SVector.(Ξ΄1_cpu) -Ξ΄2_cpu = SVector.(Ξ΄2_cpu) -𝑀_cpu = SVector.(𝑀_cpu) -# 𝑖_cpu = Array(𝑖) -# 𝑗_cpu = Array(𝑗) -# idx_cpu = Array(idx_gpu) - -@time update_hist_cpu!(hΞ΄1_cpu, hΞ΄2_cpu, h𝑀_cpu, Ξ΄1_cpu, Ξ΄2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) -@btime update_hist_cpu!(hΞ΄1_cpu, hΞ΄2_cpu, h𝑀_cpu, Ξ΄1_cpu, Ξ΄2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) -@btime update_hist_cpu!($hΞ΄1_cpu, $hΞ΄2_cpu, $h𝑀_cpu, $Ξ΄1_cpu, $Ξ΄2_cpu, $𝑀_cpu, $idx_cpu, $𝑖_cpu, $𝑗_cpu) - - -############################################################## -## Build histogram from a subsample idx -# base kernel -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol) -Ξ΄ = rand(Float32, items, ncol) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items Γ· 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol Γ· 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -@CUDA.time hist_gpu2!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -############################################# -# test for SVector - basic test - success! -function kernel!(x, y) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if i <= length(x) - # @inbounds x[i] += y[i] - k = Base._to_linear_index(x, i) - CUDA.atomic_add!(pointer(x, k), y[i]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(x, y; MAX_THREADS=1024) - thread_i = min(MAX_THREADS, length(x)) - threads = (thread_i) - blocks = ceil.(Int, length(x) .Γ· threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel!(x, y) - end - return -end - -x = rand(SVector{2,Float32}, Int(1e7)) -y = rand(SVector{2,Float32}, Int(1e7)) -x = rand(Float32, Int(1e7)) -y = rand(Float32, Int(1e7)) - -x_gpu = CuArray(x) -y_gpu = CuArray(y) - -@CuArrays.time hist_gpu!(x_gpu, y_gpu) -@btime hist_gpu!($x_gpu, $y_gpu) - - -############################################# -# test for SVector - real test -function kernelS2!(h, x, id) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= size(id, 1) && j <= size(id, 2) - @inbounds k = Base._to_linear_index(h, id[i,j], j) - # @inbounds k = id[i,j] + 32 * (j-1) - # @inbounds CUDAnative.atomic_add!(pointer(h, k), x[i,j]) - # h[id[i,j],j] += x[i,j] - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpuS2!(h, x, id; MAX_THREADS=256) where {T} - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS Γ· thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) .Γ· threads) - println("threads:", threads) - println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernelS2!(h, x, id) - end - return -end - -hist = zeros(SVector{2,Float32}, nbins, ncol) -Ξ΄ = rand(SVector{2,Float32}, items, ncol) -idx = rand(1:nbins, items, ncol) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) - -@CuArrays.time hist_gpuS2!(hist_gpu, Ξ΄_gpu, idx_gpu) -@btime hist_gpuS2!($hist_gpu, $Ξ΄_gpu, $idx_gpu) - - -############################################################## -## Build histogram from a subsample idx -# accumulate all gradient single pass -function kernel3!(h, x, id, 𝑖, 𝑗) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 1) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 1]) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 2) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 2]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu3!(h, x, id, 𝑖, 𝑗; MAX_THREADS=1024) - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CuArrays.@sync begin - @cuda blocks = blocks threads = threads kernel3!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol, 2) -Ξ΄ = rand(Float32, items, ncol, 2) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items Γ· 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol Γ· 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -@CuArrays.time hist_gpu3!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu3!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - - - -# accumulate in shared memory histograms -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, nbins) where {T <: AbstractFloat} - tid = threadIdx().x - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - - # shared memory on block of length nbins - # To Do: nbins cannot be passed as argument - dynamic shared memory generate kernel through macro - # shared = CUDAnative.@cuStaticSharedMem(T, 32) - # fill!(shared, 0) - # sync_threads() - - # accumulate in per block histogram - # Why is the atomic add on shared much longer than atomic on h in global mem? - if i <= size(id, 1) && j <= size(id, 2) - # should be the legit way to go - 70ms - # @inbounds CUDAnative.atomic_add!(pointer(shared, id[i,j]), x[i,j]) - - # unsafe (collisions) on shared mem: 3.0ms - # @inbounds shared[id[i,j]] = x[i,j] - - # unsafe (collisions) add on global memory - 3.6ms - # @inbounds h[id[i,j],j] += x[i,j] - - # atomic add on global hist - 3.2ms - @inbounds k = id[i,j] + nbins * (j - 1) - @inbounds CUDA.atomic_add!(pointer(h, k), x[i,j]) - end - # sync_threads() - - # if blockIdx().x == 1 - # if tid <= nbins - # CUDA.atomic_add!(pointer(h,tid), shared[tid]) - # end - # end - return -end - -# shared memory - -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id::CuMatrix{Int}, nbins; MAX_THREADS=256) where {T <: AbstractFloat} - # thread_i = min(MAX_THREADS, size(id, 1)) - # thread_j = min(MAX_THREADS Γ· thread_i, size(id, 2)) - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS Γ· thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) ./ threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, nbins) - end - return h -end - -@CuArrays.time hist_gpu2!(hist_gpu, Ξ΄_gpu, idx_gpu, 32, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 32, MAX_THREADS=1024) -@device_code_warntype hist_gpu2!(hist_gpu, Ξ΄_gpu, idx_gpu, 32, MAX_THREADS=1024) - - - - -###################################### -# Appoach 1 -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - -hist = zeros(Float32, nbins, ncol) -Ξ΄ = rand(Float32, items) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items Γ· 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol Γ· 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -Ξ΄_gpu = CuArray(Ξ΄) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -@CUDA.time hist_gpu!(hist_gpu, Ξ΄_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu!($hist_gpu, $Ξ΄_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -###################################### -# Appoach 2 - Loop for assigning command grad to appropriate bin per column -# Idea: exploit the fact that there's a single grad per row: take that grad and add it to each column bin -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{UInt8}, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS Γ· thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - - - - -using CUDA -using BenchmarkTools -N1 = Int(2^12) -x1 = rand(Float32, N1); -x2 = rand(Float32, N1); -x1g = CuArray(x1); -x2g = CuArray(x2); - -function dot_atomic!(x, y, z) - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if idx <= length(x) - CUDA.atomic_add!(pointer(z, 1), x[idx] * y[idx]) - end - return nothing -end - -function bench_dot_atomic!(x, y, z, threads) - numblocks = ceil(Int, N1 / threads) - @cuda threads = threads blocks = numblocks dot_atomic!(x, y, z) -end - -threads = 512 -numblocks = ceil(Int, N1 / threads) -z0 = CUDA.zeros(1) -# @cuda threads=gthreads blocks=numblocks dot_atomic!(x1g, x2g, z0) -@btime CUDA.@sync bench_dot_atomic!($x1g, $x2g, $z0, threads) -# 17.323 ms (50 allocations: 1.67 KiB) - -function dot_share!(x::CuDeviceVector{T}, y::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 64) - fill!(shared, 0) - sync_threads() - - if idx <= length(x) - @inbounds shared[tid] = x[idx] * y[idx] - end - sync_threads() - - i = blockDim().x Γ· 2 - while i > 0 - if tid <= i - @inbounds shared[tid] += shared[tid + i] # valid non atomic operation - # CUDA.atomic_add!(pointer(shared, tid), shared[tid+i]) # invalid op - results in error - end - sync_threads() - i Γ·= 2 - end - if tid == 1 - CUDA.atomic_add!(pointer(z, 1), shared[1]) - end - return nothing -end - -function bench_dot_share!(x, y, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share!(x, y, z) - return z -end - -function wrap_share(x, y, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(1) - x = bench_dot_share!(x, y, z, threads, numblocks) - return x -end - -threads = 64 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(1) -@cuda threads = threads blocks = numblocks dot_share!(x1g, x2g, z) -@time CUDA.@sync wrap_share(x1g, x2g, threads) -@btime CUDA.@sync wrap_share($x1g, $x2g, threads) -x = CUDA.@sync wrap_share(x1g, x2g, threads) -x1g' * x2g -@btime x1g' * x2g -@btime x1' * x2 - - -function dot_share2!(x::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - bid = blockIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 128) - fill!(shared, 0f0) - sync_threads() - - # if idx <= length(x) - # shared[tid] = x[idx] * y[idx] - # end - # sync_threads() - - i = blockDim().x Γ· 2 - while i > 0 - # if tid <= i - if tid == 1 - # shared[tid] += shared[tid + i] # valid non atomic operation - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) # invalid op - results in error - end - sync_threads() - i Γ·= 2 - end - if tid == 1 - z[bid] += shared[1] - end - return nothing -end - -function bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return sum(z) -end - -function wrap_share2(x, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(numblocks) - # x = bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return(x) -end - -threads = 128 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(numblocks) -sum(z) -x1g' * x2g -CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x1g, x2g, z) -@btime CUDA.@sync wrap_share2($x1g, $x2g, threads) -x = CUDA.@sync wrap_share2(x1g, x2g, threads) - -@btime x1g' * x2g - - -using CUDA -function kernel(x) - shared = @cuStaticSharedMem(Float32, 2) - fill!(shared, 1f0) - sync_threads() - # @atomic shared[threadIdx().x] += 0f0 - tid = threadIdx().x - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(1) -@cuda kernel(x) -synchronize() - - -using CUDA -function kernel2(x, y) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - i = Int32(2) - if i > 0 - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) - sync_threads() - # i Γ·= 2 - end - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -y = CUDA.zeros(1) -@cuda threads = 2 kernel2(x, y) -synchronize() - - - -using CUDA -function kernel1(x, y) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - i = Int32(2) - if i > 0 - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - i Γ·= 2 - end - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -y = CUDA.zeros(1) -@cuda threads = 2 kernel1(x, y) -x -synchronize() - - - -using CUDA -function kernel2(x, y) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - i = Int32(2) - while i > 0 - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - i Γ·= 2 - end - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -y = CUDA.zeros(1) -@cuda threads = 2 kernel2(x, y) -x -synchronize() - - -using CUDA -function kernel3(x) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -@cuda threads = 2 kernel3(x) -x -synchronize() - -using CUDA -function kernel4(x) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) + thread_k = 3 + threads = (thread_i, thread_j, thread_k) + blocks = ceil.(Int, (16, size(x_bin, 2))) + fill!(h, 0) + @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h, 2) * 3 kernel_s4!(h, βˆ‡, x_bin) + CUDA.synchronize() return end -x = CUDA.zeros(4) -@cuda threads = 2 kernel4(x) -x -synchronize() \ No newline at end of file +nbins = 64 +nfeats = 100 +nobs = Int(1e6) +h = [zeros(Float32, 3, nbins) for feat in 1nfeats]; +x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); +βˆ‡_cpu = rand(Float32, 3, nobs); +hβˆ‡_cpu = zeros(Float32, 3, nbins, nfeats) +rowsample = 0.5 +colsample = 0.5 +is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) +js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) + +βˆ‡_gpu = CuArray(βˆ‡_cpu) +x_bin_gpu = CuArray(x_bin) +hβˆ‡_gpu = CuArray(hβˆ‡_cpu) +is_gpu = CuArray(is) +js_gpu = CuArray(js) + +@time hist_gpu_s4!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu) +CUDA.@time hist_gpu_s4!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu) +# desktop | 1K: 41.102 ΞΌs (24 allocations: 1.66 KiB) +# desktop | 10K: 59.142 ΞΌs (109 allocations: 9.09 KiB) +# desktop | 100K: 251.850 ΞΌs (109 allocations: 9.09 KiB) +# desktop | 1M: 2.203 ms (23 allocations: 1.33 KiB) +# desktop | 10M: 25.557 ms (110 allocations: 9.11 KiB) +@btime hist_gpu_s4!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu) diff --git a/experiments/hist/hist_gpu-debug-convert.jl b/experiments/hist/hist_gpu-debug-convert.jl deleted file mode 100644 index a92bc34f..00000000 --- a/experiments/hist/hist_gpu-debug-convert.jl +++ /dev/null @@ -1,26 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -function convert_kernel!(x, y) - i = threadIdx().x - if i <= length(x) - val = Float64(y[i]) - CUDA.atomic_add!(pointer(x, i), val) - # CUDA.atomic_add!(pointer(x, i), y[i]) - end - sync_threads() - return nothing -end - -function convert_gpu!(x, y) - @cuda threads = length(x) blocks = (1,) convert_kernel!(x, y) - CUDA.synchronize() - return nothing -end - -x = CUDA.zeros(Float32, 3) -y = CUDA.rand(Float32, 3) -convert_gpu!(x, y) diff --git a/experiments/hist/hist_gpu-df-debug.jl b/experiments/hist/hist_gpu-df-debug.jl deleted file mode 100644 index e0e3450a..00000000 --- a/experiments/hist/hist_gpu-df-debug.jl +++ /dev/null @@ -1,71 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -function hist_kernel!(hβˆ‡::CuDeviceArray, βˆ‡::CuDeviceMatrix, x_bin::CuDeviceMatrix, is::CuDeviceVector, js::CuDeviceVector) - k, tiy, tiz = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().z, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tiz + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(hβˆ‡, βˆ‡, x_bin, is, js) - # kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - # config = launch_configuration(kernel.fun) - # max_threads = config.threads Γ· 4 - # max_blocks = config.blocks * 4 - max_threads = 256 - max_blocks = 16 - tx = size(hβˆ‡, 1) - ty = max(1, min(length(js), fld(max_threads, tx))) - tz = max(1, min(length(is), fld(max_threads, tx * ty))) - threads = (tx, ty, tz) - by = cld(length(js), ty) - bz = min(cld(max_blocks, by), cld(length(is), tz)) - blocks = (bz, by, 1) - hβˆ‡ .= 0 - @info "threads blocks" threads blocks - @cuda threads=threads blocks=blocks hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 2 -nobs = Int(1e5) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -is = 1:nobs -js = 1:nfeats - -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) diff --git a/experiments/hist/hist_gpu-df-ori copy.jl b/experiments/hist/hist_gpu-df-ori copy.jl deleted file mode 100644 index 7728191f..00000000 --- a/experiments/hist/hist_gpu-df-ori copy.jl +++ /dev/null @@ -1,75 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools - -""" - hist_kernel! -""" -function hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - tix, tiy = threadIdx().x, threadIdx().y - bdx, bdy = blockDim().x, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, 1, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[1, idx]) - CUDA.atomic_add!(pointer(hβˆ‡, hid+1), βˆ‡[2, idx]) - CUDA.atomic_add!(pointer(hβˆ‡, hid+2), βˆ‡[3, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(hβˆ‡, βˆ‡, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads - max_blocks = config.blocks - ty = max(1, min(length(js), max_threads)) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (bx, by, 1) - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -hist = zeros(Float32, nbins, nfeats); -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = rand(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -hist_gpu = CuArray(hist) -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) \ No newline at end of file diff --git a/experiments/hist/hist_gpu-df-ori.jl b/experiments/hist/hist_gpu-df-ori.jl deleted file mode 100644 index 47981bc8..00000000 --- a/experiments/hist/hist_gpu-df-ori.jl +++ /dev/null @@ -1,217 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - tix, tiy, k = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().x, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(hβˆ‡, βˆ‡, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - @assert size(hβˆ‡, 1) <= max_threads "number of classes cannot be larger than 31 on GPU" - tz = min(64, size(hβˆ‡, 1)) - ty = max(1, min(length(js), fld(max_threads, tz))) - tx = max(1, min(length(is), fld(max_threads, tz * ty))) - threads = (tx, ty, tz) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (bx, by, 1) - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) - -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -x_bin_gpu = CuArray(x_bin) -βˆ‡_gpu = CuArray(βˆ‡) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) - - -""" - hist_kernel! - revserse k and tix thread order -""" -function hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x - bdx, bdy = blockDim().z, blockDim().y - bix, biy = blockIdx().z, blockIdx().y - gdx = gridDim().z - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(hβˆ‡, βˆ‡, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - tz = size(hβˆ‡, 1) - ty = max(1, min(length(js), fld(max_threads, tz))) - tx = max(1, min(length(is), fld(max_threads, tz * ty))) - threads = (tz, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -x_bin_gpu = CuArray(x_bin) -βˆ‡_gpu = CuArray(βˆ‡) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) - - - -""" - revserse tix and tij thread order - 2.4 ms, no significant speed difference with previous k, j i order -""" -function hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - k, tix, tiy = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().y, blockDim().z - bix, biy = blockIdx().y, blockIdx().z - gdx = gridDim().y - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(hβˆ‡, βˆ‡, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - tk = size(hβˆ‡, 1) - tj = max(1, min(length(js), fld(max_threads, tk))) - ti = max(1, min(length(is), fld(max_threads, tk * tj))) - threads = (tk, ti, tj) - bj = cld(length(js), tj) - bi = min(cld(max_blocks, bj), cld(length(is), ti)) - blocks = (1, bi, bj) - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -x_bin_gpu = CuArray(x_bin) -βˆ‡_gpu = CuArray(βˆ‡) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu) diff --git a/experiments/hist/hist_gpu-df-ori2.jl b/experiments/hist/hist_gpu-df-ori2.jl deleted file mode 100644 index 4d861dfd..00000000 --- a/experiments/hist/hist_gpu-df-ori2.jl +++ /dev/null @@ -1,133 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - tix, tiy, k = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().x, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu2!(h, hβˆ‡, βˆ‡, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.threads" config.threads - # @info "config.blocks" config.blocks - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - @assert size(hβˆ‡, 1) <= max_threads "number of classes cannot be larger than 31 on GPU" - tz = min(64, size(hβˆ‡, 1)) - ty = max(1, min(length(js), fld(max_threads, tz))) - tx = max(1, min(length(is), fld(max_threads, tz * ty))) - threads = (tx, ty, tz) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (bx, by, 1) - hβˆ‡ .= 0 - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - # @inbounds for j in jsc - # copyto!(h[j], view(hβˆ‡, :, :, j)) - # end - @inbounds for j in jsc - copyto!(h[j], view(hβˆ‡, :, :, j)) - end - # CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -h = [hβˆ‡[:,:,j] for j in axes(hβˆ‡, 3)] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true) - -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -@time update_hist_gpu2!(h, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) -CUDA.@time update_hist_gpu2!(h, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) -@time CUDA.@sync update_hist_gpu2!(h, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) -@btime update_hist_gpu2!(h, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) - - - - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -h = [hβˆ‡[:,:,j] for j in axes(hβˆ‡, 3)] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true) - -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -EvoTrees.update_hist_gpu!(h, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) - - - -seed!(123) -nbins = 32 -nfeats = 1 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = zeros(Float32, 3, nbins, nfeats) -h = [hβˆ‡[:,:,j] for j in axes(hβˆ‡, 3)] -rowsample = 0.5 -colsample = 1.0 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true) - -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_gpu = CuArray(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -EvoTrees.update_hist_gpu!(h, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) diff --git a/experiments/hist/hist_gpu-df-vec.jl b/experiments/hist/hist_gpu-df-vec.jl deleted file mode 100644 index 5d36503d..00000000 --- a/experiments/hist/hist_gpu-df-vec.jl +++ /dev/null @@ -1,122 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Base.Threads: @threads -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel_vec!(hβˆ‡, βˆ‡, x_bin, is) - tix, k = threadIdx().x, threadIdx().y - bdx = blockDim().x - bix = blockIdx().x - gdx = gridDim().x - - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter in 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx] - hid = Base._to_linear_index(hβˆ‡, k, bin) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - CUDA.sync_threads() - return nothing -end - -function update_hist_gpu_vec!(hβˆ‡, βˆ‡, x_bin, is, js) - kernel = @cuda launch = false hist_kernel_vec!(hβˆ‡[js[1]], βˆ‡, view(x_bin, :, js[1]), is) - config = launch_configuration(kernel.fun) - max_threads = config.threads - max_blocks = config.blocks - ty = size(hβˆ‡[1], 1) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - bx = min(max_blocks, cld(length(is), tx)) - blocks = (bx, 1, 1) - @sync for j in js - @async hβˆ‡[j] .= 0 - end - CUDA.synchronize() - @sync for j in js - @async kernel(hβˆ‡[j], βˆ‡, view(x_bin, :, j), is; threads, blocks) - end - CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = [zeros(Float32, 3, nbins) for n in 1:nfeats] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_gpu = CuArray.(hβˆ‡) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -@time update_hist_gpu_vec!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) -@time CUDA.@sync update_hist_gpu_vec!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) -@btime CUDA.@sync update_hist_gpu_vec!(hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) - -function update_hist_gpu_cpu!(h, hβˆ‡, βˆ‡, x_bin, is, js) - kernel = @cuda launch = false hist_kernel_vec!(hβˆ‡[js[1]], βˆ‡, view(x_bin, :, js[1]), is) - config = launch_configuration(kernel.fun) - @info "config.threads" config.threads - @info "config.blocks" config.blocks - max_threads = config.threads - max_blocks = config.blocks - ty = size(hβˆ‡[1], 1) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - bx = min(max_blocks, cld(length(is), tx)) - blocks = (bx, 1, 1) - CUDA.@sync for j in js - @async hβˆ‡[j] .= 0 - end - CUDA.synchronize() - CUDA.@sync for j in js - # @async kernel(hβˆ‡[j], βˆ‡, view(x_bin, :, j), is; threads, blocks) - kernel(hβˆ‡[j], βˆ‡, view(x_bin, :, j), is; threads, blocks) - end - CUDA.synchronize() - return nothing -end -function copy_gpu_cpu!(h, hβˆ‡, js) - for j in js - # @info "j" j - copyto!(h[j], hβˆ‡[j]) - end - CUDA.synchronize() - return nothing -end -function combine!(h, hβˆ‡, βˆ‡, x_bin, is, js) - CUDA.@sync update_hist_gpu_cpu!(h, hβˆ‡, βˆ‡, x_bin, is, js) - CUDA.synchronize() - CUDA.@sync copy_gpu_cpu!(h, hβˆ‡, js) - CUDA.synchronize() - return nothing -end -@time CUDA.@sync update_hist_gpu_cpu!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) -@time CUDA.@sync copy_gpu_cpu!(hβˆ‡, hβˆ‡_gpu, js) - -@btime CUDA.@sync update_hist_gpu_cpu!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) -@btime CUDA.@sync copy_gpu_cpu!(hβˆ‡, hβˆ‡_gpu, js) - -@time combine!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) -@time CUDA.@sync update_hist_gpu_cpu!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js) -@btime CUDA.@sync update_hist_gpu_cpu!($hβˆ‡, $hβˆ‡_gpu, $βˆ‡_gpu, $x_bin_gpu, $is_gpu, $js) diff --git a/experiments/hist/hist_gpu-ref.jl b/experiments/hist/hist_gpu-ref.jl new file mode 100644 index 00000000..a56263a1 --- /dev/null +++ b/experiments/hist/hist_gpu-ref.jl @@ -0,0 +1,83 @@ +using Revise +using CUDA +using StatsBase: sample +using BenchmarkTools + +""" + hist_kernel! +""" +function hist_kernel!(hβˆ‡::CuDeviceArray{T,3}, βˆ‡::CuDeviceMatrix{S}, x_bin, is, js) where {T,S} + tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x + bdx, bdy = blockDim().z, blockDim().y + bix, biy = blockIdx().z, blockIdx().y + gdx = gridDim().z + + j = tiy + bdy * (biy - 1) + if j <= length(js) + jdx = js[j] + i_max = length(is) + niter = cld(i_max, bdx * gdx) + @inbounds for iter = 1:niter + i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) + if i <= i_max + @inbounds idx = is[i] + @inbounds bin = x_bin[idx, jdx] + hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) + CUDA.atomic_add!(pointer(hβˆ‡, hid), T(βˆ‡[k, idx])) + end + end + end + sync_threads() + return nothing +end + +function update_hist_gpu!(h, hβˆ‡_cpu, hβˆ‡, βˆ‡, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads + max_blocks = config.blocks + k = size(hβˆ‡, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = max(1, min(length(is), fld(max_threads, k * ty))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + hβˆ‡ .= 0 + kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) + CUDA.synchronize() + copyto!(hβˆ‡_cpu, hβˆ‡) + Threads.@threads for j in jsc + nbins = size(h[j], 2) + @views h[j] .= hβˆ‡_cpu[:, 1:nbins, j] + end + return nothing +end + +nbins = 64 +nfeats = 100 +nobs = Int(1e3) +h = [zeros(Float32, 3, nbins) for feat in 1:nfeats]; +x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); +βˆ‡_cpu = rand(Float32, 3, nobs); +hβˆ‡_cpu = rand(Float32, 3, nbins, nfeats) +rowsample = 0.5 +colsample = 0.5 +is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) +js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) + +hist_gpu = CuArray(hist) +βˆ‡_gpu = CuArray(βˆ‡_cpu) +x_bin_gpu = CuArray(x_bin) +hβˆ‡_gpu = CuArray(hβˆ‡_cpu) +is_gpu = CuArray(is) +js_gpu = CuArray(js) + +@time update_hist_gpu!(h, hβˆ‡_cpu, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) +CUDA.@time update_hist_gpu!(h, hβˆ‡_cpu, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) +# desktop | 1K: 46.332 ΞΌs (109 allocations: 9.09 KiB) +# desktop | 10K: 59.142 ΞΌs (109 allocations: 9.09 KiB) +# desktop | 100K: 251.850 ΞΌs (109 allocations: 9.09 KiB) +# desktop | 1M: 2.328 ms (110 allocations: 9.11 KiB) +# desktop | 10M: 25.557 ms (110 allocations: 9.11 KiB) +@btime update_hist_gpu!(h, hβˆ‡_cpu, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl deleted file mode 100644 index d7b094b6..00000000 --- a/experiments/hist/perf-gpu.jl +++ /dev/null @@ -1,202 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Base.Threads: @threads -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel!(hβˆ‡::CuDeviceArray{T,3}, βˆ‡::CuDeviceMatrix{S}, x_bin, is, js) where {T,S} - tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x - bdx, bdy = blockDim().z, blockDim().y - bix, biy = blockIdx().z, blockIdx().y - gdx = gridDim().z - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(hβˆ‡, k, bin, jdx) - CUDA.atomic_add!(pointer(hβˆ‡, hid), T(βˆ‡[k, idx])) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h, hβˆ‡, βˆ‡, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - k = size(hβˆ‡, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - hβˆ‡ .= 0 - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - CUDA.@sync for j in jsc - nbins = size(h[j], 2) - copyto!(h[j], view(hβˆ‡, :, 1:nbins, j)) - end - return nothing -end - -function update_hist_gpu1!(h, hβˆ‡, βˆ‡, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - k = size(hβˆ‡, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - hβˆ‡ .= 0 - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -function update_hist_gpu2!(h, hβˆ‡_cpu, hβˆ‡, βˆ‡, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - k = size(hβˆ‡, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - hβˆ‡ .= 0 - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - copyto!(hβˆ‡_cpu, hβˆ‡) - CUDA.synchronize() - return nothing -end - - -function update_hist_gpu3!(h, hβˆ‡_cpu, hβˆ‡, βˆ‡, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 - k = size(hβˆ‡, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - hβˆ‡ .= 0 - kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) - # CUDA.synchronize() - copyto!(hβˆ‡_cpu, hβˆ‡) - # CUDA.synchronize() - @threads for j in jsc - nbins = size(h[j], 2) - @views h[j] .= hβˆ‡_cpu[:, 1:nbins, j] - # h[j] .= hβˆ‡_cpu[:, 1:nbins, j] - end - return nothing -end - - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -βˆ‡ = rand(Float32, 3, nobs); -hβˆ‡ = [zeros(Float32, 3, nbins) for n in 1:nfeats] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -βˆ‡_gpu = CuArray(βˆ‡) -x_bin_gpu = CuArray(x_bin) -hβˆ‡_cpu = zeros(Float32, 3, nbins, nfeats) -hβˆ‡_gpu = CuArray(hβˆ‡_cpu) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -CUDA.@time update_hist_gpu!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) -# ref without copy to cpu: ~same -# ref 10K: 875.100 ΞΌs (168 allocations: 7.08 KiB) -# ref 100K: 1.236 ms (215 allocations: 9.91 KiB) -# ref 1M: 6.138 ms (227 allocations: 12.00 KiB) -# ref 10M: 67.075 ms (235 allocations: 13.38 KiB) - -# with copy -# CUDA v4 1M: 2.903 ms (124 allocations: 6.98 KiB) -# CUDA v5 1M: 3.542 ms (848 allocations: 37.14 KiB) -@btime update_hist_gpu!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) - -# without copy -# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB) -# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu1!(hβˆ‡, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) - -# without single array copy -# CUDA v4 1M: -# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu2!(hβˆ‡, hβˆ‡_cpu, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) - -# without single array copy -# CUDA v4 1M: -# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu3!(hβˆ‡, hβˆ‡_cpu, hβˆ‡_gpu, βˆ‡_gpu, x_bin_gpu, is_gpu, js_gpu, js) - - -using CUDA, BenchmarkTools -function gpu_copy!(h, hβˆ‡, jsc) - CUDA.@sync for j in jsc - nbins = size(h[j], 2) - copyto!(h[j], view(hβˆ‡, :, 1:nbins, j)) - end - return nothing -end - -hβˆ‡ = [zeros(Float32, 3, 32) for n in 1:100]; -hβˆ‡_gpu = CUDA.zeros(Float32, 3, 32, 100); -js = 1:100 - -# CUDA v4: 534.480 ΞΌs (100 allocations: 4.69 KiB) -# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB) -@btime gpu_copy!(hβˆ‡, hβˆ‡_gpu, js) - - -function gpu_copy2!(h, hβˆ‡, jsc) - for j in jsc - nbins = size(h[j], 2) - @async copyto!(h[j], view(hβˆ‡, :, 1:nbins, j)) - end - return nothing -end - -hβˆ‡ = [zeros(Float32, 3, 32) for n in 1:100]; -hβˆ‡_gpu = CUDA.zeros(Float32, 3, 32, 100); -js = 1:100 - -# CUDA v4: 534.480 ΞΌs (100 allocations: 4.69 KiB) -# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB) -@btime gpu_copy2!(hβˆ‡, hβˆ‡_gpu, js) diff --git a/experiments/hist_v1.jl b/experiments/hist_v1.jl deleted file mode 100644 index 80fd250b..00000000 --- a/experiments/hist_v1.jl +++ /dev/null @@ -1,306 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using EvoTrees -using SIMD -using LoopVectorization - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n) -𝑗 = collect(1:nvars) -Ξ΄ = rand(n) -δ² = rand(n) -𝑀 = rand(n) - -hist_Ξ΄ = zeros(nbins, nvars) -hist_δ² = zeros(nbins, nvars) -hist_𝑀 = zeros(nbins, nvars) -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars) - -function iter_1(X_bin, hist_Ξ΄, Ξ΄, 𝑖, 𝑗) - # hist_Ξ΄ .*= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_Ξ΄[X_bin[i,j], j] += Ξ΄[i] - end - end -end - -@time iter_1(X_bin, hist_Ξ΄, Ξ΄, 𝑖, 𝑗) -@btime iter_1(X_bin, hist_Ξ΄, Ξ΄, 𝑖, 𝑗) - -function iter_1A(X_bin, hist_Ξ΄, Ξ΄, 𝑖, 𝑗) - # hist_Ξ΄ .*= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_Ξ΄[X_bin[i,j], j] += Ξ΄[i] - end - end -end - -@time iter_1A(X_bin, hist_Ξ΄, Ξ΄, 𝑖, 𝑗) -@btime iter_1A(X_bin, hist_Ξ΄, Ξ΄, 𝑖, 𝑗) - -### 3 features in seperate hists -function iter_1B(X_bin, hist_Ξ΄, hist_δ², hist_𝑀, Ξ΄, δ², 𝑀, 𝑖, 𝑗) - # hist_Ξ΄ .= 0.0 - # hist_δ² .= 0.0 - # hist_𝑀 .= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_Ξ΄[X_bin[i,j], j] += Ξ΄[i] - hist_δ²[X_bin[i,j], j] += δ²[i] - hist_𝑀[X_bin[i,j], j] += 𝑀[i] - end - end -end - -@time iter_1B(X_bin, hist_Ξ΄, hist_δ², hist_𝑀, Ξ΄, δ², 𝑀, 𝑖, 𝑗) -@btime iter_1B($X_bin, $hist_Ξ΄, $hist_δ², $hist_𝑀, $Ξ΄, $δ², $𝑀, $𝑖, $𝑗) - -𝑖2 = sample(𝑖, 500000, replace=false, ordered=true) -𝑗2 = sample(𝑗, 50, replace=false, ordered=true) -@time iter_1B(X_bin, hist_Ξ΄, hist_δ², hist_𝑀, Ξ΄, δ², 𝑀, 𝑖2, 𝑗2) -@btime iter_1B($X_bin, $hist_Ξ΄, $hist_δ², $hist_𝑀, $Ξ΄, $δ², $𝑀, $𝑖2, $𝑗2) - -### 3 features in common hists -hist_δ𝑀 = zeros(3, nbins, nvars) -function iter_2(X_bin, hist_δ𝑀, Ξ΄, δ², 𝑀, 𝑖, 𝑗) - # hist_δ𝑀 .= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_δ𝑀[1, X_bin[i,j], j] += Ξ΄[i] - hist_δ𝑀[2, X_bin[i,j], j] += δ²[i] - hist_δ𝑀[3, X_bin[i,j], j] += 𝑀[i] - end - end -end - -@time iter_2(X_bin, hist_δ𝑀, Ξ΄, δ², 𝑀, 𝑖, 𝑗) -@btime iter_2($X_bin, $hist_δ𝑀, $Ξ΄, $δ², $𝑀, $𝑖, $𝑗) - -### 3 features in common hists - gradients/weight in single matrix -hist_δ𝑀 = zeros(3, nbins, nvars) -δ𝑀 = rand(3, n) - -function iter_3(X_bin, hist_δ𝑀, δ𝑀, 𝑖, 𝑗) - # hist_δ𝑀 .= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_δ𝑀[1, X_bin[i,j], j] += δ𝑀[1,i] - hist_δ𝑀[2, X_bin[i,j], j] += δ𝑀[2,i] - hist_δ𝑀[3, X_bin[i,j], j] += δ𝑀[3,i] - end - end -end - -@time iter_3(X_bin, hist_δ𝑀, δ𝑀, 𝑖, 𝑗) -@btime iter_3($X_bin, $hist_δ𝑀, $δ𝑀, $𝑖, $𝑗) - -# 39.849 ms (81 allocations: 8.22 KiB) -function iter_3B(X_bin, hist_δ𝑀, δ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_δ𝑀[1, X_bin[i,j], j] += δ𝑀[1,i] - hist_δ𝑀[2, X_bin[i,j], j] += δ𝑀[2,i] - hist_δ𝑀[3, X_bin[i,j], j] += δ𝑀[3,i] - end - end -end - -@time iter_3B(X_bin, hist_δ𝑀, δ𝑀, 𝑖, 𝑗) -@btime iter_3B($X_bin, $hist_δ𝑀, $δ𝑀, $𝑖, $𝑗) - -# 38.027 ms (81 allocations: 8.22 KiB) -function iter_3C(X_bin, hist_δ𝑀, δ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds for chunk in 1:10000 - @inbounds for iter in 1:100 - i = iter + 100 * (chunk - 1) - hist_δ𝑀[1, X_bin[i,j], j] += δ𝑀[1,i] - hist_δ𝑀[2, X_bin[i,j], j] += δ𝑀[2,i] - hist_δ𝑀[3, X_bin[i,j], j] += δ𝑀[3,i] - end - end - end -end - -@time iter_3C(X_bin, hist_δ𝑀, δ𝑀, 𝑖, 𝑗) -@btime iter_3C($X_bin, $hist_δ𝑀, $δ𝑀, $𝑖, $𝑗) - - -### 3 features in common hists - vector of matrix hists - gradients/weight in single matrix -hist_δ𝑀_vec = [zeros(3, nbins) for j in 1:nvars] -δ𝑀 = rand(3, n) - -# 33.762 ms (81 allocations: 8.22 KiB) -function iter_4(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) - # [hist_δ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_δ𝑀_vec[j][1, X_bin[i,j]] += δ𝑀[1,i] - hist_δ𝑀_vec[j][2, X_bin[i,j]] += δ𝑀[2,i] - hist_δ𝑀_vec[j][3, X_bin[i,j]] += δ𝑀[3,i] - end - end -end - -@time iter_4(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) -@btime iter_4($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗) - - -### 3 features in common hists - vector of matrix hists - gradients/weight in single matrix -hist_δ𝑀_vec = [zeros(3, nbins) for j in 1:nvars] -δ𝑀 = rand(n, 3) - -# 40.005 ms (81 allocations: 8.22 KiB) -function iter_4B(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) - # [hist_δ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_δ𝑀_vec[j][1, X_bin[i,j]] += δ𝑀[i,1] - hist_δ𝑀_vec[j][2, X_bin[i,j]] += δ𝑀[i,2] - hist_δ𝑀_vec[j][3, X_bin[i,j]] += δ𝑀[i,3] - end - end -end - -@time iter_4B(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗); -@btime iter_4B($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗); - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector -hist_δ𝑀_vec = [zeros(3 * nbins) for j in 1:nvars] -δ𝑀 = rand(3 * n) - - -function iter_5(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) - # [hist_δ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - # @inbounds if mask[i] - id = 3 * i - 2 - hid = 3 * X_bin[i,j] - 2 - hist_δ𝑀_vec[j][hid] += δ𝑀[id] - hist_δ𝑀_vec[j][hid + 1] += δ𝑀[id + 1] - hist_δ𝑀_vec[j][hid + 2] += δ𝑀[id + 2] - # end - end - end -end - -@time iter_5(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) -@btime iter_5($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗) - -# 𝑖2 = sample(𝑖, 900000, replace=false, ordered=true) -# 𝑖3 = view(𝑖2, 100001:650000) -using Random -𝑖2 = sample(𝑖, 500000, replace=false, ordered=true) -𝑗2 = sample(𝑗, 50, replace=false, ordered=true) -@time iter_5(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖2, 𝑗2) -@btime iter_5($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖2, $𝑗2) - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector - explicit loop -hist_δ𝑀_vec = [zeros(3 * nbins) for j in 1:nvars] -δ𝑀 = rand(3, n) - -function iter_5B(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hid = 3 * X_bin[i,j] - 2 - hist_δ𝑀_vec[j][hid] += δ𝑀[1, i] - hist_δ𝑀_vec[j][hid + 1] += δ𝑀[2, i] - hist_δ𝑀_vec[j][hid + 2] += δ𝑀[3, i] - end - end -end - -@time iter_5B(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) -@btime iter_5B($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗) - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector - explicit loop -hist_δ𝑀_vec = [zeros(3 * nbins) for j in 1:nvars] -δ𝑀 = rand(n, 3) - -function iter_5C(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hid = 3 * X_bin[i,j] - 2 - hist_δ𝑀_vec[j][hid] += δ𝑀[i,1] - hist_δ𝑀_vec[j][hid + 1] += δ𝑀[i,2] - hist_δ𝑀_vec[j][hid + 2] += δ𝑀[i,3] - end - end -end - -@time iter_5C(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗) -@btime iter_5C($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗) - -function iter_6(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗, K) - # [hist_δ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - id = 3 * i - 2 - hid = 3 * X_bin[i,j] - 3 - for k in 1:K - hist_δ𝑀_vec[j][hid + k] += δ𝑀[id + k] - end - end - end -end - -K = 3 -@time iter_6(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗, K) -@btime iter_6($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗, $K) - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector - with node assignations -hist_δ𝑀_vec = [[zeros(3 * nbins) for n in 1:16] for j in 1:nvars] -δ𝑀 = rand(3 * n) -𝑛 = sample(1:16, n) - -function iter_7(X_bin, hist_δ𝑀_vec::Vector{Vector{Vector{T}}}, δ𝑀::Vector{T}, 𝑖, 𝑗, 𝑛) where T - # [hist_δ𝑀_vec[j][n] .= 0.0 for n in 𝑛] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - id = 3 * i - 2 - hid = 3 * X_bin[i,j] - 2 - n = 𝑛[i] - - hist_δ𝑀_vec[j][n][hid] += δ𝑀[id] - hist_δ𝑀_vec[j][n][hid + 1] += δ𝑀[id + 1] - hist_δ𝑀_vec[j][n][hid + 2] += δ𝑀[id + 2] - end - end -end - -@time iter_7(X_bin, hist_δ𝑀_vec, δ𝑀, 𝑖, 𝑗, 𝑛) -@btime iter_7($X_bin, $hist_δ𝑀_vec, $δ𝑀, $𝑖, $𝑗, $𝑛) - - - -using StatsBase:sample -using BenchmarkTools - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 64 -K = 3 -𝑖 = collect(1:n_obs) -Ξ΄ = rand(K, n_obs) -hist = zeros(K, n_bins, n_vars); -X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars); -X_bin = reshape(X_bin, n_obs, n_vars); -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -function iter_1(X_bin, hist, Ξ΄, 𝑖) - hist .= 0.0 - @inbounds @simd for i in 𝑖 - @inbounds for k in 1:3 - hist[k, X_bin[i,1], 1] += Ξ΄[k, i] - end - end -end - -@btime iter_1($X_bin, $hist, $Ξ΄, $𝑖_sample) \ No newline at end of file diff --git a/experiments/hist_v2.jl b/experiments/hist_v2.jl deleted file mode 100644 index f107b1ca..00000000 --- a/experiments/hist_v2.jl +++ /dev/null @@ -1,66 +0,0 @@ -# using Statistics -using StatsBase:sample -# using Base.Threads:@threads -using BenchmarkTools - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 64 -K = 3 -𝑖 = collect(1:n_obs) -Ξ΄ = rand(n_obs, K) -hist = zeros(K, n_bins, n_vars); -X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars); -X_bin = reshape(X_bin, n_obs, n_vars); - -function iter_1(X_bin, hist, Ξ΄, 𝑖) - hist .= 0.0 - @inbounds for i in 𝑖 - @inbounds for k in 1:3 - hist[k, X_bin[i,1], 1] += Ξ΄[i,k] - end - end -end - -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -@time iter_1(X_bin, hist, Ξ΄, 𝑖_sample) -@btime iter_1($X_bin, $hist, $Ξ΄, $𝑖_sample) - - - -function iter_2(X_bin, hist, Ξ΄, 𝑖) - hist .= 0.0 - @inbounds @simd for i in CartesianIndices(𝑖) - @inbounds @simd for k in 1:3 - hist[k, X_bin[𝑖[i],1], 1] += Ξ΄[𝑖[i],k] - end - end -end - -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -@time iter_2(X_bin, hist, Ξ΄, 𝑖_sample) -@btime iter_2($X_bin, $hist, $Ξ΄, $𝑖_sample) - - - -# slower -Ξ΄ = rand(K, n_obs) -hist = zeros(K, n_bins, n_vars); -X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars); -X_bin = reshape(X_bin, n_obs, n_vars); - -function iter_1(X_bin, hist, Ξ΄, 𝑖) - hist .= 0.0 - @inbounds for i in 𝑖 - @inbounds for k in 1:3 - hist[k, X_bin[i,1], 1] += Ξ΄[k,i] - end - end -end - -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -@time iter_1(X_bin, hist, Ξ΄, 𝑖_sample) -@btime iter_1($X_bin, $hist, $Ξ΄, $𝑖_sample) diff --git a/experiments/logistic_tests.jl b/experiments/logistic_tests.jl deleted file mode 100644 index d4f507de..00000000 --- a/experiments/logistic_tests.jl +++ /dev/null @@ -1,142 +0,0 @@ -using Statistics -using StatsBase: sample, sample! -using EvoTrees -using BenchmarkTools -using CUDA - -# prepare a dataset -features = rand(Int(1.25e4), 100) -# features = rand(100, 10) -X = features -Y = rand(size(X, 1)) -𝑖 = collect(1:size(X, 1)) - -# train-eval split -𝑖_sample = sample(𝑖, size(𝑖, 1), replace=false) -train_size = 0.8 -𝑖_train = 𝑖_sample[1:floor(Int, train_size * size(𝑖, 1))] -𝑖_eval = 𝑖_sample[floor(Int, train_size * size(𝑖, 1))+1:end] - -x_train, x_eval = X[𝑖_train, :], X[𝑖_eval, :] -y_train, y_eval = Y[𝑖_train], Y[𝑖_eval] - - -########################### -# Tree CPU -########################### -params_c = EvoTrees.EvoTreeMLE(T=Float32, - loss=:logistic, - nrounds=200, - lambda=0.1, gamma=0.0, eta=0.1, - max_depth=5, min_weight=1.0, - rowsample=0.5, colsample=0.5, nbins=16); - -model_c, cache_c = EvoTrees.init_evotree(params_c, x_train, y_train); -EvoTrees.grow_evotree!(model_c, cache_c) -p = model_c(x_train) -sort(p[:,1]) -sort(p[:,2]) - -# initialize from cache -params_c = model_c.params -x_size = size(cache_c.X_bin) - -# select random rows and cols -sample!(params_c.rng, cache_c.𝑖_, cache_c.nodes[1].𝑖, replace=false, ordered=true); -sample!(params_c.rng, cache_c.𝑗_, cache_c.𝑗, replace=false, ordered=true); -# @btime sample!(params_c.rng, cache_c.𝑖_, cache_c.nodes[1].𝑖, replace=false, ordered=true); -# @btime sample!(params_c.rng, cache_c.𝑗_, cache_c.𝑗, replace=false, ordered=true); - -𝑖 = cache_c.nodes[1].𝑖 -𝑗 = cache_c.𝑗 - -# build a new tree -# 897.800 ΞΌs (6 allocations: 736 bytes) -get_loss_type(m::EvoTreeGaussian{L,T,S}) where {L,T,S} = L -get_loss_type(m::EvoTrees.EvoTreeLogistic{L,T,S}) where {L,T,S} = L - -L = get_loss_type(params_c) -@time EvoTrees.update_grads!(L, cache_c.δ𝑀, cache_c.pred, cache_c.Y; alpha=params_c.alpha) -cache_c.δ𝑀 - -sort(cache_c.δ𝑀[1, :]) -sort(cache_c.δ𝑀[2, :]) -sort(cache_c.δ𝑀[3, :]) -sort(cache_c.δ𝑀[4, :]) - -p = collect(-3:0.5:3) -y = collect(-3:0.5:3) - -function get_grads(p, y) - grad = zeros(length(p), length(y)) - for i in eachindex(p) - for j in eachindex(y) - # alternate from 1 - # grad[i, j] = -(exp(-2s) * (u - y) * (u - y + exp(s) * sinh(exp(-s) * (u - y)))) / (1 + cosh(exp(-s) * (u - y))) - grad[i, j] = (exp(-2 * p[i]) * (0.0 - y[j]) * (0.0 - y[j] + exp(p[i]) * sinh(exp(-p[i]) * (0.0 - y[j])))) / (1 + cosh(exp(-p[i]) * (0.0 - y[j]))) - end - end - return grad -end - -grads = get_grads(p, y) -heatmap(grads) -# @btime EvoTrees.update_grads!($params_c.loss, $cache_c.δ𝑀, $cache_c.pred_cpu, $cache_c.Y_cpu, $params_c.Ξ±) -# βˆ‘ = vec(sum(cache_c.Ξ΄[𝑖,:], dims=1)) -# gain = EvoTrees.get_gain(params_c.loss, βˆ‘, params_c.Ξ») -# assign a root and grow tree -# train_nodes[1] = EvoTrees.TrainNode(UInt32(0), UInt32(1), βˆ‘, gain) - -# 62.530 ms (7229 allocations: 17.43 MiB) -# 1.25e5: 9.187 ms (7358 allocations: 2.46 MiB) -tree = EvoTrees.Tree(params_c.max_depth, model_c.K, zero(typeof(params_c.Ξ»))) -@time EvoTrees.grow_tree!(tree, cache_c.nodes, params_c, cache_c.δ𝑀, cache_c.edges, cache_c.𝑗, cache_c.left, cache_c.left, cache_c.right, cache_c.X_bin, cache_c.K) -@btime EvoTrees.grow_tree!($EvoTrees.Tree(params_c.max_depth, model_c.K, zero(typeof(params_c.Ξ»))), $cache_c.nodes, $params_c, $cache_c.δ𝑀, $cache_c.edges, $cache_c.𝑗, $cache_c.left, $cache_c.left, $cache_c.right, $cache_c.X_bin, $cache_c.K) - -@time EvoTrees.grow_tree!(EvoTrees.Tree(params_c.max_depth, model_c.K, params_c.Ξ»), params_c, cache_c.Ξ΄, cache_c.hist, cache_c.histL, cache_c.histR, cache_c.gains, cache_c.edges, 𝑖, 𝑗, 𝑛, cache_c.X_bin); -@btime EvoTrees.grow_tree!(EvoTrees.Tree($params_c.max_depth, $model_c.K, $params_c.Ξ»), $params_c, $cache_c.Ξ΄, $cache_c.hist, $cache_c.histL, $cache_c.histR, $cache_c.gains, $cache_c.edges, $𝑖, $𝑗, $𝑛, $cache_c.X_bin); -@code_warntype EvoTrees.grow_tree!(EvoTrees.Tree(params_c.max_depth, model_c.K, params_c.Ξ»), params_c, cache_c.Ξ΄, cache_c.hist, cache_c.histL, cache_c.histR, cache_c.gains, cache_c.edges, 𝑖, 𝑗, 𝑛, cache_c.X_bin); - -# push!(model_c.trees, tree) -# 1.883 ms (83 allocations: 13.77 KiB) -@btime EvoTrees.predict!(model_c.params.loss, cache_c.pred_cpu, tree, cache_c.X, model_c.K) - -δ𝑀, K, edges, X_bin, nodes, out, left, right = cache_c.δ𝑀, cache_c.K, cache_c.edges, cache_c.X_bin, cache_c.nodes, cache_c.out, cache_c.left, cache_c.right; - -# 9.613 ms (81 allocations: 13.55 KiB) -# 1.25e5: 899.200 ΞΌs (81 allocations: 8.22 KiB) -@time EvoTrees.update_hist!(params_c.loss, nodes[1].h, δ𝑀, X_bin, 𝑖, 𝑗, K) -@btime EvoTrees.update_hist!($params_c.loss, $nodes[1].h, $δ𝑀, $X_bin, $𝑖, $𝑗, $K) -@btime EvoTrees.update_hist!($nodes[1].h, $δ𝑀, $X_bin, $nodes[1].𝑖, $𝑗) -@code_warntype EvoTrees.update_hist!(hist, Ξ΄, X_bin, 𝑖, 𝑗, 𝑛) - -j = 1 -# 8.399 ΞΌs (80 allocations: 13.42 KiB) -n = 1 -nodes[1].βˆ‘ .= vec(sum(δ𝑀[:, 𝑖], dims=2)) -EvoTrees.update_gains!(params_c.loss, nodes[n], 𝑗, params_c, K) -nodes[1].gains -# findmax(nodes[1].gains) #1.25e5: 36.500 ΞΌs (81 allocations: 8.22 KiB) -@btime EvoTrees.update_gains!($params_c.loss, $nodes[n], $𝑗, $params_c, $K) -@code_warntype EvoTrees.update_gains!(params_c.loss, nodes[n], 𝑗, params_c, K) - -#1.25e5: 14.100 ΞΌs (1 allocation: 32 bytes) -best = findmax(nodes[n].gains) -@btime best = findmax(nodes[n].gains) -@btime best = findmax(view(nodes[n].gains, :, 𝑗)) - -tree.cond_bin[n] = best[2][1] -tree.feat[n] = best[2][2] - -Int.(tree.cond_bin[n]) -# tree.cond_bin[n] = 32 - -# 204.900 ΞΌs (1 allocation: 96 bytes) -offset = 0 -@time EvoTrees.split_set!(left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n], offset) -@btime EvoTrees.split_set!($left, $right, $𝑖, $X_bin, $tree.feat[n], $tree.cond_bin[n], $offset) -@code_warntype EvoTrees.split_set!(left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n]) - -# 1.25e5: 227.200 ΞΌs (22 allocations: 1.44 KiB) -@time EvoTrees.split_set_threads!(out, left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n], offset) -@btime EvoTrees.split_set_threads!($out, $left, $right, $𝑖, $X_bin, $tree.feat[n], $tree.cond_bin[n], $offset, Int(2e15)) diff --git a/experiments/loss_gpu.jl b/experiments/loss_gpu.jl deleted file mode 100644 index 37cacad8..00000000 --- a/experiments/loss_gpu.jl +++ /dev/null @@ -1,38 +0,0 @@ -using CUDAnative -using CuArrays -using StaticArrays -using BenchmarkTools - -N = Int(1e6) -pred = rand(SVector{1, Float32}, N) -target = rand(Float32, N) -Ξ΄ = zeros(SVector{1, Float32}, N) -δ² = zeros(SVector{1, Float32}, N) - -pred_g = CuArray(rand(Float32, N)) -target_g = CuArray(rand(Float32, N)) -Ξ΄_g = CuArray(zeros(Float32, N)) -δ²_g = CuArray(zeros(Float32, N)) - -pred = Array(pred_g) -target = Array(target_g) -Ξ΄ = Array(Ξ΄_g) -δ² = Array(δ²_g) - -# linear -function update_grads!(pred::Vector{SVector{L,T}}, target::AbstractVector{T}, Ξ΄::Vector{SVector{L,T}}, δ²::Vector{SVector{L,T}}) where {T <: AbstractFloat, L} - @inbounds for i in eachindex(Ξ΄) - Ξ΄[i] = SVector(2 * (pred[i][1] - target[i])) - δ²[i] = SVector(2) - end -end - -# linear -function update_grads_gpu!(pred::AbstractVector{T}, target::AbstractVector{T}, Ξ΄::AbstractVector{T}, δ²::AbstractVector{T}, 𝑀::AbstractVector{T}) where {T <: AbstractFloat} - @. Ξ΄ = 2f0 * (pred - target) * 𝑀 - @. δ² = 2f0 * 𝑀 - return -end - -@time update_grads!(pred, target, Ξ΄, δ²) -CuArrays.@time update_grads_gpu!(pred_g, target_g, Ξ΄_g, δ²_g) diff --git a/experiments/shuffling.jl b/experiments/shuffling.jl deleted file mode 100644 index 6d11a93e..00000000 --- a/experiments/shuffling.jl +++ /dev/null @@ -1,185 +0,0 @@ -using DataFrames -using Distributions -using EvoTrees -using LinearAlgebra -using GLM -using Random - -Ξ΄ = 1.0e-6 -b = fill(1.0 - Ξ΄, 3, 3) + Ξ΄ * I -z = zeros(3, 3) -y = fill(0.5, 3) -dist = MvNormal([ - b z z 0.8*y - z b z y - z z b 1.2*y - 0.8*y' y' 1.2*y' 1.0]) -Random.seed!(1) -mat = rand(dist, 10_000); -df = DataFrame(transpose(mat), [string.("x", 1:9); "y"]); -target_name = "y" - -################################# -# Tables API -################################# -config = EvoTreeRegressor(rng=123) -m1 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m1) - -config = EvoTreeRegressor(rng=124) -m2 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m2) - -################################# -# Tables API - GPU -################################# -config = EvoTreeRegressor(rowsample=0.5, rng=123) -m1 = fit_evotree(config, - df; - device="gpu", - target_name="y", - verbosity=0); -EvoTrees.importance(m1) - -config = EvoTreeRegressor(rowsample=0.5, rng=124) -m2 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m2) - - -# permuted tables doesn't return the same result - numerical rounding error? -df2 = df[!, 10:-1:1] -config = EvoTreeRegressor() -m3 = fit_evotree(config, - df2; - target_name="y", - verbosity=0); -EvoTrees.importance(m3) - -# manual check on col permutations -config = EvoTreeRegressor(max_depth=4) -m1, cache1 = EvoTrees.init(config, df; target_name); -EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU) -EvoTrees.importance(m1) - -df2 = df[!, 10:-1:1]; -config = EvoTreeRegressor(max_depth=4) -m2, cache2 = EvoTrees.init(config, df2; target_name); -EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU) -EvoTrees.importance(m2) - -all(cache1.x_bin .== cache2.x_bin[:, 9:-1:1]) -all(cache1.edges .== cache2.edges[9:-1:1]) -m1.trees[2] -m2.trees[2] - -m1.trees[2].feat -m2.trees[2].feat - -Int.(m1.trees[2].cond_bin) -Int.(m2.trees[2].cond_bin) - - -config = EvoTreeRegressor(nrounds=100, eta=0.05, colsample=1.0) -m3 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m3) - -################################# -# Tables API -################################# -config = EvoTreeRegressor(colsample=0.5, rng=123) -m1 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m1) - -m2 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m2) - -################################# -# Matrix API -################################# -x_train = Matrix(mat[1:9, :]') -y_train = mat[10, :] - -config = EvoTreeRegressor() -m1 = fit_evotree(config; - x_train, - y_train, - verbosity=0); -EvoTrees.importance(m1) - -m2 = fit_evotree(config; - x_train, - y_train, - verbosity=0); -EvoTrees.importance(m2) - -using GLM -x_train = Matrix(mat[1:9, :]') -y_train = mat[10, :] -lm(x_train, y_train) - -################################# -# Matrix debug API -################################# -x_train = Matrix(mat[1:9, :]') -y_train = mat[10, :] - -config = EvoTreeRegressor() -m1, cache1 = EvoTrees.init(config, x_train, y_train); -EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU) -EvoTrees.importance(m1) - -m2, cache2 = EvoTrees.init(config, x_train, y_train); -EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU) -EvoTrees.importance(m2) - - -using BenchmarkTools -using Random: rand! -using CUDA -nobs = 1_000_000 -rng1 = Random.MersenneTwister(123) -rng2 = Random.Xoshiro(123) -rng3 = Random.TaskLocalRNG() - -is_in = zeros(UInt32, nobs); -is_out = zeros(UInt32, nobs); -mask = zeros(UInt8, nobs); - -@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng1); -@btime EvoTrees.get_rand!(rng1, mask); - -@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng2); -@btime EvoTrees.get_rand!(rng2, mask); - -@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng3); -@btime EvoTrees.get_rand!(rng3, mask); - -get_rand2!(rng, mask) = rand!(rng, mask) -@btime get_rand2!(rng1, mask); -@btime get_rand2!(rng2, mask); -@btime get_rand2!(rng3, mask); - -mask = CUDA.zeros(UInt8, nobs); -CUDA.@time EvoTrees.get_rand_gpu!(mask); -CUDA.@time CUDA.rand(UInt8, nobs); -# @btime EvoTrees.get_rand_gpu!(mask); -# @btime CUDA.rand(UInt8, nobs); - diff --git a/experiments/update_set_v1.jl b/experiments/update_set_v1.jl deleted file mode 100644 index 70c2fa7c..00000000 --- a/experiments/update_set_v1.jl +++ /dev/null @@ -1,277 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using Revise -using EvoTrees - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 255 -𝑖 = collect(1:n_obs); -𝑗 = collect(1:n_vars); -Ξ΄ = rand(n_obs); -δ² = rand(n_obs); - -hist_Ξ΄ = zeros(n_bins, n_vars); -hist_δ² = zeros(n_bins, n_vars); -X_bin = rand(UInt8, n_obs, n_vars); - -# split row ids into left and right based on best split condition -function update_set_1(set, best, x_bin) - left = similar(set) - right = similar(set) - left_count = 0 - right_count = 0 - @inbounds for i in set - if x_bin[i] <= best - left_count += 1 - left[left_count] = i - else - right_count += 1 - right[right_count] = i - end - end - resize!(left, left_count) - resize!(right, right_count) - return left, right -end - -@time update_set_1(𝑖, 16, X_bin[:,1]); -@btime update_set_1($𝑖, 16, $X_bin[:,1]); -@btime update_set_1($𝑖, 64, $X_bin[:,1]); -@btime update_set_1($𝑖, 128, $X_bin[:,1]); -@btime update_set_1($𝑖, 240, $X_bin[:,1]); - -# add a leaf id update - to indicate to which leaf the set is associated -function update_set_2!(leaf_vec::Vector{T}, set, best_feat, best_cond, x_bin, depth::T) where {T} - @inbounds for i in set - left_id = leaf_vec[i] + 2^depth - right_id = left_id + 1 - x_bin[i, best_feat[leaf_vec[i]]] <= best_cond[leaf_vec[i]] ? leaf_vec[i] = left_id : leaf_vec[i] = right_id - end -end - -leaf_vec = ones(UInt16, n_obs); -leaf_id = 0 -depth = UInt16(1) -depth = 1 -best_feat = UInt16.(sample(1:100, 100000)) -best_cond = rand(UInt16, 100000); - -@time update_set_2!(leaf_vec, 𝑖, best_feat, best_cond, X_bin, depth); -@btime update_set_2!($leaf_vec, $𝑖, $best_feat, $best_cond, $X_bin, $depth); -Int.(leaf_vec) - - -# split row ids into left and right based on best split condition -function split_set_1!(left::V, right::V, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset) where {S,V} - - left_count = 0 - right_count = 0 - - @inbounds for i in 1:length(𝑖) - @inbounds if X_bin[𝑖[i], feat] <= cond_bin - left_count += 1 - left[offset + left_count] = 𝑖[i] - else - right[offset + length(𝑖) - right_count] = 𝑖[i] - right_count += 1 - end - end - # return (left[1:left_count], right[1:right_count]) - return (view(left, (offset + 1):(offset + left_count)), view(right, (offset + length(𝑖)):-1:(offset + left_count + 1))) - # return nothing -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n) -𝑗 = collect(1:nvars) -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars) -left = similar(𝑖) -right = similar(𝑖) - -𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true) - -offset = 0 -feat = 15 -cond_bin = 32 -@time lid2, rid2 = split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) -@btime split_set_1!($left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset) -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_1!(left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_1!(left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) - -𝑖 -unique(vcat(lid1, rid1)) -unique(vcat(lid1)) -unique(vcat(rid1)) -unique(sort(vcat(lid2, rid2))) -unique(sort(vcat(lid3, rid3))) -unique(sort(vcat(lid2, rid2, lid3, rid3))) - -# split row ids into left and right based on best split condition -function split_set_2!(left, right, 𝑖, x_bin, feat, cond_bin) - - left_count = 0 - right_count = 0 - - @inbounds for i in 1:length(𝑖) - if x_bin[i] <= cond_bin - left_count += 1 - left[left_count] = 𝑖[i] - else - right_count += 1 - right[right_count] = 𝑖[i] - end - end - # return (left[1:left_count], right[1:right_count]) - return (view(left, 1:left_count), view(right, 1:right_count)) - # return nothing -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n) -𝑗 = collect(1:nvars) -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars) -left = similar(𝑖) -right = similar(𝑖) - -feat = 15 -cond_bin = 32 -@time left, right = split_set_2!(left, right, 𝑖, X_bin[:,feat], feat, cond_bin) -@btime split_set_2!($left, $right, $𝑖, $X_bin[:,feat], $feat, $cond_bin) -@btime split_set_2!($left, $right, $𝑖, $view(X_bin, :, feat), $feat, $cond_bin) - - - -# function split_set_bool!(child_bool::AbstractVector{Bool}, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset) where {S} -# left_count = 0 -# right_count = 0 -# @inbounds for i in eachindex(𝑖) -# child_bool[i + offset] = X_bin[𝑖[i], feat] <= cond_bin -# end -# # return (view(𝑖, child_bool[offset + 1:offset + length(𝑖)]), view(𝑖, .!child_bool[offset + 1:offset + length(𝑖)])) -# # return (view(𝑖, view(child_bool, (offset + 1):(offset + length(𝑖)))), view(𝑖, view(child_bool, (offset + 1):(offset + length(𝑖))))) -# # return (view(𝑖, child_bool), view(𝑖, child_bool)) -# return view(𝑖, child_bool) -# end - -function split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - left_count = 0 - right_count = 0 - @inbounds for i in eachindex(block) - @inbounds if X_bin[block[i], feat] <= cond_bin - left_count += 1 - left[offset + chunk_size * (bid - 1) + left_count] = block[i] - else - right_count += 1 - right[offset + chunk_size * (bid - 1) + right_count] = block[i] - # right[offset + chunk_size * (bid - 1) + length(block) - right_count] = block[i] - # right_count += 1 - end - end - lefts[bid] = left_count - rights[bid] = right_count - bsizes[bid] = length(block) - return nothing -end - -function split_set_threads!(out, left, right, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset, chunk_size=2^14) where {S} - - left_count = 0 - right_count = 0 - iter = Iterators.partition(𝑖, chunk_size) - nblocks = length(iter) - lefts = zeros(Int, nblocks) - rights = zeros(Int, nblocks) - bsizes = zeros(Int, nblocks) - - @sync for (bid, block) in enumerate(iter) - Threads.@spawn split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - end - - left_sum = sum(lefts) - left_cum = 0 - right_cum = 0 - @inbounds for bid in 1:nblocks - view(out, offset + left_cum + 1:offset + left_cum + lefts[bid]) .= view(left, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]) - view(out, offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - - # out[offset + left_cum + 1:offset + left_cum + lefts[bid]] .= left[offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]] - # out[offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]] .= right[offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]] - - # view(right, offset + right_cum + 1:offset + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - # view(right, offset + length(𝑖) - right_cum:-1:offset + length(𝑖) - right_cum - rights[bid] + 1) .= view(right, offset + chunk_size * (bid - 1) + bsizes[bid]:-1:offset + chunk_size * (bid - 1) + lefts[bid]+1) - left_cum += lefts[bid] - right_cum += rights[bid] - end - - # right_cum = 0 - # @inbounds for bid in nblocks:-1:1 - # # view(right, offset + right_cum + 1:offset + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - # view(right, offset + length(𝑖) - right_cum:-1:offset + length(𝑖) - right_cum - rights[bid] + 1) .= view(right, offset + chunk_size * (bid - 1) + lefts[bid] + 1:offset + chunk_size * (bid - 1) + bsizes[bid]) - # # right_cum += rights[bid] - # end - - return (view(out, offset + 1:offset + sum(lefts)), view(out, offset + sum(lefts)+1:offset + length(𝑖))) - # return (lefts, rights, out) - # return (view(left, offset + 1:offset + sum(lefts)), view(right, offset + length(𝑖):-1:offset + sum(lefts) + 1)) - # return (left[offset + 1:offset + sum(lefts)], right[offset + 1:offset + sum(rights)]) -end - -iter = Iterators.partition(rand(5), 3) -for i in enumerate(iter) - println(i) -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n); -𝑗 = collect(1:nvars); -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars); -𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true); -child_bool = zeros(Bool, length(𝑖)); -left = similar(𝑖) -right = similar(𝑖) -out = similar(𝑖) - -offset = 0 -feat = 15 -cond_bin = 32 -@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14); -@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14); -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) diff --git a/experiments/update_set_v2.jl b/experiments/update_set_v2.jl deleted file mode 100644 index 43f3c2ca..00000000 --- a/experiments/update_set_v2.jl +++ /dev/null @@ -1,101 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using Revise -using EvoTrees - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 255 -𝑖 = collect(1:n_obs); -𝑗 = collect(1:n_vars); -X_bin = rand(UInt8, n_obs, n_vars); - -function split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - left_count = 0 - right_count = 0 - @inbounds for i in eachindex(block) - @inbounds if X_bin[block[i], feat] <= cond_bin - left_count += 1 - left[offset + chunk_size * (bid - 1) + left_count] = block[i] - else - right_count += 1 - right[offset + chunk_size * (bid - 1) + right_count] = block[i] - # right[offset + chunk_size * (bid - 1) + length(block) - right_count] = block[i] - # right_count += 1 - end - end - lefts[bid] = left_count - rights[bid] = right_count - bsizes[bid] = length(block) - return nothing -end - -function split_set_threads!(out, left, right, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset, chunk_size=2^14) where {S} - - left_count = 0 - right_count = 0 - iter = Iterators.partition(𝑖, chunk_size) - nblocks = length(iter) - lefts = zeros(Int, nblocks) - rights = zeros(Int, nblocks) - bsizes = zeros(Int, nblocks) - - @sync for (bid, block) in enumerate(iter) - Threads.@spawn split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - end - - left_sum = sum(lefts) - left_cum = 0 - right_cum = 0 - @inbounds for bid in 1:nblocks - view(out, offset + left_cum + 1:offset + left_cum + lefts[bid]) .= view(left, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]) - view(out, offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - left_cum += lefts[bid] - right_cum += rights[bid] - end - return (view(out, offset + 1:offset + sum(lefts)), view(out, offset + sum(lefts)+1:offset + length(𝑖))) -end - -iter = Iterators.partition(rand(5), 3) -for i in enumerate(iter) - println(i) -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n); -𝑗 = collect(1:nvars); -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars); -𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true); -child_bool = zeros(Bool, length(𝑖)); -left = similar(𝑖) -right = similar(𝑖) -out = similar(𝑖) - -offset = 0 -feat = 15 -cond_bin = 32 -@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14); -@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14); -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) - - - diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl index 2533f789..02de1b1c 100644 --- a/ext/EvoTreesCUDAExt/fit-utils.jl +++ b/ext/EvoTreesCUDAExt/fit-utils.jl @@ -26,17 +26,19 @@ end function update_hist_gpu!(h, hβˆ‡_cpu, hβˆ‡, βˆ‡, x_bin, is, js, jsc) kernel = @cuda launch = false hist_kernel!(hβˆ‡, βˆ‡, x_bin, is, js) config = launch_configuration(kernel.fun) - max_threads = config.threads Γ· 4 - max_blocks = config.blocks * 4 + max_threads = config.threads + max_blocks = config.blocks k = size(hβˆ‡, 1) ty = max(1, min(length(js), fld(max_threads, k))) tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) threads = (k, ty, tx) + max_blocks = min(65535, max_blocks * fld(max_threads, prod(threads))) by = cld(length(js), ty) bx = min(cld(max_blocks, by), cld(length(is), tx)) blocks = (1, by, bx) hβˆ‡ .= 0 kernel(hβˆ‡, βˆ‡, x_bin, is, js; threads, blocks) + CUDA.synchronize() copyto!(hβˆ‡_cpu, hβˆ‡) Threads.@threads for j in jsc nbins = size(h[j], 2) @@ -45,58 +47,6 @@ function update_hist_gpu!(h, hβˆ‡_cpu, hβˆ‡, βˆ‡, x_bin, is, js, jsc) return nothing end -function hist_kernel_vec!(hβˆ‡, βˆ‡, x_bin, is) - tix, k = threadIdx().x, threadIdx().y - bdx = blockDim().x - bix = blockIdx().x - gdx = gridDim().x - - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter in 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - idx = is[i] - bin = x_bin[idx] - hid = Base._to_linear_index(hβˆ‡, k, bin) - CUDA.atomic_add!(pointer(hβˆ‡, hid), βˆ‡[k, idx]) - end - end - # CUDA.sync_threads() - return nothing -end -function update_hist_gpu_vec!(h, hβˆ‡, βˆ‡, x_bin, is, js::Vector) - kernel = @cuda launch = false hist_kernel_vec!(hβˆ‡[js[1]], βˆ‡, view(x_bin, :, js[1]), is) - config = launch_configuration(kernel.fun) - max_threads = config.threads - max_blocks = config.blocks - @assert size(hβˆ‡[js[1]], 1) <= max_threads "number of classes cannot be larger than 31 on GPU" - ty = min(64, size(hβˆ‡[js[1]], 1)) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - bx = min(max_blocks, cld(length(is), tx)) - blocks = (bx, 1, 1) - # @sync for j in js - # @async hβˆ‡[j] .= 0 - # end - for j in js - hβˆ‡[j] .= 0 - h[j] .= 0 - end - CUDA.synchronize() - # @info "hist" max_blocks length(is) threads blocks - @sync for j in js - @async kernel(hβˆ‡[j], βˆ‡, view(x_bin, :, j), is; threads, blocks) - # kernel(hβˆ‡[j], βˆ‡, view(x_bin, :, j), is; threads, blocks) - end - CUDA.synchronize() - for j in js - copyto!(h[j], hβˆ‡[j]) - end - CUDA.synchronize() - return nothing -end - # Multi-threads split_set! # Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set. function split_chunk_kernel!( diff --git a/ext/EvoTreesCUDAExt/predict.jl b/ext/EvoTreesCUDAExt/predict.jl index 6ae81645..e983dd4e 100644 --- a/ext/EvoTreesCUDAExt/predict.jl +++ b/ext/EvoTreesCUDAExt/predict.jl @@ -149,17 +149,19 @@ function EvoTrees.predict!( pred .= max.(T(-15), pred .- maximum(pred, dims=1)) end -# prediction from single tree - assign each observation to its final leaf +# prediction for EvoTree model function predict( m::EvoTree{L,K}, data, ::Type{<:EvoTrees.GPU}; ntree_limit=length(m.trees)) where {L,K} - pred = CUDA.zeros(K, size(data, 1)) + Tables.istable(data) ? data = Tables.columntable(data) : nothing ntrees = length(m.trees) ntree_limit > ntrees && error("ntree_limit is larger than number of trees $ntrees.") x_bin = CuArray(EvoTrees.binarize(data; fnames=m.info[:fnames], edges=m.info[:edges])) + nobs = size(x_bin, 1) + pred = CUDA.zeros(K, nobs) feattypes = CuArray(m.info[:feattypes]) for i = 1:ntree_limit EvoTrees.predict!(pred, m.trees[i], x_bin, feattypes) diff --git a/figures/gaussian-sinus-binary.png b/figures/gaussian-sinus-binary.png index 991b99f6..ef5b99a7 100644 Binary files a/figures/gaussian-sinus-binary.png and b/figures/gaussian-sinus-binary.png differ diff --git a/figures/logistic-sinus-binary.png b/figures/logistic-sinus-binary.png index b353ed8d..0a1d9807 100644 Binary files a/figures/logistic-sinus-binary.png and b/figures/logistic-sinus-binary.png differ diff --git a/figures/quantiles-sinus-binary.png b/figures/quantiles-sinus-binary.png index 127e4eed..35bc9c33 100644 Binary files a/figures/quantiles-sinus-binary.png and b/figures/quantiles-sinus-binary.png differ diff --git a/figures/regression-sinus-binary-gpu.png b/figures/regression-sinus-binary-gpu.png index 1be5632a..8a4fce16 100644 Binary files a/figures/regression-sinus-binary-gpu.png and b/figures/regression-sinus-binary-gpu.png differ diff --git a/figures/regression-sinus-binary.png b/figures/regression-sinus-binary.png index c81dc366..7da5c892 100644 Binary files a/figures/regression-sinus-binary.png and b/figures/regression-sinus-binary.png differ diff --git a/figures/regression-sinus-oblivious-gpu.png b/figures/regression-sinus-oblivious-gpu.png index eb60970d..c1726951 100644 Binary files a/figures/regression-sinus-oblivious-gpu.png and b/figures/regression-sinus-oblivious-gpu.png differ diff --git a/figures/regression-sinus-oblivious.png b/figures/regression-sinus-oblivious.png index 358d6749..fc332dfd 100644 Binary files a/figures/regression-sinus-oblivious.png and b/figures/regression-sinus-oblivious.png differ diff --git a/figures/regression-sinus2-binary.png b/figures/regression-sinus2-binary.png index 964652e9..475e6e8c 100644 Binary files a/figures/regression-sinus2-binary.png and b/figures/regression-sinus2-binary.png differ diff --git a/figures/regression-sinus2-oblivious.png b/figures/regression-sinus2-oblivious.png index 2a1eb75f..8bdebc95 100644 Binary files a/figures/regression-sinus2-oblivious.png and b/figures/regression-sinus2-oblivious.png differ diff --git a/src/MLJ.jl b/src/MLJ.jl index bdf1ad8f..4711f98c 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -1,9 +1,15 @@ function MMI.fit(model::EvoTypes, verbosity::Int, A, y, w=nothing) - fitresult, cache = init(model, A.matrix, y; w_train=w) + + A = isa(A, AbstractMatrix) ? Tables.columntable(Tables.table(A)) : Tables.columntable(A) + nobs = Tables.DataAPI.nrow(A) + fnames = Tables.schema(A).names + w = isnothing(w) ? device_ones(CPU, Float32, nobs) : Vector{Float32}(w) + fitresult, cache = init_core(model, CPU, A, fnames, y, w, nothing) + while cache[:info][:nrounds] < model.nrounds grow_evotree!(fitresult, cache, model) end - report = (features=A.names,) + report = (features=cache[:fnames],) return fitresult, cache, report end @@ -12,25 +18,6 @@ function okay_to_continue(model, fitresult, cache) all(_get_struct_loss(model) .== _get_struct_loss(fitresult)) end -# Generate names to be used by feature_importances in the report -MMI.reformat(::EvoTypes, X, y, w) = - ((matrix=MMI.matrix(X), names=[name for name ∈ schema(X).names]), y, w) -MMI.reformat(::EvoTypes, X, y) = - ((matrix=MMI.matrix(X), names=[name for name ∈ schema(X).names]), y) -MMI.reformat(::EvoTypes, X) = - ((matrix=MMI.matrix(X), names=[name for name ∈ schema(X).names]),) -MMI.reformat(::EvoTypes, X::AbstractMatrix, y, w) = - ((matrix=X, names=["feat_$i" for i = 1:size(X, 2)]), y, w) -MMI.reformat(::EvoTypes, X::AbstractMatrix, y) = - ((matrix=X, names=["feat_$i" for i = 1:size(X, 2)]), y) -MMI.reformat(::EvoTypes, X::AbstractMatrix) = - ((matrix=X, names=["feat_$i" for i = 1:size(X, 2)]),) -MMI.selectrows(::EvoTypes, I, A, y, w) = - ((matrix=view(A.matrix, I, :), names=A.names), view(y, I), view(w, I)) -MMI.selectrows(::EvoTypes, I, A, y) = - ((matrix=view(A.matrix, I, :), names=A.names), view(y, I)) -MMI.selectrows(::EvoTypes, I, A) = ((matrix=view(A.matrix, I, :), names=A.names),) - # For EarlyStopping.jl support MMI.iteration_parameter(::Type{<:EvoTypes}) = :nrounds @@ -47,7 +34,7 @@ function MMI.update( while cache[:info][:nrounds] < model.nrounds grow_evotree!(fitresult, cache, model) end - report = (features=A.names,) + report = (features=cache[:fnames],) else fitresult, cache, report = fit(model, verbosity, A, y, w) end @@ -55,32 +42,32 @@ function MMI.update( end function predict(::EvoTreeRegressor, fitresult, A) - pred = vec(predict(fitresult, A.matrix)) + pred = predict(fitresult, A) return pred end function predict(::EvoTreeClassifier, fitresult, A) - pred = predict(fitresult, A.matrix) + pred = predict(fitresult, A) return MMI.UnivariateFinite(fitresult.info[:target_levels], pred, pool=missing) end function predict(::EvoTreeCount, fitresult, A) - Ξ»s = vec(predict(fitresult, A.matrix)) + Ξ»s = predict(fitresult, A) return [Distributions.Poisson(Ξ») for Ξ» ∈ Ξ»s] end function predict(::EvoTreeGaussian, fitresult, A) - pred = predict(fitresult, A.matrix) + pred = predict(fitresult, A) return [Distributions.Normal(pred[i, 1], pred[i, 2]) for i in axes(pred, 1)] end function predict(::EvoTreeMLE{L}, fitresult, A) where {L<:GaussianMLE} - pred = predict(fitresult, A.matrix) + pred = predict(fitresult, A) return [Distributions.Normal(pred[i, 1], pred[i, 2]) for i in axes(pred, 1)] end function predict(::EvoTreeMLE{L}, fitresult, A) where {L<:LogisticMLE} - pred = predict(fitresult, A.matrix) + pred = predict(fitresult, A) return [Distributions.Logistic(pred[i, 1], pred[i, 2]) for i in axes(pred, 1)] end @@ -109,7 +96,7 @@ MMI.metadata_pkg.( MMI.metadata_model( EvoTreeRegressor, input_scitype=Union{ - MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor), + MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor, MMI.Multiclass), AbstractMatrix{MMI.Continuous}, }, target_scitype=AbstractVector{<:MMI.Continuous}, @@ -120,7 +107,7 @@ MMI.metadata_model( MMI.metadata_model( EvoTreeClassifier, input_scitype=Union{ - MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor), + MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor, MMI.Multiclass), AbstractMatrix{MMI.Continuous}, }, target_scitype=AbstractVector{<:MMI.Finite}, @@ -131,7 +118,7 @@ MMI.metadata_model( MMI.metadata_model( EvoTreeCount, input_scitype=Union{ - MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor), + MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor, MMI.Multiclass), AbstractMatrix{MMI.Continuous}, }, target_scitype=AbstractVector{<:MMI.Count}, @@ -142,7 +129,7 @@ MMI.metadata_model( MMI.metadata_model( EvoTreeGaussian, input_scitype=Union{ - MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor), + MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor, MMI.Multiclass), AbstractMatrix{MMI.Continuous}, }, target_scitype=AbstractVector{<:MMI.Continuous}, @@ -153,7 +140,7 @@ MMI.metadata_model( MMI.metadata_model( EvoTreeMLE, input_scitype=Union{ - MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor), + MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor, MMI.Multiclass), AbstractMatrix{MMI.Continuous}, }, target_scitype=AbstractVector{<:MMI.Continuous}, @@ -214,7 +201,7 @@ model = fit_evotree(config; x_train, y_train, kwargs...) ## Inference -Predictions are obtained using [`predict`](@ref) which returns a `Matrix` of size `[nobs, 1]`: +Predictions are obtained using [`predict`](@ref) which returns a `Vector` of length `nobs`: ```julia EvoTrees.predict(model, X) @@ -456,7 +443,7 @@ model = fit_evotree(config; x_train, y_train, kwargs...) ## Inference -Predictions are obtained using [`predict`](@ref) which returns a `Matrix` of size `[nobs, 1]`: +Predictions are obtained using [`predict`](@ref) which returns a `Vector` of length `nobs`: ```julia EvoTrees.predict(model, X) diff --git a/src/fit-utils.jl b/src/fit-utils.jl index f0fb0b93..8ea83790 100644 --- a/src/fit-utils.jl +++ b/src/fit-utils.jl @@ -24,7 +24,7 @@ function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwist end function get_edges(df; fnames, nbins, rng=Random.MersenneTwister()) - _nobs = length(Tables.getcolumn(df, 1)) + _nobs = Tables.DataAPI.nrow(df) nobs = min(_nobs, 1000 * nbins) idx = sample(rng, 1:_nobs, nobs, replace=false, ordered=true) edges = Vector{Any}([Vector{eltype(Tables.getcolumn(df, col))}() for col in fnames]) @@ -175,45 +175,40 @@ function split_set_threads!( lefts = zeros(Int, nblocks) rights = zeros(Int, nblocks) - @sync begin - for bid = 1:nblocks - @spawn begin - lefts[bid], rights[bid] = split_set_chunk!( - left, - right, - is, - bid, - nblocks, - x_bin, - feat, - cond_bin, - feattype, - offset, - chunk_size, - ) - end - end + @threads for bid = 1:nblocks + lefts[bid], rights[bid] = split_set_chunk!( + left, + right, + is, + bid, + nblocks, + x_bin, + feat, + cond_bin, + feattype, + offset, + chunk_size, + ) end sum_lefts = sum(lefts) cumsum_lefts = cumsum(lefts) cumsum_rights = cumsum(rights) - @sync begin - for bid = 1:nblocks - @spawn split_views_kernel!( - out, - left, - right, - bid, - offset, - chunk_size, - lefts, - rights, - sum_lefts, - cumsum_lefts, - cumsum_rights, - ) - end + + @threads for bid = 1:nblocks + split_views_kernel!( + out, + left, + right, + bid, + offset, + chunk_size, + lefts, + rights, + sum_lefts, + cumsum_lefts, + cumsum_rights, + ) end return ( diff --git a/src/fit.jl b/src/fit.jl index feb07d7a..f6be2e68 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -365,6 +365,7 @@ function fit_evotree( ) where {L} @assert Tables.istable(dtrain) "fit_evotree(params, dtrain) only accepts Tables compatible input for `dtrain` (ex: named tuples, DataFrames...)" + dtrain = Tables.columntable(dtrain) verbosity == 1 && @info params @assert string(device) ∈ ["cpu", "gpu"] _device = string(device) == "cpu" ? CPU : GPU @@ -379,6 +380,7 @@ function fit_evotree( @warn "To track eval metric in logger, both `metric` and `deval` must be provided." end if logging_flag + deval = Tables.columntable(deval) cb = CallBack(params, m, deval, _device; target_name, w_name, offset_name, metric) logger = init_logger(; metric, maximise=is_maximise(cb.feval), early_stopping_rounds) cb(logger, 0, m.trees[end]) diff --git a/src/predict.jl b/src/predict.jl index 300ebe66..9c753b20 100644 --- a/src/predict.jl +++ b/src/predict.jl @@ -86,10 +86,11 @@ function predict( ::Type{<:Device}=CPU; ntree_limit=length(m.trees)) where {L,K} + Tables.istable(data) ? data = Tables.columntable(data) : nothing ntrees = length(m.trees) ntree_limit > ntrees && error("ntree_limit is larger than number of trees $ntrees.") x_bin = binarize(data; fnames=m.info[:fnames], edges=m.info[:edges]) - nobs = Tables.istable(data) ? length(Tables.getcolumn(data, 1)) : size(data, 1) + nobs = size(x_bin, 1) pred = zeros(Float32, K, nobs) for i = 1:ntree_limit predict!(pred, m.trees[i], x_bin, m.info[:feattypes]) diff --git a/test/MLJ.jl b/test/MLJ.jl index 906503cf..18219342 100644 --- a/test/MLJ.jl +++ b/test/MLJ.jl @@ -162,7 +162,6 @@ tree_model = EvoTreeCount( ) X = MLJBase.table(X) -X = MLJBase.matrix(X) # typeof(X) mach = machine(tree_model, X, Y) @@ -367,6 +366,8 @@ tree_model = EvoTreeRegressor( nbins=32, ) +X = MLJBase.table(X) + # typeof(X) mach = machine(tree_model, X, Y, W) train, test = partition(eachindex(Y), 0.8, shuffle=true); # 70:30 split @@ -376,3 +377,23 @@ mach.model.nrounds += 10 fit!(mach, rows=train, verbosity=1) report(mach) + +@testset "MLJ - rowtables - EvoTreeRegressor" begin + X, y = make_regression(1000, 5) + X = Tables.rowtable(X) + booster = EvoTreeRegressor() + # smoke tests: + mach = machine(booster, X, y) |> fit! + fit!(mach) + predict(mach, X) +end + +@testset "MLJ - matrix - EvoTreeRegressor" begin + X, y = make_regression(1000, 5) + X = Tables.matrix(X) + booster = EvoTreeRegressor() + # smoke tests: + mach = machine(booster, X, y) |> fit! + fit!(mach) + predict(mach, X) +end diff --git a/test/runtests.jl b/test/runtests.jl index a4228a49..e043aa38 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,6 +2,7 @@ using Statistics using EvoTrees using EvoTrees: predict using CategoricalArrays +using Tables using Random using Test