diff --git a/Project.toml b/Project.toml index 51145a5e..195f96e0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ -authors = ["jeremiedb "] name = "EvoTrees" uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" -version = "0.16.4" +authors = ["jeremiedb "] +version = "0.16.5" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" @@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +EvoTreesCUDAExt = "CUDA" + [compat] BSON = "0.3" CUDA = "3.0, 4.0, 5.0" @@ -28,9 +34,6 @@ StatsBase = "0.32, 0.33, 0.34" Tables = "1.9" julia = "1.6" -[extensions] -EvoTreesCUDAExt = "CUDA" - [extras] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" @@ -42,6 +45,3 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] docs = ["Documenter"] test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"] - -[weakdeps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl index d38804b6..5e20a326 100644 --- a/benchmarks/regressor.jl +++ b/benchmarks/regressor.jl @@ -12,9 +12,9 @@ import CUDA # desktop | 1e6 | depth 11 | cpu: 37.2s # desktop | 10e6 | depth 11 | cpu -### perf depth -# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s -# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s +### v0.16.5 +# desktop | 1e6 | depth 11 | cpu: 31s gpu: 50 sec | xgboost cpu: 26s +# desktop | 10e6 | depth 11 | cpu 200s gpu: 80 sec | xgboost cpu: 267s #threads # laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB) diff --git a/experiments/depth-debug.jl b/experiments/depth-debug.jl deleted file mode 100644 index 4a8df1eb..00000000 --- a/experiments/depth-debug.jl +++ /dev/null @@ -1,129 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using Revise -using EvoTrees -using Profile - -nobs = Int(1e6) -num_feat = Int(100) -nrounds = 200 -nthread = Base.Threads.nthreads() -x_train = rand(nobs, num_feat) -y_train = rand(size(x_train, 1)) - -config = EvoTreeRegressor(; - loss=:mse, - nrounds=200, - lambda=0.0, - gamma=0.0, - eta=0.05, - max_depth=10, - min_weight=1.0, - rowsample=0.5, - colsample=0.5, - nbins=64, - tree_type="binary", - rng=123 -) - -################################ -# high-level -################################ -_device = EvoTrees.GPU -@time EvoTrees.fit_evotree(config; x_train, y_train, device = "gpu") - -@time m, cache = EvoTrees.init(config, x_train, y_train); -@time EvoTrees.grow_evotree!(m, cache, config) -@btime EvoTrees.grow_evotree!(m, cache, config) - -Profile.clear() -# Profile.init() -Profile.init(n = 10^5, delay = 0.01) -# @profile m, cache = EvoTrees.init(config, x_train, y_train); -@profile EvoTrees.grow_evotree!(m, cache, config) -Profile.print() - -################################ -# mid-level -################################ -@time m, cache = EvoTrees.init(config, x_train, y_train); -@time EvoTrees.grow_evotree!(m, cache, config) -# compute gradients -@time m, cache = EvoTrees.init(config, x_train, y_train); -@time EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, config) -# subsample rows -@time cache.nodes[1].is = EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, config.rowsample, config.rng) -# subsample cols -EvoTrees.sample!(config.rng, cache.js_, cache.js, replace=false, ordered=true) -L = EvoTrees._get_struct_loss(m) -# instantiate a tree then grow it -tree = EvoTrees.Tree{L,1}(config.max_depth) -grow! = config.tree_type == "oblivious" ? EvoTrees.grow_otree! : EvoTrees.grow_tree! -@time EvoTrees.grow_tree!( - tree, - cache.nodes, - config, - cache.∇, - cache.edges, - cache.js, - cache.out, - cache.left, - cache.right, - cache.x_bin, - cache.feattypes, - cache.monotone_constraints -) - -using ProfileView -ProfileView.@profview EvoTrees.grow_tree!( - tree, - cache.nodes, - config, - cache.∇, - cache.edges, - cache.js, - cache.out, - cache.left, - cache.right, - cache.x_bin, - cache.feattypes, - cache.monotone_constraints -) - -################################ -# end mid-level -################################ - - -@time m_evo = grow_tree!(params_evo; x_train, y_train, device, print_every_n=100); - -@info "train - no eval" -@time m_evo = fit_evotree(params_evo; x_train, y_train, device, print_every_n=100); - - -offset = 0 -feat = 15 -cond_bin = 32 -@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14); -@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14); -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) - - - diff --git a/experiments/hist/hist_cpu.jl b/experiments/hist/hist_cpu.jl index 3208afb4..0c70ad7f 100644 --- a/experiments/hist/hist_cpu.jl +++ b/experiments/hist/hist_cpu.jl @@ -35,497 +35,7 @@ h∇ = [zeros(Float64, 3, nbins) for n in 1:nfeats] is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) -# 6.886 ms (97 allocations: 10.67 KiB) -# 9.624 ms (97 allocations: 10.67 KiB) +# laptop: 6.886 ms (97 allocations: 10.67 KiB) +# desktop: 3.451 ms (61 allocations: 6.52 KiB) @time hist_cpu!(h∇, ∇, x_bin, is, js) @btime hist_cpu!($h∇, $∇, $x_bin, $is, $js) - -# base kernel -function kernel1!(h, x, id) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= size(id, 1) && j <= size(id, 2) - @inbounds k = Base._to_linear_index(h, id[i, j], j) - # @inbounds k = id[i,j] + 32 * (j-1) - @inbounds CUDA.atomic_add!(pointer(h, k), x[i, j]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu1!(h::AbstractMatrix{T}, x::AbstractMatrix{T}, id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} - # thread_i = min(MAX_THREADS, size(id, 1)) - # thread_j = min(MAX_THREADS ÷ thread_i, size(id, 2)) - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) .÷ threads) - # println("threads:", threads) - # println("blocks:", blocks) - @cuda blocks = blocks threads = threads kernel1!(h, x, id) - return -end - -nbins = 32 -ncol = 100 -items = Int32(1e6) -hist = zeros(Float32, nbins, ncol) -ÎŽ = rand(Float32, items, ncol) -idx = UInt8.(rand(1:nbins, items, ncol)) -# idx = Int64.(rand(1:nbins, items, ncol)) - -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) - -hist .- Array(hist_gpu) -sum(hist) - sum(Array(hist_gpu)) - -CUDA.@time hist_gpu1!(hist_gpu, ÎŽ_gpu, idx_gpu, MAX_THREADS=1024) -@time hist_cpu!(hist, ÎŽ, idx) -@btime hist_cpu!($hist, $ÎŽ, $idx) -@btime CUDA.@sync hist_gpu1!($hist_gpu, $ÎŽ_gpu, $idx_gpu, MAX_THREADS=1024) -# test on view -CUDA.@time hist_gpu1!(hist_gpu, view(ÎŽ_gpu, 1:items÷2, 1:ncol÷2), view(idx_gpu, 1:items÷2, 1:ncol÷2), MAX_THREADS=1024) - -size(ÎŽ_gpu) -size(view(ÎŽ_gpu, 1:items÷2, 1:ncol÷2)) - -############################################################## -## Build histogram from a subsample idx -# base kernel -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol) -ÎŽ = rand(Float32, items, ncol) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -CUDA.@time hist_gpu2!(hist_gpu, ÎŽ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -############################################# -# test for SVector - basic test - success! -function kernel!(x, y) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if i <= length(x) - # @inbounds x[i] += y[i] - k = Base._to_linear_index(x, i) - CUDA.atomic_add!(pointer(x, k), y[i]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(x, y; MAX_THREADS=1024) - thread_i = min(MAX_THREADS, length(x)) - threads = (thread_i) - blocks = ceil.(Int, length(x) .÷ threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel!(x, y) - end - return -end - -x = rand(SVector{2,Float32}, Int(1e7)) -y = rand(SVector{2,Float32}, Int(1e7)) -x = rand(Float32, Int(1e7)) -y = rand(Float32, Int(1e7)) - -x_gpu = CuArray(x) -y_gpu = CuArray(y) - -CuArrays.@time hist_gpu!(x_gpu, y_gpu) -@btime hist_gpu!($x_gpu, $y_gpu) - - -############################################# -# test for SVector - real test -function kernelS2!(h, x, id) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= size(id, 1) && j <= size(id, 2) - @inbounds k = Base._to_linear_index(h, id[i, j], j) - # @inbounds k = id[i,j] + 32 * (j-1) - # @inbounds CUDAnative.atomic_add!(pointer(h, k), x[i,j]) - # h[id[i,j],j] += x[i,j] - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpuS2!(h, x, id; MAX_THREADS=256) where {T} - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) .÷ threads) - println("threads:", threads) - println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernelS2!(h, x, id) - end - return -end - -hist = zeros(SVector{2,Float32}, nbins, ncol) -ÎŽ = rand(SVector{2,Float32}, items, ncol) -idx = rand(1:nbins, items, ncol) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) - -CuArrays.@time hist_gpuS2!(hist_gpu, ÎŽ_gpu, idx_gpu) -@btime hist_gpuS2!($hist_gpu, $ÎŽ_gpu, $idx_gpu) - - -############################################################## -## Build histogram from a subsample idx -# accumulate all gradient single pass -function kernel3!(h, x, id, 𝑖, 𝑗) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 1) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 1]) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 2) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 2]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu3!(h, x, id, 𝑖, 𝑗; MAX_THREADS=1024) - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CuArrays.@sync begin - @cuda blocks = blocks threads = threads kernel3!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol, 2) -ÎŽ = rand(Float32, items, ncol, 2) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -CuArrays.@time hist_gpu3!(hist_gpu, ÎŽ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu3!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - - - -# accumulate in shared memory histograms -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, nbins) where {T<:AbstractFloat} - tid = threadIdx().x - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - - # shared memory on block of length nbins - # To Do: nbins cannot be passed as argument - dynamic shared memory generate kernel through macro - # shared = CUDAnative.@cuStaticSharedMem(T, 32) - # fill!(shared, 0) - # sync_threads() - - # accumulate in per block histogram - # Why is the atomic add on shared much longer than atomic on h in global mem? - if i <= size(id, 1) && j <= size(id, 2) - # should be the legit way to go - 70ms - # @inbounds CUDAnative.atomic_add!(pointer(shared, id[i,j]), x[i,j]) - - # unsafe (collisions) on shared mem: 3.0ms - # @inbounds shared[id[i,j]] = x[i,j] - - # unsafe (collisions) add on global memory - 3.6ms - # @inbounds h[id[i,j],j] += x[i,j] - - # atomic add on global hist - 3.2ms - @inbounds k = id[i, j] + nbins * (j - 1) - @inbounds CUDA.atomic_add!(pointer(h, k), x[i, j]) - end - # sync_threads() - - # if blockIdx().x == 1 - # if tid <= nbins - # CUDA.atomic_add!(pointer(h,tid), shared[tid]) - # end - # end - return -end - -# shared memory - -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id::CuMatrix{Int}, nbins; MAX_THREADS=256) where {T<:AbstractFloat} - # thread_i = min(MAX_THREADS, size(id, 1)) - # thread_j = min(MAX_THREADS ÷ thread_i, size(id, 2)) - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) ./ threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, nbins) - end - return h -end - -CuArrays.@time hist_gpu2!(hist_gpu, ÎŽ_gpu, idx_gpu, 32, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 32, MAX_THREADS=1024) -@device_code_warntype hist_gpu2!(hist_gpu, ÎŽ_gpu, idx_gpu, 32, MAX_THREADS=1024) - - - - -###################################### -# Appoach 1 -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - -hist = zeros(Float32, nbins, ncol) -ÎŽ = rand(Float32, items) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -CUDA.@time hist_gpu!(hist_gpu, ÎŽ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -###################################### -# Appoach 2 - Loop for assigning command grad to appropriate bin per column -# Idea: exploit the fact that there's a single grad per row: take that grad and add it to each column bin -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{UInt8}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - - - - -using CUDA -using BenchmarkTools -N1 = Int(2^12) -x1 = rand(Float32, N1); -x2 = rand(Float32, N1); -x1g = CuArray(x1); -x2g = CuArray(x2); - -function dot_atomic!(x, y, z) - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if idx <= length(x) - CUDA.atomic_add!(pointer(z, 1), x[idx] * y[idx]) - end - return nothing -end - -function bench_dot_atomic!(x, y, z, threads) - numblocks = ceil(Int, N1 / threads) - @cuda threads = threads blocks = numblocks dot_atomic!(x, y, z) -end - -threads = 512 -numblocks = ceil(Int, N1 / threads) -z0 = CUDA.zeros(1) -# @cuda threads=gthreads blocks=numblocks dot_atomic!(x1g, x2g, z0) -@btime CUDA.@sync bench_dot_atomic!($x1g, $x2g, $z0, threads) -# 17.323 ms (50 allocations: 1.67 KiB) - -function dot_share!(x::CuDeviceVector{T}, y::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 64) - fill!(shared, 0) - sync_threads() - - if idx <= length(x) - @inbounds shared[tid] = x[idx] * y[idx] - end - sync_threads() - - i = blockDim().x ÷ 2 - while i > 0 - if tid <= i - @inbounds shared[tid] += shared[tid+i] # valid non atomic operation - # CUDA.atomic_add!(pointer(shared, tid), shared[tid+i]) # invalid op - results in error - end - sync_threads() - i ÷= 2 - end - if tid == 1 - CUDA.atomic_add!(pointer(z, 1), shared[1]) - end - return nothing -end - -function bench_dot_share!(x, y, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share!(x, y, z) - return z -end - -function wrap_share(x, y, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(1) - x = bench_dot_share!(x, y, z, threads, numblocks) - return x -end - -threads = 64 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(1) -@cuda threads = threads blocks = numblocks dot_share!(x1g, x2g, z) -@time CUDA.@sync wrap_share(x1g, x2g, threads) -@btime CUDA.@sync wrap_share($x1g, $x2g, threads) -x = CUDA.@sync wrap_share(x1g, x2g, threads) -x1g' * x2g -@btime x1g' * x2g -@btime x1' * x2 - - -function dot_share2!(x::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - bid = blockIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 128) - fill!(shared, 0.0f0) - sync_threads() - - # if idx <= length(x) - # shared[tid] = x[idx] * y[idx] - # end - # sync_threads() - - i = blockDim().x ÷ 2 - while i > 0 - # if tid <= i - if tid == 1 - # shared[tid] += shared[tid + i] # valid non atomic operation - CUDA.atomic_add!(pointer(shared, tid), shared[tid+1]) # invalid op - results in error - end - sync_threads() - i ÷= 2 - end - if tid == 1 - z[bid] += shared[1] - end - return nothing -end - -function bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return sum(z) -end - -function wrap_share2(x, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(numblocks) - # x = bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return (x) -end - -threads = 128 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(numblocks) -sum(z) -x1g' * x2g -CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x1g, x2g, z) -@btime CUDA.@sync wrap_share2($x1g, $x2g, threads) -x = CUDA.@sync wrap_share2(x1g, x2g, threads) - -@btime x1g' * x2g - - -using CUDA -function kernel(x) - shared = @cuStaticSharedMem(Float32, 2) - fill!(shared, 1.0f0) - sync_threads() - # @atomic shared[threadIdx().x] += 0f0 - tid = threadIdx().x - CUDA.atomic_add!(pointer(shared, tid), shared[tid+1]) - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(1) -@cuda kernel(x) -synchronize() \ No newline at end of file diff --git a/experiments/hist/hist_gpu share.jl b/experiments/hist/hist_gpu share.jl index 9c1176eb..4477274f 100644 --- a/experiments/hist/hist_gpu share.jl +++ b/experiments/hist/hist_gpu share.jl @@ -1,837 +1,88 @@ using Revise using CUDA -using StaticArrays using StatsBase: sample using BenchmarkTools # base kernel -function kernel_s4!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, xid::CuDeviceMatrix{S}) where {T,S} - +function kernel_s4!(h::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{T}, x_bin::CuDeviceMatrix{S}) where {T,S} + nbins = size(h, 2) - it, jt = threadIdx().x, threadIdx().y + it, jt, kt = threadIdx().x, threadIdx().y, threadIdx().z ib, jb = blockIdx().x, blockIdx().y id, jd = blockDim().x, blockDim().y ig, jg = gridDim().x, gridDim().y j = jt + (jb - 1) * jd - - shared = @cuDynamicSharedMem(T, 3 * nbins) - fill!(shared, 0) - sync_threads() - i_tot = size(x, 1) - iter = 0 - while iter * id * ig < i_tot - i = it + id * (ib - 1) + iter * id * ig - if i <= size(xid, 1) && j <= size(xid, 2) - # depends on shared to be assigned to a single feature - k = 3 * (xid[i, j] - 1) - @inbounds CUDA.atomic_add!(pointer(shared, k + 1), x[i, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 2), x[i, 2]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 3), x[i, 3]) - end - iter += 1 - end - sync_threads() - # loop to cover cases where nbins > nthreads - for iter in 1:(nbins - 1) ÷ id + 1 - bin_id = it + id * (iter - 1) - if bin_id <= nbins - @inbounds k = Base._to_linear_index(h, 1, bin_id, j) - @inbounds CUDA.atomic_add!(pointer(h, k), shared[3 * (bin_id - 1) + 1]) - @inbounds CUDA.atomic_add!(pointer(h, k + 1), shared[3 * (bin_id - 1) + 2]) - @inbounds CUDA.atomic_add!(pointer(h, k + 2), shared[3 * (bin_id - 1) + 3]) - end - end - # sync_threads() - return nothing -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu_s4!(h::AbstractArray{T,3}, x::AbstractMatrix{T}, id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} - thread_i = min(MAX_THREADS, size(id, 1)) - thread_j = 1 - threads = (thread_i, thread_j) - blocks = ceil.(Int, (16, size(id, 2))) - fill!(h, 0) - @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h, 2) * 3 kernel_s4!(h, x, id) - return -end - -# function hist_cpu_2!(hist, ÎŽ, idx) -# Threads.@threads for j in 1:size(idx, 2) -# for i in 1:size(idx, 1) -# bin = idx[i,j] -# @inbounds hist[1, bin, j] += ÎŽ[i,1] -# @inbounds hist[2, bin, j] += ÎŽ[i,2] -# @inbounds hist[3, bin, j] += ÎŽ[i,3] -# end -# end -# return -# end - -nbins = 64 -ncol = 100 -items = Int32(1e6) -hist = zeros(Float32, 3, nbins, ncol) -ÎŽ = rand(Float32, items, 3) -# idx = Int64.(rand(1:nbins, items, ncol)) -idx = UInt8.(rand(1:nbins, items, ncol)) - -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) - -@time hist_cpu_2!(hist, ÎŽ, idx) -@btime hist_cpu_2!(hist, ÎŽ, idx) - -@CUDA.time hist_gpu_s4!(hist_gpu, ÎŽ_gpu, idx_gpu, MAX_THREADS=128) -@btime CUDA.@sync hist_gpu_s4!($hist_gpu, $ÎŽ_gpu, $idx_gpu, MAX_THREADS=128) - - -# base kernel -function kernel_s5!(hÎŽ1::CuDeviceArray{T,3}, hÎŽ2::CuDeviceArray{T,3}, h𝑀::CuDeviceMatrix{T}, ÎŽ1::CuDeviceMatrix{T}, ÎŽ2::CuDeviceMatrix{T}, 𝑀::CuDeviceVector{T}, xid::CuDeviceMatrix{S}) where {T,S} - - nbins = size(h𝑀, 1) - it, jt = threadIdx().x, threadIdx().y - ib, jb = blockIdx().x, blockIdx().y - id, jd = blockDim().x, blockDim().y - ig, jg = gridDim().x, gridDim().y - j = jt + (jb - 1) * jd - shared = @cuDynamicSharedMem(T, 3 * nbins) fill!(shared, 0) sync_threads() - i_tot = size(𝑀, 1) + i_tot = size(x_bin, 1) iter = 0 while iter * id * ig < i_tot i = it + id * (ib - 1) + iter * id * ig - if i <= size(xid, 1) && j <= size(xid, 2) + if i <= size(x_bin, 1) && j <= size(x_bin, 2) # depends on shared to be assigned to a single feature - k = 3 * (xid[i, j] - 1) - @inbounds CUDA.atomic_add!(pointer(shared, k + 1), ÎŽ1[i, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 2), ÎŽ2[i, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 3), 𝑀[i]) + k = 3 * (x_bin[i, j] - 1) + @inbounds CUDA.atomic_add!(pointer(shared, k + kt), ∇[i, kt]) + # @inbounds CUDA.atomic_add!(pointer(shared, k + 1), ∇[i, 1]) + # @inbounds CUDA.atomic_add!(pointer(shared, k + 2), ∇[i, 2]) + # @inbounds CUDA.atomic_add!(pointer(shared, k + 3), ∇[i, 3]) end iter += 1 end sync_threads() # loop to cover cases where nbins > nthreads - for iter in 1:(nbins - 1) ÷ id + 1 + for iter in 1:(nbins-1)÷id+1 bin_id = it + id * (iter - 1) if bin_id <= nbins - @inbounds k = Base._to_linear_index(hÎŽ1, 1, bin_id, j) - @inbounds CUDA.atomic_add!(pointer(hÎŽ1, k), shared[3 * (bin_id - 1) + 1]) - @inbounds CUDA.atomic_add!(pointer(hÎŽ2, k), shared[3 * (bin_id - 1) + 2]) - @inbounds CUDA.atomic_add!(pointer(h𝑀, k), shared[3 * (bin_id - 1) + 3]) + @inbounds k = Base._to_linear_index(h, 1, bin_id, j) - 1 + @inbounds CUDA.atomic_add!(pointer(h, k + kt), shared[3*(bin_id-1)+kt]) + # @inbounds CUDA.atomic_add!(pointer(h, k), shared[3*(bin_id-1)+1]) + # @inbounds CUDA.atomic_add!(pointer(h, k + 1), shared[3*(bin_id-1)+2]) + # @inbounds CUDA.atomic_add!(pointer(h, k + 2), shared[3*(bin_id-1)+3]) end end - # sync_threads() - return nothing -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu_s5!(hÎŽ1::AbstractArray{T,3}, hÎŽ2::AbstractArray{T,3}, h𝑀::AbstractMatrix{T}, - ÎŽ1::AbstractMatrix{T}, ÎŽ2::AbstractMatrix{T}, 𝑀::AbstractVector{T}, - id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} - thread_i = min(MAX_THREADS, size(id, 1)) - thread_j = 1 - threads = (thread_i, thread_j) - blocks = ceil.(Int, (16, size(id, 2))) - fill!(hÎŽ1, 0) - fill!(hÎŽ2, 0) - fill!(h𝑀, 0) - @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h𝑀, 1) * 3 kernel_s5!(hÎŽ1, hÎŽ2, h𝑀, ÎŽ1, ÎŽ2, 𝑀, id) - return -end - -nbins = 32 -ncol = 100 -items = Int32(1e6) - -ÎŽ1 = CUDA.rand(Float32, items, 1) -ÎŽ2 = CUDA.rand(Float32, items, 1) -𝑀 = CUDA.rand(Float32, items) -idx = UInt8.(rand(1:nbins, items, ncol)) - -hÎŽ1 = CUDA.zeros(1, nbins, ncol) -hÎŽ2 = CUDA.zeros(1, nbins, ncol) -h𝑀 = CUDA.zeros(nbins, ncol) -idx_gpu = CuArray(idx) - -@CUDA.time hist_gpu_s5!(hÎŽ1, hÎŽ2, h𝑀, ÎŽ1, ÎŽ2, 𝑀, idx_gpu, MAX_THREADS=128) -@btime CUDA.@sync hist_gpu_s5!($hÎŽ1, $hÎŽ2, $h𝑀, $ÎŽ1, $ÎŽ2, $𝑀, $idx_gpu, MAX_THREADS=128) - - -# base kernel -function kernel_s6!(hÎŽ1::CuDeviceArray{T,3}, hÎŽ2::CuDeviceArray{T,3}, h𝑀::CuDeviceMatrix{T}, ÎŽ1::CuDeviceMatrix{T}, ÎŽ2::CuDeviceMatrix{T}, 𝑀::CuDeviceVector{T}, xid::CuDeviceMatrix{S}, 𝑖, 𝑗) where {T,S} - - nbins = size(h𝑀, 1) - it, jt = threadIdx().x, threadIdx().y - ib, jb = blockIdx().x, blockIdx().y - id, jd = blockDim().x, blockDim().y - ig, jg = gridDim().x, gridDim().y - j = jt + (jb - 1) * jd - - shared = @cuDynamicSharedMem(T, 3 * nbins) - fill!(shared, 0) sync_threads() - - i_tot = size(𝑀, 1) - iter = 0 - while iter * id * ig < i_tot - i = it + id * (ib - 1) + iter * id * ig - if i <= length(𝑖) && j <= length(𝑗) - # depends on shared to be assigned to a single feature - i_idx = 𝑖[i] - k = 3 * (xid[i_idx, 𝑗[j]] - 1) - @inbounds CUDA.atomic_add!(pointer(shared, k + 1), ÎŽ1[i_idx, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 2), ÎŽ2[i_idx, 1]) - @inbounds CUDA.atomic_add!(pointer(shared, k + 3), 𝑀[i_idx]) - end - iter += 1 - end - sync_threads() - # loop to cover cases where nbins > nthreads - for iter in 1:(nbins - 1) ÷ id + 1 - bin_id = it + id * (iter - 1) - if bin_id <= nbins - @inbounds k = Base._to_linear_index(hÎŽ1, 1, bin_id, 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(hÎŽ1, k), shared[3 * (bin_id - 1) + 1]) - @inbounds CUDA.atomic_add!(pointer(hÎŽ2, k), shared[3 * (bin_id - 1) + 2]) - @inbounds CUDA.atomic_add!(pointer(h𝑀, k), shared[3 * (bin_id - 1) + 3]) - end - end - # sync_threads() return nothing end # base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu_s6!(hÎŽ1::AbstractArray{T,3}, hÎŽ2::AbstractArray{T,3}, h𝑀::AbstractMatrix{T}, - ÎŽ1::AbstractMatrix{T}, ÎŽ2::AbstractMatrix{T}, 𝑀::AbstractVector{T}, - id::AbstractMatrix{S}, 𝑖, 𝑗; MAX_THREADS=256) where {T,S} - thread_i = min(MAX_THREADS, size(id, 1)) +function hist_gpu_s4!(h::AbstractArray{T,3}, ∇::AbstractMatrix{T}, x_bin::AbstractMatrix{S}; MAX_THREADS=256) where {T,S} + thread_i = min(MAX_THREADS, size(x_bin, 1)) thread_j = 1 - threads = (thread_i, thread_j) - blocks = ceil.(Int, (16, size(id, 2))) - fill!(hÎŽ1, 0) - fill!(hÎŽ2, 0) - fill!(h𝑀, 0) - @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h𝑀, 1) * 3 kernel_s6!(hÎŽ1, hÎŽ2, h𝑀, ÎŽ1, ÎŽ2, 𝑀, id, 𝑖, 𝑗) - return -end - -nbins = 32 -ncol = 100 -items = Int32(1e6) - -ÎŽ1 = CUDA.rand(Float32, items, 1) -ÎŽ2 = CUDA.rand(Float32, items, 1) -𝑀 = CUDA.rand(Float32, items) -idx = UInt8.(rand(1:nbins, items, ncol)) - -hÎŽ1 = CUDA.zeros(1, nbins, ncol) -hÎŽ2 = CUDA.zeros(1, nbins, ncol) -h𝑀 = CUDA.zeros(nbins, ncol) -idx_gpu = CuArray(idx) - -𝑖 = CuArray(UInt32.(1:items)) -𝑗 = CuArray(UInt32.(1:ncol)) - -hÎŽ1_cpu = reshape(Array(hÎŽ1), size(hÎŽ1)[2:3]) -hÎŽ2_cpu = reshape(Array(hÎŽ2), size(hÎŽ2)[2:3]) -h𝑀_cpu = Array(h𝑀) -ÎŽ1_cpu = reshape(Array(ÎŽ1), :) -ÎŽ2_cpu = reshape(Array(ÎŽ2), :) -𝑀_cpu = Array(𝑀) -𝑖_cpu = Array(𝑖) -𝑗_cpu = Array(𝑗) -idx_cpu = Array(idx_gpu) - -@time hist_cpu_3!(hÎŽ1_cpu, hÎŽ2_cpu, h𝑀_cpu, ÎŽ1_cpu, ÎŽ2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) -@btime hist_cpu_3!(hÎŽ1_cpu, hÎŽ2_cpu, h𝑀_cpu, ÎŽ1_cpu, ÎŽ2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) - -@CUDA.time hist_gpu_s6!(hÎŽ1, hÎŽ2, h𝑀, ÎŽ1, ÎŽ2, 𝑀, idx_gpu, 𝑖, 𝑗, MAX_THREADS=128) -@btime CUDA.@sync hist_gpu_s6!($hÎŽ1, $hÎŽ2, $h𝑀, $ÎŽ1, $ÎŽ2, $𝑀, $idx_gpu, $𝑖, $𝑗, MAX_THREADS=128) - - - -function update_hist_cpu!(hist_ÎŽ::Matrix{SVector{L,T}}, hist_Ύ²::Matrix{SVector{L,T}}, hist_𝑀::Matrix{SVector{1,T}}, - ÎŽ::Vector{SVector{L,T}}, Ύ²::Vector{SVector{L,T}}, 𝑀::Vector{SVector{1,T}}, - X_bin, 𝑖, 𝑗) where {L,T,S} - - hist_ÎŽ .*= 0.0 - hist_Ύ² .*= 0.0 - hist_𝑀 .*= 0.0 - - @inbounds Threads.@threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_ÎŽ[X_bin[i,j], j] += ÎŽ[i] - hist_Ύ²[X_bin[i,j], j] += Ύ²[i] - hist_𝑀[X_bin[i,j], j] += 𝑀[i] - end - end -end - -hÎŽ1_cpu = SVector.(hÎŽ1_cpu) -hÎŽ2_cpu = SVector.(hÎŽ2_cpu) -h𝑀_cpu = SVector.(h𝑀_cpu) -ÎŽ1_cpu = SVector.(ÎŽ1_cpu) -ÎŽ2_cpu = SVector.(ÎŽ2_cpu) -𝑀_cpu = SVector.(𝑀_cpu) -# 𝑖_cpu = Array(𝑖) -# 𝑗_cpu = Array(𝑗) -# idx_cpu = Array(idx_gpu) - -@time update_hist_cpu!(hÎŽ1_cpu, hÎŽ2_cpu, h𝑀_cpu, ÎŽ1_cpu, ÎŽ2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) -@btime update_hist_cpu!(hÎŽ1_cpu, hÎŽ2_cpu, h𝑀_cpu, ÎŽ1_cpu, ÎŽ2_cpu, 𝑀_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu) -@btime update_hist_cpu!($hÎŽ1_cpu, $hÎŽ2_cpu, $h𝑀_cpu, $ÎŽ1_cpu, $ÎŽ2_cpu, $𝑀_cpu, $idx_cpu, $𝑖_cpu, $𝑗_cpu) - - -############################################################## -## Build histogram from a subsample idx -# base kernel -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol) -ÎŽ = rand(Float32, items, ncol) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -@CUDA.time hist_gpu2!(hist_gpu, ÎŽ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -############################################# -# test for SVector - basic test - success! -function kernel!(x, y) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if i <= length(x) - # @inbounds x[i] += y[i] - k = Base._to_linear_index(x, i) - CUDA.atomic_add!(pointer(x, k), y[i]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(x, y; MAX_THREADS=1024) - thread_i = min(MAX_THREADS, length(x)) - threads = (thread_i) - blocks = ceil.(Int, length(x) .÷ threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel!(x, y) - end - return -end - -x = rand(SVector{2,Float32}, Int(1e7)) -y = rand(SVector{2,Float32}, Int(1e7)) -x = rand(Float32, Int(1e7)) -y = rand(Float32, Int(1e7)) - -x_gpu = CuArray(x) -y_gpu = CuArray(y) - -@CuArrays.time hist_gpu!(x_gpu, y_gpu) -@btime hist_gpu!($x_gpu, $y_gpu) - - -############################################# -# test for SVector - real test -function kernelS2!(h, x, id) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= size(id, 1) && j <= size(id, 2) - @inbounds k = Base._to_linear_index(h, id[i,j], j) - # @inbounds k = id[i,j] + 32 * (j-1) - # @inbounds CUDAnative.atomic_add!(pointer(h, k), x[i,j]) - # h[id[i,j],j] += x[i,j] - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpuS2!(h, x, id; MAX_THREADS=256) where {T} - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) .÷ threads) - println("threads:", threads) - println("blocks:", blocks) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernelS2!(h, x, id) - end - return -end - -hist = zeros(SVector{2,Float32}, nbins, ncol) -ÎŽ = rand(SVector{2,Float32}, items, ncol) -idx = rand(1:nbins, items, ncol) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) - -@CuArrays.time hist_gpuS2!(hist_gpu, ÎŽ_gpu, idx_gpu) -@btime hist_gpuS2!($hist_gpu, $ÎŽ_gpu, $idx_gpu) - - -############################################################## -## Build histogram from a subsample idx -# accumulate all gradient single pass -function kernel3!(h, x, id, 𝑖, 𝑗) - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 1) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 1]) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 2) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 2]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu3!(h, x, id, 𝑖, 𝑗; MAX_THREADS=1024) - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - # println("threads:", threads) - # println("blocks:", blocks) - CuArrays.@sync begin - @cuda blocks = blocks threads = threads kernel3!(h, x, id, 𝑖, 𝑗) - end - return -end - -hist = zeros(Float32, nbins, ncol, 2) -ÎŽ = rand(Float32, items, ncol, 2) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -@CuArrays.time hist_gpu3!(hist_gpu, ÎŽ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu3!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - - - -# accumulate in shared memory histograms -function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, nbins) where {T <: AbstractFloat} - tid = threadIdx().x - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - - # shared memory on block of length nbins - # To Do: nbins cannot be passed as argument - dynamic shared memory generate kernel through macro - # shared = CUDAnative.@cuStaticSharedMem(T, 32) - # fill!(shared, 0) - # sync_threads() - - # accumulate in per block histogram - # Why is the atomic add on shared much longer than atomic on h in global mem? - if i <= size(id, 1) && j <= size(id, 2) - # should be the legit way to go - 70ms - # @inbounds CUDAnative.atomic_add!(pointer(shared, id[i,j]), x[i,j]) - - # unsafe (collisions) on shared mem: 3.0ms - # @inbounds shared[id[i,j]] = x[i,j] - - # unsafe (collisions) add on global memory - 3.6ms - # @inbounds h[id[i,j],j] += x[i,j] - - # atomic add on global hist - 3.2ms - @inbounds k = id[i,j] + nbins * (j - 1) - @inbounds CUDA.atomic_add!(pointer(h, k), x[i,j]) - end - # sync_threads() - - # if blockIdx().x == 1 - # if tid <= nbins - # CUDA.atomic_add!(pointer(h,tid), shared[tid]) - # end - # end - return -end - -# shared memory - -function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id::CuMatrix{Int}, nbins; MAX_THREADS=256) where {T <: AbstractFloat} - # thread_i = min(MAX_THREADS, size(id, 1)) - # thread_j = min(MAX_THREADS ÷ thread_i, size(id, 2)) - thread_j = min(MAX_THREADS, size(id, 2)) - thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (size(id, 1), size(id, 2)) ./ threads) - CUDA.@sync begin - @cuda blocks = blocks threads = threads kernel2!(h, x, id, nbins) - end - return h -end - -@CuArrays.time hist_gpu2!(hist_gpu, ÎŽ_gpu, idx_gpu, 32, MAX_THREADS=1024) -@btime hist_gpu2!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 32, MAX_THREADS=1024) -@device_code_warntype hist_gpu2!(hist_gpu, ÎŽ_gpu, idx_gpu, 32, MAX_THREADS=1024) - - - - -###################################### -# Appoach 1 -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - -hist = zeros(Float32, nbins, ncol) -ÎŽ = rand(Float32, items) -idx = rand(1:nbins, items, ncol) -𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true) -𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true) -hist_gpu = CuArray(hist) -ÎŽ_gpu = CuArray(ÎŽ) -idx_gpu = CuArray(idx) -𝑖_gpu = CuArray(𝑖) -𝑗_gpu = CuArray(𝑗) - -@CUDA.time hist_gpu!(hist_gpu, ÎŽ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) -@btime hist_gpu!($hist_gpu, $ÎŽ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024) - - -###################################### -# Appoach 2 - Loop for assigning command grad to appropriate bin per column -# Idea: exploit the fact that there's a single grad per row: take that grad and add it to each column bin -###################################### -# GPU - apply along the features axis -function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat} - i = threadIdx().x + (blockIdx().x - 1) * blockDim().x - j = threadIdx().y + (blockIdx().y - 1) * blockDim().y - if i <= length(𝑖) && j <= length(𝑗) - @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j]) - @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i]]) - end - return -end - -# base approach - block built along the cols first, the rows (limit collisions) -function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{UInt8}, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat} - thread_j = min(MAX_THREADS, length(𝑗)) - thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖)) - threads = (thread_i, thread_j) - blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads) - @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗) - return -end - - - - -using CUDA -using BenchmarkTools -N1 = Int(2^12) -x1 = rand(Float32, N1); -x2 = rand(Float32, N1); -x1g = CuArray(x1); -x2g = CuArray(x2); - -function dot_atomic!(x, y, z) - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if idx <= length(x) - CUDA.atomic_add!(pointer(z, 1), x[idx] * y[idx]) - end - return nothing -end - -function bench_dot_atomic!(x, y, z, threads) - numblocks = ceil(Int, N1 / threads) - @cuda threads = threads blocks = numblocks dot_atomic!(x, y, z) -end - -threads = 512 -numblocks = ceil(Int, N1 / threads) -z0 = CUDA.zeros(1) -# @cuda threads=gthreads blocks=numblocks dot_atomic!(x1g, x2g, z0) -@btime CUDA.@sync bench_dot_atomic!($x1g, $x2g, $z0, threads) -# 17.323 ms (50 allocations: 1.67 KiB) - -function dot_share!(x::CuDeviceVector{T}, y::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 64) - fill!(shared, 0) - sync_threads() - - if idx <= length(x) - @inbounds shared[tid] = x[idx] * y[idx] - end - sync_threads() - - i = blockDim().x ÷ 2 - while i > 0 - if tid <= i - @inbounds shared[tid] += shared[tid + i] # valid non atomic operation - # CUDA.atomic_add!(pointer(shared, tid), shared[tid+i]) # invalid op - results in error - end - sync_threads() - i ÷= 2 - end - if tid == 1 - CUDA.atomic_add!(pointer(z, 1), shared[1]) - end - return nothing -end - -function bench_dot_share!(x, y, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share!(x, y, z) - return z -end - -function wrap_share(x, y, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(1) - x = bench_dot_share!(x, y, z, threads, numblocks) - return x -end - -threads = 64 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(1) -@cuda threads = threads blocks = numblocks dot_share!(x1g, x2g, z) -@time CUDA.@sync wrap_share(x1g, x2g, threads) -@btime CUDA.@sync wrap_share($x1g, $x2g, threads) -x = CUDA.@sync wrap_share(x1g, x2g, threads) -x1g' * x2g -@btime x1g' * x2g -@btime x1' * x2 - - -function dot_share2!(x::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T} - - tid = threadIdx().x - bid = blockIdx().x - idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - shared = CUDA.@cuStaticSharedMem(T, 128) - fill!(shared, 0f0) - sync_threads() - - # if idx <= length(x) - # shared[tid] = x[idx] * y[idx] - # end - # sync_threads() - - i = blockDim().x ÷ 2 - while i > 0 - # if tid <= i - if tid == 1 - # shared[tid] += shared[tid + i] # valid non atomic operation - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) # invalid op - results in error - end - sync_threads() - i ÷= 2 - end - if tid == 1 - z[bid] += shared[1] - end - return nothing -end - -function bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return sum(z) -end - -function wrap_share2(x, threads) - numblocks = ceil(Int, N1 / threads) - z = CUDA.zeros(numblocks) - # x = bench_dot_share2!(x, z, threads, numblocks) - CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z) - return(x) -end - -threads = 128 -numblocks = ceil(Int, N1 / threads) -z = CUDA.zeros(numblocks) -sum(z) -x1g' * x2g -CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x1g, x2g, z) -@btime CUDA.@sync wrap_share2($x1g, $x2g, threads) -x = CUDA.@sync wrap_share2(x1g, x2g, threads) - -@btime x1g' * x2g - - -using CUDA -function kernel(x) - shared = @cuStaticSharedMem(Float32, 2) - fill!(shared, 1f0) - sync_threads() - # @atomic shared[threadIdx().x] += 0f0 - tid = threadIdx().x - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(1) -@cuda kernel(x) -synchronize() - - -using CUDA -function kernel2(x, y) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - i = Int32(2) - if i > 0 - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) - sync_threads() - # i ÷= 2 - end - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -y = CUDA.zeros(1) -@cuda threads = 2 kernel2(x, y) -synchronize() - - - -using CUDA -function kernel1(x, y) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - i = Int32(2) - if i > 0 - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - i ÷= 2 - end - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -y = CUDA.zeros(1) -@cuda threads = 2 kernel1(x, y) -x -synchronize() - - - -using CUDA -function kernel2(x, y) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - i = Int32(2) - while i > 0 - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - i ÷= 2 - end - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -y = CUDA.zeros(1) -@cuda threads = 2 kernel2(x, y) -x -synchronize() - - -using CUDA -function kernel3(x) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) - return -end - -x = CUDA.zeros(4) -@cuda threads = 2 kernel3(x) -x -synchronize() - -using CUDA -function kernel4(x) - tid = threadIdx().x - shared = @cuStaticSharedMem(Float32, 4) - fill!(shared, 1f0) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2]) - sync_threads() - CUDA.atomic_add!(pointer(x, 1), shared[1]) + thread_k = 3 + threads = (thread_i, thread_j, thread_k) + blocks = ceil.(Int, (16, size(x_bin, 2))) + fill!(h, 0) + @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h, 2) * 3 kernel_s4!(h, ∇, x_bin) + CUDA.synchronize() return end -x = CUDA.zeros(4) -@cuda threads = 2 kernel4(x) -x -synchronize() \ No newline at end of file +nbins = 64 +nfeats = 100 +nobs = Int(1e6) +h = [zeros(Float32, 3, nbins) for feat in 1nfeats]; +x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); +∇_cpu = rand(Float32, 3, nobs); +h∇_cpu = zeros(Float32, 3, nbins, nfeats) +rowsample = 0.5 +colsample = 0.5 +is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) +js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) + +∇_gpu = CuArray(∇_cpu) +x_bin_gpu = CuArray(x_bin) +h∇_gpu = CuArray(h∇_cpu) +is_gpu = CuArray(is) +js_gpu = CuArray(js) + +@time hist_gpu_s4!(h∇_gpu, ∇_gpu, x_bin_gpu) +CUDA.@time hist_gpu_s4!(h∇_gpu, ∇_gpu, x_bin_gpu) +# desktop | 1K: 41.102 ÎŒs (24 allocations: 1.66 KiB) +# desktop | 10K: 59.142 ÎŒs (109 allocations: 9.09 KiB) +# desktop | 100K: 251.850 ÎŒs (109 allocations: 9.09 KiB) +# desktop | 1M: 2.203 ms (23 allocations: 1.33 KiB) +# desktop | 10M: 25.557 ms (110 allocations: 9.11 KiB) +@btime hist_gpu_s4!(h∇_gpu, ∇_gpu, x_bin_gpu) diff --git a/experiments/hist/hist_gpu-debug-convert.jl b/experiments/hist/hist_gpu-debug-convert.jl deleted file mode 100644 index a92bc34f..00000000 --- a/experiments/hist/hist_gpu-debug-convert.jl +++ /dev/null @@ -1,26 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -function convert_kernel!(x, y) - i = threadIdx().x - if i <= length(x) - val = Float64(y[i]) - CUDA.atomic_add!(pointer(x, i), val) - # CUDA.atomic_add!(pointer(x, i), y[i]) - end - sync_threads() - return nothing -end - -function convert_gpu!(x, y) - @cuda threads = length(x) blocks = (1,) convert_kernel!(x, y) - CUDA.synchronize() - return nothing -end - -x = CUDA.zeros(Float32, 3) -y = CUDA.rand(Float32, 3) -convert_gpu!(x, y) diff --git a/experiments/hist/hist_gpu-df-debug.jl b/experiments/hist/hist_gpu-df-debug.jl deleted file mode 100644 index e0e3450a..00000000 --- a/experiments/hist/hist_gpu-df-debug.jl +++ /dev/null @@ -1,71 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -function hist_kernel!(h∇::CuDeviceArray, ∇::CuDeviceMatrix, x_bin::CuDeviceMatrix, is::CuDeviceVector, js::CuDeviceVector) - k, tiy, tiz = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().z, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tiz + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, k, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h∇, ∇, x_bin, is, js) - # kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - # config = launch_configuration(kernel.fun) - # max_threads = config.threads ÷ 4 - # max_blocks = config.blocks * 4 - max_threads = 256 - max_blocks = 16 - tx = size(h∇, 1) - ty = max(1, min(length(js), fld(max_threads, tx))) - tz = max(1, min(length(is), fld(max_threads, tx * ty))) - threads = (tx, ty, tz) - by = cld(length(js), ty) - bz = min(cld(max_blocks, by), cld(length(is), tz)) - blocks = (bz, by, 1) - h∇ .= 0 - @info "threads blocks" threads blocks - @cuda threads=threads blocks=blocks hist_kernel!(h∇, ∇, x_bin, is, js) - CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 2 -nobs = Int(1e5) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -is = 1:nobs -js = 1:nfeats - -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) diff --git a/experiments/hist/hist_gpu-df-ori copy.jl b/experiments/hist/hist_gpu-df-ori copy.jl deleted file mode 100644 index 7728191f..00000000 --- a/experiments/hist/hist_gpu-df-ori copy.jl +++ /dev/null @@ -1,75 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools - -""" - hist_kernel! -""" -function hist_kernel!(h∇, ∇, x_bin, is, js) - tix, tiy = threadIdx().x, threadIdx().y - bdx, bdy = blockDim().x, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, 1, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), ∇[1, idx]) - CUDA.atomic_add!(pointer(h∇, hid+1), ∇[2, idx]) - CUDA.atomic_add!(pointer(h∇, hid+2), ∇[3, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h∇, ∇, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads - max_blocks = config.blocks - ty = max(1, min(length(js), max_threads)) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (bx, by, 1) - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -hist = zeros(Float32, nbins, nfeats); -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = rand(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -hist_gpu = CuArray(hist) -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) \ No newline at end of file diff --git a/experiments/hist/hist_gpu-df-ori.jl b/experiments/hist/hist_gpu-df-ori.jl deleted file mode 100644 index 47981bc8..00000000 --- a/experiments/hist/hist_gpu-df-ori.jl +++ /dev/null @@ -1,217 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel!(h∇, ∇, x_bin, is, js) - tix, tiy, k = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().x, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, k, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h∇, ∇, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - @assert size(h∇, 1) <= max_threads "number of classes cannot be larger than 31 on GPU" - tz = min(64, size(h∇, 1)) - ty = max(1, min(length(js), fld(max_threads, tz))) - tx = max(1, min(length(is), fld(max_threads, tz * ty))) - threads = (tx, ty, tz) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (bx, by, 1) - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) - -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -x_bin_gpu = CuArray(x_bin) -∇_gpu = CuArray(∇) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) - - -""" - hist_kernel! - revserse k and tix thread order -""" -function hist_kernel!(h∇, ∇, x_bin, is, js) - tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x - bdx, bdy = blockDim().z, blockDim().y - bix, biy = blockIdx().z, blockIdx().y - gdx = gridDim().z - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, k, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h∇, ∇, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - tz = size(h∇, 1) - ty = max(1, min(length(js), fld(max_threads, tz))) - tx = max(1, min(length(is), fld(max_threads, tz * ty))) - threads = (tz, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -x_bin_gpu = CuArray(x_bin) -∇_gpu = CuArray(∇) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) - - - -""" - revserse tix and tij thread order - 2.4 ms, no significant speed difference with previous k, j i order -""" -function hist_kernel!(h∇, ∇, x_bin, is, js) - k, tix, tiy = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().y, blockDim().z - bix, biy = blockIdx().y, blockIdx().z - gdx = gridDim().y - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, k, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h∇, ∇, x_bin, is, js) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.blocks" config.blocks - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - tk = size(h∇, 1) - tj = max(1, min(length(js), fld(max_threads, tk))) - ti = max(1, min(length(is), fld(max_threads, tk * tj))) - threads = (tk, ti, tj) - bj = cld(length(js), tj) - bi = min(cld(max_blocks, bj), cld(length(is), ti)) - blocks = (1, bi, bj) - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -x_bin_gpu = CuArray(x_bin) -∇_gpu = CuArray(∇) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) -@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu) diff --git a/experiments/hist/hist_gpu-df-ori2.jl b/experiments/hist/hist_gpu-df-ori2.jl deleted file mode 100644 index 4d861dfd..00000000 --- a/experiments/hist/hist_gpu-df-ori2.jl +++ /dev/null @@ -1,133 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel!(h∇, ∇, x_bin, is, js) - tix, tiy, k = threadIdx().x, threadIdx().y, threadIdx().z - bdx, bdy = blockDim().x, blockDim().y - bix, biy = blockIdx().x, blockIdx().y - gdx = gridDim().x - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, k, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu2!(h, h∇, ∇, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - # @info "config.threads" config.threads - # @info "config.blocks" config.blocks - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - @assert size(h∇, 1) <= max_threads "number of classes cannot be larger than 31 on GPU" - tz = min(64, size(h∇, 1)) - ty = max(1, min(length(js), fld(max_threads, tz))) - tx = max(1, min(length(is), fld(max_threads, tz * ty))) - threads = (tx, ty, tz) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (bx, by, 1) - h∇ .= 0 - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - # @inbounds for j in jsc - # copyto!(h[j], view(h∇, :, :, j)) - # end - @inbounds for j in jsc - copyto!(h[j], view(h∇, :, :, j)) - end - # CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -h = [h∇[:,:,j] for j in axes(h∇, 3)] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true) - -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -@time update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) -CUDA.@time update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) -@time CUDA.@sync update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) -@btime update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) - - - - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -h = [h∇[:,:,j] for j in axes(h∇, 3)] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true) - -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -EvoTrees.update_hist_gpu!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) - - - -seed!(123) -nbins = 32 -nfeats = 1 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = zeros(Float32, 3, nbins, nfeats) -h = [h∇[:,:,j] for j in axes(h∇, 3)] -rowsample = 0.5 -colsample = 1.0 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true) - -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_gpu = CuArray(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -EvoTrees.update_hist_gpu!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) diff --git a/experiments/hist/hist_gpu-df-vec.jl b/experiments/hist/hist_gpu-df-vec.jl deleted file mode 100644 index 5d36503d..00000000 --- a/experiments/hist/hist_gpu-df-vec.jl +++ /dev/null @@ -1,122 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Base.Threads: @threads -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel_vec!(h∇, ∇, x_bin, is) - tix, k = threadIdx().x, threadIdx().y - bdx = blockDim().x - bix = blockIdx().x - gdx = gridDim().x - - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter in 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx] - hid = Base._to_linear_index(h∇, k, bin) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - CUDA.sync_threads() - return nothing -end - -function update_hist_gpu_vec!(h∇, ∇, x_bin, is, js) - kernel = @cuda launch = false hist_kernel_vec!(h∇[js[1]], ∇, view(x_bin, :, js[1]), is) - config = launch_configuration(kernel.fun) - max_threads = config.threads - max_blocks = config.blocks - ty = size(h∇[1], 1) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - bx = min(max_blocks, cld(length(is), tx)) - blocks = (bx, 1, 1) - @sync for j in js - @async h∇[j] .= 0 - end - CUDA.synchronize() - @sync for j in js - @async kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks) - end - CUDA.synchronize() - return nothing -end - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_gpu = CuArray.(h∇) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -@time update_hist_gpu_vec!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) -@time CUDA.@sync update_hist_gpu_vec!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) -@btime CUDA.@sync update_hist_gpu_vec!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) - -function update_hist_gpu_cpu!(h, h∇, ∇, x_bin, is, js) - kernel = @cuda launch = false hist_kernel_vec!(h∇[js[1]], ∇, view(x_bin, :, js[1]), is) - config = launch_configuration(kernel.fun) - @info "config.threads" config.threads - @info "config.blocks" config.blocks - max_threads = config.threads - max_blocks = config.blocks - ty = size(h∇[1], 1) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - bx = min(max_blocks, cld(length(is), tx)) - blocks = (bx, 1, 1) - CUDA.@sync for j in js - @async h∇[j] .= 0 - end - CUDA.synchronize() - CUDA.@sync for j in js - # @async kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks) - kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks) - end - CUDA.synchronize() - return nothing -end -function copy_gpu_cpu!(h, h∇, js) - for j in js - # @info "j" j - copyto!(h[j], h∇[j]) - end - CUDA.synchronize() - return nothing -end -function combine!(h, h∇, ∇, x_bin, is, js) - CUDA.@sync update_hist_gpu_cpu!(h, h∇, ∇, x_bin, is, js) - CUDA.synchronize() - CUDA.@sync copy_gpu_cpu!(h, h∇, js) - CUDA.synchronize() - return nothing -end -@time CUDA.@sync update_hist_gpu_cpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) -@time CUDA.@sync copy_gpu_cpu!(h∇, h∇_gpu, js) - -@btime CUDA.@sync update_hist_gpu_cpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) -@btime CUDA.@sync copy_gpu_cpu!(h∇, h∇_gpu, js) - -@time combine!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) -@time CUDA.@sync update_hist_gpu_cpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js) -@btime CUDA.@sync update_hist_gpu_cpu!($h∇, $h∇_gpu, $∇_gpu, $x_bin_gpu, $is_gpu, $js) diff --git a/experiments/hist/hist_gpu-ref.jl b/experiments/hist/hist_gpu-ref.jl new file mode 100644 index 00000000..a56263a1 --- /dev/null +++ b/experiments/hist/hist_gpu-ref.jl @@ -0,0 +1,83 @@ +using Revise +using CUDA +using StatsBase: sample +using BenchmarkTools + +""" + hist_kernel! +""" +function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S} + tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x + bdx, bdy = blockDim().z, blockDim().y + bix, biy = blockIdx().z, blockIdx().y + gdx = gridDim().z + + j = tiy + bdy * (biy - 1) + if j <= length(js) + jdx = js[j] + i_max = length(is) + niter = cld(i_max, bdx * gdx) + @inbounds for iter = 1:niter + i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) + if i <= i_max + @inbounds idx = is[i] + @inbounds bin = x_bin[idx, jdx] + hid = Base._to_linear_index(h∇, k, bin, jdx) + CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx])) + end + end + end + sync_threads() + return nothing +end + +function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads + max_blocks = config.blocks + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = max(1, min(length(is), fld(max_threads, k * ty))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + CUDA.synchronize() + copyto!(h∇_cpu, h∇) + Threads.@threads for j in jsc + nbins = size(h[j], 2) + @views h[j] .= h∇_cpu[:, 1:nbins, j] + end + return nothing +end + +nbins = 64 +nfeats = 100 +nobs = Int(1e3) +h = [zeros(Float32, 3, nbins) for feat in 1:nfeats]; +x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); +∇_cpu = rand(Float32, 3, nobs); +h∇_cpu = rand(Float32, 3, nbins, nfeats) +rowsample = 0.5 +colsample = 0.5 +is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) +js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) + +hist_gpu = CuArray(hist) +∇_gpu = CuArray(∇_cpu) +x_bin_gpu = CuArray(x_bin) +h∇_gpu = CuArray(h∇_cpu) +is_gpu = CuArray(is) +js_gpu = CuArray(js) + +@time update_hist_gpu!(h, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) +CUDA.@time update_hist_gpu!(h, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) +# desktop | 1K: 46.332 ÎŒs (109 allocations: 9.09 KiB) +# desktop | 10K: 59.142 ÎŒs (109 allocations: 9.09 KiB) +# desktop | 100K: 251.850 ÎŒs (109 allocations: 9.09 KiB) +# desktop | 1M: 2.328 ms (110 allocations: 9.11 KiB) +# desktop | 10M: 25.557 ms (110 allocations: 9.11 KiB) +@btime update_hist_gpu!(h, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl deleted file mode 100644 index d7b094b6..00000000 --- a/experiments/hist/perf-gpu.jl +++ /dev/null @@ -1,202 +0,0 @@ -using Revise -using CUDA -using StatsBase: sample -using BenchmarkTools -using Base.Threads: @threads -using Random: seed! - -""" - hist_kernel! -""" -function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S} - tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x - bdx, bdy = blockDim().z, blockDim().y - bix, biy = blockIdx().z, blockIdx().y - gdx = gridDim().z - - j = tiy + bdy * (biy - 1) - if j <= length(js) - jdx = js[j] - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter = 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - @inbounds idx = is[i] - @inbounds bin = x_bin[idx, jdx] - hid = Base._to_linear_index(h∇, k, bin, jdx) - CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx])) - end - end - end - sync_threads() - return nothing -end - -function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - k = size(h∇, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - h∇ .= 0 - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - CUDA.@sync for j in jsc - nbins = size(h[j], 2) - copyto!(h[j], view(h∇, :, 1:nbins, j)) - end - return nothing -end - -function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - k = size(h∇, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - h∇ .= 0 - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - return nothing -end - -function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - k = size(h∇, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - h∇ .= 0 - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - copyto!(h∇_cpu, h∇) - CUDA.synchronize() - return nothing -end - - -function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) - kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) - config = launch_configuration(kernel.fun) - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 - k = size(h∇, 1) - ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) - threads = (k, ty, tx) - by = cld(length(js), ty) - bx = min(cld(max_blocks, by), cld(length(is), tx)) - blocks = (1, by, bx) - h∇ .= 0 - kernel(h∇, ∇, x_bin, is, js; threads, blocks) - # CUDA.synchronize() - copyto!(h∇_cpu, h∇) - # CUDA.synchronize() - @threads for j in jsc - nbins = size(h[j], 2) - @views h[j] .= h∇_cpu[:, 1:nbins, j] - # h[j] .= h∇_cpu[:, 1:nbins, j] - end - return nothing -end - - -seed!(123) -nbins = 32 -nfeats = 100 -nobs = Int(1e6) -x_bin = UInt8.(rand(1:nbins, nobs, nfeats)); -∇ = rand(Float32, 3, nobs); -h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats] -rowsample = 0.5 -colsample = 0.5 -is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true) -js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true) - -∇_gpu = CuArray(∇) -x_bin_gpu = CuArray(x_bin) -h∇_cpu = zeros(Float32, 3, nbins, nfeats) -h∇_gpu = CuArray(h∇_cpu) -is_gpu = CuArray(is) -js_gpu = CuArray(js) - -CUDA.allowscalar(false) -CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) -# ref without copy to cpu: ~same -# ref 10K: 875.100 ÎŒs (168 allocations: 7.08 KiB) -# ref 100K: 1.236 ms (215 allocations: 9.91 KiB) -# ref 1M: 6.138 ms (227 allocations: 12.00 KiB) -# ref 10M: 67.075 ms (235 allocations: 13.38 KiB) - -# with copy -# CUDA v4 1M: 2.903 ms (124 allocations: 6.98 KiB) -# CUDA v5 1M: 3.542 ms (848 allocations: 37.14 KiB) -@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) - -# without copy -# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB) -# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) - -# without single array copy -# CUDA v4 1M: -# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) - -# without single array copy -# CUDA v4 1M: -# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) - - -using CUDA, BenchmarkTools -function gpu_copy!(h, h∇, jsc) - CUDA.@sync for j in jsc - nbins = size(h[j], 2) - copyto!(h[j], view(h∇, :, 1:nbins, j)) - end - return nothing -end - -h∇ = [zeros(Float32, 3, 32) for n in 1:100]; -h∇_gpu = CUDA.zeros(Float32, 3, 32, 100); -js = 1:100 - -# CUDA v4: 534.480 ÎŒs (100 allocations: 4.69 KiB) -# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB) -@btime gpu_copy!(h∇, h∇_gpu, js) - - -function gpu_copy2!(h, h∇, jsc) - for j in jsc - nbins = size(h[j], 2) - @async copyto!(h[j], view(h∇, :, 1:nbins, j)) - end - return nothing -end - -h∇ = [zeros(Float32, 3, 32) for n in 1:100]; -h∇_gpu = CUDA.zeros(Float32, 3, 32, 100); -js = 1:100 - -# CUDA v4: 534.480 ÎŒs (100 allocations: 4.69 KiB) -# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB) -@btime gpu_copy2!(h∇, h∇_gpu, js) diff --git a/experiments/hist_v1.jl b/experiments/hist_v1.jl deleted file mode 100644 index 80fd250b..00000000 --- a/experiments/hist_v1.jl +++ /dev/null @@ -1,306 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using EvoTrees -using SIMD -using LoopVectorization - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n) -𝑗 = collect(1:nvars) -ÎŽ = rand(n) -Ύ² = rand(n) -𝑀 = rand(n) - -hist_ÎŽ = zeros(nbins, nvars) -hist_Ύ² = zeros(nbins, nvars) -hist_𝑀 = zeros(nbins, nvars) -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars) - -function iter_1(X_bin, hist_ÎŽ, ÎŽ, 𝑖, 𝑗) - # hist_ÎŽ .*= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_ÎŽ[X_bin[i,j], j] += ÎŽ[i] - end - end -end - -@time iter_1(X_bin, hist_ÎŽ, ÎŽ, 𝑖, 𝑗) -@btime iter_1(X_bin, hist_ÎŽ, ÎŽ, 𝑖, 𝑗) - -function iter_1A(X_bin, hist_ÎŽ, ÎŽ, 𝑖, 𝑗) - # hist_ÎŽ .*= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_ÎŽ[X_bin[i,j], j] += ÎŽ[i] - end - end -end - -@time iter_1A(X_bin, hist_ÎŽ, ÎŽ, 𝑖, 𝑗) -@btime iter_1A(X_bin, hist_ÎŽ, ÎŽ, 𝑖, 𝑗) - -### 3 features in seperate hists -function iter_1B(X_bin, hist_ÎŽ, hist_Ύ², hist_𝑀, ÎŽ, Ύ², 𝑀, 𝑖, 𝑗) - # hist_ÎŽ .= 0.0 - # hist_Ύ² .= 0.0 - # hist_𝑀 .= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_ÎŽ[X_bin[i,j], j] += ÎŽ[i] - hist_Ύ²[X_bin[i,j], j] += Ύ²[i] - hist_𝑀[X_bin[i,j], j] += 𝑀[i] - end - end -end - -@time iter_1B(X_bin, hist_ÎŽ, hist_Ύ², hist_𝑀, ÎŽ, Ύ², 𝑀, 𝑖, 𝑗) -@btime iter_1B($X_bin, $hist_ÎŽ, $hist_Ύ², $hist_𝑀, $ÎŽ, $Ύ², $𝑀, $𝑖, $𝑗) - -𝑖2 = sample(𝑖, 500000, replace=false, ordered=true) -𝑗2 = sample(𝑗, 50, replace=false, ordered=true) -@time iter_1B(X_bin, hist_ÎŽ, hist_Ύ², hist_𝑀, ÎŽ, Ύ², 𝑀, 𝑖2, 𝑗2) -@btime iter_1B($X_bin, $hist_ÎŽ, $hist_Ύ², $hist_𝑀, $ÎŽ, $Ύ², $𝑀, $𝑖2, $𝑗2) - -### 3 features in common hists -hist_Ύ𝑀 = zeros(3, nbins, nvars) -function iter_2(X_bin, hist_Ύ𝑀, ÎŽ, Ύ², 𝑀, 𝑖, 𝑗) - # hist_Ύ𝑀 .= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_Ύ𝑀[1, X_bin[i,j], j] += ÎŽ[i] - hist_Ύ𝑀[2, X_bin[i,j], j] += Ύ²[i] - hist_Ύ𝑀[3, X_bin[i,j], j] += 𝑀[i] - end - end -end - -@time iter_2(X_bin, hist_Ύ𝑀, ÎŽ, Ύ², 𝑀, 𝑖, 𝑗) -@btime iter_2($X_bin, $hist_Ύ𝑀, $ÎŽ, $Ύ², $𝑀, $𝑖, $𝑗) - -### 3 features in common hists - gradients/weight in single matrix -hist_Ύ𝑀 = zeros(3, nbins, nvars) -Ύ𝑀 = rand(3, n) - -function iter_3(X_bin, hist_Ύ𝑀, Ύ𝑀, 𝑖, 𝑗) - # hist_Ύ𝑀 .= 0.0 - @inbounds @threads for j in 𝑗 - @inbounds for i in 𝑖 - hist_Ύ𝑀[1, X_bin[i,j], j] += Ύ𝑀[1,i] - hist_Ύ𝑀[2, X_bin[i,j], j] += Ύ𝑀[2,i] - hist_Ύ𝑀[3, X_bin[i,j], j] += Ύ𝑀[3,i] - end - end -end - -@time iter_3(X_bin, hist_Ύ𝑀, Ύ𝑀, 𝑖, 𝑗) -@btime iter_3($X_bin, $hist_Ύ𝑀, $Ύ𝑀, $𝑖, $𝑗) - -# 39.849 ms (81 allocations: 8.22 KiB) -function iter_3B(X_bin, hist_Ύ𝑀, Ύ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_Ύ𝑀[1, X_bin[i,j], j] += Ύ𝑀[1,i] - hist_Ύ𝑀[2, X_bin[i,j], j] += Ύ𝑀[2,i] - hist_Ύ𝑀[3, X_bin[i,j], j] += Ύ𝑀[3,i] - end - end -end - -@time iter_3B(X_bin, hist_Ύ𝑀, Ύ𝑀, 𝑖, 𝑗) -@btime iter_3B($X_bin, $hist_Ύ𝑀, $Ύ𝑀, $𝑖, $𝑗) - -# 38.027 ms (81 allocations: 8.22 KiB) -function iter_3C(X_bin, hist_Ύ𝑀, Ύ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds for chunk in 1:10000 - @inbounds for iter in 1:100 - i = iter + 100 * (chunk - 1) - hist_Ύ𝑀[1, X_bin[i,j], j] += Ύ𝑀[1,i] - hist_Ύ𝑀[2, X_bin[i,j], j] += Ύ𝑀[2,i] - hist_Ύ𝑀[3, X_bin[i,j], j] += Ύ𝑀[3,i] - end - end - end -end - -@time iter_3C(X_bin, hist_Ύ𝑀, Ύ𝑀, 𝑖, 𝑗) -@btime iter_3C($X_bin, $hist_Ύ𝑀, $Ύ𝑀, $𝑖, $𝑗) - - -### 3 features in common hists - vector of matrix hists - gradients/weight in single matrix -hist_Ύ𝑀_vec = [zeros(3, nbins) for j in 1:nvars] -Ύ𝑀 = rand(3, n) - -# 33.762 ms (81 allocations: 8.22 KiB) -function iter_4(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) - # [hist_Ύ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_Ύ𝑀_vec[j][1, X_bin[i,j]] += Ύ𝑀[1,i] - hist_Ύ𝑀_vec[j][2, X_bin[i,j]] += Ύ𝑀[2,i] - hist_Ύ𝑀_vec[j][3, X_bin[i,j]] += Ύ𝑀[3,i] - end - end -end - -@time iter_4(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) -@btime iter_4($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗) - - -### 3 features in common hists - vector of matrix hists - gradients/weight in single matrix -hist_Ύ𝑀_vec = [zeros(3, nbins) for j in 1:nvars] -Ύ𝑀 = rand(n, 3) - -# 40.005 ms (81 allocations: 8.22 KiB) -function iter_4B(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) - # [hist_Ύ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hist_Ύ𝑀_vec[j][1, X_bin[i,j]] += Ύ𝑀[i,1] - hist_Ύ𝑀_vec[j][2, X_bin[i,j]] += Ύ𝑀[i,2] - hist_Ύ𝑀_vec[j][3, X_bin[i,j]] += Ύ𝑀[i,3] - end - end -end - -@time iter_4B(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗); -@btime iter_4B($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗); - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector -hist_Ύ𝑀_vec = [zeros(3 * nbins) for j in 1:nvars] -Ύ𝑀 = rand(3 * n) - - -function iter_5(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) - # [hist_Ύ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - # @inbounds if mask[i] - id = 3 * i - 2 - hid = 3 * X_bin[i,j] - 2 - hist_Ύ𝑀_vec[j][hid] += Ύ𝑀[id] - hist_Ύ𝑀_vec[j][hid + 1] += Ύ𝑀[id + 1] - hist_Ύ𝑀_vec[j][hid + 2] += Ύ𝑀[id + 2] - # end - end - end -end - -@time iter_5(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) -@btime iter_5($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗) - -# 𝑖2 = sample(𝑖, 900000, replace=false, ordered=true) -# 𝑖3 = view(𝑖2, 100001:650000) -using Random -𝑖2 = sample(𝑖, 500000, replace=false, ordered=true) -𝑗2 = sample(𝑗, 50, replace=false, ordered=true) -@time iter_5(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖2, 𝑗2) -@btime iter_5($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖2, $𝑗2) - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector - explicit loop -hist_Ύ𝑀_vec = [zeros(3 * nbins) for j in 1:nvars] -Ύ𝑀 = rand(3, n) - -function iter_5B(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hid = 3 * X_bin[i,j] - 2 - hist_Ύ𝑀_vec[j][hid] += Ύ𝑀[1, i] - hist_Ύ𝑀_vec[j][hid + 1] += Ύ𝑀[2, i] - hist_Ύ𝑀_vec[j][hid + 2] += Ύ𝑀[3, i] - end - end -end - -@time iter_5B(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) -@btime iter_5B($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗) - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector - explicit loop -hist_Ύ𝑀_vec = [zeros(3 * nbins) for j in 1:nvars] -Ύ𝑀 = rand(n, 3) - -function iter_5C(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - hid = 3 * X_bin[i,j] - 2 - hist_Ύ𝑀_vec[j][hid] += Ύ𝑀[i,1] - hist_Ύ𝑀_vec[j][hid + 1] += Ύ𝑀[i,2] - hist_Ύ𝑀_vec[j][hid + 2] += Ύ𝑀[i,3] - end - end -end - -@time iter_5C(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗) -@btime iter_5C($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗) - -function iter_6(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗, K) - # [hist_Ύ𝑀_vec[j] .= 0.0 for j in 𝑗] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - id = 3 * i - 2 - hid = 3 * X_bin[i,j] - 3 - for k in 1:K - hist_Ύ𝑀_vec[j][hid + k] += Ύ𝑀[id + k] - end - end - end -end - -K = 3 -@time iter_6(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗, K) -@btime iter_6($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗, $K) - -### 3 features in common hists - vector of vec hists - gradients/weight in single vector - with node assignations -hist_Ύ𝑀_vec = [[zeros(3 * nbins) for n in 1:16] for j in 1:nvars] -Ύ𝑀 = rand(3 * n) -𝑛 = sample(1:16, n) - -function iter_7(X_bin, hist_Ύ𝑀_vec::Vector{Vector{Vector{T}}}, Ύ𝑀::Vector{T}, 𝑖, 𝑗, 𝑛) where T - # [hist_Ύ𝑀_vec[j][n] .= 0.0 for n in 𝑛] - @inbounds @threads for j in 𝑗 - @inbounds @simd for i in 𝑖 - id = 3 * i - 2 - hid = 3 * X_bin[i,j] - 2 - n = 𝑛[i] - - hist_Ύ𝑀_vec[j][n][hid] += Ύ𝑀[id] - hist_Ύ𝑀_vec[j][n][hid + 1] += Ύ𝑀[id + 1] - hist_Ύ𝑀_vec[j][n][hid + 2] += Ύ𝑀[id + 2] - end - end -end - -@time iter_7(X_bin, hist_Ύ𝑀_vec, Ύ𝑀, 𝑖, 𝑗, 𝑛) -@btime iter_7($X_bin, $hist_Ύ𝑀_vec, $Ύ𝑀, $𝑖, $𝑗, $𝑛) - - - -using StatsBase:sample -using BenchmarkTools - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 64 -K = 3 -𝑖 = collect(1:n_obs) -ÎŽ = rand(K, n_obs) -hist = zeros(K, n_bins, n_vars); -X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars); -X_bin = reshape(X_bin, n_obs, n_vars); -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -function iter_1(X_bin, hist, ÎŽ, 𝑖) - hist .= 0.0 - @inbounds @simd for i in 𝑖 - @inbounds for k in 1:3 - hist[k, X_bin[i,1], 1] += ÎŽ[k, i] - end - end -end - -@btime iter_1($X_bin, $hist, $ÎŽ, $𝑖_sample) \ No newline at end of file diff --git a/experiments/hist_v2.jl b/experiments/hist_v2.jl deleted file mode 100644 index f107b1ca..00000000 --- a/experiments/hist_v2.jl +++ /dev/null @@ -1,66 +0,0 @@ -# using Statistics -using StatsBase:sample -# using Base.Threads:@threads -using BenchmarkTools - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 64 -K = 3 -𝑖 = collect(1:n_obs) -ÎŽ = rand(n_obs, K) -hist = zeros(K, n_bins, n_vars); -X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars); -X_bin = reshape(X_bin, n_obs, n_vars); - -function iter_1(X_bin, hist, ÎŽ, 𝑖) - hist .= 0.0 - @inbounds for i in 𝑖 - @inbounds for k in 1:3 - hist[k, X_bin[i,1], 1] += ÎŽ[i,k] - end - end -end - -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -@time iter_1(X_bin, hist, ÎŽ, 𝑖_sample) -@btime iter_1($X_bin, $hist, $ÎŽ, $𝑖_sample) - - - -function iter_2(X_bin, hist, ÎŽ, 𝑖) - hist .= 0.0 - @inbounds @simd for i in CartesianIndices(𝑖) - @inbounds @simd for k in 1:3 - hist[k, X_bin[𝑖[i],1], 1] += ÎŽ[𝑖[i],k] - end - end -end - -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -@time iter_2(X_bin, hist, ÎŽ, 𝑖_sample) -@btime iter_2($X_bin, $hist, $ÎŽ, $𝑖_sample) - - - -# slower -ÎŽ = rand(K, n_obs) -hist = zeros(K, n_bins, n_vars); -X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars); -X_bin = reshape(X_bin, n_obs, n_vars); - -function iter_1(X_bin, hist, ÎŽ, 𝑖) - hist .= 0.0 - @inbounds for i in 𝑖 - @inbounds for k in 1:3 - hist[k, X_bin[i,1], 1] += ÎŽ[k,i] - end - end -end - -𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true) - -@time iter_1(X_bin, hist, ÎŽ, 𝑖_sample) -@btime iter_1($X_bin, $hist, $ÎŽ, $𝑖_sample) diff --git a/experiments/logistic_tests.jl b/experiments/logistic_tests.jl deleted file mode 100644 index d4f507de..00000000 --- a/experiments/logistic_tests.jl +++ /dev/null @@ -1,142 +0,0 @@ -using Statistics -using StatsBase: sample, sample! -using EvoTrees -using BenchmarkTools -using CUDA - -# prepare a dataset -features = rand(Int(1.25e4), 100) -# features = rand(100, 10) -X = features -Y = rand(size(X, 1)) -𝑖 = collect(1:size(X, 1)) - -# train-eval split -𝑖_sample = sample(𝑖, size(𝑖, 1), replace=false) -train_size = 0.8 -𝑖_train = 𝑖_sample[1:floor(Int, train_size * size(𝑖, 1))] -𝑖_eval = 𝑖_sample[floor(Int, train_size * size(𝑖, 1))+1:end] - -x_train, x_eval = X[𝑖_train, :], X[𝑖_eval, :] -y_train, y_eval = Y[𝑖_train], Y[𝑖_eval] - - -########################### -# Tree CPU -########################### -params_c = EvoTrees.EvoTreeMLE(T=Float32, - loss=:logistic, - nrounds=200, - lambda=0.1, gamma=0.0, eta=0.1, - max_depth=5, min_weight=1.0, - rowsample=0.5, colsample=0.5, nbins=16); - -model_c, cache_c = EvoTrees.init_evotree(params_c, x_train, y_train); -EvoTrees.grow_evotree!(model_c, cache_c) -p = model_c(x_train) -sort(p[:,1]) -sort(p[:,2]) - -# initialize from cache -params_c = model_c.params -x_size = size(cache_c.X_bin) - -# select random rows and cols -sample!(params_c.rng, cache_c.𝑖_, cache_c.nodes[1].𝑖, replace=false, ordered=true); -sample!(params_c.rng, cache_c.𝑗_, cache_c.𝑗, replace=false, ordered=true); -# @btime sample!(params_c.rng, cache_c.𝑖_, cache_c.nodes[1].𝑖, replace=false, ordered=true); -# @btime sample!(params_c.rng, cache_c.𝑗_, cache_c.𝑗, replace=false, ordered=true); - -𝑖 = cache_c.nodes[1].𝑖 -𝑗 = cache_c.𝑗 - -# build a new tree -# 897.800 ÎŒs (6 allocations: 736 bytes) -get_loss_type(m::EvoTreeGaussian{L,T,S}) where {L,T,S} = L -get_loss_type(m::EvoTrees.EvoTreeLogistic{L,T,S}) where {L,T,S} = L - -L = get_loss_type(params_c) -@time EvoTrees.update_grads!(L, cache_c.Ύ𝑀, cache_c.pred, cache_c.Y; alpha=params_c.alpha) -cache_c.Ύ𝑀 - -sort(cache_c.Ύ𝑀[1, :]) -sort(cache_c.Ύ𝑀[2, :]) -sort(cache_c.Ύ𝑀[3, :]) -sort(cache_c.Ύ𝑀[4, :]) - -p = collect(-3:0.5:3) -y = collect(-3:0.5:3) - -function get_grads(p, y) - grad = zeros(length(p), length(y)) - for i in eachindex(p) - for j in eachindex(y) - # alternate from 1 - # grad[i, j] = -(exp(-2s) * (u - y) * (u - y + exp(s) * sinh(exp(-s) * (u - y)))) / (1 + cosh(exp(-s) * (u - y))) - grad[i, j] = (exp(-2 * p[i]) * (0.0 - y[j]) * (0.0 - y[j] + exp(p[i]) * sinh(exp(-p[i]) * (0.0 - y[j])))) / (1 + cosh(exp(-p[i]) * (0.0 - y[j]))) - end - end - return grad -end - -grads = get_grads(p, y) -heatmap(grads) -# @btime EvoTrees.update_grads!($params_c.loss, $cache_c.Ύ𝑀, $cache_c.pred_cpu, $cache_c.Y_cpu, $params_c.α) -# ∑ = vec(sum(cache_c.ÎŽ[𝑖,:], dims=1)) -# gain = EvoTrees.get_gain(params_c.loss, ∑, params_c.λ) -# assign a root and grow tree -# train_nodes[1] = EvoTrees.TrainNode(UInt32(0), UInt32(1), ∑, gain) - -# 62.530 ms (7229 allocations: 17.43 MiB) -# 1.25e5: 9.187 ms (7358 allocations: 2.46 MiB) -tree = EvoTrees.Tree(params_c.max_depth, model_c.K, zero(typeof(params_c.λ))) -@time EvoTrees.grow_tree!(tree, cache_c.nodes, params_c, cache_c.Ύ𝑀, cache_c.edges, cache_c.𝑗, cache_c.left, cache_c.left, cache_c.right, cache_c.X_bin, cache_c.K) -@btime EvoTrees.grow_tree!($EvoTrees.Tree(params_c.max_depth, model_c.K, zero(typeof(params_c.λ))), $cache_c.nodes, $params_c, $cache_c.Ύ𝑀, $cache_c.edges, $cache_c.𝑗, $cache_c.left, $cache_c.left, $cache_c.right, $cache_c.X_bin, $cache_c.K) - -@time EvoTrees.grow_tree!(EvoTrees.Tree(params_c.max_depth, model_c.K, params_c.λ), params_c, cache_c.ÎŽ, cache_c.hist, cache_c.histL, cache_c.histR, cache_c.gains, cache_c.edges, 𝑖, 𝑗, 𝑛, cache_c.X_bin); -@btime EvoTrees.grow_tree!(EvoTrees.Tree($params_c.max_depth, $model_c.K, $params_c.λ), $params_c, $cache_c.ÎŽ, $cache_c.hist, $cache_c.histL, $cache_c.histR, $cache_c.gains, $cache_c.edges, $𝑖, $𝑗, $𝑛, $cache_c.X_bin); -@code_warntype EvoTrees.grow_tree!(EvoTrees.Tree(params_c.max_depth, model_c.K, params_c.λ), params_c, cache_c.ÎŽ, cache_c.hist, cache_c.histL, cache_c.histR, cache_c.gains, cache_c.edges, 𝑖, 𝑗, 𝑛, cache_c.X_bin); - -# push!(model_c.trees, tree) -# 1.883 ms (83 allocations: 13.77 KiB) -@btime EvoTrees.predict!(model_c.params.loss, cache_c.pred_cpu, tree, cache_c.X, model_c.K) - -Ύ𝑀, K, edges, X_bin, nodes, out, left, right = cache_c.Ύ𝑀, cache_c.K, cache_c.edges, cache_c.X_bin, cache_c.nodes, cache_c.out, cache_c.left, cache_c.right; - -# 9.613 ms (81 allocations: 13.55 KiB) -# 1.25e5: 899.200 ÎŒs (81 allocations: 8.22 KiB) -@time EvoTrees.update_hist!(params_c.loss, nodes[1].h, Ύ𝑀, X_bin, 𝑖, 𝑗, K) -@btime EvoTrees.update_hist!($params_c.loss, $nodes[1].h, $Ύ𝑀, $X_bin, $𝑖, $𝑗, $K) -@btime EvoTrees.update_hist!($nodes[1].h, $Ύ𝑀, $X_bin, $nodes[1].𝑖, $𝑗) -@code_warntype EvoTrees.update_hist!(hist, ÎŽ, X_bin, 𝑖, 𝑗, 𝑛) - -j = 1 -# 8.399 ÎŒs (80 allocations: 13.42 KiB) -n = 1 -nodes[1].∑ .= vec(sum(Ύ𝑀[:, 𝑖], dims=2)) -EvoTrees.update_gains!(params_c.loss, nodes[n], 𝑗, params_c, K) -nodes[1].gains -# findmax(nodes[1].gains) #1.25e5: 36.500 ÎŒs (81 allocations: 8.22 KiB) -@btime EvoTrees.update_gains!($params_c.loss, $nodes[n], $𝑗, $params_c, $K) -@code_warntype EvoTrees.update_gains!(params_c.loss, nodes[n], 𝑗, params_c, K) - -#1.25e5: 14.100 ÎŒs (1 allocation: 32 bytes) -best = findmax(nodes[n].gains) -@btime best = findmax(nodes[n].gains) -@btime best = findmax(view(nodes[n].gains, :, 𝑗)) - -tree.cond_bin[n] = best[2][1] -tree.feat[n] = best[2][2] - -Int.(tree.cond_bin[n]) -# tree.cond_bin[n] = 32 - -# 204.900 ÎŒs (1 allocation: 96 bytes) -offset = 0 -@time EvoTrees.split_set!(left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n], offset) -@btime EvoTrees.split_set!($left, $right, $𝑖, $X_bin, $tree.feat[n], $tree.cond_bin[n], $offset) -@code_warntype EvoTrees.split_set!(left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n]) - -# 1.25e5: 227.200 ÎŒs (22 allocations: 1.44 KiB) -@time EvoTrees.split_set_threads!(out, left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n], offset) -@btime EvoTrees.split_set_threads!($out, $left, $right, $𝑖, $X_bin, $tree.feat[n], $tree.cond_bin[n], $offset, Int(2e15)) diff --git a/experiments/loss_gpu.jl b/experiments/loss_gpu.jl deleted file mode 100644 index 37cacad8..00000000 --- a/experiments/loss_gpu.jl +++ /dev/null @@ -1,38 +0,0 @@ -using CUDAnative -using CuArrays -using StaticArrays -using BenchmarkTools - -N = Int(1e6) -pred = rand(SVector{1, Float32}, N) -target = rand(Float32, N) -ÎŽ = zeros(SVector{1, Float32}, N) -Ύ² = zeros(SVector{1, Float32}, N) - -pred_g = CuArray(rand(Float32, N)) -target_g = CuArray(rand(Float32, N)) -ÎŽ_g = CuArray(zeros(Float32, N)) -Ύ²_g = CuArray(zeros(Float32, N)) - -pred = Array(pred_g) -target = Array(target_g) -ÎŽ = Array(ÎŽ_g) -Ύ² = Array(Ύ²_g) - -# linear -function update_grads!(pred::Vector{SVector{L,T}}, target::AbstractVector{T}, ÎŽ::Vector{SVector{L,T}}, Ύ²::Vector{SVector{L,T}}) where {T <: AbstractFloat, L} - @inbounds for i in eachindex(ÎŽ) - ÎŽ[i] = SVector(2 * (pred[i][1] - target[i])) - Ύ²[i] = SVector(2) - end -end - -# linear -function update_grads_gpu!(pred::AbstractVector{T}, target::AbstractVector{T}, ÎŽ::AbstractVector{T}, Ύ²::AbstractVector{T}, 𝑀::AbstractVector{T}) where {T <: AbstractFloat} - @. ÎŽ = 2f0 * (pred - target) * 𝑀 - @. Ύ² = 2f0 * 𝑀 - return -end - -@time update_grads!(pred, target, ÎŽ, Ύ²) -CuArrays.@time update_grads_gpu!(pred_g, target_g, ÎŽ_g, Ύ²_g) diff --git a/experiments/shuffling.jl b/experiments/shuffling.jl deleted file mode 100644 index 6d11a93e..00000000 --- a/experiments/shuffling.jl +++ /dev/null @@ -1,185 +0,0 @@ -using DataFrames -using Distributions -using EvoTrees -using LinearAlgebra -using GLM -using Random - -ÎŽ = 1.0e-6 -b = fill(1.0 - ÎŽ, 3, 3) + ÎŽ * I -z = zeros(3, 3) -y = fill(0.5, 3) -dist = MvNormal([ - b z z 0.8*y - z b z y - z z b 1.2*y - 0.8*y' y' 1.2*y' 1.0]) -Random.seed!(1) -mat = rand(dist, 10_000); -df = DataFrame(transpose(mat), [string.("x", 1:9); "y"]); -target_name = "y" - -################################# -# Tables API -################################# -config = EvoTreeRegressor(rng=123) -m1 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m1) - -config = EvoTreeRegressor(rng=124) -m2 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m2) - -################################# -# Tables API - GPU -################################# -config = EvoTreeRegressor(rowsample=0.5, rng=123) -m1 = fit_evotree(config, - df; - device="gpu", - target_name="y", - verbosity=0); -EvoTrees.importance(m1) - -config = EvoTreeRegressor(rowsample=0.5, rng=124) -m2 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m2) - - -# permuted tables doesn't return the same result - numerical rounding error? -df2 = df[!, 10:-1:1] -config = EvoTreeRegressor() -m3 = fit_evotree(config, - df2; - target_name="y", - verbosity=0); -EvoTrees.importance(m3) - -# manual check on col permutations -config = EvoTreeRegressor(max_depth=4) -m1, cache1 = EvoTrees.init(config, df; target_name); -EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU) -EvoTrees.importance(m1) - -df2 = df[!, 10:-1:1]; -config = EvoTreeRegressor(max_depth=4) -m2, cache2 = EvoTrees.init(config, df2; target_name); -EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU) -EvoTrees.importance(m2) - -all(cache1.x_bin .== cache2.x_bin[:, 9:-1:1]) -all(cache1.edges .== cache2.edges[9:-1:1]) -m1.trees[2] -m2.trees[2] - -m1.trees[2].feat -m2.trees[2].feat - -Int.(m1.trees[2].cond_bin) -Int.(m2.trees[2].cond_bin) - - -config = EvoTreeRegressor(nrounds=100, eta=0.05, colsample=1.0) -m3 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m3) - -################################# -# Tables API -################################# -config = EvoTreeRegressor(colsample=0.5, rng=123) -m1 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m1) - -m2 = fit_evotree(config, - df; - target_name="y", - verbosity=0); -EvoTrees.importance(m2) - -################################# -# Matrix API -################################# -x_train = Matrix(mat[1:9, :]') -y_train = mat[10, :] - -config = EvoTreeRegressor() -m1 = fit_evotree(config; - x_train, - y_train, - verbosity=0); -EvoTrees.importance(m1) - -m2 = fit_evotree(config; - x_train, - y_train, - verbosity=0); -EvoTrees.importance(m2) - -using GLM -x_train = Matrix(mat[1:9, :]') -y_train = mat[10, :] -lm(x_train, y_train) - -################################# -# Matrix debug API -################################# -x_train = Matrix(mat[1:9, :]') -y_train = mat[10, :] - -config = EvoTreeRegressor() -m1, cache1 = EvoTrees.init(config, x_train, y_train); -EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU) -EvoTrees.importance(m1) - -m2, cache2 = EvoTrees.init(config, x_train, y_train); -EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU) -EvoTrees.importance(m2) - - -using BenchmarkTools -using Random: rand! -using CUDA -nobs = 1_000_000 -rng1 = Random.MersenneTwister(123) -rng2 = Random.Xoshiro(123) -rng3 = Random.TaskLocalRNG() - -is_in = zeros(UInt32, nobs); -is_out = zeros(UInt32, nobs); -mask = zeros(UInt8, nobs); - -@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng1); -@btime EvoTrees.get_rand!(rng1, mask); - -@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng2); -@btime EvoTrees.get_rand!(rng2, mask); - -@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng3); -@btime EvoTrees.get_rand!(rng3, mask); - -get_rand2!(rng, mask) = rand!(rng, mask) -@btime get_rand2!(rng1, mask); -@btime get_rand2!(rng2, mask); -@btime get_rand2!(rng3, mask); - -mask = CUDA.zeros(UInt8, nobs); -CUDA.@time EvoTrees.get_rand_gpu!(mask); -CUDA.@time CUDA.rand(UInt8, nobs); -# @btime EvoTrees.get_rand_gpu!(mask); -# @btime CUDA.rand(UInt8, nobs); - diff --git a/experiments/update_set_v1.jl b/experiments/update_set_v1.jl deleted file mode 100644 index 70c2fa7c..00000000 --- a/experiments/update_set_v1.jl +++ /dev/null @@ -1,277 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using Revise -using EvoTrees - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 255 -𝑖 = collect(1:n_obs); -𝑗 = collect(1:n_vars); -ÎŽ = rand(n_obs); -Ύ² = rand(n_obs); - -hist_ÎŽ = zeros(n_bins, n_vars); -hist_Ύ² = zeros(n_bins, n_vars); -X_bin = rand(UInt8, n_obs, n_vars); - -# split row ids into left and right based on best split condition -function update_set_1(set, best, x_bin) - left = similar(set) - right = similar(set) - left_count = 0 - right_count = 0 - @inbounds for i in set - if x_bin[i] <= best - left_count += 1 - left[left_count] = i - else - right_count += 1 - right[right_count] = i - end - end - resize!(left, left_count) - resize!(right, right_count) - return left, right -end - -@time update_set_1(𝑖, 16, X_bin[:,1]); -@btime update_set_1($𝑖, 16, $X_bin[:,1]); -@btime update_set_1($𝑖, 64, $X_bin[:,1]); -@btime update_set_1($𝑖, 128, $X_bin[:,1]); -@btime update_set_1($𝑖, 240, $X_bin[:,1]); - -# add a leaf id update - to indicate to which leaf the set is associated -function update_set_2!(leaf_vec::Vector{T}, set, best_feat, best_cond, x_bin, depth::T) where {T} - @inbounds for i in set - left_id = leaf_vec[i] + 2^depth - right_id = left_id + 1 - x_bin[i, best_feat[leaf_vec[i]]] <= best_cond[leaf_vec[i]] ? leaf_vec[i] = left_id : leaf_vec[i] = right_id - end -end - -leaf_vec = ones(UInt16, n_obs); -leaf_id = 0 -depth = UInt16(1) -depth = 1 -best_feat = UInt16.(sample(1:100, 100000)) -best_cond = rand(UInt16, 100000); - -@time update_set_2!(leaf_vec, 𝑖, best_feat, best_cond, X_bin, depth); -@btime update_set_2!($leaf_vec, $𝑖, $best_feat, $best_cond, $X_bin, $depth); -Int.(leaf_vec) - - -# split row ids into left and right based on best split condition -function split_set_1!(left::V, right::V, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset) where {S,V} - - left_count = 0 - right_count = 0 - - @inbounds for i in 1:length(𝑖) - @inbounds if X_bin[𝑖[i], feat] <= cond_bin - left_count += 1 - left[offset + left_count] = 𝑖[i] - else - right[offset + length(𝑖) - right_count] = 𝑖[i] - right_count += 1 - end - end - # return (left[1:left_count], right[1:right_count]) - return (view(left, (offset + 1):(offset + left_count)), view(right, (offset + length(𝑖)):-1:(offset + left_count + 1))) - # return nothing -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n) -𝑗 = collect(1:nvars) -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars) -left = similar(𝑖) -right = similar(𝑖) - -𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true) - -offset = 0 -feat = 15 -cond_bin = 32 -@time lid2, rid2 = split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) -@btime split_set_1!($left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset) -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_1!(left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_1!(left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) - -𝑖 -unique(vcat(lid1, rid1)) -unique(vcat(lid1)) -unique(vcat(rid1)) -unique(sort(vcat(lid2, rid2))) -unique(sort(vcat(lid3, rid3))) -unique(sort(vcat(lid2, rid2, lid3, rid3))) - -# split row ids into left and right based on best split condition -function split_set_2!(left, right, 𝑖, x_bin, feat, cond_bin) - - left_count = 0 - right_count = 0 - - @inbounds for i in 1:length(𝑖) - if x_bin[i] <= cond_bin - left_count += 1 - left[left_count] = 𝑖[i] - else - right_count += 1 - right[right_count] = 𝑖[i] - end - end - # return (left[1:left_count], right[1:right_count]) - return (view(left, 1:left_count), view(right, 1:right_count)) - # return nothing -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n) -𝑗 = collect(1:nvars) -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars) -left = similar(𝑖) -right = similar(𝑖) - -feat = 15 -cond_bin = 32 -@time left, right = split_set_2!(left, right, 𝑖, X_bin[:,feat], feat, cond_bin) -@btime split_set_2!($left, $right, $𝑖, $X_bin[:,feat], $feat, $cond_bin) -@btime split_set_2!($left, $right, $𝑖, $view(X_bin, :, feat), $feat, $cond_bin) - - - -# function split_set_bool!(child_bool::AbstractVector{Bool}, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset) where {S} -# left_count = 0 -# right_count = 0 -# @inbounds for i in eachindex(𝑖) -# child_bool[i + offset] = X_bin[𝑖[i], feat] <= cond_bin -# end -# # return (view(𝑖, child_bool[offset + 1:offset + length(𝑖)]), view(𝑖, .!child_bool[offset + 1:offset + length(𝑖)])) -# # return (view(𝑖, view(child_bool, (offset + 1):(offset + length(𝑖)))), view(𝑖, view(child_bool, (offset + 1):(offset + length(𝑖))))) -# # return (view(𝑖, child_bool), view(𝑖, child_bool)) -# return view(𝑖, child_bool) -# end - -function split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - left_count = 0 - right_count = 0 - @inbounds for i in eachindex(block) - @inbounds if X_bin[block[i], feat] <= cond_bin - left_count += 1 - left[offset + chunk_size * (bid - 1) + left_count] = block[i] - else - right_count += 1 - right[offset + chunk_size * (bid - 1) + right_count] = block[i] - # right[offset + chunk_size * (bid - 1) + length(block) - right_count] = block[i] - # right_count += 1 - end - end - lefts[bid] = left_count - rights[bid] = right_count - bsizes[bid] = length(block) - return nothing -end - -function split_set_threads!(out, left, right, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset, chunk_size=2^14) where {S} - - left_count = 0 - right_count = 0 - iter = Iterators.partition(𝑖, chunk_size) - nblocks = length(iter) - lefts = zeros(Int, nblocks) - rights = zeros(Int, nblocks) - bsizes = zeros(Int, nblocks) - - @sync for (bid, block) in enumerate(iter) - Threads.@spawn split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - end - - left_sum = sum(lefts) - left_cum = 0 - right_cum = 0 - @inbounds for bid in 1:nblocks - view(out, offset + left_cum + 1:offset + left_cum + lefts[bid]) .= view(left, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]) - view(out, offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - - # out[offset + left_cum + 1:offset + left_cum + lefts[bid]] .= left[offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]] - # out[offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]] .= right[offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]] - - # view(right, offset + right_cum + 1:offset + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - # view(right, offset + length(𝑖) - right_cum:-1:offset + length(𝑖) - right_cum - rights[bid] + 1) .= view(right, offset + chunk_size * (bid - 1) + bsizes[bid]:-1:offset + chunk_size * (bid - 1) + lefts[bid]+1) - left_cum += lefts[bid] - right_cum += rights[bid] - end - - # right_cum = 0 - # @inbounds for bid in nblocks:-1:1 - # # view(right, offset + right_cum + 1:offset + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - # view(right, offset + length(𝑖) - right_cum:-1:offset + length(𝑖) - right_cum - rights[bid] + 1) .= view(right, offset + chunk_size * (bid - 1) + lefts[bid] + 1:offset + chunk_size * (bid - 1) + bsizes[bid]) - # # right_cum += rights[bid] - # end - - return (view(out, offset + 1:offset + sum(lefts)), view(out, offset + sum(lefts)+1:offset + length(𝑖))) - # return (lefts, rights, out) - # return (view(left, offset + 1:offset + sum(lefts)), view(right, offset + length(𝑖):-1:offset + sum(lefts) + 1)) - # return (left[offset + 1:offset + sum(lefts)], right[offset + 1:offset + sum(rights)]) -end - -iter = Iterators.partition(rand(5), 3) -for i in enumerate(iter) - println(i) -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n); -𝑗 = collect(1:nvars); -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars); -𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true); -child_bool = zeros(Bool, length(𝑖)); -left = similar(𝑖) -right = similar(𝑖) -out = similar(𝑖) - -offset = 0 -feat = 15 -cond_bin = 32 -@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14); -@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14); -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) diff --git a/experiments/update_set_v2.jl b/experiments/update_set_v2.jl deleted file mode 100644 index 43f3c2ca..00000000 --- a/experiments/update_set_v2.jl +++ /dev/null @@ -1,101 +0,0 @@ -using Statistics -using StatsBase:sample -using Base.Threads:@threads -using BenchmarkTools -using Revise -using EvoTrees - -n_obs = Int(1e6) -n_vars = 100 -n_bins = 255 -𝑖 = collect(1:n_obs); -𝑗 = collect(1:n_vars); -X_bin = rand(UInt8, n_obs, n_vars); - -function split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - left_count = 0 - right_count = 0 - @inbounds for i in eachindex(block) - @inbounds if X_bin[block[i], feat] <= cond_bin - left_count += 1 - left[offset + chunk_size * (bid - 1) + left_count] = block[i] - else - right_count += 1 - right[offset + chunk_size * (bid - 1) + right_count] = block[i] - # right[offset + chunk_size * (bid - 1) + length(block) - right_count] = block[i] - # right_count += 1 - end - end - lefts[bid] = left_count - rights[bid] = right_count - bsizes[bid] = length(block) - return nothing -end - -function split_set_threads!(out, left, right, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset, chunk_size=2^14) where {S} - - left_count = 0 - right_count = 0 - iter = Iterators.partition(𝑖, chunk_size) - nblocks = length(iter) - lefts = zeros(Int, nblocks) - rights = zeros(Int, nblocks) - bsizes = zeros(Int, nblocks) - - @sync for (bid, block) in enumerate(iter) - Threads.@spawn split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes) - end - - left_sum = sum(lefts) - left_cum = 0 - right_cum = 0 - @inbounds for bid in 1:nblocks - view(out, offset + left_cum + 1:offset + left_cum + lefts[bid]) .= view(left, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]) - view(out, offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]) - left_cum += lefts[bid] - right_cum += rights[bid] - end - return (view(out, offset + 1:offset + sum(lefts)), view(out, offset + sum(lefts)+1:offset + length(𝑖))) -end - -iter = Iterators.partition(rand(5), 3) -for i in enumerate(iter) - println(i) -end - -n = Int(1e6) -nvars = 100 -nbins = 64 -𝑖 = collect(1:n); -𝑗 = collect(1:nvars); -X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars); -𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true); -child_bool = zeros(Bool, length(𝑖)); -left = similar(𝑖) -right = similar(𝑖) -out = similar(𝑖) - -offset = 0 -feat = 15 -cond_bin = 32 -@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14); -@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14); -@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset) - -offset = 0 -feat = 15 -cond_bin = 32 -lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset) -offset = 0 -feat = 14 -cond_bin = 12 -lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset) -offset = + length(lid1) -feat = 14 -cond_bin = 12 -lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset) - -lid1_ = deepcopy(lid1) - - - diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl index 177e4fde..5dd6ae86 100644 --- a/ext/EvoTreesCUDAExt/fit-utils.jl +++ b/ext/EvoTreesCUDAExt/fit-utils.jl @@ -26,17 +26,18 @@ end function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) config = launch_configuration(kernel.fun) - max_threads = config.threads ÷ 4 - max_blocks = config.blocks * 4 + max_threads = config.threads + max_blocks = config.blocks k = size(h∇, 1) ty = max(1, min(length(js), fld(max_threads, k))) - tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + tx = max(1, min(length(is), fld(max_threads, k * ty))) threads = (k, ty, tx) by = cld(length(js), ty) bx = min(cld(max_blocks, by), cld(length(is), tx)) blocks = (1, by, bx) h∇ .= 0 kernel(h∇, ∇, x_bin, is, js; threads, blocks) + CUDA.synchronize() copyto!(h∇_cpu, h∇) Threads.@threads for j in jsc nbins = size(h[j], 2)