diff --git a/LocalPreferences.toml b/LocalPreferences.toml deleted file mode 100644 index f5f4c10..0000000 --- a/LocalPreferences.toml +++ /dev/null @@ -1,4 +0,0 @@ -[CUDA] -# whether to use a nonblocking synchronization mechanism, -# making it possible to do use cooperative multitasking. -nonblocking_synchronization = false \ No newline at end of file diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl index e7514f3..e0d35b9 100644 --- a/benchmarks/regressor.jl +++ b/benchmarks/regressor.jl @@ -19,7 +19,7 @@ import CUDA #threads # laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB) -nobs = Int(1e6) +nobs = Int(10e6) num_feat = Int(100) nrounds = 200 max_depth = 6 @@ -55,7 +55,7 @@ params_xgb = Dict( :print_every_n => 5, :subsample => 0.5, :colsample_bytree => 0.5, - :tree_method => "hist", # hist/gpu_hist + :tree_method => "gpu_hist", # hist/gpu_hist :max_bin => 64, ) diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl index 9ccf1bc..d7b094b 100644 --- a/experiments/hist/perf-gpu.jl +++ b/experiments/hist/perf-gpu.jl @@ -48,13 +48,77 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) h∇ .= 0 kernel(h∇, ∇, x_bin, is, js; threads, blocks) CUDA.synchronize() - # CUDA.@sync for j in jsc - # nbins = size(h[j], 2) - # copyto!(h[j], view(h∇, :, 1:nbins, j)) - # end + CUDA.@sync for j in jsc + nbins = size(h[j], 2) + copyto!(h[j], view(h∇, :, 1:nbins, j)) + end return nothing end +function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + CUDA.synchronize() + return nothing +end + +function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + copyto!(h∇_cpu, h∇) + CUDA.synchronize() + return nothing +end + + +function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) + kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) + config = launch_configuration(kernel.fun) + max_threads = config.threads ÷ 4 + max_blocks = config.blocks * 4 + k = size(h∇, 1) + ty = max(1, min(length(js), fld(max_threads, k))) + tx = min(64, max(1, min(length(is), fld(max_threads, k * ty)))) + threads = (k, ty, tx) + by = cld(length(js), ty) + bx = min(cld(max_blocks, by), cld(length(is), tx)) + blocks = (1, by, bx) + h∇ .= 0 + kernel(h∇, ∇, x_bin, is, js; threads, blocks) + # CUDA.synchronize() + copyto!(h∇_cpu, h∇) + # CUDA.synchronize() + @threads for j in jsc + nbins = size(h[j], 2) + @views h[j] .= h∇_cpu[:, 1:nbins, j] + # h[j] .= h∇_cpu[:, 1:nbins, j] + end + return nothing +end + + seed!(123) nbins = 32 nfeats = 100 @@ -69,7 +133,8 @@ js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=tru ∇_gpu = CuArray(∇) x_bin_gpu = CuArray(x_bin) -h∇_gpu = CUDA.zeros(Float32, 3, nbins, nfeats) +h∇_cpu = zeros(Float32, 3, nbins, nfeats) +h∇_gpu = CuArray(h∇_cpu) is_gpu = CuArray(is) js_gpu = CuArray(js) @@ -89,10 +154,20 @@ CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, # without copy # CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB) # CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB) -@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) +@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) + +# without single array copy +# CUDA v4 1M: +# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB) +@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) + +# without single array copy +# CUDA v4 1M: +# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB) +@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js) -using CUDA, BenchmarkTools +using CUDA, BenchmarkTools function gpu_copy!(h, h∇, jsc) CUDA.@sync for j in jsc nbins = size(h[j], 2) diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl index 31bc1f5..2533f78 100644 --- a/ext/EvoTreesCUDAExt/fit-utils.jl +++ b/ext/EvoTreesCUDAExt/fit-utils.jl @@ -23,7 +23,7 @@ function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, i return nothing end -function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) +function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js) config = launch_configuration(kernel.fun) max_threads = config.threads ÷ 4 @@ -37,10 +37,10 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc) blocks = (1, by, bx) h∇ .= 0 kernel(h∇, ∇, x_bin, is, js; threads, blocks) - CUDA.synchronize() - CUDA.@sync for j in jsc + copyto!(h∇_cpu, h∇) + Threads.@threads for j in jsc nbins = size(h[j], 2) - copyto!(h[j], view(h∇, :, 1:nbins, j)) + @views h[j] .= h∇_cpu[:, 1:nbins, j] end return nothing end diff --git a/ext/EvoTreesCUDAExt/fit.jl b/ext/EvoTreesCUDAExt/fit.jl index f0e32cf..e4f4e33 100644 --- a/ext/EvoTreesCUDAExt/fit.jl +++ b/ext/EvoTreesCUDAExt/fit.jl @@ -21,6 +21,7 @@ function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.E cache.out, cache.left, cache.right, + cache.h∇_cpu, cache.h∇, cache.x_bin, cache.feattypes, @@ -43,6 +44,7 @@ function grow_tree!( out, left, right, + h∇_cpu::Array{Float64,3}, h∇::CuArray{Float64,3}, x_bin::CuMatrix, feattypes::Vector{Bool}, @@ -87,7 +89,7 @@ function grow_tree!( end end else - update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) + update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end Threads.@threads for n ∈ sort(n_current) @@ -214,7 +216,7 @@ function grow_otree!( end end else - update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js) + update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js) end end Threads.@threads for n ∈ n_current diff --git a/ext/EvoTreesCUDAExt/init.jl b/ext/EvoTreesCUDAExt/init.jl index 6a8dfcd..3a0b24c 100644 --- a/ext/EvoTreesCUDAExt/init.jl +++ b/ext/EvoTreesCUDAExt/init.jl @@ -63,7 +63,8 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU} !isnothing(offset) && (pred .+= CuArray(offset')) # initialize gradients - h∇ = CUDA.zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins)) + h∇_cpu = zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins)) + h∇ = CuArray(h∇_cpu) ∇ = CUDA.zeros(T, 2 * K + 1, nobs) @assert (length(y) == length(w) && minimum(w) > 0) ∇[end, :] .= w @@ -117,6 +118,7 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU} right=right, ∇=∇, h∇=h∇, + h∇_cpu=h∇_cpu, fnames=fnames, edges=edges, featbins=featbins,