Skip to content

Commit

Permalink
fix GPU perf regression on CUDA v5
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedb committed Oct 12, 2023
1 parent fd78817 commit c805754
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 20 deletions.
4 changes: 0 additions & 4 deletions LocalPreferences.toml

This file was deleted.

4 changes: 2 additions & 2 deletions benchmarks/regressor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import CUDA
#threads
# laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)

nobs = Int(1e6)
nobs = Int(10e6)
num_feat = Int(100)
nrounds = 200
max_depth = 6
Expand Down Expand Up @@ -55,7 +55,7 @@ params_xgb = Dict(
:print_every_n => 5,
:subsample => 0.5,
:colsample_bytree => 0.5,
:tree_method => "hist", # hist/gpu_hist
:tree_method => "gpu_hist", # hist/gpu_hist
:max_bin => 64,
)

Expand Down
89 changes: 82 additions & 7 deletions experiments/hist/perf-gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,77 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
CUDA.synchronize()
# CUDA.@sync for j in jsc
# nbins = size(h[j], 2)
# copyto!(h[j], view(h∇, :, 1:nbins, j))
# end
CUDA.@sync for j in jsc
nbins = size(h[j], 2)
copyto!(h[j], view(h∇, :, 1:nbins, j))
end
return nothing
end

function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
CUDA.synchronize()
return nothing
end

function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
copyto!(h∇_cpu, h∇)
CUDA.synchronize()
return nothing
end


function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
max_blocks = config.blocks * 4
k = size(h∇, 1)
ty = max(1, min(length(js), fld(max_threads, k)))
tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
threads = (k, ty, tx)
by = cld(length(js), ty)
bx = min(cld(max_blocks, by), cld(length(is), tx))
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
# CUDA.synchronize()
copyto!(h∇_cpu, h∇)
# CUDA.synchronize()
@threads for j in jsc
nbins = size(h[j], 2)
@views h[j] .= h∇_cpu[:, 1:nbins, j]
# h[j] .= h∇_cpu[:, 1:nbins, j]
end
return nothing
end


seed!(123)
nbins = 32
nfeats = 100
Expand All @@ -69,7 +133,8 @@ js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=tru

∇_gpu = CuArray(∇)
x_bin_gpu = CuArray(x_bin)
h∇_gpu = CUDA.zeros(Float32, 3, nbins, nfeats)
h∇_cpu = zeros(Float32, 3, nbins, nfeats)
h∇_gpu = CuArray(h∇_cpu)
is_gpu = CuArray(is)
js_gpu = CuArray(js)

Expand All @@ -89,10 +154,20 @@ CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu,
# without copy
# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB)
# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB)
@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)

# without single array copy
# CUDA v4 1M:
# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB)
@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)

# without single array copy
# CUDA v4 1M:
# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB)
@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)


using CUDA, BenchmarkTools
using CUDA, BenchmarkTools
function gpu_copy!(h, h∇, jsc)
CUDA.@sync for j in jsc
nbins = size(h[j], 2)
Expand Down
8 changes: 4 additions & 4 deletions ext/EvoTreesCUDAExt/fit-utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, i
return nothing
end

function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
config = launch_configuration(kernel.fun)
max_threads = config.threads ÷ 4
Expand All @@ -37,10 +37,10 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
blocks = (1, by, bx)
h∇ .= 0
kernel(h∇, ∇, x_bin, is, js; threads, blocks)
CUDA.synchronize()
CUDA.@sync for j in jsc
copyto!(h∇_cpu, h∇)
Threads.@threads for j in jsc
nbins = size(h[j], 2)
copyto!(h[j], view(h∇, :, 1:nbins, j))
@views h[j] .= h∇_cpu[:, 1:nbins, j]
end
return nothing
end
Expand Down
6 changes: 4 additions & 2 deletions ext/EvoTreesCUDAExt/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.E
cache.out,
cache.left,
cache.right,
cache.h∇_cpu,
cache.h∇,
cache.x_bin,
cache.feattypes,
Expand All @@ -43,6 +44,7 @@ function grow_tree!(
out,
left,
right,
h∇_cpu::Array{Float64,3},
h∇::CuArray{Float64,3},
x_bin::CuMatrix,
feattypes::Vector{Bool},
Expand Down Expand Up @@ -87,7 +89,7 @@ function grow_tree!(
end
end
else
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
Threads.@threads for n sort(n_current)
Expand Down Expand Up @@ -214,7 +216,7 @@ function grow_otree!(
end
end
else
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
Threads.@threads for n n_current
Expand Down
4 changes: 3 additions & 1 deletion ext/EvoTreesCUDAExt/init.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}
!isnothing(offset) && (pred .+= CuArray(offset'))

# initialize gradients
h∇ = CUDA.zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins))
h∇_cpu = zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins))
h∇ = CuArray(h∇_cpu)
= CUDA.zeros(T, 2 * K + 1, nobs)
@assert (length(y) == length(w) && minimum(w) > 0)
∇[end, :] .= w
Expand Down Expand Up @@ -117,6 +118,7 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}
right=right,
=∇,
h∇=h∇,
h∇_cpu=h∇_cpu,
fnames=fnames,
edges=edges,
featbins=featbins,
Expand Down

0 comments on commit c805754

Please sign in to comment.