fix GPU perf regression on CUDA v5

Evovest · Oct 12, 2023 · c805754 · c805754
1 parent fd78817
commit c805754
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 20 deletions.
diff --git a/LocalPreferences.toml b/LocalPreferences.toml
diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl
@@ -19,7 +19,7 @@ import CUDA
 #threads
 # laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)
 
-nobs = Int(1e6)
+nobs = Int(10e6)
 num_feat = Int(100)
 nrounds = 200
 max_depth = 6
@@ -55,7 +55,7 @@ params_xgb = Dict(
     :print_every_n => 5,
     :subsample => 0.5,
     :colsample_bytree => 0.5,
-    :tree_method => "hist", # hist/gpu_hist
+    :tree_method => "gpu_hist", # hist/gpu_hist
     :max_bin => 64,
 )
 

diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl
@@ -48,13 +48,77 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
     h∇ .= 0
     kernel(h∇, ∇, x_bin, is, js; threads, blocks)
     CUDA.synchronize()
-    # CUDA.@sync for j in jsc
-    #     nbins = size(h[j], 2)
-    #     copyto!(h[j], view(h∇, :, 1:nbins, j))
-    # end
+    CUDA.@sync for j in jsc
+        nbins = size(h[j], 2)
+        copyto!(h[j], view(h∇, :, 1:nbins, j))
+    end
     return nothing
 end
 
+function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    CUDA.synchronize()
+    return nothing
+end
+
+function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    copyto!(h∇_cpu, h∇)
+    CUDA.synchronize()
+    return nothing
+end
+
+
+function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    # CUDA.synchronize()
+    copyto!(h∇_cpu, h∇)
+    # CUDA.synchronize()
+    @threads for j in jsc
+        nbins = size(h[j], 2)
+        @views h[j] .= h∇_cpu[:, 1:nbins, j]
+        # h[j] .= h∇_cpu[:, 1:nbins, j]
+    end
+    return nothing
+end
+
+
 seed!(123)
 nbins = 32
 nfeats = 100
@@ -69,7 +133,8 @@ js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=tru
 
 ∇_gpu = CuArray(∇)
 x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CUDA.zeros(Float32, 3, nbins, nfeats)
+h∇_cpu = zeros(Float32, 3, nbins, nfeats)
+h∇_gpu = CuArray(h∇_cpu)
 is_gpu = CuArray(is)
 js_gpu = CuArray(js)
 
@@ -89,10 +154,20 @@ CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu,
 # without copy
 # CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB)
 # CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB)
-@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+
+# without single array copy
+# CUDA v4 1M: 
+# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB)
+@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+
+# without single array copy
+# CUDA v4 1M: 
+# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB)
+@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
 
 
-using CUDA, BenchmarkTools 
+using CUDA, BenchmarkTools
 function gpu_copy!(h, h∇, jsc)
     CUDA.@sync for j in jsc
         nbins = size(h[j], 2)

diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl
@@ -23,7 +23,7 @@ function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, i
     return nothing
 end
 
-function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
+function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
     kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
     config = launch_configuration(kernel.fun)
     max_threads = config.threads ÷ 4
@@ -37,10 +37,10 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
     blocks = (1, by, bx)
     h∇ .= 0
     kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    CUDA.@sync for j in jsc
+    copyto!(h∇_cpu, h∇)
+    Threads.@threads for j in jsc
         nbins = size(h[j], 2)
-        copyto!(h[j], view(h∇, :, 1:nbins, j))
+        @views h[j] .= h∇_cpu[:, 1:nbins, j]
     end
     return nothing
 end

diff --git a/ext/EvoTreesCUDAExt/fit.jl b/ext/EvoTreesCUDAExt/fit.jl
@@ -21,6 +21,7 @@ function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.E
         cache.out,
         cache.left,
         cache.right,
+        cache.h∇_cpu,
         cache.h∇,
         cache.x_bin,
         cache.feattypes,
@@ -43,6 +44,7 @@ function grow_tree!(
     out,
     left,
     right,
+    h∇_cpu::Array{Float64,3},
     h∇::CuArray{Float64,3},
     x_bin::CuMatrix,
     feattypes::Vector{Bool},
@@ -87,7 +89,7 @@ function grow_tree!(
                         end
                     end
                 else
-                    update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
+                    update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
             Threads.@threads for n ∈ sort(n_current)
@@ -214,7 +216,7 @@ function grow_otree!(
                         end
                     end
                 else
-                    update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
+                    update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
             Threads.@threads for n ∈ n_current

diff --git a/ext/EvoTreesCUDAExt/init.jl b/ext/EvoTreesCUDAExt/init.jl
@@ -63,7 +63,8 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}
     !isnothing(offset) && (pred .+= CuArray(offset'))
 
     # initialize gradients
-    h∇ = CUDA.zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins))
+    h∇_cpu = zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins))
+    h∇ = CuArray(h∇_cpu)
     ∇ = CUDA.zeros(T, 2 * K + 1, nobs)
     @assert (length(y) == length(w) && minimum(w) > 0)
     ∇[end, :] .= w
@@ -117,6 +118,7 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}
         right=right,
         ∇=∇,
         h∇=h∇,
+        h∇_cpu=h∇_cpu,
         fnames=fnames,
         edges=edges,
         featbins=featbins,