From cbc6c0aca071f5af33c8153763f0d7bf1918e534 Mon Sep 17 00:00:00 2001 From: jeremie Date: Mon, 16 Oct 2023 21:58:20 -0400 Subject: [PATCH] integrate MLJ wrapper with Tables API --- experiments/hist/hist_gpu share-v2.jl | 14 +++----- ext/EvoTreesCUDAExt/fit-utils.jl | 52 --------------------------- 2 files changed, 5 insertions(+), 61 deletions(-) diff --git a/experiments/hist/hist_gpu share-v2.jl b/experiments/hist/hist_gpu share-v2.jl index c9d2557..bee557b 100644 --- a/experiments/hist/hist_gpu share-v2.jl +++ b/experiments/hist/hist_gpu share-v2.jl @@ -1,6 +1,6 @@ using Revise using CUDA -using StaticArrays +# using StaticArrays using StatsBase: sample using BenchmarkTools @@ -10,10 +10,6 @@ using BenchmarkTools # - each block build histogram for many features -> (k, j) # - ################################################ - -function agg_share() -end - # base kernel function kernel_share_1!(h::CuDeviceArray{T,3}, ∇, x_bin, is) where {T} @@ -71,14 +67,14 @@ end nbins = 64 nfeats = 100 -nobs = Int32(1e6) +nobs = Int(1e6) hist = zeros(Float32, 3, nbins, ncol) -∇ = rand(Float32, items, 3) +∇ = rand(Float32, nobs, 3) # idx = Int64.(rand(1:nbins, items, ncol)) -idx = UInt8.(rand(1:nbins, items, ncol)) +is = UInt8.(rand(1:nbins, nobs, ncol)) hist_gpu = CuArray(hist) -∇_gpu = CuArray(δ) +∇_gpu = CuArray(∇) idx_gpu = CuArray(idx) @time hist_share_1!(hist, ∇, idx) diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl index 2533f78..177e4fd 100644 --- a/ext/EvoTreesCUDAExt/fit-utils.jl +++ b/ext/EvoTreesCUDAExt/fit-utils.jl @@ -45,58 +45,6 @@ function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc) return nothing end -function hist_kernel_vec!(h∇, ∇, x_bin, is) - tix, k = threadIdx().x, threadIdx().y - bdx = blockDim().x - bix = blockIdx().x - gdx = gridDim().x - - i_max = length(is) - niter = cld(i_max, bdx * gdx) - @inbounds for iter in 1:niter - i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1) - if i <= i_max - idx = is[i] - bin = x_bin[idx] - hid = Base._to_linear_index(h∇, k, bin) - CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx]) - end - end - # CUDA.sync_threads() - return nothing -end -function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector) - kernel = @cuda launch = false hist_kernel_vec!(h∇[js[1]], ∇, view(x_bin, :, js[1]), is) - config = launch_configuration(kernel.fun) - max_threads = config.threads - max_blocks = config.blocks - @assert size(h∇[js[1]], 1) <= max_threads "number of classes cannot be larger than 31 on GPU" - ty = min(64, size(h∇[js[1]], 1)) - tx = max(1, min(length(is), fld(max_threads, ty))) - threads = (tx, ty, 1) - bx = min(max_blocks, cld(length(is), tx)) - blocks = (bx, 1, 1) - # @sync for j in js - # @async h∇[j] .= 0 - # end - for j in js - h∇[j] .= 0 - h[j] .= 0 - end - CUDA.synchronize() - # @info "hist" max_blocks length(is) threads blocks - @sync for j in js - @async kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks) - # kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks) - end - CUDA.synchronize() - for j in js - copyto!(h[j], h∇[j]) - end - CUDA.synchronize() - return nothing -end - # Multi-threads split_set! # Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set. function split_chunk_kernel!(