From cbc6c0aca071f5af33c8153763f0d7bf1918e534 Mon Sep 17 00:00:00 2001
From: jeremie <jeremie.desgagne.bouchard@gmail.com>
Date: Mon, 16 Oct 2023 21:58:20 -0400
Subject: [PATCH] integrate MLJ wrapper with Tables API

---
 experiments/hist/hist_gpu share-v2.jl | 14 +++-----
 ext/EvoTreesCUDAExt/fit-utils.jl      | 52 ---------------------------
 2 files changed, 5 insertions(+), 61 deletions(-)

diff --git a/experiments/hist/hist_gpu share-v2.jl b/experiments/hist/hist_gpu share-v2.jl
index c9d2557..bee557b 100644
--- a/experiments/hist/hist_gpu share-v2.jl	
+++ b/experiments/hist/hist_gpu share-v2.jl	
@@ -1,6 +1,6 @@
 using Revise
 using CUDA
-using StaticArrays
+# using StaticArrays
 using StatsBase: sample
 using BenchmarkTools
 
@@ -10,10 +10,6 @@ using BenchmarkTools
 # - each block build histogram for many features -> (k, j)
 # - 
 ################################################
-
-function agg_share()
-end
-
 # base kernel
 function kernel_share_1!(h::CuDeviceArray{T,3}, ∇, x_bin, is) where {T}
     
@@ -71,14 +67,14 @@ end
 
 nbins = 64
 nfeats = 100
-nobs = Int32(1e6)
+nobs = Int(1e6)
 hist = zeros(Float32, 3, nbins, ncol)
-∇ = rand(Float32, items, 3)
+∇ = rand(Float32, nobs, 3)
 # idx = Int64.(rand(1:nbins, items, ncol))
-idx = UInt8.(rand(1:nbins, items, ncol))
+is = UInt8.(rand(1:nbins, nobs, ncol))
 
 hist_gpu = CuArray(hist)
-∇_gpu = CuArray(δ)
+∇_gpu = CuArray(∇)
 idx_gpu = CuArray(idx)
 
 @time hist_share_1!(hist, ∇, idx)
diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl
index 2533f78..177e4fd 100644
--- a/ext/EvoTreesCUDAExt/fit-utils.jl
+++ b/ext/EvoTreesCUDAExt/fit-utils.jl
@@ -45,58 +45,6 @@ function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
     return nothing
 end
 
-function hist_kernel_vec!(h∇, ∇, x_bin, is)
-    tix, k = threadIdx().x, threadIdx().y
-    bdx = blockDim().x
-    bix = blockIdx().x
-    gdx = gridDim().x
-
-    i_max = length(is)
-    niter = cld(i_max, bdx * gdx)
-    @inbounds for iter in 1:niter
-        i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-        if i <= i_max
-            idx = is[i]
-            bin = x_bin[idx]
-            hid = Base._to_linear_index(h∇, k, bin)
-            CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-        end
-    end
-    # CUDA.sync_threads()
-    return nothing
-end
-function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector)
-    kernel = @cuda launch = false hist_kernel_vec!(h∇[js[1]], ∇, view(x_bin, :, js[1]), is)
-    config = launch_configuration(kernel.fun)
-    max_threads = config.threads
-    max_blocks = config.blocks
-    @assert size(h∇[js[1]], 1) <= max_threads "number of classes cannot be larger than 31 on GPU"
-    ty = min(64, size(h∇[js[1]], 1))
-    tx = max(1, min(length(is), fld(max_threads, ty)))
-    threads = (tx, ty, 1)
-    bx = min(max_blocks, cld(length(is), tx))
-    blocks = (bx, 1, 1)
-    # @sync for j in js
-    #     @async h∇[j] .= 0
-    # end
-    for j in js
-        h∇[j] .= 0
-        h[j] .= 0
-    end
-    CUDA.synchronize()
-    # @info "hist" max_blocks length(is) threads blocks
-    @sync for j in js
-        @async kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks)
-        # kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks)
-    end
-    CUDA.synchronize()
-    for j in js
-        copyto!(h[j], h∇[j])
-    end
-    CUDA.synchronize()
-    return nothing
-end
-
 # Multi-threads split_set!
 # Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
 function split_chunk_kernel!(