diff --git a/Project.toml b/Project.toml
index 51145a5e..195f96e0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
-authors = ["jeremiedb <jeremie.db@evovest.com>"]
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
-version = "0.16.4"
+authors = ["jeremiedb <jeremie.db@evovest.com>"]
+version = "0.16.5"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
@@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+EvoTreesCUDAExt = "CUDA"
+
 [compat]
 BSON = "0.3"
 CUDA = "3.0, 4.0, 5.0"
@@ -28,9 +34,6 @@ StatsBase = "0.32, 0.33, 0.34"
 Tables = "1.9"
 julia = "1.6"
 
-[extensions]
-EvoTreesCUDAExt = "CUDA"
-
 [extras]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
@@ -42,6 +45,3 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 [targets]
 docs = ["Documenter"]
 test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"]
-
-[weakdeps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
diff --git a/benchmarks/regressor.jl b/benchmarks/regressor.jl
index d38804b6..5e20a326 100644
--- a/benchmarks/regressor.jl
+++ b/benchmarks/regressor.jl
@@ -12,9 +12,9 @@ import CUDA
 # desktop | 1e6 | depth 11 | cpu: 37.2s
 # desktop | 10e6 | depth 11 | cpu
 
-### perf depth
-# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec  | xgboost: 26s
-# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s
+### v0.16.5
+# desktop | 1e6 | depth 11 | cpu: 31s gpu: 50 sec  | xgboost cpu: 26s
+# desktop | 10e6 | depth 11 | cpu 200s gpu: 80 sec | xgboost cpu: 267s
 
 #threads
 # laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)
diff --git a/experiments/depth-debug.jl b/experiments/depth-debug.jl
deleted file mode 100644
index 4a8df1eb..00000000
--- a/experiments/depth-debug.jl
+++ /dev/null
@@ -1,129 +0,0 @@
-using Statistics
-using StatsBase:sample
-using Base.Threads:@threads
-using BenchmarkTools
-using Revise
-using EvoTrees
-using Profile
-
-nobs = Int(1e6)
-num_feat = Int(100)
-nrounds = 200
-nthread = Base.Threads.nthreads()
-x_train = rand(nobs, num_feat)
-y_train = rand(size(x_train, 1))
-
-config = EvoTreeRegressor(;
-    loss=:mse,
-    nrounds=200,
-    lambda=0.0,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=10,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=0.5,
-    nbins=64,
-    tree_type="binary",
-    rng=123
-)
-
-################################
-# high-level
-################################
-_device = EvoTrees.GPU
-@time EvoTrees.fit_evotree(config; x_train, y_train, device = "gpu")
-
-@time m, cache = EvoTrees.init(config, x_train, y_train);
-@time EvoTrees.grow_evotree!(m, cache, config)
-@btime EvoTrees.grow_evotree!(m, cache, config)
-
-Profile.clear()
-# Profile.init()
-Profile.init(n = 10^5, delay = 0.01)
-# @profile m, cache = EvoTrees.init(config, x_train, y_train);
-@profile EvoTrees.grow_evotree!(m, cache, config)
-Profile.print()
-
-################################
-# mid-level
-################################
-@time m, cache = EvoTrees.init(config, x_train, y_train);
-@time EvoTrees.grow_evotree!(m, cache, config)
-# compute gradients
-@time m, cache = EvoTrees.init(config, x_train, y_train);
-@time EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, config)
-# subsample rows
-@time cache.nodes[1].is = EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, config.rowsample, config.rng)
-# subsample cols
-EvoTrees.sample!(config.rng, cache.js_, cache.js, replace=false, ordered=true)
-L = EvoTrees._get_struct_loss(m)
-# instantiate a tree then grow it
-tree = EvoTrees.Tree{L,1}(config.max_depth)
-grow! = config.tree_type == "oblivious" ? EvoTrees.grow_otree! : EvoTrees.grow_tree!
-@time EvoTrees.grow_tree!(
-    tree,
-    cache.nodes,
-    config,
-    cache.∇,
-    cache.edges,
-    cache.js,
-    cache.out,
-    cache.left,
-    cache.right,
-    cache.x_bin,
-    cache.feattypes,
-    cache.monotone_constraints
-)
-
-using ProfileView
-ProfileView.@profview EvoTrees.grow_tree!(
-    tree,
-    cache.nodes,
-    config,
-    cache.∇,
-    cache.edges,
-    cache.js,
-    cache.out,
-    cache.left,
-    cache.right,
-    cache.x_bin,
-    cache.feattypes,
-    cache.monotone_constraints
-)
-
-################################
-# end mid-level
-################################
-
-
-@time m_evo = grow_tree!(params_evo; x_train, y_train, device, print_every_n=100);
-
-@info "train - no eval"
-@time m_evo = fit_evotree(params_evo; x_train, y_train, device, print_every_n=100);
-
-
-offset = 0
-feat = 15
-cond_bin = 32
-@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14);
-@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14);
-@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
-
-offset = 0
-feat = 15
-cond_bin = 32
-lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset)
-offset = 0
-feat = 14
-cond_bin = 12
-lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset)
-offset = + length(lid1)
-feat = 14
-cond_bin = 12
-lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset)
-
-lid1_ = deepcopy(lid1)
-
-
-
diff --git a/experiments/hist/hist_cpu.jl b/experiments/hist/hist_cpu.jl
index 3208afb4..0c70ad7f 100644
--- a/experiments/hist/hist_cpu.jl
+++ b/experiments/hist/hist_cpu.jl
@@ -35,497 +35,7 @@ h∇ = [zeros(Float64, 3, nbins) for n in 1:nfeats]
 is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
 js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
 
-# 6.886 ms (97 allocations: 10.67 KiB)
-# 9.624 ms (97 allocations: 10.67 KiB)
+# laptop: 6.886 ms (97 allocations: 10.67 KiB)
+# desktop: 3.451 ms (61 allocations: 6.52 KiB)
 @time hist_cpu!(h∇, ∇, x_bin, is, js)
 @btime hist_cpu!($h∇, $∇, $x_bin, $is, $js)
-
-# base kernel
-function kernel1!(h, x, id)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= size(id, 1) && j <= size(id, 2)
-        @inbounds k = Base._to_linear_index(h, id[i, j], j)
-        # @inbounds k = id[i,j] + 32 * (j-1)
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[i, j])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu1!(h::AbstractMatrix{T}, x::AbstractMatrix{T}, id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S}
-    # thread_i = min(MAX_THREADS, size(id, 1))
-    # thread_j = min(MAX_THREADS ÷ thread_i, size(id, 2))
-    thread_j = min(MAX_THREADS, size(id, 2))
-    thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (size(id, 1), size(id, 2)) .÷ threads)
-    # println("threads:", threads)
-    # println("blocks:", blocks)
-    @cuda blocks = blocks threads = threads kernel1!(h, x, id)
-    return
-end
-
-nbins = 32
-ncol = 100
-items = Int32(1e6)
-hist = zeros(Float32, nbins, ncol)
-δ = rand(Float32, items, ncol)
-idx = UInt8.(rand(1:nbins, items, ncol))
-# idx = Int64.(rand(1:nbins, items, ncol))
-
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-
-hist .- Array(hist_gpu)
-sum(hist) - sum(Array(hist_gpu))
-
-CUDA.@time hist_gpu1!(hist_gpu, δ_gpu, idx_gpu, MAX_THREADS=1024)
-@time hist_cpu!(hist, δ, idx)
-@btime hist_cpu!($hist, $δ, $idx)
-@btime CUDA.@sync hist_gpu1!($hist_gpu, $δ_gpu, $idx_gpu, MAX_THREADS=1024)
-# test on view
-CUDA.@time hist_gpu1!(hist_gpu, view(δ_gpu, 1:items÷2, 1:ncol÷2), view(idx_gpu, 1:items÷2, 1:ncol÷2), MAX_THREADS=1024)
-
-size(δ_gpu)
-size(view(δ_gpu, 1:items÷2, 1:ncol÷2))
-
-##############################################################
-## Build histogram from a subsample idx
-# base kernel
-function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j]])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    # println("threads:", threads)
-    # println("blocks:", blocks)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernel2!(h, x, id, 𝑖, 𝑗)
-    end
-    return
-end
-
-hist = zeros(Float32, nbins, ncol)
-δ = rand(Float32, items, ncol)
-idx = rand(1:nbins, items, ncol)
-𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true)
-𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-𝑖_gpu = CuArray(𝑖)
-𝑗_gpu = CuArray(𝑗)
-
-CUDA.@time hist_gpu2!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-@btime hist_gpu2!($hist_gpu, $δ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-
-
-#############################################
-# test for SVector - basic test - success!
-function kernel!(x, y)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    if i <= length(x)
-        # @inbounds x[i] += y[i]
-        k = Base._to_linear_index(x, i)
-        CUDA.atomic_add!(pointer(x, k), y[i])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu!(x, y; MAX_THREADS=1024)
-    thread_i = min(MAX_THREADS, length(x))
-    threads = (thread_i)
-    blocks = ceil.(Int, length(x) .÷ threads)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernel!(x, y)
-    end
-    return
-end
-
-x = rand(SVector{2,Float32}, Int(1e7))
-y = rand(SVector{2,Float32}, Int(1e7))
-x = rand(Float32, Int(1e7))
-y = rand(Float32, Int(1e7))
-
-x_gpu = CuArray(x)
-y_gpu = CuArray(y)
-
-CuArrays.@time hist_gpu!(x_gpu, y_gpu)
-@btime hist_gpu!($x_gpu, $y_gpu)
-
-
-#############################################
-# test for SVector - real test
-function kernelS2!(h, x, id)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= size(id, 1) && j <= size(id, 2)
-        @inbounds k = Base._to_linear_index(h, id[i, j], j)
-        # @inbounds k = id[i,j] + 32 * (j-1)
-        # @inbounds CUDAnative.atomic_add!(pointer(h, k), x[i,j])
-        # h[id[i,j],j] += x[i,j]
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpuS2!(h, x, id; MAX_THREADS=256) where {T}
-    thread_j = min(MAX_THREADS, size(id, 2))
-    thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (size(id, 1), size(id, 2)) .÷ threads)
-    println("threads:", threads)
-    println("blocks:", blocks)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernelS2!(h, x, id)
-    end
-    return
-end
-
-hist = zeros(SVector{2,Float32}, nbins, ncol)
-δ = rand(SVector{2,Float32}, items, ncol)
-idx = rand(1:nbins, items, ncol)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-
-CuArrays.@time hist_gpuS2!(hist_gpu, δ_gpu, idx_gpu)
-@btime hist_gpuS2!($hist_gpu, $δ_gpu, $idx_gpu)
-
-
-##############################################################
-## Build histogram from a subsample idx
-# accumulate all gradient single pass
-function kernel3!(h, x, id, 𝑖, 𝑗)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 1)
-        @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 1])
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 2)
-        @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 2])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu3!(h, x, id, 𝑖, 𝑗; MAX_THREADS=1024)
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    # println("threads:", threads)
-    # println("blocks:", blocks)
-    CuArrays.@sync begin
-        @cuda blocks = blocks threads = threads kernel3!(h, x, id, 𝑖, 𝑗)
-    end
-    return
-end
-
-hist = zeros(Float32, nbins, ncol, 2)
-δ = rand(Float32, items, ncol, 2)
-idx = rand(1:nbins, items, ncol)
-𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true)
-𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-𝑖_gpu = CuArray(𝑖)
-𝑗_gpu = CuArray(𝑗)
-
-CuArrays.@time hist_gpu3!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-@btime hist_gpu3!($hist_gpu, $δ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-
-
-
-
-# accumulate in shared memory histograms
-function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, nbins) where {T<:AbstractFloat}
-    tid = threadIdx().x
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-
-    # shared memory on block of length nbins
-    # To Do: nbins cannot be passed as argument - dynamic shared memory generate kernel through macro
-    # shared = CUDAnative.@cuStaticSharedMem(T, 32)
-    # fill!(shared, 0)
-    # sync_threads()
-
-    # accumulate in per block histogram
-    # Why is the atomic add on shared much longer than atomic on h in global mem?
-    if i <= size(id, 1) && j <= size(id, 2)
-        # should be the legit way to go - 70ms
-        # @inbounds CUDAnative.atomic_add!(pointer(shared, id[i,j]), x[i,j])
-
-        # unsafe (collisions) on shared mem: 3.0ms
-        # @inbounds shared[id[i,j]] = x[i,j]
-
-        # unsafe (collisions) add on global memory - 3.6ms
-        # @inbounds h[id[i,j],j] += x[i,j]
-
-        # atomic add on global hist - 3.2ms
-        @inbounds k = id[i, j] + nbins * (j - 1)
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[i, j])
-    end
-    # sync_threads()
-
-    # if blockIdx().x == 1
-    #     if tid <= nbins
-    #         CUDA.atomic_add!(pointer(h,tid), shared[tid])
-    #     end
-    # end
-    return
-end
-
-# shared memory -
-function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id::CuMatrix{Int}, nbins; MAX_THREADS=256) where {T<:AbstractFloat}
-    # thread_i = min(MAX_THREADS, size(id, 1))
-    # thread_j = min(MAX_THREADS ÷ thread_i, size(id, 2))
-    thread_j = min(MAX_THREADS, size(id, 2))
-    thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (size(id, 1), size(id, 2)) ./ threads)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernel2!(h, x, id, nbins)
-    end
-    return h
-end
-
-CuArrays.@time hist_gpu2!(hist_gpu, δ_gpu, idx_gpu, 32, MAX_THREADS=1024)
-@btime hist_gpu2!($hist_gpu, $δ_gpu, $idx_gpu, 32, MAX_THREADS=1024)
-@device_code_warntype hist_gpu2!(hist_gpu, δ_gpu, idx_gpu, 32, MAX_THREADS=1024)
-
-
-
-
-######################################
-# Appoach 1
-######################################
-# GPU - apply along the features axis
-function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗)
-    return
-end
-
-hist = zeros(Float32, nbins, ncol)
-δ = rand(Float32, items)
-idx = rand(1:nbins, items, ncol)
-𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true)
-𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-𝑖_gpu = CuArray(𝑖)
-𝑗_gpu = CuArray(𝑗)
-
-CUDA.@time hist_gpu!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-@btime hist_gpu!($hist_gpu, $δ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-
-
-######################################
-# Appoach 2 - Loop for assigning command grad to appropriate bin per column
-# Idea: exploit the fact that there's a single grad per row: take that grad and add it to each column bin
-######################################
-# GPU - apply along the features axis
-function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T<:AbstractFloat}
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
-        @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i]])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{UInt8}, 𝑖, 𝑗; MAX_THREADS=1024) where {T<:AbstractFloat}
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗)
-    return
-end
-
-
-
-
-using CUDA
-using BenchmarkTools
-N1 = Int(2^12)
-x1 = rand(Float32, N1);
-x2 = rand(Float32, N1);
-x1g = CuArray(x1);
-x2g = CuArray(x2);
-
-function dot_atomic!(x, y, z)
-    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    if idx <= length(x)
-        CUDA.atomic_add!(pointer(z, 1), x[idx] * y[idx])
-    end
-    return nothing
-end
-
-function bench_dot_atomic!(x, y, z, threads)
-    numblocks = ceil(Int, N1 / threads)
-    @cuda threads = threads blocks = numblocks dot_atomic!(x, y, z)
-end
-
-threads = 512
-numblocks = ceil(Int, N1 / threads)
-z0 = CUDA.zeros(1)
-# @cuda threads=gthreads blocks=numblocks dot_atomic!(x1g, x2g, z0)
-@btime CUDA.@sync bench_dot_atomic!($x1g, $x2g, $z0, threads)
-#  17.323 ms (50 allocations: 1.67 KiB)
-
-function dot_share!(x::CuDeviceVector{T}, y::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T}
-
-    tid = threadIdx().x
-    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    shared = CUDA.@cuStaticSharedMem(T, 64)
-    fill!(shared, 0)
-    sync_threads()
-
-    if idx <= length(x)
-        @inbounds shared[tid] = x[idx] * y[idx]
-    end
-    sync_threads()
-
-    i = blockDim().x ÷ 2
-    while i > 0
-        if tid <= i
-            @inbounds shared[tid] += shared[tid+i] # valid non atomic operation
-            # CUDA.atomic_add!(pointer(shared, tid), shared[tid+i]) # invalid op - results in error
-        end
-        sync_threads()
-        i ÷= 2
-    end
-    if tid == 1
-        CUDA.atomic_add!(pointer(z, 1), shared[1])
-    end
-    return nothing
-end
-
-function bench_dot_share!(x, y, z, threads, numblocks)
-    CUDA.@sync @cuda threads = threads blocks = numblocks dot_share!(x, y, z)
-    return z
-end
-
-function wrap_share(x, y, threads)
-    numblocks = ceil(Int, N1 / threads)
-    z = CUDA.zeros(1)
-    x = bench_dot_share!(x, y, z, threads, numblocks)
-    return x
-end
-
-threads = 64
-numblocks = ceil(Int, N1 / threads)
-z = CUDA.zeros(1)
-@cuda threads = threads blocks = numblocks dot_share!(x1g, x2g, z)
-@time CUDA.@sync wrap_share(x1g, x2g, threads)
-@btime CUDA.@sync wrap_share($x1g, $x2g, threads)
-x = CUDA.@sync wrap_share(x1g, x2g, threads)
-x1g' * x2g
-@btime x1g' * x2g
-@btime x1' * x2
-
-
-function dot_share2!(x::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T}
-
-    tid = threadIdx().x
-    bid = blockIdx().x
-    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    shared = CUDA.@cuStaticSharedMem(T, 128)
-    fill!(shared, 0.0f0)
-    sync_threads()
-
-    # if idx <= length(x)
-    #     shared[tid] = x[idx] * y[idx]
-    # end
-    # sync_threads()
-
-    i = blockDim().x ÷ 2
-    while i > 0
-        # if tid <= i
-        if tid == 1
-            # shared[tid] += shared[tid + i] # valid non atomic operation
-            CUDA.atomic_add!(pointer(shared, tid), shared[tid+1]) # invalid op - results in error
-        end
-        sync_threads()
-        i ÷= 2
-    end
-    if tid == 1
-        z[bid] += shared[1]
-    end
-    return nothing
-end
-
-function bench_dot_share2!(x, z, threads, numblocks)
-    CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z)
-    return sum(z)
-end
-
-function wrap_share2(x, threads)
-    numblocks = ceil(Int, N1 / threads)
-    z = CUDA.zeros(numblocks)
-    # x = bench_dot_share2!(x, z, threads, numblocks)
-    CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z)
-    return (x)
-end
-
-threads = 128
-numblocks = ceil(Int, N1 / threads)
-z = CUDA.zeros(numblocks)
-sum(z)
-x1g' * x2g
-CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x1g, x2g, z)
-@btime CUDA.@sync wrap_share2($x1g, $x2g, threads)
-x = CUDA.@sync wrap_share2(x1g, x2g, threads)
-
-@btime x1g' * x2g
-
-
-using CUDA
-function kernel(x)
-    shared = @cuStaticSharedMem(Float32, 2)
-    fill!(shared, 1.0f0)
-    sync_threads()
-    # @atomic shared[threadIdx().x] += 0f0
-    tid = threadIdx().x
-    CUDA.atomic_add!(pointer(shared, tid), shared[tid+1])
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
-    return
-end
-
-x = CUDA.zeros(1)
-@cuda kernel(x)
-synchronize()
\ No newline at end of file
diff --git a/experiments/hist/hist_gpu share.jl b/experiments/hist/hist_gpu share.jl
index 9c1176eb..4477274f 100644
--- a/experiments/hist/hist_gpu share.jl	
+++ b/experiments/hist/hist_gpu share.jl	
@@ -1,837 +1,88 @@
 using Revise
 using CUDA
-using StaticArrays
 using StatsBase: sample
 using BenchmarkTools
 
 # base kernel
-function kernel_s4!(h::CuDeviceArray{T,3}, x::CuDeviceMatrix{T}, xid::CuDeviceMatrix{S}) where {T,S}
-    
+function kernel_s4!(h::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{T}, x_bin::CuDeviceMatrix{S}) where {T,S}
+
     nbins = size(h, 2)
-    it, jt = threadIdx().x, threadIdx().y
+    it, jt, kt = threadIdx().x, threadIdx().y, threadIdx().z
     ib, jb = blockIdx().x, blockIdx().y
     id, jd = blockDim().x, blockDim().y
     ig, jg = gridDim().x, gridDim().y
     j = jt + (jb - 1) * jd
-    
-    shared = @cuDynamicSharedMem(T, 3 * nbins)
-    fill!(shared, 0)
-    sync_threads()
 
-    i_tot = size(x, 1)
-    iter = 0
-    while iter * id * ig < i_tot
-        i = it + id * (ib - 1) + iter * id * ig
-        if i <= size(xid, 1) && j <= size(xid, 2)
-            # depends on shared to be assigned to a single feature
-            k = 3 * (xid[i, j] - 1)
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 1), x[i, 1])
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 2), x[i, 2])
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 3), x[i, 3])
-        end
-        iter += 1
-    end
-    sync_threads()
-    # loop to cover cases where nbins > nthreads
-    for iter in 1:(nbins - 1) ÷ id + 1
-        bin_id = it + id * (iter - 1)
-        if bin_id <= nbins
-            @inbounds k = Base._to_linear_index(h, 1, bin_id, j)
-            @inbounds CUDA.atomic_add!(pointer(h, k), shared[3 * (bin_id - 1) + 1])
-            @inbounds CUDA.atomic_add!(pointer(h, k + 1), shared[3 * (bin_id - 1) + 2])
-            @inbounds CUDA.atomic_add!(pointer(h, k + 2), shared[3 * (bin_id - 1) + 3])
-        end
-    end
-    # sync_threads()
-    return nothing
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu_s4!(h::AbstractArray{T,3}, x::AbstractMatrix{T}, id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S}
-    thread_i = min(MAX_THREADS, size(id, 1))
-    thread_j = 1
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (16, size(id, 2)))
-    fill!(h, 0)
-    @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h, 2) * 3 kernel_s4!(h, x, id)
-    return
-end
-
-# function hist_cpu_2!(hist, δ, idx)
-#     Threads.@threads for j in 1:size(idx, 2)
-#         for i in 1:size(idx, 1)
-#             bin = idx[i,j]
-#             @inbounds hist[1, bin, j] += δ[i,1]
-#             @inbounds hist[2, bin, j] += δ[i,2]
-#             @inbounds hist[3, bin, j] += δ[i,3]
-#         end
-#     end
-#     return
-# end
-
-nbins = 64
-ncol = 100
-items = Int32(1e6)
-hist = zeros(Float32, 3, nbins, ncol)
-δ = rand(Float32, items, 3)
-# idx = Int64.(rand(1:nbins, items, ncol))
-idx = UInt8.(rand(1:nbins, items, ncol))
-
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-
-@time hist_cpu_2!(hist, δ, idx)
-@btime hist_cpu_2!(hist, δ, idx)
-
-@CUDA.time hist_gpu_s4!(hist_gpu, δ_gpu, idx_gpu, MAX_THREADS=128)
-@btime CUDA.@sync hist_gpu_s4!($hist_gpu, $δ_gpu, $idx_gpu, MAX_THREADS=128)
-
-
-# base kernel
-function kernel_s5!(hδ1::CuDeviceArray{T,3}, hδ2::CuDeviceArray{T,3}, h𝑤::CuDeviceMatrix{T}, δ1::CuDeviceMatrix{T}, δ2::CuDeviceMatrix{T}, 𝑤::CuDeviceVector{T}, xid::CuDeviceMatrix{S}) where {T,S}
-    
-    nbins = size(h𝑤, 1)
-    it, jt = threadIdx().x, threadIdx().y
-    ib, jb = blockIdx().x, blockIdx().y
-    id, jd = blockDim().x, blockDim().y
-    ig, jg = gridDim().x, gridDim().y
-    j = jt + (jb - 1) * jd
-    
     shared = @cuDynamicSharedMem(T, 3 * nbins)
     fill!(shared, 0)
     sync_threads()
 
-    i_tot = size(𝑤, 1)
+    i_tot = size(x_bin, 1)
     iter = 0
     while iter * id * ig < i_tot
         i = it + id * (ib - 1) + iter * id * ig
-        if i <= size(xid, 1) && j <= size(xid, 2)
+        if i <= size(x_bin, 1) && j <= size(x_bin, 2)
             # depends on shared to be assigned to a single feature
-            k = 3 * (xid[i, j] - 1)
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 1), δ1[i, 1])
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 2), δ2[i, 1])
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 3), 𝑤[i])
+            k = 3 * (x_bin[i, j] - 1)
+            @inbounds CUDA.atomic_add!(pointer(shared, k + kt), ∇[i, kt])
+            # @inbounds CUDA.atomic_add!(pointer(shared, k + 1), ∇[i, 1])
+            # @inbounds CUDA.atomic_add!(pointer(shared, k + 2), ∇[i, 2])
+            # @inbounds CUDA.atomic_add!(pointer(shared, k + 3), ∇[i, 3])
         end
         iter += 1
     end
     sync_threads()
     # loop to cover cases where nbins > nthreads
-    for iter in 1:(nbins - 1) ÷ id + 1
+    for iter in 1:(nbins-1)÷id+1
         bin_id = it + id * (iter - 1)
         if bin_id <= nbins
-            @inbounds k = Base._to_linear_index(hδ1, 1, bin_id, j)
-            @inbounds CUDA.atomic_add!(pointer(hδ1, k), shared[3 * (bin_id - 1) + 1])
-            @inbounds CUDA.atomic_add!(pointer(hδ2, k), shared[3 * (bin_id - 1) + 2])
-            @inbounds CUDA.atomic_add!(pointer(h𝑤, k), shared[3 * (bin_id - 1) + 3])
+            @inbounds k = Base._to_linear_index(h, 1, bin_id, j) - 1
+            @inbounds CUDA.atomic_add!(pointer(h, k + kt), shared[3*(bin_id-1)+kt])
+            # @inbounds CUDA.atomic_add!(pointer(h, k), shared[3*(bin_id-1)+1])
+            # @inbounds CUDA.atomic_add!(pointer(h, k + 1), shared[3*(bin_id-1)+2])
+            # @inbounds CUDA.atomic_add!(pointer(h, k + 2), shared[3*(bin_id-1)+3])
         end
     end
-    # sync_threads()
-    return nothing
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu_s5!(hδ1::AbstractArray{T,3}, hδ2::AbstractArray{T,3}, h𝑤::AbstractMatrix{T}, 
-        δ1::AbstractMatrix{T}, δ2::AbstractMatrix{T}, 𝑤::AbstractVector{T}, 
-        id::AbstractMatrix{S}; MAX_THREADS=256) where {T,S}
-    thread_i = min(MAX_THREADS, size(id, 1))
-    thread_j = 1
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (16, size(id, 2)))
-    fill!(hδ1, 0)
-    fill!(hδ2, 0)
-    fill!(h𝑤, 0)
-    @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h𝑤, 1) * 3 kernel_s5!(hδ1, hδ2, h𝑤, δ1, δ2, 𝑤, id)
-    return
-end
-
-nbins = 32
-ncol = 100
-items = Int32(1e6)
-
-δ1 = CUDA.rand(Float32, items, 1)
-δ2 = CUDA.rand(Float32, items, 1)
-𝑤 = CUDA.rand(Float32, items)
-idx = UInt8.(rand(1:nbins, items, ncol))
-
-hδ1 = CUDA.zeros(1, nbins, ncol)
-hδ2 = CUDA.zeros(1, nbins, ncol)
-h𝑤 =  CUDA.zeros(nbins, ncol)
-idx_gpu = CuArray(idx)
-
-@CUDA.time hist_gpu_s5!(hδ1, hδ2, h𝑤, δ1, δ2, 𝑤, idx_gpu, MAX_THREADS=128)
-@btime CUDA.@sync hist_gpu_s5!($hδ1, $hδ2, $h𝑤, $δ1, $δ2, $𝑤, $idx_gpu, MAX_THREADS=128)
-
-
-# base kernel
-function kernel_s6!(hδ1::CuDeviceArray{T,3}, hδ2::CuDeviceArray{T,3}, h𝑤::CuDeviceMatrix{T}, δ1::CuDeviceMatrix{T}, δ2::CuDeviceMatrix{T}, 𝑤::CuDeviceVector{T}, xid::CuDeviceMatrix{S}, 𝑖, 𝑗) where {T,S}
-    
-    nbins = size(h𝑤, 1)
-    it, jt = threadIdx().x, threadIdx().y
-    ib, jb = blockIdx().x, blockIdx().y
-    id, jd = blockDim().x, blockDim().y
-    ig, jg = gridDim().x, gridDim().y
-    j = jt + (jb - 1) * jd
-    
-    shared = @cuDynamicSharedMem(T, 3 * nbins)
-    fill!(shared, 0)
     sync_threads()
-
-    i_tot = size(𝑤, 1)
-    iter = 0
-    while iter * id * ig < i_tot
-        i = it + id * (ib - 1) + iter * id * ig
-        if i <= length(𝑖) && j <= length(𝑗)
-            # depends on shared to be assigned to a single feature
-            i_idx = 𝑖[i]
-            k = 3 * (xid[i_idx, 𝑗[j]] - 1)
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 1), δ1[i_idx, 1])
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 2), δ2[i_idx, 1])
-            @inbounds CUDA.atomic_add!(pointer(shared, k + 3), 𝑤[i_idx])
-        end
-        iter += 1
-    end
-    sync_threads()
-    # loop to cover cases where nbins > nthreads
-    for iter in 1:(nbins - 1) ÷ id + 1
-        bin_id = it + id * (iter - 1)
-        if bin_id <= nbins
-            @inbounds k = Base._to_linear_index(hδ1, 1, bin_id, 𝑗[j])
-            @inbounds CUDA.atomic_add!(pointer(hδ1, k), shared[3 * (bin_id - 1) + 1])
-            @inbounds CUDA.atomic_add!(pointer(hδ2, k), shared[3 * (bin_id - 1) + 2])
-            @inbounds CUDA.atomic_add!(pointer(h𝑤, k), shared[3 * (bin_id - 1) + 3])
-        end
-    end
-    # sync_threads()
     return nothing
 end
 
 # base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu_s6!(hδ1::AbstractArray{T,3}, hδ2::AbstractArray{T,3}, h𝑤::AbstractMatrix{T}, 
-        δ1::AbstractMatrix{T}, δ2::AbstractMatrix{T}, 𝑤::AbstractVector{T}, 
-        id::AbstractMatrix{S}, 𝑖, 𝑗; MAX_THREADS=256) where {T,S}
-    thread_i = min(MAX_THREADS, size(id, 1))
+function hist_gpu_s4!(h::AbstractArray{T,3}, ∇::AbstractMatrix{T}, x_bin::AbstractMatrix{S}; MAX_THREADS=256) where {T,S}
+    thread_i = min(MAX_THREADS, size(x_bin, 1))
     thread_j = 1
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (16, size(id, 2)))
-    fill!(hδ1, 0)
-    fill!(hδ2, 0)
-    fill!(h𝑤, 0)
-    @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h𝑤, 1) * 3 kernel_s6!(hδ1, hδ2, h𝑤, δ1, δ2, 𝑤, id, 𝑖, 𝑗)
-    return
-end
-
-nbins = 32
-ncol = 100
-items = Int32(1e6)
-
-δ1 = CUDA.rand(Float32, items, 1)
-δ2 = CUDA.rand(Float32, items, 1)
-𝑤 = CUDA.rand(Float32, items)
-idx = UInt8.(rand(1:nbins, items, ncol))
-
-hδ1 = CUDA.zeros(1, nbins, ncol)
-hδ2 = CUDA.zeros(1, nbins, ncol)
-h𝑤 =  CUDA.zeros(nbins, ncol)
-idx_gpu = CuArray(idx)
-
-𝑖 = CuArray(UInt32.(1:items))
-𝑗 = CuArray(UInt32.(1:ncol))
-
-hδ1_cpu = reshape(Array(hδ1), size(hδ1)[2:3])
-hδ2_cpu = reshape(Array(hδ2), size(hδ2)[2:3])
-h𝑤_cpu = Array(h𝑤)
-δ1_cpu = reshape(Array(δ1), :)
-δ2_cpu = reshape(Array(δ2), :)
-𝑤_cpu = Array(𝑤)
-𝑖_cpu = Array(𝑖)
-𝑗_cpu = Array(𝑗)
-idx_cpu = Array(idx_gpu)
-
-@time hist_cpu_3!(hδ1_cpu, hδ2_cpu, h𝑤_cpu, δ1_cpu, δ2_cpu, 𝑤_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu)
-@btime hist_cpu_3!(hδ1_cpu, hδ2_cpu, h𝑤_cpu, δ1_cpu, δ2_cpu, 𝑤_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu)
-
-@CUDA.time hist_gpu_s6!(hδ1, hδ2, h𝑤, δ1, δ2, 𝑤, idx_gpu, 𝑖, 𝑗, MAX_THREADS=128)
-@btime CUDA.@sync hist_gpu_s6!($hδ1, $hδ2, $h𝑤, $δ1, $δ2, $𝑤, $idx_gpu, $𝑖, $𝑗, MAX_THREADS=128)
-
-
-
-function update_hist_cpu!(hist_δ::Matrix{SVector{L,T}}, hist_δ²::Matrix{SVector{L,T}}, hist_𝑤::Matrix{SVector{1,T}},
-    δ::Vector{SVector{L,T}}, δ²::Vector{SVector{L,T}}, 𝑤::Vector{SVector{1,T}},
-    X_bin, 𝑖, 𝑗) where {L,T,S}
-
-    hist_δ .*= 0.0
-    hist_δ² .*= 0.0
-    hist_𝑤 .*= 0.0
-
-    @inbounds Threads.@threads for j in 𝑗
-        @inbounds for i in 𝑖
-            hist_δ[X_bin[i,j], j] += δ[i]
-            hist_δ²[X_bin[i,j], j] += δ²[i]
-            hist_𝑤[X_bin[i,j], j] += 𝑤[i]
-        end
-    end
-end
-
-hδ1_cpu = SVector.(hδ1_cpu)
-hδ2_cpu = SVector.(hδ2_cpu)
-h𝑤_cpu = SVector.(h𝑤_cpu)
-δ1_cpu = SVector.(δ1_cpu)
-δ2_cpu = SVector.(δ2_cpu)
-𝑤_cpu = SVector.(𝑤_cpu)
-# 𝑖_cpu = Array(𝑖)
-# 𝑗_cpu = Array(𝑗)
-# idx_cpu = Array(idx_gpu)
-
-@time update_hist_cpu!(hδ1_cpu, hδ2_cpu, h𝑤_cpu, δ1_cpu, δ2_cpu, 𝑤_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu)
-@btime update_hist_cpu!(hδ1_cpu, hδ2_cpu, h𝑤_cpu, δ1_cpu, δ2_cpu, 𝑤_cpu, idx_cpu, 𝑖_cpu, 𝑗_cpu)
-@btime update_hist_cpu!($hδ1_cpu, $hδ2_cpu, $h𝑤_cpu, $δ1_cpu, $δ2_cpu, $𝑤_cpu, $idx_cpu, $𝑖_cpu, $𝑗_cpu)
-
-
-##############################################################
-## Build histogram from a subsample idx
-# base kernel
-function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat}
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j]])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat}
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    # println("threads:", threads)
-    # println("blocks:", blocks)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernel2!(h, x, id, 𝑖, 𝑗)
-    end
-    return
-end
-
-hist = zeros(Float32, nbins, ncol)
-δ = rand(Float32, items, ncol)
-idx = rand(1:nbins, items, ncol)
-𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true)
-𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-𝑖_gpu = CuArray(𝑖)
-𝑗_gpu = CuArray(𝑗)
-
-@CUDA.time hist_gpu2!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-@btime hist_gpu2!($hist_gpu, $δ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-
-
-#############################################
-# test for SVector - basic test - success!
-function kernel!(x, y)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    if i <= length(x)
-        # @inbounds x[i] += y[i]
-        k = Base._to_linear_index(x, i)
-        CUDA.atomic_add!(pointer(x, k), y[i])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu!(x, y; MAX_THREADS=1024)
-    thread_i = min(MAX_THREADS, length(x))
-    threads = (thread_i)
-    blocks = ceil.(Int, length(x) .÷ threads)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernel!(x, y)
-    end
-    return
-end
-
-x = rand(SVector{2,Float32}, Int(1e7))
-y = rand(SVector{2,Float32}, Int(1e7))
-x = rand(Float32, Int(1e7))
-y = rand(Float32, Int(1e7))
-
-x_gpu = CuArray(x)
-y_gpu = CuArray(y)
-
-@CuArrays.time hist_gpu!(x_gpu, y_gpu)
-@btime hist_gpu!($x_gpu, $y_gpu)
-
-
-#############################################
-# test for SVector - real test
-function kernelS2!(h, x, id)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= size(id, 1) && j <= size(id, 2)
-        @inbounds k = Base._to_linear_index(h, id[i,j], j)
-        # @inbounds k = id[i,j] + 32 * (j-1)
-        # @inbounds CUDAnative.atomic_add!(pointer(h, k), x[i,j])
-        # h[id[i,j],j] += x[i,j]
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpuS2!(h, x, id; MAX_THREADS=256) where {T}
-    thread_j = min(MAX_THREADS, size(id, 2))
-    thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (size(id, 1), size(id, 2)) .÷ threads)
-    println("threads:", threads)
-    println("blocks:", blocks)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernelS2!(h, x, id)
-    end
-    return
-end
-
-hist = zeros(SVector{2,Float32}, nbins, ncol)
-δ = rand(SVector{2,Float32}, items, ncol)
-idx = rand(1:nbins, items, ncol)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-
-@CuArrays.time hist_gpuS2!(hist_gpu, δ_gpu, idx_gpu)
-@btime hist_gpuS2!($hist_gpu, $δ_gpu, $idx_gpu)
-
-
-##############################################################
-## Build histogram from a subsample idx
-# accumulate all gradient single pass
-function kernel3!(h, x, id, 𝑖, 𝑗)
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 1)
-        @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 1])
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j], 2)
-        @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i], 𝑗[j], 2])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu3!(h, x, id, 𝑖, 𝑗; MAX_THREADS=1024)
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    # println("threads:", threads)
-    # println("blocks:", blocks)
-    CuArrays.@sync begin
-        @cuda blocks = blocks threads = threads kernel3!(h, x, id, 𝑖, 𝑗)
-    end
-    return
-end
-
-hist = zeros(Float32, nbins, ncol, 2)
-δ = rand(Float32, items, ncol, 2)
-idx = rand(1:nbins, items, ncol)
-𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true)
-𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-𝑖_gpu = CuArray(𝑖)
-𝑗_gpu = CuArray(𝑗)
-
-@CuArrays.time hist_gpu3!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-@btime hist_gpu3!($hist_gpu, $δ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-
-
-
-
-# accumulate in shared memory histograms
-function kernel2!(h::CuDeviceMatrix{T}, x::CuDeviceMatrix{T}, id, nbins) where {T <: AbstractFloat}
-    tid = threadIdx().x
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-
-    # shared memory on block of length nbins
-    # To Do: nbins cannot be passed as argument - dynamic shared memory generate kernel through macro
-    # shared = CUDAnative.@cuStaticSharedMem(T, 32)
-    # fill!(shared, 0)
-    # sync_threads()
-
-    # accumulate in per block histogram
-    # Why is the atomic add on shared much longer than atomic on h in global mem?
-    if i <= size(id, 1) && j <= size(id, 2)
-        # should be the legit way to go - 70ms
-        # @inbounds CUDAnative.atomic_add!(pointer(shared, id[i,j]), x[i,j])
-
-        # unsafe (collisions) on shared mem: 3.0ms
-        # @inbounds shared[id[i,j]] = x[i,j]
-
-        # unsafe (collisions) add on global memory - 3.6ms
-        # @inbounds h[id[i,j],j] += x[i,j]
-
-        # atomic add on global hist - 3.2ms
-        @inbounds k = id[i,j] + nbins * (j - 1)
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[i,j])
-    end
-    # sync_threads()
-
-    # if blockIdx().x == 1
-    #     if tid <= nbins
-    #         CUDA.atomic_add!(pointer(h,tid), shared[tid])
-    #     end
-    # end
-    return
-end
-
-# shared memory -
-function hist_gpu2!(h::CuMatrix{T}, x::CuMatrix{T}, id::CuMatrix{Int}, nbins; MAX_THREADS=256) where {T <: AbstractFloat}
-    # thread_i = min(MAX_THREADS, size(id, 1))
-    # thread_j = min(MAX_THREADS ÷ thread_i, size(id, 2))
-    thread_j = min(MAX_THREADS, size(id, 2))
-    thread_i = min(MAX_THREADS ÷ thread_j, size(id, 1))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (size(id, 1), size(id, 2)) ./ threads)
-    CUDA.@sync begin
-        @cuda blocks = blocks threads = threads kernel2!(h, x, id, nbins)
-    end
-    return h
-end
-
-@CuArrays.time hist_gpu2!(hist_gpu, δ_gpu, idx_gpu, 32, MAX_THREADS=1024)
-@btime hist_gpu2!($hist_gpu, $δ_gpu, $idx_gpu, 32, MAX_THREADS=1024)
-@device_code_warntype hist_gpu2!(hist_gpu, δ_gpu, idx_gpu, 32, MAX_THREADS=1024)
-
-
-
-
-######################################
-# Appoach 1
-######################################
-# GPU - apply along the features axis
-function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat}
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
-        @inbounds CUDA.atomic_add!(pointer(h, k), x[𝑖[i]])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat}
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗)
-    return
-end
-
-hist = zeros(Float32, nbins, ncol)
-δ = rand(Float32, items)
-idx = rand(1:nbins, items, ncol)
-𝑖 = sample(1:items, items ÷ 2, replace=false, ordered=true)
-𝑗 = sample(1:ncol, ncol ÷ 2, replace=false, ordered=true)
-hist_gpu = CuArray(hist)
-δ_gpu = CuArray(δ)
-idx_gpu = CuArray(idx)
-𝑖_gpu = CuArray(𝑖)
-𝑗_gpu = CuArray(𝑗)
-
-@CUDA.time hist_gpu!(hist_gpu, δ_gpu, idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-@btime hist_gpu!($hist_gpu, $δ_gpu, $idx_gpu, 𝑖_gpu, 𝑗_gpu, MAX_THREADS=1024)
-
-
-######################################
-# Appoach 2 - Loop for assigning command grad to appropriate bin per column
-# Idea: exploit the fact that there's a single grad per row: take that grad and add it to each column bin
-######################################
-# GPU - apply along the features axis
-function kernel!(h::CuDeviceMatrix{T}, x::CuDeviceVector{T}, id, 𝑖, 𝑗) where {T <: AbstractFloat}
-    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    j = threadIdx().y + (blockIdx().y - 1) * blockDim().y
-    if i <= length(𝑖) && j <= length(𝑗)
-        @inbounds k = Base._to_linear_index(h, id[𝑖[i], 𝑗[j]], 𝑗[j])
-        @inbounds CUDAnative.atomic_add!(pointer(h, k), x[𝑖[i]])
-    end
-    return
-end
-
-# base approach - block built along the cols first, the rows (limit collisions)
-function hist_gpu!(h::CuMatrix{T}, x::CuVector{T}, id::CuMatrix{UInt8}, 𝑖, 𝑗; MAX_THREADS=1024) where {T <: AbstractFloat}
-    thread_j = min(MAX_THREADS, length(𝑗))
-    thread_i = min(MAX_THREADS ÷ thread_j, length(𝑖))
-    threads = (thread_i, thread_j)
-    blocks = ceil.(Int, (length(𝑖), length(𝑗)) ./ threads)
-    @cuda blocks = blocks threads = threads kernel!(h, x, id, 𝑖, 𝑗)
-    return
-end
-
-
-
-
-using CUDA
-using BenchmarkTools
-N1 = Int(2^12)
-x1 = rand(Float32, N1);
-x2 = rand(Float32, N1);
-x1g = CuArray(x1);
-x2g = CuArray(x2);
-
-function dot_atomic!(x, y, z)
-    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    if idx <= length(x)
-        CUDA.atomic_add!(pointer(z, 1), x[idx] * y[idx])
-    end
-    return nothing
-end
-
-function bench_dot_atomic!(x, y, z, threads)
-    numblocks = ceil(Int, N1 / threads)
-    @cuda threads = threads blocks = numblocks dot_atomic!(x, y, z)
-end
-
-threads = 512
-numblocks = ceil(Int, N1 / threads)
-z0 = CUDA.zeros(1)
-# @cuda threads=gthreads blocks=numblocks dot_atomic!(x1g, x2g, z0)
-@btime CUDA.@sync bench_dot_atomic!($x1g, $x2g, $z0, threads)
-#  17.323 ms (50 allocations: 1.67 KiB)
-
-function dot_share!(x::CuDeviceVector{T}, y::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T}
-
-    tid = threadIdx().x
-    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    shared = CUDA.@cuStaticSharedMem(T, 64)
-    fill!(shared, 0)
-    sync_threads()
-
-    if idx <= length(x)
-        @inbounds shared[tid] = x[idx] * y[idx]
-    end
-    sync_threads()
-
-    i = blockDim().x ÷ 2
-    while i > 0
-        if tid <= i
-            @inbounds shared[tid] += shared[tid + i] # valid non atomic operation
-            # CUDA.atomic_add!(pointer(shared, tid), shared[tid+i]) # invalid op - results in error
-        end
-        sync_threads()
-        i ÷= 2
-    end
-    if tid == 1
-        CUDA.atomic_add!(pointer(z, 1), shared[1])
-    end
-    return nothing
-end
-
-function bench_dot_share!(x, y, z, threads, numblocks)
-    CUDA.@sync @cuda threads = threads blocks = numblocks dot_share!(x, y, z)
-    return z
-end
-
-function wrap_share(x, y, threads)
-    numblocks = ceil(Int, N1 / threads)
-    z = CUDA.zeros(1)
-    x = bench_dot_share!(x, y, z, threads, numblocks)
-    return x
-end
-
-threads = 64
-numblocks = ceil(Int, N1 / threads)
-z = CUDA.zeros(1)
-@cuda threads = threads blocks = numblocks dot_share!(x1g, x2g, z)
-@time CUDA.@sync wrap_share(x1g, x2g, threads)
-@btime CUDA.@sync wrap_share($x1g, $x2g, threads)
-x = CUDA.@sync wrap_share(x1g, x2g, threads)
-x1g' * x2g
-@btime x1g' * x2g
-@btime x1' * x2
-
-
-function dot_share2!(x::CuDeviceVector{T}, z::CuDeviceVector{T}) where {T}
-
-    tid = threadIdx().x
-    bid = blockIdx().x
-    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    shared = CUDA.@cuStaticSharedMem(T, 128)
-    fill!(shared, 0f0)
-    sync_threads()
-
-    # if idx <= length(x)
-    #     shared[tid] = x[idx] * y[idx]
-    # end
-    # sync_threads()
-
-    i = blockDim().x ÷ 2
-    while i > 0
-        # if tid <= i
-        if tid == 1
-            # shared[tid] += shared[tid + i] # valid non atomic operation
-            CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1]) # invalid op - results in error
-        end
-        sync_threads()
-        i ÷= 2
-    end
-    if tid == 1
-        z[bid] += shared[1]
-    end
-    return nothing
-end
-
-function bench_dot_share2!(x, z, threads, numblocks)
-    CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z)
-    return sum(z)
-end
-
-function wrap_share2(x, threads)
-    numblocks = ceil(Int, N1 / threads)
-    z = CUDA.zeros(numblocks)
-    # x = bench_dot_share2!(x, z, threads, numblocks)
-    CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x, z)
-    return(x)
-end
-
-threads = 128
-numblocks = ceil(Int, N1 / threads)
-z = CUDA.zeros(numblocks)
-sum(z)
-x1g' * x2g
-CUDA.@sync @cuda threads = threads blocks = numblocks dot_share2!(x1g, x2g, z)
-@btime CUDA.@sync wrap_share2($x1g, $x2g, threads)
-x = CUDA.@sync wrap_share2(x1g, x2g, threads)
-
-@btime x1g' * x2g
-
-
-using CUDA
-function kernel(x)
-    shared = @cuStaticSharedMem(Float32, 2)
-    fill!(shared, 1f0)
-    sync_threads()
-    # @atomic shared[threadIdx().x] += 0f0
-    tid = threadIdx().x
-    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1])
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
-    return
-end
-
-x = CUDA.zeros(1)
-@cuda kernel(x)
-synchronize()
-
-
-using CUDA
-function kernel2(x, y)
-    tid = threadIdx().x
-    shared = @cuStaticSharedMem(Float32, 4)
-    fill!(shared, 1f0)
-    sync_threads()
-    i = Int32(2)
-    if i > 0
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 1])
-        sync_threads()
-        # i ÷= 2
-    end
-    sync_threads()
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
-    return
-end
-
-x = CUDA.zeros(4)
-y = CUDA.zeros(1)
-@cuda threads = 2 kernel2(x, y)
-synchronize()
-
-
-
-using CUDA
-function kernel1(x, y)
-    tid = threadIdx().x
-    shared = @cuStaticSharedMem(Float32, 4)
-    fill!(shared, 1f0)
-    sync_threads()
-    i = Int32(2)
-    if i > 0
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-        sync_threads()
-        i ÷= 2
-    end
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
-    return
-end
-
-x = CUDA.zeros(4)
-y = CUDA.zeros(1)
-@cuda threads = 2 kernel1(x, y)
-x
-synchronize()
-
-
-
-using CUDA
-function kernel2(x, y)
-    tid = threadIdx().x
-    shared = @cuStaticSharedMem(Float32, 4)
-    fill!(shared, 1f0)
-    sync_threads()
-    i = Int32(2)
-    while i > 0
-        CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-        sync_threads()
-        i ÷= 2
-    end
-    sync_threads()
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
-    return
-end
-
-x = CUDA.zeros(4)
-y = CUDA.zeros(1)
-@cuda threads = 2 kernel2(x, y)
-x
-synchronize()
-
-
-using CUDA
-function kernel3(x)
-    tid = threadIdx().x
-    shared = @cuStaticSharedMem(Float32, 4)
-    fill!(shared, 1f0)
-    sync_threads()
-    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-    sync_threads()
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
-    return
-end
-
-x = CUDA.zeros(4)
-@cuda threads = 2 kernel3(x)
-x
-synchronize()
-
-using CUDA
-function kernel4(x)
-    tid = threadIdx().x
-    shared = @cuStaticSharedMem(Float32, 4)
-    fill!(shared, 1f0)
-    sync_threads()
-    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-    sync_threads()
-    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
-    sync_threads()
-    CUDA.atomic_add!(pointer(x, 1), shared[1])
+    thread_k = 3
+    threads = (thread_i, thread_j, thread_k)
+    blocks = ceil.(Int, (16, size(x_bin, 2)))
+    fill!(h, 0)
+    @cuda blocks = blocks threads = threads shmem = sizeof(T) * size(h, 2) * 3 kernel_s4!(h, ∇, x_bin)
+    CUDA.synchronize()
     return
 end
 
-x = CUDA.zeros(4)
-@cuda threads = 2 kernel4(x)
-x
-synchronize()
\ No newline at end of file
+nbins = 64
+nfeats = 100
+nobs = Int(1e6)
+h = [zeros(Float32, 3, nbins) for feat in 1nfeats];
+x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
+∇_cpu = rand(Float32, 3, nobs);
+h∇_cpu = zeros(Float32, 3, nbins, nfeats)
+rowsample = 0.5
+colsample = 0.5
+is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
+js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
+
+∇_gpu = CuArray(∇_cpu)
+x_bin_gpu = CuArray(x_bin)
+h∇_gpu = CuArray(h∇_cpu)
+is_gpu = CuArray(is)
+js_gpu = CuArray(js)
+
+@time hist_gpu_s4!(h∇_gpu, ∇_gpu, x_bin_gpu)
+CUDA.@time hist_gpu_s4!(h∇_gpu, ∇_gpu, x_bin_gpu)
+# desktop | 1K: 41.102 μs (24 allocations: 1.66 KiB)
+# desktop | 10K: 59.142 μs (109 allocations: 9.09 KiB)
+# desktop | 100K: 251.850 μs (109 allocations: 9.09 KiB)
+# desktop | 1M: 2.203 ms (23 allocations: 1.33 KiB)
+# desktop | 10M: 25.557 ms (110 allocations: 9.11 KiB)
+@btime hist_gpu_s4!(h∇_gpu, ∇_gpu, x_bin_gpu)
diff --git a/experiments/hist/hist_gpu-debug-convert.jl b/experiments/hist/hist_gpu-debug-convert.jl
deleted file mode 100644
index a92bc34f..00000000
--- a/experiments/hist/hist_gpu-debug-convert.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-using Random: seed!
-
-function convert_kernel!(x, y)
-    i = threadIdx().x
-    if i <= length(x)
-        val = Float64(y[i])
-        CUDA.atomic_add!(pointer(x, i), val)
-        # CUDA.atomic_add!(pointer(x, i), y[i])
-    end
-    sync_threads()
-    return nothing
-end
-
-function convert_gpu!(x, y)
-    @cuda threads = length(x) blocks = (1,) convert_kernel!(x, y)
-    CUDA.synchronize()
-    return nothing
-end
-
-x = CUDA.zeros(Float32, 3)
-y = CUDA.rand(Float32, 3)
-convert_gpu!(x, y)
diff --git a/experiments/hist/hist_gpu-df-debug.jl b/experiments/hist/hist_gpu-df-debug.jl
deleted file mode 100644
index e0e3450a..00000000
--- a/experiments/hist/hist_gpu-df-debug.jl
+++ /dev/null
@@ -1,71 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-using Random: seed!
-
-function hist_kernel!(h∇::CuDeviceArray, ∇::CuDeviceMatrix, x_bin::CuDeviceMatrix, is::CuDeviceVector, js::CuDeviceVector)
-    k, tiy, tiz = threadIdx().x, threadIdx().y, threadIdx().z
-    bdx, bdy = blockDim().z, blockDim().y
-    bix, biy = blockIdx().x, blockIdx().y
-    gdx = gridDim().x
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tiz + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, k, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu!(h∇, ∇, x_bin, is, js)
-    # kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    # config = launch_configuration(kernel.fun)
-    # max_threads = config.threads ÷ 4
-    # max_blocks = config.blocks * 4
-    max_threads = 256
-    max_blocks = 16
-    tx = size(h∇, 1)
-    ty = max(1, min(length(js), fld(max_threads, tx)))
-    tz = max(1, min(length(is), fld(max_threads, tx * ty)))
-    threads = (tx, ty, tz)
-    by = cld(length(js), ty)
-    bz = min(cld(max_blocks, by), cld(length(is), tz))
-    blocks = (bz, by, 1)
-    h∇ .= 0
-    @info "threads blocks" threads blocks
-    @cuda threads=threads blocks=blocks hist_kernel!(h∇, ∇, x_bin, is, js)
-    CUDA.synchronize()
-    return nothing
-end
-
-seed!(123)
-nbins = 32
-nfeats = 2
-nobs = Int(1e5)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-rowsample = 0.5
-is = 1:nobs
-js = 1:nfeats
-
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-CUDA.allowscalar(false)
-update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
diff --git a/experiments/hist/hist_gpu-df-ori copy.jl b/experiments/hist/hist_gpu-df-ori copy.jl
deleted file mode 100644
index 7728191f..00000000
--- a/experiments/hist/hist_gpu-df-ori copy.jl	
+++ /dev/null
@@ -1,75 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-
-"""
-    hist_kernel!
-"""
-function hist_kernel!(h∇, ∇, x_bin, is, js)
-    tix, tiy = threadIdx().x, threadIdx().y
-    bdx, bdy = blockDim().x, blockDim().y
-    bix, biy = blockIdx().x, blockIdx().y
-    gdx = gridDim().x
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, 1, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), ∇[1, idx])
-                CUDA.atomic_add!(pointer(h∇, hid+1), ∇[2, idx])
-                CUDA.atomic_add!(pointer(h∇, hid+2), ∇[3, idx])
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu!(h∇, ∇, x_bin, is, js)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    # @info "config.blocks" config.blocks
-    max_threads = config.threads
-    max_blocks = config.blocks
-    ty = max(1, min(length(js), max_threads))
-    tx = max(1, min(length(is), fld(max_threads, ty)))
-    threads = (tx, ty, 1)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (bx, by, 1)
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    return nothing
-end
-
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-hist = zeros(Float32, nbins, nfeats);
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = rand(Float32, 3, nbins, nfeats)
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
-
-hist_gpu = CuArray(hist)
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
\ No newline at end of file
diff --git a/experiments/hist/hist_gpu-df-ori.jl b/experiments/hist/hist_gpu-df-ori.jl
deleted file mode 100644
index 47981bc8..00000000
--- a/experiments/hist/hist_gpu-df-ori.jl
+++ /dev/null
@@ -1,217 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-using Random: seed!
-
-"""
-    hist_kernel!
-"""
-function hist_kernel!(h∇, ∇, x_bin, is, js)
-    tix, tiy, k = threadIdx().x, threadIdx().y, threadIdx().z
-    bdx, bdy = blockDim().x, blockDim().y
-    bix, biy = blockIdx().x, blockIdx().y
-    gdx = gridDim().x
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, k, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu!(h∇, ∇, x_bin, is, js)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    # @info "config.blocks" config.blocks
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    @assert size(h∇, 1) <= max_threads "number of classes cannot be larger than 31 on GPU"
-    tz = min(64, size(h∇, 1))
-    ty = max(1, min(length(js), fld(max_threads, tz)))
-    tx = max(1, min(length(is), fld(max_threads, tz * ty)))
-    threads = (tx, ty, tz)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (bx, by, 1)
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    return nothing
-end
-
-seed!(123)
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
-
-x_bin_gpu = CuArray(x_bin)
-∇_gpu = CuArray(∇)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-
-
-"""
-    hist_kernel!
-    revserse k and tix thread order
-"""
-function hist_kernel!(h∇, ∇, x_bin, is, js)
-    tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
-    bdx, bdy = blockDim().z, blockDim().y
-    bix, biy = blockIdx().z, blockIdx().y
-    gdx = gridDim().z
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, k, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu!(h∇, ∇, x_bin, is, js)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    # @info "config.blocks" config.blocks
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    tz = size(h∇, 1)
-    ty = max(1, min(length(js), fld(max_threads, tz)))
-    tx = max(1, min(length(is), fld(max_threads, tz * ty)))
-    threads = (tz, ty, tx)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (1, by, bx)
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    return nothing
-end
-
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
-
-x_bin_gpu = CuArray(x_bin)
-∇_gpu = CuArray(∇)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-
-
-
-"""
-    revserse tix and tij thread order
-    2.4 ms, no significant speed difference with previous k, j i order
-"""
-function hist_kernel!(h∇, ∇, x_bin, is, js)
-    k, tix, tiy = threadIdx().x, threadIdx().y, threadIdx().z
-    bdx, bdy = blockDim().y, blockDim().z
-    bix, biy = blockIdx().y, blockIdx().z
-    gdx = gridDim().y
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, k, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu!(h∇, ∇, x_bin, is, js)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    # @info "config.blocks" config.blocks
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    tk = size(h∇, 1)
-    tj = max(1, min(length(js), fld(max_threads, tk)))
-    ti = max(1, min(length(is), fld(max_threads, tk * tj)))
-    threads = (tk, ti, tj)
-    bj = cld(length(js), tj)
-    bi = min(cld(max_blocks, bj), cld(length(is), ti))
-    blocks = (1, bi, bj)
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    return nothing
-end
-
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
-
-x_bin_gpu = CuArray(x_bin)
-∇_gpu = CuArray(∇)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-CUDA.@time update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@time CUDA.@sync update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
-@btime update_hist_gpu!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu)
diff --git a/experiments/hist/hist_gpu-df-ori2.jl b/experiments/hist/hist_gpu-df-ori2.jl
deleted file mode 100644
index 4d861dfd..00000000
--- a/experiments/hist/hist_gpu-df-ori2.jl
+++ /dev/null
@@ -1,133 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-using Random: seed!
-
-"""
-    hist_kernel!
-"""
-function hist_kernel!(h∇, ∇, x_bin, is, js)
-    tix, tiy, k = threadIdx().x, threadIdx().y, threadIdx().z
-    bdx, bdy = blockDim().x, blockDim().y
-    bix, biy = blockIdx().x, blockIdx().y
-    gdx = gridDim().x
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, k, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu2!(h, h∇, ∇, x_bin, is, js, jsc)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    # @info "config.threads" config.threads
-    # @info "config.blocks" config.blocks
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    @assert size(h∇, 1) <= max_threads "number of classes cannot be larger than 31 on GPU"
-    tz = min(64, size(h∇, 1))
-    ty = max(1, min(length(js), fld(max_threads, tz)))
-    tx = max(1, min(length(is), fld(max_threads, tz * ty)))
-    threads = (tx, ty, tz)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (bx, by, 1)
-    h∇ .= 0
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    # @inbounds for j in jsc
-    #     copyto!(h[j], view(h∇, :, :, j))
-    # end
-    @inbounds for j in jsc
-        copyto!(h[j], view(h∇, :, :, j))
-    end
-    # CUDA.synchronize()
-    return nothing
-end
-
-seed!(123)
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-h = [h∇[:,:,j] for j in axes(h∇, 3)]
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true)
-
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-CUDA.allowscalar(false)
-@time update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-CUDA.@time update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-@time CUDA.@sync update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-@btime update_hist_gpu2!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-
-
-
-
-seed!(123)
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-h = [h∇[:,:,j] for j in axes(h∇, 3)]
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true)
-
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-EvoTrees.update_hist_gpu!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-
-
-
-seed!(123)
-nbins = 32
-nfeats = 1
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = zeros(Float32, 3, nbins, nfeats)
-h = [h∇[:,:,j] for j in axes(h∇, 3)]
-rowsample = 0.5
-colsample = 1.0
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(colsample * nfeats)), replace=false, ordered=true)
-
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CuArray(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-EvoTrees.update_hist_gpu!(h, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
diff --git a/experiments/hist/hist_gpu-df-vec.jl b/experiments/hist/hist_gpu-df-vec.jl
deleted file mode 100644
index 5d36503d..00000000
--- a/experiments/hist/hist_gpu-df-vec.jl
+++ /dev/null
@@ -1,122 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-using Base.Threads: @threads
-using Random: seed!
-
-"""
-    hist_kernel!
-"""
-function hist_kernel_vec!(h∇, ∇, x_bin, is)
-    tix, k = threadIdx().x, threadIdx().y
-    bdx = blockDim().x
-    bix = blockIdx().x
-    gdx = gridDim().x
-
-    i_max = length(is)
-    niter = cld(i_max, bdx * gdx)
-    @inbounds for iter in 1:niter
-        i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-        if i <= i_max
-            @inbounds idx = is[i]
-            @inbounds bin = x_bin[idx]
-            hid = Base._to_linear_index(h∇, k, bin)
-            CUDA.atomic_add!(pointer(h∇, hid), ∇[k, idx])
-        end
-    end
-    CUDA.sync_threads()
-    return nothing
-end
-
-function update_hist_gpu_vec!(h∇, ∇, x_bin, is, js)
-    kernel = @cuda launch = false hist_kernel_vec!(h∇[js[1]], ∇, view(x_bin, :, js[1]), is)
-    config = launch_configuration(kernel.fun)
-    max_threads = config.threads
-    max_blocks = config.blocks
-    ty = size(h∇[1], 1)
-    tx = max(1, min(length(is), fld(max_threads, ty)))
-    threads = (tx, ty, 1)
-    bx = min(max_blocks, cld(length(is), tx))
-    blocks = (bx, 1, 1)
-    @sync for j in js
-        @async h∇[j] .= 0
-    end
-    CUDA.synchronize()
-    @sync for j in js
-        @async kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks)
-    end
-    CUDA.synchronize()
-    return nothing
-end
-
-seed!(123)
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats]
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
-
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_gpu = CuArray.(h∇)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-CUDA.allowscalar(false)
-@time update_hist_gpu_vec!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-@time CUDA.@sync update_hist_gpu_vec!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-@btime CUDA.@sync update_hist_gpu_vec!(h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-
-function update_hist_gpu_cpu!(h, h∇, ∇, x_bin, is, js)
-    kernel = @cuda launch = false hist_kernel_vec!(h∇[js[1]], ∇, view(x_bin, :, js[1]), is)
-    config = launch_configuration(kernel.fun)
-    @info "config.threads" config.threads
-    @info "config.blocks" config.blocks
-    max_threads = config.threads
-    max_blocks = config.blocks
-    ty = size(h∇[1], 1)
-    tx = max(1, min(length(is), fld(max_threads, ty)))
-    threads = (tx, ty, 1)
-    bx = min(max_blocks, cld(length(is), tx))
-    blocks = (bx, 1, 1)
-    CUDA.@sync for j in js
-        @async h∇[j] .= 0
-    end
-    CUDA.synchronize()
-    CUDA.@sync for j in js
-        # @async kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks)
-        kernel(h∇[j], ∇, view(x_bin, :, j), is; threads, blocks)
-    end
-    CUDA.synchronize()
-    return nothing
-end
-function copy_gpu_cpu!(h, h∇, js)
-    for j in js
-        # @info "j" j
-        copyto!(h[j], h∇[j])
-    end
-    CUDA.synchronize()
-    return nothing
-end
-function combine!(h, h∇, ∇, x_bin, is, js)
-    CUDA.@sync update_hist_gpu_cpu!(h, h∇, ∇, x_bin, is, js)
-    CUDA.synchronize()
-    CUDA.@sync copy_gpu_cpu!(h, h∇, js)
-    CUDA.synchronize()
-    return nothing
-end
-@time CUDA.@sync update_hist_gpu_cpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-@time CUDA.@sync copy_gpu_cpu!(h∇, h∇_gpu, js)
-
-@btime CUDA.@sync update_hist_gpu_cpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-@btime CUDA.@sync copy_gpu_cpu!(h∇, h∇_gpu, js)
-
-@time combine!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-@time CUDA.@sync update_hist_gpu_cpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js)
-@btime CUDA.@sync update_hist_gpu_cpu!($h∇, $h∇_gpu, $∇_gpu, $x_bin_gpu, $is_gpu, $js)
diff --git a/experiments/hist/hist_gpu-ref.jl b/experiments/hist/hist_gpu-ref.jl
new file mode 100644
index 00000000..a56263a1
--- /dev/null
+++ b/experiments/hist/hist_gpu-ref.jl
@@ -0,0 +1,83 @@
+using Revise
+using CUDA
+using StatsBase: sample
+using BenchmarkTools
+
+"""
+    hist_kernel!
+"""
+function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
+    tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
+    bdx, bdy = blockDim().z, blockDim().y
+    bix, biy = blockIdx().z, blockIdx().y
+    gdx = gridDim().z
+
+    j = tiy + bdy * (biy - 1)
+    if j <= length(js)
+        jdx = js[j]
+        i_max = length(is)
+        niter = cld(i_max, bdx * gdx)
+        @inbounds for iter = 1:niter
+            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
+            if i <= i_max
+                @inbounds idx = is[i]
+                @inbounds bin = x_bin[idx, jdx]
+                hid = Base._to_linear_index(h∇, k, bin, jdx)
+                CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx]))
+            end
+        end
+    end
+    sync_threads()
+    return nothing
+end
+
+function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads
+    max_blocks = config.blocks
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = max(1, min(length(is), fld(max_threads, k * ty)))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    CUDA.synchronize()
+    copyto!(h∇_cpu, h∇)
+    Threads.@threads for j in jsc
+        nbins = size(h[j], 2)
+        @views h[j] .= h∇_cpu[:, 1:nbins, j]
+    end
+    return nothing
+end
+
+nbins = 64
+nfeats = 100
+nobs = Int(1e3)
+h = [zeros(Float32, 3, nbins) for feat in 1:nfeats];
+x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
+∇_cpu = rand(Float32, 3, nobs);
+h∇_cpu = rand(Float32, 3, nbins, nfeats)
+rowsample = 0.5
+colsample = 0.5
+is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
+js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
+
+hist_gpu = CuArray(hist)
+∇_gpu = CuArray(∇_cpu)
+x_bin_gpu = CuArray(x_bin)
+h∇_gpu = CuArray(h∇_cpu)
+is_gpu = CuArray(is)
+js_gpu = CuArray(js)
+
+@time update_hist_gpu!(h, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+CUDA.@time update_hist_gpu!(h, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+# desktop | 1K: 46.332 μs (109 allocations: 9.09 KiB)
+# desktop | 10K: 59.142 μs (109 allocations: 9.09 KiB)
+# desktop | 100K: 251.850 μs (109 allocations: 9.09 KiB)
+# desktop | 1M: 2.328 ms (110 allocations: 9.11 KiB)
+# desktop | 10M: 25.557 ms (110 allocations: 9.11 KiB)
+@btime update_hist_gpu!(h, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl
deleted file mode 100644
index d7b094b6..00000000
--- a/experiments/hist/perf-gpu.jl
+++ /dev/null
@@ -1,202 +0,0 @@
-using Revise
-using CUDA
-using StatsBase: sample
-using BenchmarkTools
-using Base.Threads: @threads
-using Random: seed!
-
-"""
-    hist_kernel!
-"""
-function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
-    tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
-    bdx, bdy = blockDim().z, blockDim().y
-    bix, biy = blockIdx().z, blockIdx().y
-    gdx = gridDim().z
-
-    j = tiy + bdy * (biy - 1)
-    if j <= length(js)
-        jdx = js[j]
-        i_max = length(is)
-        niter = cld(i_max, bdx * gdx)
-        @inbounds for iter = 1:niter
-            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
-            if i <= i_max
-                @inbounds idx = is[i]
-                @inbounds bin = x_bin[idx, jdx]
-                hid = Base._to_linear_index(h∇, k, bin, jdx)
-                CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx]))
-            end
-        end
-    end
-    sync_threads()
-    return nothing
-end
-
-function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    k = size(h∇, 1)
-    ty = max(1, min(length(js), fld(max_threads, k)))
-    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
-    threads = (k, ty, tx)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (1, by, bx)
-    h∇ .= 0
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    CUDA.@sync for j in jsc
-        nbins = size(h[j], 2)
-        copyto!(h[j], view(h∇, :, 1:nbins, j))
-    end
-    return nothing
-end
-
-function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    k = size(h∇, 1)
-    ty = max(1, min(length(js), fld(max_threads, k)))
-    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
-    threads = (k, ty, tx)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (1, by, bx)
-    h∇ .= 0
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    return nothing
-end
-
-function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    k = size(h∇, 1)
-    ty = max(1, min(length(js), fld(max_threads, k)))
-    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
-    threads = (k, ty, tx)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (1, by, bx)
-    h∇ .= 0
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    copyto!(h∇_cpu, h∇)
-    CUDA.synchronize()
-    return nothing
-end
-
-
-function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
-    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
-    config = launch_configuration(kernel.fun)
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
-    k = size(h∇, 1)
-    ty = max(1, min(length(js), fld(max_threads, k)))
-    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
-    threads = (k, ty, tx)
-    by = cld(length(js), ty)
-    bx = min(cld(max_blocks, by), cld(length(is), tx))
-    blocks = (1, by, bx)
-    h∇ .= 0
-    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    # CUDA.synchronize()
-    copyto!(h∇_cpu, h∇)
-    # CUDA.synchronize()
-    @threads for j in jsc
-        nbins = size(h[j], 2)
-        @views h[j] .= h∇_cpu[:, 1:nbins, j]
-        # h[j] .= h∇_cpu[:, 1:nbins, j]
-    end
-    return nothing
-end
-
-
-seed!(123)
-nbins = 32
-nfeats = 100
-nobs = Int(1e6)
-x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
-∇ = rand(Float32, 3, nobs);
-h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats]
-rowsample = 0.5
-colsample = 0.5
-is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
-js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
-
-∇_gpu = CuArray(∇)
-x_bin_gpu = CuArray(x_bin)
-h∇_cpu = zeros(Float32, 3, nbins, nfeats)
-h∇_gpu = CuArray(h∇_cpu)
-is_gpu = CuArray(is)
-js_gpu = CuArray(js)
-
-CUDA.allowscalar(false)
-CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-# ref without copy to cpu: ~same
-# ref 10K: 875.100 μs (168 allocations: 7.08 KiB)
-# ref 100K: 1.236 ms (215 allocations: 9.91 KiB)
-# ref 1M:  6.138 ms (227 allocations: 12.00 KiB)
-# ref 10M: 67.075 ms (235 allocations: 13.38 KiB)
-
-# with copy
-# CUDA v4 1M: 2.903 ms (124 allocations: 6.98 KiB)
-# CUDA v5 1M: 3.542 ms (848 allocations: 37.14 KiB)
-@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-
-# without copy
-# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB)
-# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB)
-@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-
-# without single array copy
-# CUDA v4 1M: 
-# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB)
-@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-
-# without single array copy
-# CUDA v4 1M: 
-# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB)
-@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
-
-
-using CUDA, BenchmarkTools
-function gpu_copy!(h, h∇, jsc)
-    CUDA.@sync for j in jsc
-        nbins = size(h[j], 2)
-        copyto!(h[j], view(h∇, :, 1:nbins, j))
-    end
-    return nothing
-end
-
-h∇ = [zeros(Float32, 3, 32) for n in 1:100];
-h∇_gpu = CUDA.zeros(Float32, 3, 32, 100);
-js = 1:100
-
-# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB)
-# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB)
-@btime gpu_copy!(h∇, h∇_gpu, js)
-
-
-function gpu_copy2!(h, h∇, jsc)
-    for j in jsc
-        nbins = size(h[j], 2)
-        @async copyto!(h[j], view(h∇, :, 1:nbins, j))
-    end
-    return nothing
-end
-
-h∇ = [zeros(Float32, 3, 32) for n in 1:100];
-h∇_gpu = CUDA.zeros(Float32, 3, 32, 100);
-js = 1:100
-
-# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB)
-# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB)
-@btime gpu_copy2!(h∇, h∇_gpu, js)
diff --git a/experiments/hist_v1.jl b/experiments/hist_v1.jl
deleted file mode 100644
index 80fd250b..00000000
--- a/experiments/hist_v1.jl
+++ /dev/null
@@ -1,306 +0,0 @@
-using Statistics
-using StatsBase:sample
-using Base.Threads:@threads
-using BenchmarkTools
-using EvoTrees
-using SIMD
-using LoopVectorization
-
-n = Int(1e6)
-nvars = 100
-nbins = 64
-𝑖 = collect(1:n)
-𝑗 = collect(1:nvars)
-δ = rand(n)
-δ² = rand(n)
-𝑤 = rand(n)
-
-hist_δ = zeros(nbins, nvars)
-hist_δ² = zeros(nbins, nvars)
-hist_𝑤 = zeros(nbins, nvars)
-X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars)
-
-function iter_1(X_bin, hist_δ, δ, 𝑖, 𝑗)
-    # hist_δ .*= 0.0
-    @inbounds @threads for j in 𝑗
-        @inbounds for i in 𝑖
-            hist_δ[X_bin[i,j], j] += δ[i]
-        end
-    end
-end
-
-@time iter_1(X_bin, hist_δ, δ, 𝑖, 𝑗)
-@btime iter_1(X_bin, hist_δ, δ, 𝑖, 𝑗)
-
-function iter_1A(X_bin, hist_δ, δ, 𝑖, 𝑗)
-    # hist_δ .*= 0.0
-    @inbounds @threads for j in 𝑗
-        @inbounds for i in 𝑖
-            hist_δ[X_bin[i,j], j] += δ[i]
-        end
-    end
-end
-
-@time iter_1A(X_bin, hist_δ, δ, 𝑖, 𝑗)
-@btime iter_1A(X_bin, hist_δ, δ, 𝑖, 𝑗)
-
-### 3 features in seperate hists
-function iter_1B(X_bin, hist_δ, hist_δ², hist_𝑤, δ, δ², 𝑤, 𝑖, 𝑗)
-    # hist_δ .= 0.0
-    # hist_δ² .= 0.0
-    # hist_𝑤 .= 0.0
-    @inbounds @threads for j in 𝑗
-        @inbounds for i in 𝑖
-            hist_δ[X_bin[i,j], j] += δ[i]
-            hist_δ²[X_bin[i,j], j] += δ²[i]
-            hist_𝑤[X_bin[i,j], j] += 𝑤[i]
-        end
-    end
-end
-
-@time iter_1B(X_bin, hist_δ, hist_δ², hist_𝑤, δ, δ², 𝑤, 𝑖, 𝑗)
-@btime iter_1B($X_bin, $hist_δ, $hist_δ², $hist_𝑤, $δ, $δ², $𝑤, $𝑖, $𝑗)
-
-𝑖2 = sample(𝑖, 500000, replace=false, ordered=true)
-𝑗2 = sample(𝑗, 50, replace=false, ordered=true)
-@time iter_1B(X_bin, hist_δ, hist_δ², hist_𝑤, δ, δ², 𝑤, 𝑖2, 𝑗2)
-@btime iter_1B($X_bin, $hist_δ, $hist_δ², $hist_𝑤, $δ, $δ², $𝑤, $𝑖2, $𝑗2)
-
-### 3 features in common hists
-hist_δ𝑤 = zeros(3, nbins, nvars)
-function iter_2(X_bin, hist_δ𝑤, δ, δ², 𝑤, 𝑖, 𝑗)
-    # hist_δ𝑤 .= 0.0
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            hist_δ𝑤[1, X_bin[i,j], j] += δ[i]
-            hist_δ𝑤[2, X_bin[i,j], j] += δ²[i]
-            hist_δ𝑤[3, X_bin[i,j], j] += 𝑤[i]
-        end
-    end
-end
-
-@time iter_2(X_bin, hist_δ𝑤, δ, δ², 𝑤, 𝑖, 𝑗)
-@btime iter_2($X_bin, $hist_δ𝑤, $δ, $δ², $𝑤, $𝑖, $𝑗)
-
-### 3 features in common hists - gradients/weight in single matrix
-hist_δ𝑤 = zeros(3, nbins, nvars)
-δ𝑤 = rand(3, n)
-
-function iter_3(X_bin, hist_δ𝑤, δ𝑤, 𝑖, 𝑗)
-    # hist_δ𝑤 .= 0.0
-    @inbounds @threads for j in 𝑗
-        @inbounds for i in 𝑖
-            hist_δ𝑤[1, X_bin[i,j], j] += δ𝑤[1,i]
-            hist_δ𝑤[2, X_bin[i,j], j] += δ𝑤[2,i]
-            hist_δ𝑤[3, X_bin[i,j], j] += δ𝑤[3,i]
-        end
-    end
-end
-
-@time iter_3(X_bin, hist_δ𝑤, δ𝑤, 𝑖, 𝑗)
-@btime iter_3($X_bin, $hist_δ𝑤, $δ𝑤, $𝑖, $𝑗)
-
-# 39.849 ms (81 allocations: 8.22 KiB)
-function iter_3B(X_bin, hist_δ𝑤, δ𝑤, 𝑖, 𝑗)
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            hist_δ𝑤[1, X_bin[i,j], j] += δ𝑤[1,i]
-            hist_δ𝑤[2, X_bin[i,j], j] += δ𝑤[2,i]
-            hist_δ𝑤[3, X_bin[i,j], j] += δ𝑤[3,i]
-        end
-    end
-end
-
-@time iter_3B(X_bin, hist_δ𝑤, δ𝑤, 𝑖, 𝑗)
-@btime iter_3B($X_bin, $hist_δ𝑤, $δ𝑤, $𝑖, $𝑗)
-
-# 38.027 ms (81 allocations: 8.22 KiB)
-function iter_3C(X_bin, hist_δ𝑤, δ𝑤, 𝑖, 𝑗)
-    @inbounds @threads for j in 𝑗
-        @inbounds for chunk in 1:10000
-            @inbounds for iter in 1:100
-                i = iter + 100 * (chunk - 1)
-                hist_δ𝑤[1, X_bin[i,j], j] += δ𝑤[1,i]
-                hist_δ𝑤[2, X_bin[i,j], j] += δ𝑤[2,i]
-                hist_δ𝑤[3, X_bin[i,j], j] += δ𝑤[3,i]
-            end
-        end
-    end
-end
-
-@time iter_3C(X_bin, hist_δ𝑤, δ𝑤, 𝑖, 𝑗)
-@btime iter_3C($X_bin, $hist_δ𝑤, $δ𝑤, $𝑖, $𝑗)
-
-
-### 3 features in common hists - vector of matrix hists - gradients/weight in single matrix
-hist_δ𝑤_vec = [zeros(3, nbins) for j in 1:nvars]
-δ𝑤 = rand(3, n)
-
-# 33.762 ms (81 allocations: 8.22 KiB)
-function iter_4(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-    # [hist_δ𝑤_vec[j] .= 0.0 for j in 𝑗]
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            hist_δ𝑤_vec[j][1, X_bin[i,j]] += δ𝑤[1,i]
-            hist_δ𝑤_vec[j][2, X_bin[i,j]] += δ𝑤[2,i]
-            hist_δ𝑤_vec[j][3, X_bin[i,j]] += δ𝑤[3,i]
-        end
-    end
-end
-
-@time iter_4(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-@btime iter_4($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗)
-
-
-### 3 features in common hists - vector of matrix hists - gradients/weight in single matrix
-hist_δ𝑤_vec = [zeros(3, nbins) for j in 1:nvars]
-δ𝑤 = rand(n, 3)
-
-# 40.005 ms (81 allocations: 8.22 KiB)
-function iter_4B(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-    # [hist_δ𝑤_vec[j] .= 0.0 for j in 𝑗]
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            hist_δ𝑤_vec[j][1, X_bin[i,j]] += δ𝑤[i,1]
-            hist_δ𝑤_vec[j][2, X_bin[i,j]] += δ𝑤[i,2]
-            hist_δ𝑤_vec[j][3, X_bin[i,j]] += δ𝑤[i,3]
-        end
-    end
-end
-
-@time iter_4B(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗);
-@btime iter_4B($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗);
-
-### 3 features in common hists - vector of vec hists - gradients/weight in single vector
-hist_δ𝑤_vec = [zeros(3 * nbins) for j in 1:nvars]
-δ𝑤 = rand(3 * n)
-
-
-function iter_5(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-    # [hist_δ𝑤_vec[j] .= 0.0 for j in 𝑗]
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            # @inbounds if mask[i]
-            id = 3 * i - 2
-            hid = 3 * X_bin[i,j] - 2
-            hist_δ𝑤_vec[j][hid] += δ𝑤[id]
-            hist_δ𝑤_vec[j][hid + 1] += δ𝑤[id + 1]
-            hist_δ𝑤_vec[j][hid + 2] += δ𝑤[id + 2]
-            # end
-        end
-    end
-end
-
-@time iter_5(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-@btime iter_5($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗)
-
-# 𝑖2 = sample(𝑖, 900000, replace=false, ordered=true)
-# 𝑖3 = view(𝑖2, 100001:650000)
-using Random
-𝑖2 = sample(𝑖, 500000, replace=false, ordered=true)
-𝑗2 = sample(𝑗, 50, replace=false, ordered=true)
-@time iter_5(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖2, 𝑗2)
-@btime iter_5($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖2, $𝑗2)
-
-### 3 features in common hists - vector of vec hists - gradients/weight in single vector - explicit loop
-hist_δ𝑤_vec = [zeros(3 * nbins) for j in 1:nvars]
-δ𝑤 = rand(3, n)
-
-function iter_5B(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            hid = 3 * X_bin[i,j] - 2
-            hist_δ𝑤_vec[j][hid] += δ𝑤[1, i]
-            hist_δ𝑤_vec[j][hid + 1] += δ𝑤[2, i]
-            hist_δ𝑤_vec[j][hid + 2] += δ𝑤[3, i]
-        end
-    end
-end
-
-@time iter_5B(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-@btime iter_5B($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗)
-
-### 3 features in common hists - vector of vec hists - gradients/weight in single vector - explicit loop
-hist_δ𝑤_vec = [zeros(3 * nbins) for j in 1:nvars]
-δ𝑤 = rand(n, 3)
-
-function iter_5C(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            hid = 3 * X_bin[i,j] - 2
-            hist_δ𝑤_vec[j][hid] += δ𝑤[i,1]
-            hist_δ𝑤_vec[j][hid + 1] += δ𝑤[i,2]
-            hist_δ𝑤_vec[j][hid + 2] += δ𝑤[i,3]
-        end
-    end
-end
-
-@time iter_5C(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗)
-@btime iter_5C($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗)
-
-function iter_6(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗, K)
-    # [hist_δ𝑤_vec[j] .= 0.0 for j in 𝑗]
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            id = 3 * i - 2
-            hid = 3 * X_bin[i,j] - 3
-            for k in 1:K
-                hist_δ𝑤_vec[j][hid + k] += δ𝑤[id + k]
-            end
-        end
-    end
-end
-
-K = 3
-@time iter_6(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗, K)
-@btime iter_6($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗, $K)
-
-### 3 features in common hists - vector of vec hists - gradients/weight in single vector - with node assignations
-hist_δ𝑤_vec = [[zeros(3 * nbins) for n in 1:16] for j in 1:nvars]
-δ𝑤 = rand(3 * n)
-𝑛 = sample(1:16, n)
-
-function iter_7(X_bin, hist_δ𝑤_vec::Vector{Vector{Vector{T}}}, δ𝑤::Vector{T}, 𝑖, 𝑗, 𝑛) where T
-    # [hist_δ𝑤_vec[j][n] .= 0.0 for n in 𝑛]
-    @inbounds @threads for j in 𝑗
-        @inbounds @simd for i in 𝑖
-            id = 3 * i - 2
-            hid = 3 * X_bin[i,j] - 2
-            n = 𝑛[i]
-
-            hist_δ𝑤_vec[j][n][hid] += δ𝑤[id]
-            hist_δ𝑤_vec[j][n][hid + 1] += δ𝑤[id + 1]
-            hist_δ𝑤_vec[j][n][hid + 2] += δ𝑤[id + 2]
-        end
-    end
-end
-
-@time iter_7(X_bin, hist_δ𝑤_vec, δ𝑤, 𝑖, 𝑗, 𝑛)
-@btime iter_7($X_bin, $hist_δ𝑤_vec, $δ𝑤, $𝑖, $𝑗, $𝑛)
-
-
-
-using StatsBase:sample
-using BenchmarkTools
-
-n_obs = Int(1e6)
-n_vars = 100
-n_bins = 64
-K = 3
-𝑖 = collect(1:n_obs)
-δ = rand(K, n_obs)
-hist = zeros(K, n_bins, n_vars);
-X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars);
-X_bin = reshape(X_bin, n_obs, n_vars);
-𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true)
-
-function iter_1(X_bin, hist, δ, 𝑖)
-    hist .= 0.0
-    @inbounds @simd for i in 𝑖
-        @inbounds for k in 1:3
-            hist[k, X_bin[i,1], 1] += δ[k, i]
-        end
-    end
-end
-
-@btime iter_1($X_bin, $hist, $δ, $𝑖_sample)
\ No newline at end of file
diff --git a/experiments/hist_v2.jl b/experiments/hist_v2.jl
deleted file mode 100644
index f107b1ca..00000000
--- a/experiments/hist_v2.jl
+++ /dev/null
@@ -1,66 +0,0 @@
-# using Statistics
-using StatsBase:sample
-# using Base.Threads:@threads
-using BenchmarkTools
-
-n_obs = Int(1e6)
-n_vars = 100
-n_bins = 64
-K = 3
-𝑖 = collect(1:n_obs)
-δ = rand(n_obs, K)
-hist = zeros(K, n_bins, n_vars);
-X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars);
-X_bin = reshape(X_bin, n_obs, n_vars);
-
-function iter_1(X_bin, hist, δ, 𝑖)
-    hist .= 0.0
-    @inbounds for i in 𝑖
-        @inbounds for k in 1:3
-            hist[k, X_bin[i,1], 1] += δ[i,k]
-        end
-    end
-end
-
-𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true)
-
-@time iter_1(X_bin, hist, δ, 𝑖_sample)
-@btime iter_1($X_bin, $hist, $δ, $𝑖_sample)
-
-
-
-function iter_2(X_bin, hist, δ, 𝑖)
-    hist .= 0.0
-    @inbounds @simd for i in CartesianIndices(𝑖)
-        @inbounds @simd for k in 1:3
-            hist[k, X_bin[𝑖[i],1], 1] += δ[𝑖[i],k]
-        end
-    end
-end
-
-𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true)
-
-@time iter_2(X_bin, hist, δ, 𝑖_sample)
-@btime iter_2($X_bin, $hist, $δ, $𝑖_sample)
-
-
-
-# slower
-δ = rand(K, n_obs)
-hist = zeros(K, n_bins, n_vars);
-X_bin = sample(UInt8.(1:n_bins), n_obs * n_vars);
-X_bin = reshape(X_bin, n_obs, n_vars);
-
-function iter_1(X_bin, hist, δ, 𝑖)
-    hist .= 0.0
-    @inbounds for i in 𝑖
-        @inbounds for k in 1:3
-            hist[k, X_bin[i,1], 1] += δ[k,i]
-        end
-    end
-end
-
-𝑖_sample = sample(𝑖, Int(n_obs / 2), ordered=true)
-
-@time iter_1(X_bin, hist, δ, 𝑖_sample)
-@btime iter_1($X_bin, $hist, $δ, $𝑖_sample)
diff --git a/experiments/logistic_tests.jl b/experiments/logistic_tests.jl
deleted file mode 100644
index d4f507de..00000000
--- a/experiments/logistic_tests.jl
+++ /dev/null
@@ -1,142 +0,0 @@
-using Statistics
-using StatsBase: sample, sample!
-using EvoTrees
-using BenchmarkTools
-using CUDA
-
-# prepare a dataset
-features = rand(Int(1.25e4), 100)
-# features = rand(100, 10)
-X = features
-Y = rand(size(X, 1))
-𝑖 = collect(1:size(X, 1))
-
-# train-eval split
-𝑖_sample = sample(𝑖, size(𝑖, 1), replace=false)
-train_size = 0.8
-𝑖_train = 𝑖_sample[1:floor(Int, train_size * size(𝑖, 1))]
-𝑖_eval = 𝑖_sample[floor(Int, train_size * size(𝑖, 1))+1:end]
-
-x_train, x_eval = X[𝑖_train, :], X[𝑖_eval, :]
-y_train, y_eval = Y[𝑖_train], Y[𝑖_eval]
-
-
-###########################
-# Tree CPU
-###########################
-params_c = EvoTrees.EvoTreeMLE(T=Float32,
-    loss=:logistic,
-    nrounds=200,
-    lambda=0.1, gamma=0.0, eta=0.1,
-    max_depth=5, min_weight=1.0,
-    rowsample=0.5, colsample=0.5, nbins=16);
-
-model_c, cache_c = EvoTrees.init_evotree(params_c, x_train, y_train);
-EvoTrees.grow_evotree!(model_c, cache_c)
-p = model_c(x_train)
-sort(p[:,1])
-sort(p[:,2])
-
-# initialize from cache
-params_c = model_c.params
-x_size = size(cache_c.X_bin)
-
-# select random rows and cols
-sample!(params_c.rng, cache_c.𝑖_, cache_c.nodes[1].𝑖, replace=false, ordered=true);
-sample!(params_c.rng, cache_c.𝑗_, cache_c.𝑗, replace=false, ordered=true);
-# @btime sample!(params_c.rng, cache_c.𝑖_, cache_c.nodes[1].𝑖, replace=false, ordered=true);
-# @btime sample!(params_c.rng, cache_c.𝑗_, cache_c.𝑗, replace=false, ordered=true);
-
-𝑖 = cache_c.nodes[1].𝑖
-𝑗 = cache_c.𝑗
-
-# build a new tree
-# 897.800 μs (6 allocations: 736 bytes)
-get_loss_type(m::EvoTreeGaussian{L,T,S}) where {L,T,S} = L
-get_loss_type(m::EvoTrees.EvoTreeLogistic{L,T,S}) where {L,T,S} = L
-
-L = get_loss_type(params_c)
-@time EvoTrees.update_grads!(L, cache_c.δ𝑤, cache_c.pred, cache_c.Y; alpha=params_c.alpha)
-cache_c.δ𝑤
-
-sort(cache_c.δ𝑤[1, :])
-sort(cache_c.δ𝑤[2, :])
-sort(cache_c.δ𝑤[3, :])
-sort(cache_c.δ𝑤[4, :])
-
-p = collect(-3:0.5:3)
-y = collect(-3:0.5:3)
-
-function get_grads(p, y)
-    grad = zeros(length(p), length(y))
-    for i in eachindex(p)
-        for j in eachindex(y)
-            # alternate from 1
-            # grad[i, j] = -(exp(-2s) * (u - y) * (u - y + exp(s) * sinh(exp(-s) * (u - y)))) / (1 + cosh(exp(-s) * (u - y)))
-            grad[i, j] = (exp(-2 * p[i]) * (0.0 - y[j]) * (0.0 - y[j] + exp(p[i]) * sinh(exp(-p[i]) * (0.0 - y[j])))) / (1 + cosh(exp(-p[i]) * (0.0 - y[j])))
-        end
-    end
-    return grad
-end
-
-grads = get_grads(p, y)
-heatmap(grads)
-# @btime EvoTrees.update_grads!($params_c.loss, $cache_c.δ𝑤, $cache_c.pred_cpu, $cache_c.Y_cpu, $params_c.α)
-# ∑ = vec(sum(cache_c.δ[𝑖,:], dims=1))
-# gain = EvoTrees.get_gain(params_c.loss, ∑, params_c.λ)
-# assign a root and grow tree
-# train_nodes[1] = EvoTrees.TrainNode(UInt32(0), UInt32(1), ∑, gain)
-
-# 62.530 ms (7229 allocations: 17.43 MiB)
-# 1.25e5: 9.187 ms (7358 allocations: 2.46 MiB)
-tree = EvoTrees.Tree(params_c.max_depth, model_c.K, zero(typeof(params_c.λ)))
-@time EvoTrees.grow_tree!(tree, cache_c.nodes, params_c, cache_c.δ𝑤, cache_c.edges, cache_c.𝑗, cache_c.left, cache_c.left, cache_c.right, cache_c.X_bin, cache_c.K)
-@btime EvoTrees.grow_tree!($EvoTrees.Tree(params_c.max_depth, model_c.K, zero(typeof(params_c.λ))), $cache_c.nodes, $params_c, $cache_c.δ𝑤, $cache_c.edges, $cache_c.𝑗, $cache_c.left, $cache_c.left, $cache_c.right, $cache_c.X_bin, $cache_c.K)
-
-@time EvoTrees.grow_tree!(EvoTrees.Tree(params_c.max_depth, model_c.K, params_c.λ), params_c, cache_c.δ, cache_c.hist, cache_c.histL, cache_c.histR, cache_c.gains, cache_c.edges, 𝑖, 𝑗, 𝑛, cache_c.X_bin);
-@btime EvoTrees.grow_tree!(EvoTrees.Tree($params_c.max_depth, $model_c.K, $params_c.λ), $params_c, $cache_c.δ, $cache_c.hist, $cache_c.histL, $cache_c.histR, $cache_c.gains, $cache_c.edges, $𝑖, $𝑗, $𝑛, $cache_c.X_bin);
-@code_warntype EvoTrees.grow_tree!(EvoTrees.Tree(params_c.max_depth, model_c.K, params_c.λ), params_c, cache_c.δ, cache_c.hist, cache_c.histL, cache_c.histR, cache_c.gains, cache_c.edges, 𝑖, 𝑗, 𝑛, cache_c.X_bin);
-
-# push!(model_c.trees, tree)
-# 1.883 ms (83 allocations: 13.77 KiB)
-@btime EvoTrees.predict!(model_c.params.loss, cache_c.pred_cpu, tree, cache_c.X, model_c.K)
-
-δ𝑤, K, edges, X_bin, nodes, out, left, right = cache_c.δ𝑤, cache_c.K, cache_c.edges, cache_c.X_bin, cache_c.nodes, cache_c.out, cache_c.left, cache_c.right;
-
-# 9.613 ms (81 allocations: 13.55 KiB)
-# 1.25e5: 899.200 μs (81 allocations: 8.22 KiB)
-@time EvoTrees.update_hist!(params_c.loss, nodes[1].h, δ𝑤, X_bin, 𝑖, 𝑗, K)
-@btime EvoTrees.update_hist!($params_c.loss, $nodes[1].h, $δ𝑤, $X_bin, $𝑖, $𝑗, $K)
-@btime EvoTrees.update_hist!($nodes[1].h, $δ𝑤, $X_bin, $nodes[1].𝑖, $𝑗)
-@code_warntype EvoTrees.update_hist!(hist, δ, X_bin, 𝑖, 𝑗, 𝑛)
-
-j = 1
-# 8.399 μs (80 allocations: 13.42 KiB)
-n = 1
-nodes[1].∑ .= vec(sum(δ𝑤[:, 𝑖], dims=2))
-EvoTrees.update_gains!(params_c.loss, nodes[n], 𝑗, params_c, K)
-nodes[1].gains
-# findmax(nodes[1].gains) #1.25e5: 36.500 μs (81 allocations: 8.22 KiB)
-@btime EvoTrees.update_gains!($params_c.loss, $nodes[n], $𝑗, $params_c, $K)
-@code_warntype EvoTrees.update_gains!(params_c.loss, nodes[n], 𝑗, params_c, K)
-
-#1.25e5: 14.100 μs (1 allocation: 32 bytes)
-best = findmax(nodes[n].gains)
-@btime best = findmax(nodes[n].gains)
-@btime best = findmax(view(nodes[n].gains, :, 𝑗))
-
-tree.cond_bin[n] = best[2][1]
-tree.feat[n] = best[2][2]
-
-Int.(tree.cond_bin[n])
-# tree.cond_bin[n] = 32
-
-# 204.900 μs (1 allocation: 96 bytes)
-offset = 0
-@time EvoTrees.split_set!(left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n], offset)
-@btime EvoTrees.split_set!($left, $right, $𝑖, $X_bin, $tree.feat[n], $tree.cond_bin[n], $offset)
-@code_warntype EvoTrees.split_set!(left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n])
-
-# 1.25e5: 227.200 μs (22 allocations: 1.44 KiB)
-@time EvoTrees.split_set_threads!(out, left, right, 𝑖, X_bin, tree.feat[n], tree.cond_bin[n], offset)
-@btime EvoTrees.split_set_threads!($out, $left, $right, $𝑖, $X_bin, $tree.feat[n], $tree.cond_bin[n], $offset, Int(2e15))
diff --git a/experiments/loss_gpu.jl b/experiments/loss_gpu.jl
deleted file mode 100644
index 37cacad8..00000000
--- a/experiments/loss_gpu.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-using CUDAnative
-using CuArrays
-using StaticArrays
-using BenchmarkTools
-
-N = Int(1e6)
-pred = rand(SVector{1, Float32}, N)
-target = rand(Float32, N)
-δ = zeros(SVector{1, Float32}, N)
-δ² = zeros(SVector{1, Float32}, N)
-
-pred_g = CuArray(rand(Float32, N))
-target_g = CuArray(rand(Float32, N))
-δ_g = CuArray(zeros(Float32, N))
-δ²_g = CuArray(zeros(Float32, N))
-
-pred = Array(pred_g)
-target = Array(target_g)
-δ = Array(δ_g)
-δ² = Array(δ²_g)
-
-# linear
-function update_grads!(pred::Vector{SVector{L,T}}, target::AbstractVector{T}, δ::Vector{SVector{L,T}}, δ²::Vector{SVector{L,T}}) where {T <: AbstractFloat, L}
-    @inbounds for i in eachindex(δ)
-        δ[i] = SVector(2 * (pred[i][1] - target[i]))
-        δ²[i] = SVector(2)
-    end
-end
-
-# linear
-function update_grads_gpu!(pred::AbstractVector{T}, target::AbstractVector{T}, δ::AbstractVector{T}, δ²::AbstractVector{T}, 𝑤::AbstractVector{T}) where {T <: AbstractFloat}
-    @. δ = 2f0 * (pred - target) * 𝑤
-    @. δ² = 2f0 * 𝑤
-    return
-end
-
-@time update_grads!(pred, target, δ, δ²)
-CuArrays.@time update_grads_gpu!(pred_g, target_g, δ_g, δ²_g)
diff --git a/experiments/shuffling.jl b/experiments/shuffling.jl
deleted file mode 100644
index 6d11a93e..00000000
--- a/experiments/shuffling.jl
+++ /dev/null
@@ -1,185 +0,0 @@
-using DataFrames
-using Distributions
-using EvoTrees
-using LinearAlgebra
-using GLM
-using Random
-
-δ = 1.0e-6
-b = fill(1.0 - δ, 3, 3) + δ * I
-z = zeros(3, 3)
-y = fill(0.5, 3)
-dist = MvNormal([
-    b z z 0.8*y
-    z b z y
-    z z b 1.2*y
-    0.8*y' y' 1.2*y' 1.0])
-Random.seed!(1)
-mat = rand(dist, 10_000);
-df = DataFrame(transpose(mat), [string.("x", 1:9); "y"]);
-target_name = "y"
-
-#################################
-# Tables API
-#################################
-config = EvoTreeRegressor(rng=123)
-m1 = fit_evotree(config,
-    df;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m1)
-
-config = EvoTreeRegressor(rng=124)
-m2 = fit_evotree(config,
-    df;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m2)
-
-#################################
-# Tables API - GPU
-#################################
-config = EvoTreeRegressor(rowsample=0.5, rng=123)
-m1 = fit_evotree(config,
-    df;
-    device="gpu",
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m1)
-
-config = EvoTreeRegressor(rowsample=0.5, rng=124)
-m2 = fit_evotree(config,
-    df;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m2)
-
-
-# permuted tables doesn't return the same result - numerical rounding error?
-df2 = df[!, 10:-1:1]
-config = EvoTreeRegressor()
-m3 = fit_evotree(config,
-    df2;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m3)
-
-# manual check on col permutations
-config = EvoTreeRegressor(max_depth=4)
-m1, cache1 = EvoTrees.init(config, df; target_name);
-EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
-EvoTrees.importance(m1)
-
-df2 = df[!, 10:-1:1];
-config = EvoTreeRegressor(max_depth=4)
-m2, cache2 = EvoTrees.init(config, df2; target_name);
-EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
-EvoTrees.importance(m2)
-
-all(cache1.x_bin .== cache2.x_bin[:, 9:-1:1])
-all(cache1.edges .== cache2.edges[9:-1:1])
-m1.trees[2]
-m2.trees[2]
-
-m1.trees[2].feat
-m2.trees[2].feat
-
-Int.(m1.trees[2].cond_bin)
-Int.(m2.trees[2].cond_bin)
-
-
-config = EvoTreeRegressor(nrounds=100, eta=0.05, colsample=1.0)
-m3 = fit_evotree(config,
-    df;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m3)
-
-#################################
-# Tables API
-#################################
-config = EvoTreeRegressor(colsample=0.5, rng=123)
-m1 = fit_evotree(config,
-    df;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m1)
-
-m2 = fit_evotree(config,
-    df;
-    target_name="y",
-    verbosity=0);
-EvoTrees.importance(m2)
-
-#################################
-# Matrix API
-#################################
-x_train = Matrix(mat[1:9, :]')
-y_train = mat[10, :]
-
-config = EvoTreeRegressor()
-m1 = fit_evotree(config;
-    x_train,
-    y_train,
-    verbosity=0);
-EvoTrees.importance(m1)
-
-m2 = fit_evotree(config;
-    x_train,
-    y_train,
-    verbosity=0);
-EvoTrees.importance(m2)
-
-using GLM
-x_train = Matrix(mat[1:9, :]')
-y_train = mat[10, :]
-lm(x_train, y_train)
-
-#################################
-# Matrix debug API
-#################################
-x_train = Matrix(mat[1:9, :]')
-y_train = mat[10, :]
-
-config = EvoTreeRegressor()
-m1, cache1 = EvoTrees.init(config, x_train, y_train);
-EvoTrees.grow_evotree!(m1, cache1, config, EvoTrees.CPU)
-EvoTrees.importance(m1)
-
-m2, cache2 = EvoTrees.init(config, x_train, y_train);
-EvoTrees.grow_evotree!(m2, cache2, config, EvoTrees.CPU)
-EvoTrees.importance(m2)
-
-
-using BenchmarkTools
-using Random: rand!
-using CUDA
-nobs = 1_000_000
-rng1 = Random.MersenneTwister(123)
-rng2 = Random.Xoshiro(123)
-rng3 = Random.TaskLocalRNG()
-
-is_in = zeros(UInt32, nobs);
-is_out = zeros(UInt32, nobs);
-mask = zeros(UInt8, nobs);
-
-@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng1);
-@btime EvoTrees.get_rand!(rng1, mask);
-
-@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng2);
-@btime EvoTrees.get_rand!(rng2, mask);
-
-@btime EvoTrees.subsample(is_in, is_out, mask, 0.5, rng3);
-@btime EvoTrees.get_rand!(rng3, mask);
-
-get_rand2!(rng, mask) = rand!(rng, mask)
-@btime get_rand2!(rng1, mask);
-@btime get_rand2!(rng2, mask);
-@btime get_rand2!(rng3, mask);
-
-mask = CUDA.zeros(UInt8, nobs);
-CUDA.@time EvoTrees.get_rand_gpu!(mask);
-CUDA.@time CUDA.rand(UInt8, nobs);
-# @btime EvoTrees.get_rand_gpu!(mask);
-# @btime CUDA.rand(UInt8, nobs);
-
diff --git a/experiments/update_set_v1.jl b/experiments/update_set_v1.jl
deleted file mode 100644
index 70c2fa7c..00000000
--- a/experiments/update_set_v1.jl
+++ /dev/null
@@ -1,277 +0,0 @@
-using Statistics
-using StatsBase:sample
-using Base.Threads:@threads
-using BenchmarkTools
-using Revise
-using EvoTrees
-
-n_obs = Int(1e6)
-n_vars = 100
-n_bins = 255
-𝑖 = collect(1:n_obs);
-𝑗 = collect(1:n_vars);
-δ = rand(n_obs);
-δ² = rand(n_obs);
-
-hist_δ = zeros(n_bins, n_vars);
-hist_δ² = zeros(n_bins, n_vars);
-X_bin = rand(UInt8, n_obs, n_vars);
-
-# split row ids into left and right based on best split condition
-function update_set_1(set, best, x_bin)
-    left = similar(set)
-    right = similar(set)
-    left_count = 0
-    right_count = 0
-    @inbounds for i in set
-        if x_bin[i] <= best
-            left_count += 1
-            left[left_count] = i
-        else
-            right_count += 1
-            right[right_count] = i
-        end
-    end
-    resize!(left, left_count)
-    resize!(right, right_count)
-    return left, right
-end
-
-@time update_set_1(𝑖, 16, X_bin[:,1]);
-@btime update_set_1($𝑖, 16, $X_bin[:,1]);
-@btime update_set_1($𝑖, 64, $X_bin[:,1]);
-@btime update_set_1($𝑖, 128, $X_bin[:,1]);
-@btime update_set_1($𝑖, 240, $X_bin[:,1]);
-
-# add a leaf id update - to indicate to which leaf the set is associated
-function update_set_2!(leaf_vec::Vector{T}, set, best_feat, best_cond, x_bin, depth::T) where {T}
-    @inbounds for i in set
-        left_id = leaf_vec[i] + 2^depth
-        right_id = left_id + 1
-        x_bin[i, best_feat[leaf_vec[i]]] <= best_cond[leaf_vec[i]] ? leaf_vec[i] = left_id : leaf_vec[i] = right_id
-    end
-end
-
-leaf_vec = ones(UInt16, n_obs);
-leaf_id = 0
-depth = UInt16(1)
-depth = 1
-best_feat = UInt16.(sample(1:100, 100000))
-best_cond = rand(UInt16, 100000);
-
-@time update_set_2!(leaf_vec, 𝑖, best_feat, best_cond, X_bin, depth);
-@btime update_set_2!($leaf_vec, $𝑖, $best_feat, $best_cond, $X_bin, $depth);
-Int.(leaf_vec)
-
-
-# split row ids into left and right based on best split condition
-function split_set_1!(left::V, right::V, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset) where {S,V}
-    
-    left_count = 0 
-    right_count = 0
-
-    @inbounds for i in 1:length(𝑖)
-        @inbounds if X_bin[𝑖[i], feat] <= cond_bin
-            left_count += 1
-            left[offset + left_count] = 𝑖[i]
-        else
-            right[offset + length(𝑖) - right_count] = 𝑖[i]
-            right_count += 1
-        end
-    end
-    # return (left[1:left_count], right[1:right_count])
-    return (view(left, (offset + 1):(offset + left_count)), view(right, (offset + length(𝑖)):-1:(offset + left_count + 1)))
-    # return nothing
-end
-
-n = Int(1e6)
-nvars = 100
-nbins = 64
-𝑖 = collect(1:n)
-𝑗 = collect(1:nvars)
-X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars)
-left = similar(𝑖)
-right = similar(𝑖)
-
-𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true)
-
-offset = 0
-feat = 15
-cond_bin = 32
-@time lid2, rid2 = split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
-@btime split_set_1!($left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset)
-@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
-
-offset = 0
-feat = 15
-cond_bin = 32
-lid1, rid1 = split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
-offset = 0
-feat = 14
-cond_bin = 12
-lid2, rid2 = split_set_1!(left, right, lid1, X_bin, feat, cond_bin, offset)
-offset = + length(lid1)
-feat = 14
-cond_bin = 12
-lid3, rid3 = split_set_1!(left, right, rid1, X_bin, feat, cond_bin, offset)
-
-lid1_ = deepcopy(lid1)
-
-𝑖
-unique(vcat(lid1, rid1))
-unique(vcat(lid1))
-unique(vcat(rid1))
-unique(sort(vcat(lid2, rid2)))
-unique(sort(vcat(lid3, rid3)))
-unique(sort(vcat(lid2, rid2, lid3, rid3)))
-
-# split row ids into left and right based on best split condition
-function split_set_2!(left, right, 𝑖, x_bin, feat, cond_bin)
-    
-    left_count = 0 
-    right_count = 0
-
-    @inbounds for i in 1:length(𝑖)
-        if x_bin[i] <= cond_bin
-            left_count += 1
-            left[left_count] = 𝑖[i]
-        else
-            right_count += 1
-            right[right_count] = 𝑖[i]
-        end
-    end
-    # return (left[1:left_count], right[1:right_count])
-    return (view(left, 1:left_count), view(right, 1:right_count))
-    # return nothing
-end
-
-n = Int(1e6)
-nvars = 100
-nbins = 64
-𝑖 = collect(1:n)
-𝑗 = collect(1:nvars)
-X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars)
-left = similar(𝑖)
-right = similar(𝑖)
-
-feat = 15
-cond_bin = 32
-@time left, right = split_set_2!(left, right, 𝑖, X_bin[:,feat], feat, cond_bin)
-@btime split_set_2!($left, $right, $𝑖, $X_bin[:,feat], $feat, $cond_bin)
-@btime split_set_2!($left, $right, $𝑖, $view(X_bin, :, feat), $feat, $cond_bin)
-
-
-
-# function split_set_bool!(child_bool::AbstractVector{Bool}, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset) where {S}    
-#     left_count = 0 
-#     right_count = 0
-#     @inbounds for i in eachindex(𝑖)
-#         child_bool[i + offset] = X_bin[𝑖[i], feat] <= cond_bin
-#     end
-#     # return (view(𝑖, child_bool[offset + 1:offset + length(𝑖)]), view(𝑖, .!child_bool[offset + 1:offset + length(𝑖)]))
-#     # return (view(𝑖, view(child_bool, (offset + 1):(offset + length(𝑖)))), view(𝑖, view(child_bool, (offset + 1):(offset + length(𝑖)))))
-#     # return (view(𝑖, child_bool), view(𝑖, child_bool))
-#     return view(𝑖, child_bool)
-# end
-
-function split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes)
-    left_count = 0
-    right_count = 0
-    @inbounds for i in eachindex(block)
-        @inbounds if X_bin[block[i], feat] <= cond_bin
-            left_count += 1
-            left[offset + chunk_size * (bid - 1) + left_count] = block[i]
-        else
-            right_count += 1
-            right[offset + chunk_size * (bid - 1) + right_count] = block[i]
-            # right[offset + chunk_size * (bid - 1) + length(block) - right_count] = block[i]
-            # right_count += 1
-        end
-    end
-    lefts[bid] = left_count
-    rights[bid] = right_count
-    bsizes[bid] = length(block)
-    return nothing
-end
-
-function split_set_threads!(out, left, right, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset, chunk_size=2^14) where {S}    
-
-    left_count = 0 
-    right_count = 0
-    iter = Iterators.partition(𝑖, chunk_size)
-    nblocks = length(iter)
-    lefts = zeros(Int, nblocks)
-    rights = zeros(Int, nblocks)
-    bsizes = zeros(Int, nblocks)
-
-    @sync for (bid, block) in enumerate(iter)
-        Threads.@spawn split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes)
-    end
-
-    left_sum = sum(lefts)
-    left_cum = 0
-    right_cum = 0
-    @inbounds for bid in 1:nblocks
-        view(out, offset + left_cum + 1:offset + left_cum + lefts[bid]) .= view(left, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid])
-        view(out, offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid])
-
-        # out[offset + left_cum + 1:offset + left_cum + lefts[bid]] .= left[offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid]]
-        # out[offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]] .= right[offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid]]
-
-        # view(right, offset + right_cum + 1:offset + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid])
-        # view(right, offset + length(𝑖) - right_cum:-1:offset + length(𝑖) - right_cum - rights[bid] + 1) .= view(right, offset + chunk_size * (bid - 1) + bsizes[bid]:-1:offset + chunk_size * (bid - 1) + lefts[bid]+1)
-        left_cum += lefts[bid]
-        right_cum += rights[bid]
-    end
-    
-    # right_cum = 0
-    # @inbounds for bid in nblocks:-1:1
-    #     # view(right, offset + right_cum + 1:offset + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid])
-    #     view(right, offset + length(𝑖) - right_cum:-1:offset + length(𝑖) - right_cum - rights[bid] + 1) .= view(right, offset + chunk_size * (bid - 1) + lefts[bid] + 1:offset + chunk_size * (bid - 1) + bsizes[bid])
-    #     # right_cum += rights[bid]
-    # end
-
-    return (view(out, offset + 1:offset + sum(lefts)), view(out, offset + sum(lefts)+1:offset + length(𝑖)))
-    # return (lefts, rights, out)
-    # return (view(left, offset + 1:offset + sum(lefts)), view(right, offset + length(𝑖):-1:offset + sum(lefts) + 1))
-    # return (left[offset + 1:offset + sum(lefts)], right[offset + 1:offset + sum(rights)])
-end
-
-iter = Iterators.partition(rand(5), 3)
-for i in enumerate(iter)
-    println(i)
-end
-
-n = Int(1e6)
-nvars = 100
-nbins = 64
-𝑖 = collect(1:n);
-𝑗 = collect(1:nvars);
-X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars);
-𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true);
-child_bool = zeros(Bool, length(𝑖));
-left = similar(𝑖)
-right = similar(𝑖)
-out = similar(𝑖)
-
-offset = 0
-feat = 15
-cond_bin = 32
-@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14);
-@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14);
-@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
-
-offset = 0
-feat = 15
-cond_bin = 32
-lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset)
-offset = 0
-feat = 14
-cond_bin = 12
-lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset)
-offset = + length(lid1)
-feat = 14
-cond_bin = 12
-lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset)
-
-lid1_ = deepcopy(lid1)
diff --git a/experiments/update_set_v2.jl b/experiments/update_set_v2.jl
deleted file mode 100644
index 43f3c2ca..00000000
--- a/experiments/update_set_v2.jl
+++ /dev/null
@@ -1,101 +0,0 @@
-using Statistics
-using StatsBase:sample
-using Base.Threads:@threads
-using BenchmarkTools
-using Revise
-using EvoTrees
-
-n_obs = Int(1e6)
-n_vars = 100
-n_bins = 255
-𝑖 = collect(1:n_obs);
-𝑗 = collect(1:n_vars);
-X_bin = rand(UInt8, n_obs, n_vars);
-
-function split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes)
-    left_count = 0
-    right_count = 0
-    @inbounds for i in eachindex(block)
-        @inbounds if X_bin[block[i], feat] <= cond_bin
-            left_count += 1
-            left[offset + chunk_size * (bid - 1) + left_count] = block[i]
-        else
-            right_count += 1
-            right[offset + chunk_size * (bid - 1) + right_count] = block[i]
-            # right[offset + chunk_size * (bid - 1) + length(block) - right_count] = block[i]
-            # right_count += 1
-        end
-    end
-    lefts[bid] = left_count
-    rights[bid] = right_count
-    bsizes[bid] = length(block)
-    return nothing
-end
-
-function split_set_threads!(out, left, right, 𝑖, X_bin::Matrix{S}, feat, cond_bin, offset, chunk_size=2^14) where {S}    
-
-    left_count = 0 
-    right_count = 0
-    iter = Iterators.partition(𝑖, chunk_size)
-    nblocks = length(iter)
-    lefts = zeros(Int, nblocks)
-    rights = zeros(Int, nblocks)
-    bsizes = zeros(Int, nblocks)
-
-    @sync for (bid, block) in enumerate(iter)
-        Threads.@spawn split_set_chunk!(left, right, block, bid, X_bin, feat, cond_bin, offset, chunk_size, lefts, rights, bsizes)
-    end
-
-    left_sum = sum(lefts)
-    left_cum = 0
-    right_cum = 0
-    @inbounds for bid in 1:nblocks
-        view(out, offset + left_cum + 1:offset + left_cum + lefts[bid]) .= view(left, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + lefts[bid])
-        view(out, offset + left_sum + right_cum + 1:offset + left_sum + right_cum + rights[bid]) .= view(right, offset + chunk_size * (bid - 1) + 1:offset + chunk_size * (bid - 1) + rights[bid])
-        left_cum += lefts[bid]
-        right_cum += rights[bid]
-    end
-    return (view(out, offset + 1:offset + sum(lefts)), view(out, offset + sum(lefts)+1:offset + length(𝑖)))
-end
-
-iter = Iterators.partition(rand(5), 3)
-for i in enumerate(iter)
-    println(i)
-end
-
-n = Int(1e6)
-nvars = 100
-nbins = 64
-𝑖 = collect(1:n);
-𝑗 = collect(1:nvars);
-X_bin = reshape(sample(UInt8.(1:nbins), n * nvars), n, nvars);
-𝑖 = sample(𝑖, Int(5e5), replace=false, ordered=true);
-child_bool = zeros(Bool, length(𝑖));
-left = similar(𝑖)
-right = similar(𝑖)
-out = similar(𝑖)
-
-offset = 0
-feat = 15
-cond_bin = 32
-@time l, r = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset, 2^14);
-@btime split_set_threads!($out, $left, $right, $𝑖, $X_bin, $feat, $cond_bin, $offset, 2^14);
-@code_warntype split_set_1!(left, right, 𝑖, X_bin, feat, cond_bin, offset)
-
-offset = 0
-feat = 15
-cond_bin = 32
-lid1, rid1 = split_set_threads!(out, left, right, 𝑖, X_bin, feat, cond_bin, offset)
-offset = 0
-feat = 14
-cond_bin = 12
-lid2, rid2 = split_set_threads!(out, left, right, lid1, X_bin, feat, cond_bin, offset)
-offset = + length(lid1)
-feat = 14
-cond_bin = 12
-lid3, rid3 = split_set_threads!(out, left, right, rid1, X_bin, feat, cond_bin, offset)
-
-lid1_ = deepcopy(lid1)
-
-
-
diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl
index 177e4fde..5dd6ae86 100644
--- a/ext/EvoTreesCUDAExt/fit-utils.jl
+++ b/ext/EvoTreesCUDAExt/fit-utils.jl
@@ -26,17 +26,18 @@ end
 function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
     kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
     config = launch_configuration(kernel.fun)
-    max_threads = config.threads ÷ 4
-    max_blocks = config.blocks * 4
+    max_threads = config.threads
+    max_blocks = config.blocks
     k = size(h∇, 1)
     ty = max(1, min(length(js), fld(max_threads, k)))
-    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    tx = max(1, min(length(is), fld(max_threads, k * ty)))
     threads = (k, ty, tx)
     by = cld(length(js), ty)
     bx = min(cld(max_blocks, by), cld(length(is), tx))
     blocks = (1, by, bx)
     h∇ .= 0
     kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    CUDA.synchronize()
     copyto!(h∇_cpu, h∇)
     Threads.@threads for j in jsc
         nbins = size(h[j], 2)