diff --git a/Project.toml b/Project.toml
index c6e1e298..f96d0415 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <jeremie.db@evovest.com>"]
-version = "0.16.3"
+version = "0.16.4"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
diff --git a/experiments/hist/perf-gpu.jl b/experiments/hist/perf-gpu.jl
new file mode 100644
index 00000000..d7b094b6
--- /dev/null
+++ b/experiments/hist/perf-gpu.jl
@@ -0,0 +1,202 @@
+using Revise
+using CUDA
+using StatsBase: sample
+using BenchmarkTools
+using Base.Threads: @threads
+using Random: seed!
+
+"""
+    hist_kernel!
+"""
+function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
+    tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
+    bdx, bdy = blockDim().z, blockDim().y
+    bix, biy = blockIdx().z, blockIdx().y
+    gdx = gridDim().z
+
+    j = tiy + bdy * (biy - 1)
+    if j <= length(js)
+        jdx = js[j]
+        i_max = length(is)
+        niter = cld(i_max, bdx * gdx)
+        @inbounds for iter = 1:niter
+            i = tix + bdx * (bix - 1) + bdx * gdx * (iter - 1)
+            if i <= i_max
+                @inbounds idx = is[i]
+                @inbounds bin = x_bin[idx, jdx]
+                hid = Base._to_linear_index(h∇, k, bin, jdx)
+                CUDA.atomic_add!(pointer(h∇, hid), T(∇[k, idx]))
+            end
+        end
+    end
+    sync_threads()
+    return nothing
+end
+
+function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    CUDA.synchronize()
+    CUDA.@sync for j in jsc
+        nbins = size(h[j], 2)
+        copyto!(h[j], view(h∇, :, 1:nbins, j))
+    end
+    return nothing
+end
+
+function update_hist_gpu1!(h, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    CUDA.synchronize()
+    return nothing
+end
+
+function update_hist_gpu2!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    copyto!(h∇_cpu, h∇)
+    CUDA.synchronize()
+    return nothing
+end
+
+
+function update_hist_gpu3!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
+    kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
+    config = launch_configuration(kernel.fun)
+    max_threads = config.threads ÷ 4
+    max_blocks = config.blocks * 4
+    k = size(h∇, 1)
+    ty = max(1, min(length(js), fld(max_threads, k)))
+    tx = min(64, max(1, min(length(is), fld(max_threads, k * ty))))
+    threads = (k, ty, tx)
+    by = cld(length(js), ty)
+    bx = min(cld(max_blocks, by), cld(length(is), tx))
+    blocks = (1, by, bx)
+    h∇ .= 0
+    kernel(h∇, ∇, x_bin, is, js; threads, blocks)
+    # CUDA.synchronize()
+    copyto!(h∇_cpu, h∇)
+    # CUDA.synchronize()
+    @threads for j in jsc
+        nbins = size(h[j], 2)
+        @views h[j] .= h∇_cpu[:, 1:nbins, j]
+        # h[j] .= h∇_cpu[:, 1:nbins, j]
+    end
+    return nothing
+end
+
+
+seed!(123)
+nbins = 32
+nfeats = 100
+nobs = Int(1e6)
+x_bin = UInt8.(rand(1:nbins, nobs, nfeats));
+∇ = rand(Float32, 3, nobs);
+h∇ = [zeros(Float32, 3, nbins) for n in 1:nfeats]
+rowsample = 0.5
+colsample = 0.5
+is = sample(1:nobs, Int(round(rowsample * nobs)), replace=false, ordered=true)
+js = sample(1:nfeats, Int(round(rowsample * nfeats)), replace=false, ordered=true)
+
+∇_gpu = CuArray(∇)
+x_bin_gpu = CuArray(x_bin)
+h∇_cpu = zeros(Float32, 3, nbins, nfeats)
+h∇_gpu = CuArray(h∇_cpu)
+is_gpu = CuArray(is)
+js_gpu = CuArray(js)
+
+CUDA.allowscalar(false)
+CUDA.@time update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+# ref without copy to cpu: ~same
+# ref 10K: 875.100 μs (168 allocations: 7.08 KiB)
+# ref 100K: 1.236 ms (215 allocations: 9.91 KiB)
+# ref 1M:  6.138 ms (227 allocations: 12.00 KiB)
+# ref 10M: 67.075 ms (235 allocations: 13.38 KiB)
+
+# with copy
+# CUDA v4 1M: 2.903 ms (124 allocations: 6.98 KiB)
+# CUDA v5 1M: 3.542 ms (848 allocations: 37.14 KiB)
+@btime update_hist_gpu!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+
+# without copy
+# CUDA v4 1M: 2.599 ms (74 allocations: 4.64 KiB)
+# CUDA v5 1M: 2.274 ms (48 allocations: 2.77 KiB)
+@btime update_hist_gpu1!(h∇, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+
+# without single array copy
+# CUDA v4 1M: 
+# CUDA v5 1M: 2.447 ms (48 allocations: 2.77 KiB)
+@btime update_hist_gpu2!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+
+# without single array copy
+# CUDA v4 1M: 
+# CUDA v5 1M: 2.442 ms (48 allocations: 2.77 KiB)
+@btime update_hist_gpu3!(h∇, h∇_cpu, h∇_gpu, ∇_gpu, x_bin_gpu, is_gpu, js_gpu, js)
+
+
+using CUDA, BenchmarkTools
+function gpu_copy!(h, h∇, jsc)
+    CUDA.@sync for j in jsc
+        nbins = size(h[j], 2)
+        copyto!(h[j], view(h∇, :, 1:nbins, j))
+    end
+    return nothing
+end
+
+h∇ = [zeros(Float32, 3, 32) for n in 1:100];
+h∇_gpu = CUDA.zeros(Float32, 3, 32, 100);
+js = 1:100
+
+# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB)
+# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB)
+@btime gpu_copy!(h∇, h∇_gpu, js)
+
+
+function gpu_copy2!(h, h∇, jsc)
+    for j in jsc
+        nbins = size(h[j], 2)
+        @async copyto!(h[j], view(h∇, :, 1:nbins, j))
+    end
+    return nothing
+end
+
+h∇ = [zeros(Float32, 3, 32) for n in 1:100];
+h∇_gpu = CUDA.zeros(Float32, 3, 32, 100);
+js = 1:100
+
+# CUDA v4: 534.480 μs (100 allocations: 4.69 KiB)
+# CUDA v5: 1.203 ms (1600 allocations: 68.75 KiB)
+@btime gpu_copy2!(h∇, h∇_gpu, js)
diff --git a/experiments/readme_plots-df-cpu.jl b/experiments/readme_plots-df-cpu.jl
index 878c6d7f..8e7a8df7 100644
--- a/experiments/readme_plots-df-cpu.jl
+++ b/experiments/readme_plots-df-cpu.jl
@@ -121,535 +121,3 @@ plot!(
     linewidth=1.5,
     label="Linear - C",
 )
-
-# logistic / cross-entropy
-params1 = EvoTreeRegressor(
-    loss=:logistic,
-    nrounds=200,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1,
-    dtrain;
-    fnames=["x_num", "x_cat"],
-    target_name="y",
-    deval,
-    metric=:logloss,
-    print_every_n=25,
-    early_stopping_rounds=20,
-    verbosity=0
-);
-# 218.040 ms (123372 allocations: 34.71 MiB)
-# @btime model = fit_evotree($params1, $X_train, $Y_train, X_eval = $X_eval, Y_eval = $Y_eval)
-plot(
-    dtrain.x_num,
-    dtrain.y,
-    msize=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-dinfer = dtrain[dtrain.x_cat.=="A", :]
-pred = model(dinfer)
-x_perm = sortperm(dinfer.x_num)
-plot!(
-    dinfer.x_num[x_perm],
-    pred[x_perm],
-    color="lightblue",
-    linewidth=1.5,
-    label="Linear - A",
-)
-dinfer = dtrain[dtrain.x_cat.=="B", :]
-pred = model(dinfer);
-x_perm = sortperm(dinfer.x_num)
-plot!(
-    dinfer.x_num[x_perm],
-    pred[x_perm],
-    color="blue",
-    linewidth=1.5,
-    label="Linear - B",
-)
-dinfer = dtrain[dtrain.x_cat.=="C", :]
-pred = model(dinfer);
-x_perm = sortperm(dinfer.x_num)
-plot!(
-    dinfer.x_num[x_perm],
-    pred[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Linear - C",
-)
-
-# L1
-params1 = EvoTreeRegressor(
-    loss=:L1,
-    alpha=0.5,
-    nrounds=500,
-    nbins=64,
-    lambda=0.0,
-    gamma=0.0,
-    eta=0.1,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:mae
-);
-@time pred_train_L1 = predict(model, x_train)
-@time pred_eval_L1 = predict(model, x_eval)
-sqrt(mean((pred_train_L1 .- y_train) .^ 2))
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train,
-    y_train,
-    msize=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_linear[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Linear",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_linear_w[x_perm],
-    color="lightblue",
-    linewidth=1.5,
-    label="LinearW",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_logistic[x_perm],
-    color="darkred",
-    linewidth=1.5,
-    label="Logistic",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_L1[x_perm],
-    color="darkgreen",
-    linewidth=1.5,
-    label="L1",
-)
-savefig("figures/regression_sinus.png")
-
-# Poisson
-params1 = EvoTreeCount(
-    loss=:poisson,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.1,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:poisson
-);
-@time pred_train_poisson = predict(model, x_train);
-sqrt(mean((pred_train_poisson .- y_train) .^ 2))
-
-# Gamma
-params1 = EvoTreeRegressor(
-    loss=:gamma,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.02,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:gamma
-);
-@time pred_train_gamma = predict(model, x_train);
-sqrt(mean((pred_train_gamma .- y_train) .^ 2))
-
-# Tweedie
-params1 = EvoTreeRegressor(
-    loss=:tweedie,
-    nrounds=500,
-    nbins=64,
-    lambda=0.5,
-    gamma=0.1,
-    eta=0.1,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:tweedie
-);
-@time pred_train_tweedie = predict(model, x_train);
-sqrt(mean((pred_train_tweedie .- y_train) .^ 2))
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train,
-    y_train,
-    msize=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_poisson[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Poisson",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_gamma[x_perm],
-    color="lightblue",
-    linewidth=1.5,
-    label="Gamma",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_tweedie[x_perm],
-    color="darkred",
-    linewidth=1.5,
-    label="Tweedie",
-)
-savefig("figures/regression_sinus2.png")
-
-
-###############################
-## Quantiles
-###############################
-# q50
-params1 = EvoTreeRegressor(
-    loss=:quantile,
-    alpha=0.5,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:mae
-);
-# 116.822 ms (74496 allocations: 36.41 MiB) for 100 iterations
-# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval)
-@time pred_train_q50 = predict(model, x_train)
-sum(pred_train_q50 .< y_train) / length(y_train)
-
-# q20
-params1 = EvoTreeRegressor(
-    loss=:quantile,
-    alpha=0.2,
-    nrounds=300,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25);
-@time pred_train_q20 = predict(model, x_train)
-sum(pred_train_q20 .< y_train) / length(y_train)
-
-# q80
-params1 = EvoTreeRegressor(
-    loss=:quantile,
-    alpha=0.8,
-    nrounds=300,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25)
-@time pred_train_q80 = predict(model, x_train)
-sum(pred_train_q80 .< y_train) / length(y_train)
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train,
-    y_train,
-    ms=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_q50[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Median",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_q20[x_perm],
-    color="darkred",
-    linewidth=1.5,
-    label="Q20",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_q80[x_perm],
-    color="darkgreen",
-    linewidth=1.5,
-    label="Q80",
-)
-savefig("figures/quantiles_sinus.png")
-
-
-###############################
-## gaussian
-###############################
-params1 = EvoTreeMLE(
-    loss=:gaussian,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.05,
-    max_depth=6,
-    min_weight=10.0,
-    rowsample=1.0,
-    colsample=1.0,
-    rng=123,
-)
-
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:gaussian
-);
-# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
-@time pred_train = EvoTrees.predict(model, x_train);
-# @btime pred_train = EvoTrees.predict(model, X_train);
-
-pred_gauss =
-    [Distributions.Normal(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1)]
-pred_q80 = quantile.(pred_gauss, 0.8)
-pred_q20 = quantile.(pred_gauss, 0.2)
-
-mean(y_train .< pred_q80)
-mean(y_train .< pred_q20)
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train[:, 1],
-    y_train,
-    ms=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 1],
-    color="navy",
-    linewidth=1.5,
-    label="mu",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 2],
-    color="darkred",
-    linewidth=1.5,
-    label="sigma",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q20[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q20",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q80[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q80",
-)
-savefig("figures/gaussian-sinus.png")
-
-
-###############################
-## Logistic
-###############################
-params1 = EvoTrees.EvoTreeMLE(
-    loss=:logistic,
-    nrounds=500,
-    nbins=64,
-    lambda=1.0,
-    gamma=0.1,
-    eta=0.03,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=1.0,
-    colsample=1.0,
-    rng=123,
-)
-
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:logistic_mle
-);
-# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
-@time pred_train = EvoTrees.predict(model, x_train);
-# @btime pred_train = EvoTrees.predict(model, X_train);
-
-pred_logistic = [
-    Distributions.Logistic(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1)
-]
-pred_q80 = quantile.(pred_logistic, 0.8)
-pred_q20 = quantile.(pred_logistic, 0.2)
-
-mean(y_train .< pred_q80)
-mean(y_train .< pred_q20)
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train[:, 1],
-    y_train,
-    ms=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 1],
-    color="navy",
-    linewidth=1.5,
-    label="mu",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 2],
-    color="darkred",
-    linewidth=1.5,
-    label="s",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q20[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q20",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q80[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q80",
-)
-savefig("figures/logistic-sinus.png")
diff --git a/experiments/readme_plots-df-gpu.jl b/experiments/readme_plots-df-gpu.jl
index bff06ae4..0e623a28 100644
--- a/experiments/readme_plots-df-gpu.jl
+++ b/experiments/readme_plots-df-gpu.jl
@@ -5,6 +5,7 @@ using StatsBase: sample, quantile
 using Distributions
 using Random
 using Plots
+using CUDA
 using EvoTrees
 using DataFrames
 using CategoricalArrays
@@ -124,495 +125,3 @@ plot!(
     linewidth=1.5,
     label="Linear - C",
 )
-
-# logistic / cross-entropy
-params1 = EvoTreeRegressor(
-    loss=:logistic,
-    nrounds=200,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:logloss
-);
-# 218.040 ms (123372 allocations: 34.71 MiB)
-# @btime model = fit_evotree($params1, $X_train, $Y_train, X_eval = $X_eval, Y_eval = $Y_eval)
-@time pred_train_logistic = predict(model, x_train);
-@time pred_eval_logistic = predict(model, x_eval)
-sqrt(mean((pred_train_logistic .- y_train) .^ 2))
-
-# L1
-params1 = EvoTreeRegressor(
-    loss=:L1,
-    alpha=0.5,
-    nrounds=500,
-    nbins=64,
-    lambda=0.0,
-    gamma=0.0,
-    eta=0.1,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:mae
-);
-@time pred_train_L1 = predict(model, x_train)
-@time pred_eval_L1 = predict(model, x_eval)
-sqrt(mean((pred_train_L1 .- y_train) .^ 2))
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train,
-    y_train,
-    msize=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_linear[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Linear",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_linear_w[x_perm],
-    color="lightblue",
-    linewidth=1.5,
-    label="LinearW",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_logistic[x_perm],
-    color="darkred",
-    linewidth=1.5,
-    label="Logistic",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_L1[x_perm],
-    color="darkgreen",
-    linewidth=1.5,
-    label="L1",
-)
-savefig("figures/regression_sinus.png")
-
-# Poisson
-params1 = EvoTreeCount(
-    loss=:poisson,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.1,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:poisson
-);
-@time pred_train_poisson = predict(model, x_train);
-sqrt(mean((pred_train_poisson .- y_train) .^ 2))
-
-# Gamma
-params1 = EvoTreeRegressor(
-    loss=:gamma,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.02,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:gamma
-);
-@time pred_train_gamma = predict(model, x_train);
-sqrt(mean((pred_train_gamma .- y_train) .^ 2))
-
-# Tweedie
-params1 = EvoTreeRegressor(
-    loss=:tweedie,
-    nrounds=500,
-    nbins=64,
-    lambda=0.5,
-    gamma=0.1,
-    eta=0.1,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:tweedie
-);
-@time pred_train_tweedie = predict(model, x_train);
-sqrt(mean((pred_train_tweedie .- y_train) .^ 2))
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train,
-    y_train,
-    msize=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_poisson[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Poisson",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_gamma[x_perm],
-    color="lightblue",
-    linewidth=1.5,
-    label="Gamma",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_tweedie[x_perm],
-    color="darkred",
-    linewidth=1.5,
-    label="Tweedie",
-)
-savefig("figures/regression_sinus2.png")
-
-
-###############################
-## Quantiles
-###############################
-# q50
-params1 = EvoTreeRegressor(
-    loss=:quantile,
-    alpha=0.5,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:mae
-);
-# 116.822 ms (74496 allocations: 36.41 MiB) for 100 iterations
-# @btime model = grow_gbtree($X_train, $Y_train, $params1, X_eval = $X_eval, Y_eval = $Y_eval)
-@time pred_train_q50 = predict(model, x_train)
-sum(pred_train_q50 .< y_train) / length(y_train)
-
-# q20
-params1 = EvoTreeRegressor(
-    loss=:quantile,
-    alpha=0.2,
-    nrounds=300,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25);
-@time pred_train_q20 = predict(model, x_train)
-sum(pred_train_q20 .< y_train) / length(y_train)
-
-# q80
-params1 = EvoTreeRegressor(
-    loss=:quantile,
-    alpha=0.8,
-    nrounds=300,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.0,
-    eta=0.05,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=0.5,
-    colsample=1.0,
-)
-@time model = fit_evotree(params1; x_train, y_train, x_eval, y_eval, print_every_n=25)
-@time pred_train_q80 = predict(model, x_train)
-sum(pred_train_q80 .< y_train) / length(y_train)
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train,
-    y_train,
-    ms=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_q50[x_perm],
-    color="navy",
-    linewidth=1.5,
-    label="Median",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_q20[x_perm],
-    color="darkred",
-    linewidth=1.5,
-    label="Q20",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train_q80[x_perm],
-    color="darkgreen",
-    linewidth=1.5,
-    label="Q80",
-)
-savefig("figures/quantiles_sinus.png")
-
-
-###############################
-## gaussian
-###############################
-params1 = EvoTreeMLE(
-    loss=:gaussian,
-    nrounds=500,
-    nbins=64,
-    lambda=0.1,
-    gamma=0.1,
-    eta=0.05,
-    max_depth=6,
-    min_weight=10.0,
-    rowsample=1.0,
-    colsample=1.0,
-    rng=123,
-)
-
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:gaussian
-);
-# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
-@time pred_train = EvoTrees.predict(model, x_train);
-# @btime pred_train = EvoTrees.predict(model, X_train);
-
-pred_gauss =
-    [Distributions.Normal(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1)]
-pred_q80 = quantile.(pred_gauss, 0.8)
-pred_q20 = quantile.(pred_gauss, 0.2)
-
-mean(y_train .< pred_q80)
-mean(y_train .< pred_q20)
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train[:, 1],
-    y_train,
-    ms=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 1],
-    color="navy",
-    linewidth=1.5,
-    label="mu",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 2],
-    color="darkred",
-    linewidth=1.5,
-    label="sigma",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q20[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q20",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q80[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q80",
-)
-savefig("figures/gaussian-sinus.png")
-
-
-###############################
-## Logistic
-###############################
-params1 = EvoTrees.EvoTreeMLE(
-    loss=:logistic,
-    nrounds=500,
-    nbins=64,
-    lambda=1.0,
-    gamma=0.1,
-    eta=0.03,
-    max_depth=6,
-    min_weight=1.0,
-    rowsample=1.0,
-    colsample=1.0,
-    rng=123,
-)
-
-@time model = fit_evotree(
-    params1;
-    x_train,
-    y_train,
-    x_eval,
-    y_eval,
-    print_every_n=25,
-    early_stopping_rounds=50,
-    metric=:logistic_mle
-);
-# @time model = fit_evotree(params1, X_train, Y_train, print_every_n = 10);
-@time pred_train = EvoTrees.predict(model, x_train);
-# @btime pred_train = EvoTrees.predict(model, X_train);
-
-pred_logistic = [
-    Distributions.Logistic(pred_train[i, 1], pred_train[i, 2]) for i in axes(pred_train, 1)
-]
-pred_q80 = quantile.(pred_logistic, 0.8)
-pred_q20 = quantile.(pred_logistic, 0.2)
-
-mean(y_train .< pred_q80)
-mean(y_train .< pred_q20)
-
-x_perm = sortperm(x_train[:, 1])
-plot(
-    x_train[:, 1],
-    y_train,
-    ms=0.5,
-    mcolor="darkgray",
-    mswidth=0,
-    background_color=RGB(1, 1, 1),
-    seriestype=:scatter,
-    xaxis=("feature"),
-    yaxis=("target"),
-    legend=true,
-    label="",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 1],
-    color="navy",
-    linewidth=1.5,
-    label="mu",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_train[x_perm, 2],
-    color="darkred",
-    linewidth=1.5,
-    label="s",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q20[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q20",
-)
-plot!(
-    x_train[:, 1][x_perm],
-    pred_q80[x_perm, 1],
-    color="darkgreen",
-    linewidth=1.5,
-    label="q80",
-)
-savefig("figures/logistic-sinus.png")
diff --git a/experiments/readme_plots_gpu.jl b/experiments/readme_plots_gpu.jl
index 33b89c92..18fdcf6c 100644
--- a/experiments/readme_plots_gpu.jl
+++ b/experiments/readme_plots_gpu.jl
@@ -5,12 +5,13 @@ using Distributions
 using Random
 using Plots
 using Revise
+using CUDA
 using EvoTrees
 using EvoTrees: predict, sigmoid, logit
 # using ProfileView
 
 # prepare a dataset
-tree_type = "binary"
+tree_type = "binary" # binary/oblivious
 device = "gpu"
 
 Random.seed!(123)
diff --git a/ext/EvoTreesCUDAExt/eval.jl b/ext/EvoTreesCUDAExt/eval.jl
index 6ea10153..f041c18e 100644
--- a/ext/EvoTreesCUDAExt/eval.jl
+++ b/ext/EvoTreesCUDAExt/eval.jl
@@ -46,7 +46,7 @@ end
 function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDeviceVector{T}, w::CuDeviceVector{T}) where {T<:AbstractFloat}
     i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
     if i <= length(y)
-        @inbounds pred = sigmoid(p[1, i])
+        @inbounds pred = EvoTrees.sigmoid(p[1, i])
         @inbounds eval[i] = w[i] * (-y[i] * log(pred) + (y[i] - 1) * log(1 - pred))
     end
     return nothing
diff --git a/ext/EvoTreesCUDAExt/fit-utils.jl b/ext/EvoTreesCUDAExt/fit-utils.jl
index 31bc1f55..2533f789 100644
--- a/ext/EvoTreesCUDAExt/fit-utils.jl
+++ b/ext/EvoTreesCUDAExt/fit-utils.jl
@@ -23,7 +23,7 @@ function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, i
     return nothing
 end
 
-function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
+function update_hist_gpu!(h, h∇_cpu, h∇, ∇, x_bin, is, js, jsc)
     kernel = @cuda launch = false hist_kernel!(h∇, ∇, x_bin, is, js)
     config = launch_configuration(kernel.fun)
     max_threads = config.threads ÷ 4
@@ -37,10 +37,10 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
     blocks = (1, by, bx)
     h∇ .= 0
     kernel(h∇, ∇, x_bin, is, js; threads, blocks)
-    CUDA.synchronize()
-    CUDA.@sync for j in jsc
+    copyto!(h∇_cpu, h∇)
+    Threads.@threads for j in jsc
         nbins = size(h[j], 2)
-        copyto!(h[j], view(h∇, :, 1:nbins, j))
+        @views h[j] .= h∇_cpu[:, 1:nbins, j]
     end
     return nothing
 end
diff --git a/ext/EvoTreesCUDAExt/fit.jl b/ext/EvoTreesCUDAExt/fit.jl
index f0e32cf0..1f454254 100644
--- a/ext/EvoTreesCUDAExt/fit.jl
+++ b/ext/EvoTreesCUDAExt/fit.jl
@@ -21,6 +21,7 @@ function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.E
         cache.out,
         cache.left,
         cache.right,
+        cache.h∇_cpu,
         cache.h∇,
         cache.x_bin,
         cache.feattypes,
@@ -43,6 +44,7 @@ function grow_tree!(
     out,
     left,
     right,
+    h∇_cpu::Array{Float64,3},
     h∇::CuArray{Float64,3},
     x_bin::CuMatrix,
     feattypes::Vector{Bool},
@@ -87,7 +89,7 @@ function grow_tree!(
                         end
                     end
                 else
-                    update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
+                    update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
             Threads.@threads for n ∈ sort(n_current)
@@ -160,6 +162,7 @@ function grow_otree!(
     out,
     left,
     right,
+    h∇_cpu::Array{Float64,3},
     h∇::CuArray{Float64,3},
     x_bin::CuMatrix,
     feattypes::Vector{Bool},
@@ -214,7 +217,7 @@ function grow_otree!(
                         end
                     end
                 else
-                    update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
+                    update_hist_gpu!(nodes[n].h, h∇_cpu, h∇, ∇, x_bin, nodes[n].is, jsg, js)
                 end
             end
             Threads.@threads for n ∈ n_current
diff --git a/ext/EvoTreesCUDAExt/init.jl b/ext/EvoTreesCUDAExt/init.jl
index 6a8dfcda..3a0b24c9 100644
--- a/ext/EvoTreesCUDAExt/init.jl
+++ b/ext/EvoTreesCUDAExt/init.jl
@@ -63,7 +63,8 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}
     !isnothing(offset) && (pred .+= CuArray(offset'))
 
     # initialize gradients
-    h∇ = CUDA.zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins))
+    h∇_cpu = zeros(Float64, 2 * K + 1, maximum(featbins), length(featbins))
+    h∇ = CuArray(h∇_cpu)
     ∇ = CUDA.zeros(T, 2 * K + 1, nobs)
     @assert (length(y) == length(w) && minimum(w) > 0)
     ∇[end, :] .= w
@@ -117,6 +118,7 @@ function EvoTrees.init_core(params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}
         right=right,
         ∇=∇,
         h∇=h∇,
+        h∇_cpu=h∇_cpu,
         fnames=fnames,
         edges=edges,
         featbins=featbins,
diff --git a/figures/gaussian-sinus-oblivious-gpu.png b/figures/gaussian-sinus-oblivious-gpu.png
index abde7cdd..c62c1267 100644
Binary files a/figures/gaussian-sinus-oblivious-gpu.png and b/figures/gaussian-sinus-oblivious-gpu.png differ
diff --git a/figures/regression-sinus-oblivious-gpu.png b/figures/regression-sinus-oblivious-gpu.png
index 44644279..eb60970d 100644
Binary files a/figures/regression-sinus-oblivious-gpu.png and b/figures/regression-sinus-oblivious-gpu.png differ