Skip to content

Commit

Permalink
Merge pull request #259 from devmotion/dw/cudaext
Browse files Browse the repository at this point in the history
Create CUDA extension
  • Loading branch information
jeremiedb authored Oct 10, 2023
2 parents cf6e3c0 + 6d33c71 commit 16db639
Show file tree
Hide file tree
Showing 15 changed files with 170 additions and 180 deletions.
9 changes: 8 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
EvoTreesCUDAExt = "CUDA"

[compat]
BSON = "0.3"
CUDA = "3.0, 4.0, 5.0"
Expand All @@ -29,6 +35,7 @@ Tables = "1.9"
julia = "1.6"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Expand All @@ -37,4 +44,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
docs = ["Documenter"]
test = ["DataFrames", "Test", "MLJBase", "MLJTestInterface"]
test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"]
6 changes: 6 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ m = fit_evotree(config, dtrain; target_name="y", fnames=["x1", "x3"]);

### GPU Acceleration

EvoTrees supports training and inference on Nvidia GPU's with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
Note that on Julia ≥ 1.9 CUDA support is only enabled when CUDA.jl is installed and loaded, by another package or explicitly with e.g.
```julia
using CUDA
```

If running on a CUDA enabled machine, training and inference on GPU can be triggered through the `device` kwarg:

```julia
Expand Down
4 changes: 0 additions & 4 deletions docs/src/internals.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ EvoTrees.update_gains!
EvoTrees.predict!
EvoTrees.subsample
EvoTrees.split_set_chunk!
EvoTrees.split_chunk_kernel!
```

## Histogram
Expand All @@ -28,7 +27,4 @@ EvoTrees.split_chunk_kernel!
EvoTrees.get_edges
EvoTrees.binarize
EvoTrees.update_hist!
EvoTrees.hist_kernel!
EvoTrees.hist_kernel_vec!
EvoTrees.predict_kernel!
```
22 changes: 22 additions & 0 deletions ext/EvoTreesCUDAExt/EvoTreesCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module EvoTreesCUDAExt

using EvoTrees
using CUDA

# This should be different on CPUs and GPUs
EvoTrees.device_ones(::Type{<:EvoTrees.GPU}, ::Type{T}, n::Int) where {T} = CUDA.ones(T, n)
EvoTrees.device_array_type(::Type{<:EvoTrees.GPU}) = CuArray
function EvoTrees.post_fit_gc(::Type{<:EvoTrees.GPU})
GC.gc(true)
CUDA.reclaim()
end

include("loss.jl")
include("eval.jl")
include("predict.jl")
include("init.jl")
include("subsample.jl")
include("fit-utils.jl")
include("fit.jl")

end # module
22 changes: 11 additions & 11 deletions src/gpu/eval.jl → ext/EvoTreesCUDAExt/eval.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ function eval_mse_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
end
return nothing
end
function mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.mse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_mse_kernel!(eval, p, y, w)
Expand All @@ -19,8 +19,8 @@ end
########################
# RMSE
########################
rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
sqrt(rmse(p, y, w; MAX_THREADS, kwargs...))
EvoTrees.rmse(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat} =
sqrt(EvoTrees.rmse(p, y, w; MAX_THREADS, kwargs...))

########################
# MAE
Expand All @@ -32,7 +32,7 @@ function eval_mae_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::CuDe
end
return nothing
end
function mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.mae(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_mae_kernel!(eval, p, y, w)
Expand All @@ -51,7 +51,7 @@ function eval_logloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
end
return nothing
end
function logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.logloss(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_logloss_kernel!(eval, p, y, w)
Expand All @@ -70,7 +70,7 @@ function eval_gaussian_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
end
return nothing
end
function gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.gaussian_mle(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_gaussian_kernel!(eval, p, y, w)
Expand All @@ -91,7 +91,7 @@ function eval_poisson_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
return nothing
end

function poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.poisson(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_poisson_kernel!(eval, p, y, w)
Expand All @@ -111,7 +111,7 @@ function eval_gamma_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::Cu
return nothing
end

function gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.gamma(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_gamma_kernel!(eval, p, y, w)
Expand All @@ -133,7 +133,7 @@ function eval_tweedie_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y::
return nothing
end

function tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.tweedie(p::CuMatrix{T}, y::CuVector{T}, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_tweedie_kernel!(eval, p, y, w)
Expand All @@ -158,10 +158,10 @@ function eval_mlogloss_kernel!(eval::CuDeviceVector{T}, p::CuDeviceMatrix{T}, y:
return nothing
end

function mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
function EvoTrees.mlogloss(p::CuMatrix{T}, y::CuVector, w::CuVector{T}, eval::CuVector{T}; MAX_THREADS=1024, kwargs...) where {T<:AbstractFloat}
threads = min(MAX_THREADS, length(y))
blocks = cld(length(y), threads)
@cuda blocks = blocks threads = threads eval_mlogloss_kernel!(eval, p, y, w)
CUDA.synchronize()
return sum(eval) / sum(w)
end
end
16 changes: 4 additions & 12 deletions src/gpu/fit-utils.jl → ext/EvoTreesCUDAExt/fit-utils.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
"""
hist_kernel!
"""
function hist_kernel!(h∇::CuDeviceArray{T,3}, ∇::CuDeviceMatrix{S}, x_bin, is, js) where {T,S}
tix, tiy, k = threadIdx().z, threadIdx().y, threadIdx().x
bdx, bdy = blockDim().z, blockDim().y
Expand Down Expand Up @@ -48,9 +45,6 @@ function update_hist_gpu!(h, h∇, ∇, x_bin, is, js, jsc)
return nothing
end

"""
hist_kernel_vec!
"""
function hist_kernel_vec!(h∇, ∇, x_bin, is)
tix, k = threadIdx().x, threadIdx().y
bdx = blockDim().x
Expand Down Expand Up @@ -103,10 +97,8 @@ function update_hist_gpu_vec!(h, h∇, ∇, x_bin, is, js::Vector)
return nothing
end

"""
Multi-threads split_set!
Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
"""
# Multi-threads split_set!
# Take a view into left and right placeholders. Right ids are assigned at the end of the length of the current node set.
function split_chunk_kernel!(
left::CuDeviceVector{S},
right::CuDeviceVector{S},
Expand Down Expand Up @@ -149,7 +141,7 @@ function split_chunk_kernel!(
return nothing
end

function split_views_kernel!(
function EvoTrees.split_views_kernel!(
out::CuDeviceVector{S},
left::CuDeviceVector{S},
right::CuDeviceVector{S},
Expand Down Expand Up @@ -208,7 +200,7 @@ function split_set_threads_gpu!(out, left, right, is, x_bin, feat, cond_bin, fea
sum_lefts = sum(lefts)
cumsum_lefts = cumsum(lefts)
cumsum_rights = cumsum(rights)
@cuda blocks = nblocks threads = 1 split_views_kernel!(
@cuda blocks = nblocks threads = 1 EvoTrees.split_views_kernel!(
out,
left,
right,
Expand Down
50 changes: 25 additions & 25 deletions src/gpu/fit.jl → ext/EvoTreesCUDAExt/fit.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type{GPU}) where {L,K}
function EvoTrees.grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTrees.EvoTypes{L}, ::Type{<:EvoTrees.GPU}) where {L,K}

# compute gradients
update_grads!(cache.∇, cache.pred, cache.y, params)
EvoTrees.update_grads!(cache.∇, cache.pred, cache.y, params)
# subsample rows
cache.nodes[1].is =
subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
EvoTrees.subsample(cache.is_in, cache.is_out, cache.mask, params.rowsample, params.rng)
# subsample cols
sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)
EvoTrees.sample!(params.rng, cache.js_, cache.js, replace=false, ordered=true)

# assign a root and grow tree
tree = Tree{L,K}(params.max_depth)
tree = EvoTrees.Tree{L,K}(params.max_depth)
grow! = params.tree_type == "oblivious" ? grow_otree! : grow_tree!
grow!(
tree,
Expand All @@ -27,16 +27,16 @@ function grow_evotree!(evotree::EvoTree{L,K}, cache, params::EvoTypes{L}, ::Type
cache.monotone_constraints,
)
push!(evotree.trees, tree)
predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
EvoTrees.predict!(cache.pred, tree, cache.x_bin, cache.feattypes_gpu)
cache[:info][:nrounds] += 1
return nothing
end

# grow a single binary tree - grow through all depth
function grow_tree!(
tree::Tree{L,K},
tree::EvoTrees.Tree{L,K},
nodes::Vector{N},
params::EvoTypes{L},
params::EvoTrees.EvoTypes{L},
::CuMatrix,
edges,
js,
Expand Down Expand Up @@ -66,7 +66,7 @@ function grow_tree!(

# initialize summary stats
nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?

# grow while there are remaining active nodes
while length(n_current) > 0 && depth <= params.max_depth
Expand All @@ -90,14 +90,14 @@ function grow_tree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads for n sort(n_current)
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
Threads.@threads for n sort(n_current)
EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end
end

for n sort(n_current)
if depth == params.max_depth || nodes[n].∑[end] <= params.min_weight
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
else
best = findmax(findmax.(nodes[n].gains))
best_gain = best[1][1]
Expand Down Expand Up @@ -126,8 +126,8 @@ function grow_tree!(
nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)

if length(_right) >= length(_left)
push!(n_next, n << 1)
Expand All @@ -137,7 +137,7 @@ function grow_tree!(
push!(n_next, n << 1)
end
else
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
end
end
end
Expand All @@ -151,9 +151,9 @@ end

# grow a single oblivious tree - grow through all depth
function grow_otree!(
tree::Tree{L,K},
tree::EvoTrees.Tree{L,K},
nodes::Vector{N},
params::EvoTypes{L},
params::EvoTrees.EvoTypes{L},
::CuMatrix,
edges,
js,
Expand Down Expand Up @@ -183,7 +183,7 @@ function grow_otree!(

# initialize summary stats
nodes[1].∑ .= Vector(vec(sum(∇[:, nodes[1].is], dims=2)))
nodes[1].gain = get_gain(params, nodes[1].∑) # should use a GPU version?
nodes[1].gain = EvoTrees.get_gain(params, nodes[1].∑) # should use a GPU version?

# grow while there are remaining active nodes
while length(n_current) > 0 && depth <= params.max_depth
Expand All @@ -197,7 +197,7 @@ function grow_otree!(
if depth == params.max_depth || min_weight_flag
for n in n_current
# @info "length(nodes[n].is)" length(nodes[n].is) depth n
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
end
else
# update histograms
Expand All @@ -217,8 +217,8 @@ function grow_otree!(
update_hist_gpu!(nodes[n].h, h∇, ∇, x_bin, nodes[n].is, jsg, js)
end
end
@threads for n n_current
update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
Threads.@threads for n n_current
EvoTrees.update_gains!(nodes[n], js, params, feattypes, monotone_constraints)
end

# initialize gains for node 1 in which all gains of a given depth will be accumulated
Expand Down Expand Up @@ -273,8 +273,8 @@ function grow_otree!(
nodes[n<<1].is, nodes[n<<1+1].is = _left, _right
nodes[n<<1].∑ .= nodes[n].hL[best_feat][:, best_bin]
nodes[n<<1+1].∑ .= nodes[n].hR[best_feat][:, best_bin]
nodes[n<<1].gain = get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = get_gain(params, nodes[n<<1+1].∑)
nodes[n<<1].gain = EvoTrees.get_gain(params, nodes[n<<1].∑)
nodes[n<<1+1].gain = EvoTrees.get_gain(params, nodes[n<<1+1].∑)

if length(_right) >= length(_left)
push!(n_next, n << 1)
Expand All @@ -286,7 +286,7 @@ function grow_otree!(
end
else
for n in n_current
pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
EvoTrees.pred_leaf_cpu!(tree.pred, n, nodes[n].∑, params, ∇, nodes[n].is)
end
end
end
Expand All @@ -295,4 +295,4 @@ function grow_otree!(
end # end of loop over current nodes for a given depth

return nothing
end
end
Loading

0 comments on commit 16db639

Please sign in to comment.