Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KernelAbstractions support #147

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
- NotZygote
- Zygote
steps:
- run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*.jl.*.cov
*.jl.mem
docs/build
/Manifest.toml
*Manifest.toml
benchmark/tune.json
benchmark/results
.vscode/settings.json
7 changes: 6 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
Expand All @@ -21,7 +20,9 @@ Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
Expand All @@ -39,11 +40,13 @@ UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"

[extensions]
MollyCUDAExt = "CUDA"
MollyGLMakieExt = ["GLMakie", "Colors"]
MollyPythonCallExt = "PythonCall"

Expand All @@ -67,7 +70,9 @@ EzXML = "1"
FLoops = "0.2"
ForwardDiff = "0.10.35"
GLMakie = "0.8, 0.9, 0.10"
GPUArrays = "10"
Graphs = "1.8"
KernelAbstractions = "0.9"
KernelDensity = "0.5, 0.6"
LinearAlgebra = "1.9"
NearestNeighbors = "0.4"
Expand Down
52 changes: 23 additions & 29 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
const starting_coords_f32 = [Float32.(c) for c in starting_coords]
const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]

function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
function test_sim(nl::Bool, parallel::Bool, f32::Bool,
ArrayType::Type{AT}) where AT <: AbstractArray
n_atoms = 400
n_steps = 200
atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
Expand All @@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
specific_inter_lists = (InteractionList2Atoms(
gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
gpu ? CuArray(bonds) : bonds,
ArrayType(Int32.(collect(1:2:n_atoms))),
ArrayType(Int32.(collect(2:2:n_atoms))),
ArrayType(bonds),
),)

neighbor_finder = NoNeighborFinder()
cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
if nl
neighbor_finder = DistanceNeighborFinder(
eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
eligible=ArrayType(trues(n_atoms, n_atoms)),
n_steps=10,
dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
)
pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
end

if gpu
coords = CuArray(deepcopy(f32 ? starting_coords_f32 : starting_coords))
velocities = CuArray(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
atoms = CuArray([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
else
coords = deepcopy(f32 ? starting_coords_f32 : starting_coords)
velocities = deepcopy(f32 ? starting_velocities_f32 : starting_velocities)
atoms = [Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
end
coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])

sys = System(
atoms=atoms,
Expand All @@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
end

runs = [
("CPU" , [false, false, false, false]),
("CPU f32" , [false, false, true , false]),
("CPU NL" , [true , false, false, false]),
("CPU f32 NL", [true , false, true , false]),
("CPU" , [false, false, false, Array]),
("CPU f32" , [false, false, true , Array]),
("CPU NL" , [true , false, false, Array]),
("CPU f32 NL", [true , false, true , Array]),
]
if run_parallel_tests
push!(runs, ("CPU parallel" , [false, true , false, false]))
push!(runs, ("CPU parallel f32" , [false, true , true , false]))
push!(runs, ("CPU parallel NL" , [true , true , false, false]))
push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
push!(runs, ("CPU parallel" , [false, true , false, Array]))
push!(runs, ("CPU parallel f32" , [false, true , true , Array]))
push!(runs, ("CPU parallel NL" , [true , true , false, Array]))
push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
end
if run_gpu_tests
push!(runs, ("GPU" , [false, false, false, true]))
push!(runs, ("GPU f32" , [false, false, true , true]))
push!(runs, ("GPU NL" , [true , false, false, true]))
push!(runs, ("GPU f32 NL", [true , false, true , true]))
if run_cuda_tests
push!(runs, ("GPU" , [false, false, false, CuArray]))
push!(runs, ("GPU f32" , [false, false, true , CuArray]))
push!(runs, ("GPU NL" , [true , false, false, CuArray]))
push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
end

for (name, args) in runs
Expand Down
22 changes: 11 additions & 11 deletions benchmark/protein.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
const ff_dir = joinpath(data_dir, "force_fields")
const openmm_dir = joinpath(data_dir, "openmm_6mrr")

function setup_system(gpu::Bool, f32::Bool, units::Bool)
function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
T = f32 ? Float32 : Float64
ff = MolecularForceField(
T,
Expand All @@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
sys = System(
joinpath(data_dir, "6mrr_equil.pdb"),
ff;
velocities=gpu ? CuArray(velocities) : velocities,
velocities=ArrayType(velocities),
units=units,
gpu=gpu,
dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
Expand All @@ -42,15 +42,15 @@ end

runs = [
# run_name gpu parr f32 units
("CPU 1 thread" , false, false, false, true ),
("CPU 1 thread f32" , false, false, true , true ),
("CPU 1 thread f32 nounits" , false, false, true , false),
("CPU $n_threads threads" , false, true , false, true ),
("CPU $n_threads threads f32" , false, true , true , true ),
("CPU $n_threads threads f32 nounits", false, true , true , false),
("GPU" , true , false, false, true ),
("GPU f32" , true , false, true , true ),
("GPU f32 nounits" , true , false, true , false),
("CPU 1 thread" , Array, false, false, true ),
("CPU 1 thread f32" , Array, false, true , true ),
("CPU 1 thread f32 nounits" , Array, false, true , false),
("CPU $n_threads threads" , Array, true , false, true ),
("CPU $n_threads threads f32" , Array, true , true , true ),
("CPU $n_threads threads f32 nounits", Array, true , true , false),
("GPU" , CuArray, false, false, true ),
("GPU f32" , CuArray, false, true , true ),
("GPU f32 nounits" , CuArray, false, true , false),
]

for (run_name, gpu, parallel, f32, units) in runs
Expand Down
177 changes: 177 additions & 0 deletions ext/MollyCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
module MollyCUDAExt

using Molly
using CUDA
using ChainRulesCore
using Atomix
using KernelAbstractions

CUDA.Const(nl::Molly.NoNeighborList) = nl

# CUDA.jl kernels
const WARPSIZE = UInt32(32)

macro shfl_multiple_sync(mask, target, width, vars...)
all_lines = map(vars) do v
Expr(:(=), v,
Expr(:call, :shfl_sync,
mask, v, target, width
)
)
end
return esc(Expr(:block, all_lines...))
end

CUDA.shfl_recurse(op, x::Quantity) = op(x.val) * unit(x)
CUDA.shfl_recurse(op, x::SVector{1, C}) where C = SVector{1, C}(op(x[1]))
CUDA.shfl_recurse(op, x::SVector{2, C}) where C = SVector{2, C}(op(x[1]), op(x[2]))
CUDA.shfl_recurse(op, x::SVector{3, C}) where C = SVector{3, C}(op(x[1]), op(x[2]), op(x[3]))

function cuda_threads_blocks_pairwise(n_neighbors)
n_threads_gpu = min(n_neighbors, parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")))
n_blocks = cld(n_neighbors, n_threads_gpu)
return n_threads_gpu, n_blocks
end

function cuda_threads_blocks_specific(n_inters)
n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
n_blocks = cld(n_inters, n_threads_gpu)
return n_threads_gpu, n_blocks
end

function Molly.pairwise_force_gpu(coords::CuArray{SVector{D, C}}, atoms, boundary,
pairwise_inters, nbs, force_units, ::Val{T}) where {D, C, T}
fs_mat = CUDA.zeros(T, D, length(atoms))

if typeof(nbs) == Molly.NoNeighborList
kernel = @cuda launch=false cuda_pairwise_force_kernel_nonl!(
fs_mat, coords, atoms, boundary, pairwise_inters, Val(D), Val(force_units))
conf = launch_configuration(kernel.fun)
threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
nthreads = min(length(atoms), threads_basic, conf.threads)
nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
n_blocks_i = cld(length(atoms), WARPSIZE)
n_blocks_j = cld(length(atoms), nthreads)
kernel(fs_mat, coords, atoms, boundary, pairwise_inters, Val(D), Val(force_units);
threads=nthreads, blocks=(n_blocks_i, n_blocks_j))
else
backend = get_backend(coords)
n_threads_gpu = Molly.gpu_threads_blocks_pairwise(length(nbs))
kernel! = Molly.pairwise_force_kernel!(backend, n_threads_gpu)
kernel!(fs_mat, coords, atoms, boundary, pairwise_inters, nbs,
Val(D), Val(force_units), ndrange = length(nbs))
end
return fs_mat
end

#=
**The No-neighborlist pairwise force summation kernel**: This kernel calculates all the pairwise forces in the system of
`n_atoms` atoms, this is done by dividing the complete matrix of `n_atoms`×`n_atoms` interactions into small tiles. Most
of the tiles are of size `WARPSIZE`×`WARPSIZE`, but when `n_atoms` is not divisible by `WARPSIZE`, some tiles on the
edges are of a different size are dealt as a separate case. The force summation for the tiles are done in the following
way:
1. `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different tread in a warp which calculates the
forces for the entire row in `WARPSIZE` steps (or `WARPSIZE - 1` steps for tiles on the diagonal of `n_atoms`×`n_atoms`
matrix of interactions). This is done such that some data can be shuffled from `i+1`'th thread to `i`'th thread in each
subsequent iteration of the force calculation in a row. If `a, b, ...` are different atoms and `1, 2, ...` are order in
which each thread calculates the interatomic forces, then we can represent this scenario as (considering `WARPSIZE=8`):
```
× | i j k l m n o p
--------------------
a | 1 2 3 4 5 6 7 8
b | 8 1 2 3 4 5 6 7
c | 7 8 1 2 3 4 5 6
d | 6 7 8 1 2 3 4 5
e | 5 6 7 8 1 2 3 4
f | 4 5 6 7 8 1 2 3
g | 3 4 5 6 7 8 1 2
h | 2 3 4 5 6 7 8 1
```

2. Edge tiles when `n_atoms` is not divisible by warpsize: In such cases, it is not possible to shuffle data generally
so there is no need to order calculations for each thread diagonally and it is also a bit more complicated to do so.
That's why the calculations are done in the following order:
```
× | i j k l m n
----------------
a | 1 2 3 4 5 6
b | 1 2 3 4 5 6
c | 1 2 3 4 5 6
d | 1 2 3 4 5 6
e | 1 2 3 4 5 6
f | 1 2 3 4 5 6
g | 1 2 3 4 5 6
h | 1 2 3 4 5 6
```
=#
function cuda_pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, atoms_var, boundary, inters,
::Val{D}, ::Val{F}) where {T, D, F}
coords = CUDA.Const(coords_var)
atoms = CUDA.Const(atoms_var)
n_atoms = length(atoms)

tidx = threadIdx().x
i_0_tile = (blockIdx().x - 1) * warpsize()
j_0_block = (blockIdx().y - 1) * blockDim().x
warpidx = cld(tidx, warpsize())
j_0_tile = j_0_block + (warpidx - 1) * warpsize()
i = i_0_tile + laneid()

forces_shmem = CuStaticSharedArray(T, (3, 1024))
@inbounds for dim in 1:3
forces_shmem[dim, tidx] = zero(T)
end

if i_0_tile + warpsize() > n_atoms || j_0_tile + warpsize() > n_atoms
@inbounds if i <= n_atoms
njs = min(warpsize(), n_atoms - j_0_tile)
atom_i, coord_i = atoms[i], coords[i]
for del_j in 1:njs
j = j_0_tile + del_j
if i != j
atom_j, coord_j = atoms[j], coords[j]
f = Molly.sum_pairwise_forces(inters, coord_i, coord_j, atom_i, atom_j, boundary, false, Val(F))
for dim in 1:D
forces_shmem[dim, tidx] += -ustrip(f[dim])
end
end
end

for dim in 1:D
Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
end
end
else
j = j_0_tile + laneid()
tilesteps = warpsize()
if i_0_tile == j_0_tile # To not compute i-i forces
j = j_0_tile + laneid() % warpsize() + 1
tilesteps -= 1
end

atom_i, coord_i = atoms[i], coords[i]
coord_j = coords[j]
@inbounds for _ in 1:tilesteps
sync_warp()
atom_j = atoms[j]
f = Molly.sum_pairwise_forces(inters, coord_i, coord_j, atom_i, atom_j, boundary, false, Val(F))
for dim in 1:D
forces_shmem[dim, tidx] += -Molly.ustrip(f[dim])
end
@shfl_multiple_sync(FULL_MASK, laneid() + 1, warpsize(), j, coord_j)
end

@inbounds for dim in 1:D
Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
end
end

return nothing
end

# CUDA specific calls for Molly
@non_differentiable CUDA.zeros(args...)
@non_differentiable cuda_threads_blocks_pairwise(args...)
@non_differentiable cuda_threads_blocks_specific(args...)

end
2 changes: 1 addition & 1 deletion ext/MollyGLMakieExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ module MollyGLMakieExt

using Molly
using GLMakie
using Colors
using Unitful
using Colors

using LinearAlgebra

Expand Down
Loading