JuliaMolSim · leios · Sep 6, 2023 · Jul 2, 2024 · Jul 2, 2024 · Sep 27, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -28,6 +28,7 @@ jobs:
           - NotZygote
           - Zygote
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
diff --git a/Project.toml b/Project.toml
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -21,7 +20,9 @@ Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
@@ -39,11 +40,13 @@ UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
+MollyCUDAExt = "CUDA"
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyPythonCallExt = "PythonCall"
 
@@ -67,7 +70,9 @@ EzXML = "1"
 FLoops = "0.2"
 ForwardDiff = "0.10.35"
 GLMakie = "0.8, 0.9, 0.10"
+GPUArrays = "10"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                  ArrayType::Type{AT}) where AT <: AbstractArray
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        ArrayType(Int32.(collect(1:2:n_atoms))),
+        ArrayType(Int32.(collect(2:2:n_atoms))),
+        ArrayType(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
     cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=ArrayType(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(deepcopy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = deepcopy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = deepcopy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs

diff --git a/benchmark/protein.jl b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=ArrayType(velocities),
         units=units,
         gpu=gpu,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -42,15 +42,15 @@ end
 
 runs = [
     # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    ("CPU 1 thread"                      , Array, false, false, true ),
+    ("CPU 1 thread f32"                  , Array, false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array, false, true , false),
+    ("CPU $n_threads threads"            , Array, true , false, true ),
+    ("CPU $n_threads threads f32"        , Array, true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array, true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
 for (run_name, gpu, parallel, f32, units) in runs

diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
@@ -0,0 +1,177 @@
+module MollyCUDAExt
+
+using Molly
+using CUDA
+using ChainRulesCore
+using Atomix
+using KernelAbstractions
+
+CUDA.Const(nl::Molly.NoNeighborList) = nl
+
+# CUDA.jl kernels
+const WARPSIZE = UInt32(32)
+
+macro shfl_multiple_sync(mask, target, width, vars...)
+    all_lines = map(vars) do v
+        Expr(:(=), v,
+            Expr(:call, :shfl_sync,
+                mask, v, target, width
+            )
+        )
+    end
+    return esc(Expr(:block, all_lines...))
+end
+
+CUDA.shfl_recurse(op, x::Quantity) = op(x.val) * unit(x)
+CUDA.shfl_recurse(op, x::SVector{1, C}) where C = SVector{1, C}(op(x[1]))
+CUDA.shfl_recurse(op, x::SVector{2, C}) where C = SVector{2, C}(op(x[1]), op(x[2]))
+CUDA.shfl_recurse(op, x::SVector{3, C}) where C = SVector{3, C}(op(x[1]), op(x[2]), op(x[3]))
+
+function cuda_threads_blocks_pairwise(n_neighbors)
+    n_threads_gpu = min(n_neighbors, parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")))
+    n_blocks = cld(n_neighbors, n_threads_gpu)
+    return n_threads_gpu, n_blocks
+end
+
+function cuda_threads_blocks_specific(n_inters)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
+    n_blocks = cld(n_inters, n_threads_gpu)
+    return n_threads_gpu, n_blocks
+end
+
+function Molly.pairwise_force_gpu(coords::CuArray{SVector{D, C}}, atoms, boundary,
+                            pairwise_inters, nbs, force_units, ::Val{T}) where {D, C, T}
+    fs_mat = CUDA.zeros(T, D, length(atoms))
+
+    if typeof(nbs) == Molly.NoNeighborList
+        kernel = @cuda launch=false cuda_pairwise_force_kernel_nonl!(
+            fs_mat, coords, atoms, boundary, pairwise_inters, Val(D), Val(force_units))
+        conf = launch_configuration(kernel.fun)
+        threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+        nthreads = min(length(atoms), threads_basic, conf.threads)
+        nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
+        n_blocks_i = cld(length(atoms), WARPSIZE)
+        n_blocks_j = cld(length(atoms), nthreads)
+        kernel(fs_mat, coords, atoms, boundary, pairwise_inters, Val(D), Val(force_units);
+            threads=nthreads, blocks=(n_blocks_i, n_blocks_j))
+    else
+        backend = get_backend(coords)
+        n_threads_gpu = Molly.gpu_threads_blocks_pairwise(length(nbs))
+        kernel! = Molly.pairwise_force_kernel!(backend, n_threads_gpu)
+        kernel!(fs_mat, coords, atoms, boundary, pairwise_inters, nbs,
+                Val(D), Val(force_units), ndrange = length(nbs))
+    end
+    return fs_mat
+end
+
+#=
+**The No-neighborlist pairwise force summation kernel**: This kernel calculates all the pairwise forces in the system of
+`n_atoms` atoms, this is done by dividing the complete matrix of `n_atoms`×`n_atoms` interactions into small tiles. Most
+of the tiles are of size `WARPSIZE`×`WARPSIZE`, but when `n_atoms` is not divisible by `WARPSIZE`, some tiles on the
+edges are of a different size are dealt as a separate case. The force summation for the tiles are done in the following
+way:
+1. `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different tread in a warp which calculates the
+forces for the entire row in `WARPSIZE` steps (or `WARPSIZE - 1` steps for tiles on the diagonal of `n_atoms`×`n_atoms`
+matrix of interactions). This is done such that some data can be shuffled from `i+1`'th thread to `i`'th thread in each
+subsequent iteration of the force calculation in a row. If `a, b, ...` are different atoms and `1, 2, ...` are order in
+which each thread calculates the interatomic forces, then we can represent this scenario as (considering `WARPSIZE=8`):
+```
+    × | i j k l m n o p
+    --------------------
+    a | 1 2 3 4 5 6 7 8
+    b | 8 1 2 3 4 5 6 7
+    c | 7 8 1 2 3 4 5 6
+    d | 6 7 8 1 2 3 4 5
+    e | 5 6 7 8 1 2 3 4
+    f | 4 5 6 7 8 1 2 3
+    g | 3 4 5 6 7 8 1 2
+    h | 2 3 4 5 6 7 8 1
+```
+
+2. Edge tiles when `n_atoms` is not divisible by warpsize: In such cases, it is not possible to shuffle data generally
+so there is no need to order calculations for each thread diagonally and it is also a bit more complicated to do so.
+That's why the calculations are done in the following order:
+```
+    × | i j k l m n
+    ----------------
+    a | 1 2 3 4 5 6
+    b | 1 2 3 4 5 6
+    c | 1 2 3 4 5 6
+    d | 1 2 3 4 5 6
+    e | 1 2 3 4 5 6
+    f | 1 2 3 4 5 6
+    g | 1 2 3 4 5 6
+    h | 1 2 3 4 5 6
+```
+=#
+function cuda_pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, atoms_var, boundary, inters,
+                                     ::Val{D}, ::Val{F}) where {T, D, F}
+    coords = CUDA.Const(coords_var)
+    atoms = CUDA.Const(atoms_var)
+    n_atoms = length(atoms)
+
+    tidx = threadIdx().x
+    i_0_tile = (blockIdx().x - 1) * warpsize()
+    j_0_block = (blockIdx().y - 1) * blockDim().x
+    warpidx = cld(tidx, warpsize())
+    j_0_tile = j_0_block + (warpidx - 1) * warpsize()
+    i = i_0_tile + laneid()
+
+    forces_shmem = CuStaticSharedArray(T, (3, 1024))
+    @inbounds for dim in 1:3
+        forces_shmem[dim, tidx] = zero(T)
+    end
+
+    if i_0_tile + warpsize() > n_atoms || j_0_tile + warpsize() > n_atoms
+        @inbounds if i <= n_atoms
+            njs = min(warpsize(), n_atoms - j_0_tile)
+            atom_i, coord_i = atoms[i], coords[i]
+            for del_j in 1:njs
+                j = j_0_tile + del_j
+                if i != j
+                    atom_j, coord_j = atoms[j], coords[j]
+                    f = Molly.sum_pairwise_forces(inters, coord_i, coord_j, atom_i, atom_j, boundary, false, Val(F))
+                    for dim in 1:D
+                        forces_shmem[dim, tidx] += -ustrip(f[dim])
+                    end
+                end
+            end
+
+            for dim in 1:D
+                Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
+            end
+        end
+    else
+        j = j_0_tile + laneid()
+        tilesteps = warpsize()
+        if i_0_tile == j_0_tile  # To not compute i-i forces
+            j = j_0_tile + laneid() % warpsize() + 1
+            tilesteps -= 1
+        end
+
+        atom_i, coord_i = atoms[i], coords[i]
+        coord_j = coords[j]
+        @inbounds for _ in 1:tilesteps
+            sync_warp()
+            atom_j = atoms[j]
+            f = Molly.sum_pairwise_forces(inters, coord_i, coord_j, atom_i, atom_j, boundary, false, Val(F))
+            for dim in 1:D
+                forces_shmem[dim, tidx] += -Molly.ustrip(f[dim])
+            end
+            @shfl_multiple_sync(FULL_MASK, laneid() + 1, warpsize(), j, coord_j)
+        end
+
+        @inbounds for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
+        end
+    end
+
+    return nothing
+end
+
+# CUDA specific calls for Molly
+@non_differentiable CUDA.zeros(args...)
+@non_differentiable cuda_threads_blocks_pairwise(args...)
+@non_differentiable cuda_threads_blocks_specific(args...)
+
+end
diff --git a/ext/MollyGLMakieExt.jl b/ext/MollyGLMakieExt.jl
@@ -5,8 +5,8 @@ module MollyGLMakieExt
 
 using Molly
 using GLMakie
-using Colors
 using Unitful
+using Colors
 
 using LinearAlgebra