Skip to content

Commit

Permalink
Merge pull request mfem#3138 from mfem/dispatch-map
Browse files Browse the repository at this point in the history
Runtime dispatch for SMEM GPU kernels
  • Loading branch information
tzanio authored Sep 26, 2024
2 parents 96ac1d9 + fb1a587 commit 0739640
Show file tree
Hide file tree
Showing 25 changed files with 1,240 additions and 1,399 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
CMakeCache.txt
CMakeFiles/

# Clangd server cache
*.cache*

# Backup files
*~

Expand Down
12 changes: 12 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ GPU computing
- Added support for GPU-accelerated batched linear algebra (using cuBLAS,
hipBLAS, MAGMA, or native MFEM functionality) through the BatchedLinAlg class.

- A new GPU kernel dispatch mechanism was introduced. Users can instantiate
specialized kernels for specific combinations of (for example) polynomial
degree and number of quadrature points using
`DiffusionIntegrator::AddSpecialization` and
`MassIntegrator::AddSpecialization` (this functionality may be added to more
integrators in the future).

- Calls to slower fallback kernels can be reported to `mfem::err` by setting
the environment variable `MFEM_REPORT_KERNELS` to any value other than `NO`
or by explicitly calling `KernelReporter::Enable`. Users can then add
specializations for these kernels to achieve higher performance.

Miscellaneous
-------------
- Refactored the `ARKStepSolver` class (ARKODE interface) to use
Expand Down
6 changes: 3 additions & 3 deletions fem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ set(SRCS
qinterp/eval_by_vdim.cpp
qinterp/grad_by_nodes.cpp
qinterp/grad_by_vdim.cpp
qinterp/grad_phys_by_nodes.cpp
qinterp/grad_phys_by_vdim.cpp
qspace.cpp
quadinterpolator.cpp
quadinterpolator_face.cpp
Expand Down Expand Up @@ -192,6 +190,9 @@ set(HDRS
hybridization.hpp
intrules.hpp
intrules_cut.hpp
kernel_dispatch.hpp
kernel_reporter.hpp
kernels.hpp
ceed/interface/basis.hpp
ceed/interface/integrator.hpp
ceed/interface/interface.hpp
Expand Down Expand Up @@ -223,7 +224,6 @@ set(HDRS
nonlinearform_ext.hpp
nonlininteg.hpp
qfunction.hpp
qinterp/dispatch.hpp
qinterp/eval.hpp
qinterp/grad.hpp
qspace.hpp
Expand Down
46 changes: 46 additions & 0 deletions fem/bilininteg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include "qfunction.hpp"
#include <memory>

#include "kernel_dispatch.hpp"

namespace mfem
{

Expand Down Expand Up @@ -2126,6 +2128,22 @@ class GradientIntegrator : public BilinearFormIntegrator
can be a scalar or a matrix coefficient. */
class DiffusionIntegrator: public BilinearFormIntegrator
{
public:

using ApplyKernelType = void(*)(const int, const bool, const Array<real_t>&,
const Array<real_t>&, const Array<real_t>&,
const Array<real_t>&,
const Vector&, const Vector&,
Vector&, const int, const int);

using DiagonalKernelType = void(*)(const int, const bool, const Array<real_t>&,
const Array<real_t>&, const Vector&, Vector&,
const int, const int);

MFEM_REGISTER_KERNELS(ApplyPAKernels, ApplyKernelType, (int, int, int));
MFEM_REGISTER_KERNELS(DiagonalPAKernels, DiagonalKernelType, (int, int, int));
static struct Kernels { Kernels(); } kernels;

protected:
Coefficient *Q;
VectorCoefficient *VQ;
Expand Down Expand Up @@ -2286,6 +2304,13 @@ class DiffusionIntegrator: public BilinearFormIntegrator
bool SupportsCeed() const override { return DeviceCanUseCeed(); }

Coefficient *GetCoefficient() const { return Q; }

template <int DIM, int D1D, int Q1D>
static void AddSpecialization()
{
ApplyPAKernels::Specialization<DIM,D1D,Q1D>::Add();
DiagonalPAKernels::Specialization<DIM,D1D,Q1D>::Add();
}
};

/** Class for local mass matrix assembling $a(u,v) := (Q u, v)$ */
Expand All @@ -2305,6 +2330,20 @@ class MassIntegrator: public BilinearFormIntegrator
const FaceGeometricFactors *face_geom; ///< Not owned
int dim, ne, nq, dofs1D, quad1D;

public:

using ApplyKernelType = void(*)(const int, const Array<real_t>&,
const Array<real_t>&, const Vector&,
const Vector&, Vector&, const int, const int);

using DiagonalKernelType = void(*)(const int, const Array<real_t>&,
const Vector&, Vector&, const int,
const int);

MFEM_REGISTER_KERNELS(ApplyPAKernels, ApplyKernelType, (int, int, int));
MFEM_REGISTER_KERNELS(DiagonalPAKernels, DiagonalKernelType, (int, int, int));
static struct Kernels { Kernels(); } kernels;

public:
MassIntegrator(const IntegrationRule *ir = NULL)
: BilinearFormIntegrator(ir), Q(NULL), maps(NULL), geom(NULL) { }
Expand Down Expand Up @@ -2350,6 +2389,13 @@ class MassIntegrator: public BilinearFormIntegrator
bool SupportsCeed() const override { return DeviceCanUseCeed(); }

const Coefficient *GetCoefficient() const { return Q; }

template <int DIM, int D1D, int Q1D>
static void AddSpecialization()
{
ApplyPAKernels::Specialization<DIM,D1D,Q1D>::Add();
DiagonalPAKernels::Specialization<DIM,D1D,Q1D>::Add();
}
};

/** Mass integrator $(u, v)$ restricted to the boundary of a domain */
Expand Down
141 changes: 27 additions & 114 deletions fem/integ/bilininteg_diffusion_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,33 @@
namespace mfem
{

// PA Diffusion Integrator

DiffusionIntegrator::Kernels DiffusionIntegrator::kernels;
DiffusionIntegrator::Kernels::Kernels()
{
// 2D
DiffusionIntegrator::AddSpecialization<2,2,2>();
DiffusionIntegrator::AddSpecialization<2,3,3>();
DiffusionIntegrator::AddSpecialization<2,4,4>();
DiffusionIntegrator::AddSpecialization<2,5,5>();
DiffusionIntegrator::AddSpecialization<2,6,6>();
DiffusionIntegrator::AddSpecialization<2,7,7>();
DiffusionIntegrator::AddSpecialization<2,8,8>();
DiffusionIntegrator::AddSpecialization<2,9,9>();
// 3D
DiffusionIntegrator::AddSpecialization<3,2,2>();
DiffusionIntegrator::AddSpecialization<3,2,3>();
DiffusionIntegrator::AddSpecialization<3,3,4>();
DiffusionIntegrator::AddSpecialization<3,4,5>();
DiffusionIntegrator::AddSpecialization<3,4,6>();
DiffusionIntegrator::AddSpecialization<3,5,6>();
DiffusionIntegrator::AddSpecialization<3,5,8>();
DiffusionIntegrator::AddSpecialization<3,6,7>();
DiffusionIntegrator::AddSpecialization<3,7,8>();
DiffusionIntegrator::AddSpecialization<3,8,9>();
}

namespace internal
{

Expand Down Expand Up @@ -361,121 +388,7 @@ void OccaPADiffusionSetup3D(const int D1D,
}
OccaDiffSetup3D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
}
#endif // MFEM_USE_OCCA

void PADiffusionAssembleDiagonal(const int dim,
const int D1D,
const int Q1D,
const int NE,
const bool symm,
const Array<real_t> &B,
const Array<real_t> &G,
const Vector &D,
Vector &Y)
{
if (dim == 2)
{
switch ((D1D << 4 ) | Q1D)
{
case 0x22: return SmemPADiffusionDiagonal2D<2,2,8>(NE,symm,B,G,D,Y);
case 0x33: return SmemPADiffusionDiagonal2D<3,3,8>(NE,symm,B,G,D,Y);
case 0x44: return SmemPADiffusionDiagonal2D<4,4,4>(NE,symm,B,G,D,Y);
case 0x55: return SmemPADiffusionDiagonal2D<5,5,4>(NE,symm,B,G,D,Y);
case 0x66: return SmemPADiffusionDiagonal2D<6,6,2>(NE,symm,B,G,D,Y);
case 0x77: return SmemPADiffusionDiagonal2D<7,7,2>(NE,symm,B,G,D,Y);
case 0x88: return SmemPADiffusionDiagonal2D<8,8,1>(NE,symm,B,G,D,Y);
case 0x99: return SmemPADiffusionDiagonal2D<9,9,1>(NE,symm,B,G,D,Y);
default: return PADiffusionDiagonal2D(NE,symm,B,G,D,Y,D1D,Q1D);
}
}
else if (dim == 3)
{
switch ((D1D << 4 ) | Q1D)
{
case 0x22: return SmemPADiffusionDiagonal3D<2,2>(NE,symm,B,G,D,Y);
case 0x23: return SmemPADiffusionDiagonal3D<2,3>(NE,symm,B,G,D,Y);
case 0x34: return SmemPADiffusionDiagonal3D<3,4>(NE,symm,B,G,D,Y);
case 0x45: return SmemPADiffusionDiagonal3D<4,5>(NE,symm,B,G,D,Y);
case 0x46: return SmemPADiffusionDiagonal3D<4,6>(NE,symm,B,G,D,Y);
case 0x56: return SmemPADiffusionDiagonal3D<5,6>(NE,symm,B,G,D,Y);
case 0x67: return SmemPADiffusionDiagonal3D<6,7>(NE,symm,B,G,D,Y);
case 0x78: return SmemPADiffusionDiagonal3D<7,8>(NE,symm,B,G,D,Y);
case 0x89: return SmemPADiffusionDiagonal3D<8,9>(NE,symm,B,G,D,Y);
case 0x9A: return SmemPADiffusionDiagonal3D<9,10>(NE,symm,B,G,D,Y);
default: return PADiffusionDiagonal3D(NE,symm,B,G,D,Y,D1D,Q1D);
}
}
MFEM_ABORT("Unknown kernel.");
}

void PADiffusionApply(const int dim,
const int D1D,
const int Q1D,
const int NE,
const bool symm,
const Array<real_t> &B,
const Array<real_t> &G,
const Array<real_t> &Bt,
const Array<real_t> &Gt,
const Vector &D,
const Vector &X,
Vector &Y)
{
#ifdef MFEM_USE_OCCA
if (DeviceCanUseOcca())
{
if (dim == 2)
{
OccaPADiffusionApply2D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
return;
}
if (dim == 3)
{
OccaPADiffusionApply3D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
return;
}
MFEM_ABORT("OCCA PADiffusionApply unknown kernel!");
}
#endif // MFEM_USE_OCCA
const int id = (D1D << 4) | Q1D;

if (dim == 2)
{
switch (id)
{
case 0x22: return SmemPADiffusionApply2D<2,2,16>(NE,symm,B,G,D,X,Y);
case 0x33: return SmemPADiffusionApply2D<3,3,16>(NE,symm,B,G,D,X,Y);
case 0x44: return SmemPADiffusionApply2D<4,4,8>(NE,symm,B,G,D,X,Y);
case 0x55: return SmemPADiffusionApply2D<5,5,8>(NE,symm,B,G,D,X,Y);
case 0x66: return SmemPADiffusionApply2D<6,6,4>(NE,symm,B,G,D,X,Y);
case 0x77: return SmemPADiffusionApply2D<7,7,4>(NE,symm,B,G,D,X,Y);
case 0x88: return SmemPADiffusionApply2D<8,8,2>(NE,symm,B,G,D,X,Y);
case 0x99: return SmemPADiffusionApply2D<9,9,2>(NE,symm,B,G,D,X,Y);
default: return PADiffusionApply2D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
}
}

if (dim == 3)
{
switch (id)
{
case 0x22: return SmemPADiffusionApply3D<2,2>(NE,symm,B,G,D,X,Y);
case 0x23: return SmemPADiffusionApply3D<2,3>(NE,symm,B,G,D,X,Y);
case 0x34: return SmemPADiffusionApply3D<3,4>(NE,symm,B,G,D,X,Y);
case 0x45: return SmemPADiffusionApply3D<4,5>(NE,symm,B,G,D,X,Y);
case 0x46: return SmemPADiffusionApply3D<4,6>(NE,symm,B,G,D,X,Y);
case 0x56: return SmemPADiffusionApply3D<5,6>(NE,symm,B,G,D,X,Y);
case 0x58: return SmemPADiffusionApply3D<5,8>(NE,symm,B,G,D,X,Y);
case 0x67: return SmemPADiffusionApply3D<6,7>(NE,symm,B,G,D,X,Y);
case 0x78: return SmemPADiffusionApply3D<7,8>(NE,symm,B,G,D,X,Y);
case 0x89: return SmemPADiffusionApply3D<8,9>(NE,symm,B,G,D,X,Y);
default: return PADiffusionApply3D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
}
}
MFEM_ABORT("Unknown kernel: 0x"<<std::hex << id << std::dec);
}

#ifdef MFEM_USE_OCCA
void OccaPADiffusionApply2D(const int D1D,
const int Q1D,
const int NE,
Expand Down
Loading

0 comments on commit 0739640

Please sign in to comment.