Skip to content

Commit

Permalink
[BLAS] add cublas backend for omatadd and omatcopy (#345)
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtbarker authored Jul 11, 2023
1 parent 1a243e0 commit f711d1a
Showing 1 changed file with 146 additions and 99 deletions.
245 changes: 146 additions & 99 deletions src/blas/backends/cublas/cublas_extensions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,27 +85,41 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra
throw unimplemented("blas", "gemmt", "for column_major layout");
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}

void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}
template <typename Func, typename T>
void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
int64_t ldb) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb);
queue.submit([&](sycl::handler &cgh) {
auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? m : n);
const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? n : m);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = sc.get_mem<cuDataType *>(a_acc);
auto b_ = sc.get_mem<cuDataType *>(b_acc);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
get_cublas_operation(trans), logical_m, logical_n,
(cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
});
});
}

#define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \
void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb); \
}

OMATCOPY_LAUNCHER(float, cublasSgeam)
OMATCOPY_LAUNCHER(double, cublasDgeam)
OMATCOPY_LAUNCHER(std::complex<float>, cublasCgeam)
OMATCOPY_LAUNCHER(std::complex<double>, cublasZgeam)

#undef OMATCOPY_LAUNCHER

void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
Expand All @@ -127,31 +141,43 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co
throw unimplemented("blas", "imatcopy", "for column_major layout");
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}

void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}
template <typename Func, typename T>
void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
T beta, sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb, ldc);
queue.submit([&](sycl::handler &cgh) {
auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = sc.get_mem<cuDataType *>(a_acc);
auto b_ = sc.get_mem<cuDataType *>(b_acc);
auto c_ = sc.get_mem<cuDataType *>(c_acc);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
get_cublas_operation(transb), m, n, (cuDataType *)&alpha, a_,
lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
});
});
}

#define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE) \
void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta, \
sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) { \
omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \
b, ldb, c, ldc); \
}

OMATADD_LAUNCHER(float, cublasSgeam)
OMATADD_LAUNCHER(double, cublasDgeam)
OMATADD_LAUNCHER(std::complex<float>, cublasCgeam)
OMATADD_LAUNCHER(std::complex<double>, cublasZgeam)

#undef OMATADD_LAUNCHER

// USM APIs

Expand Down Expand Up @@ -217,31 +243,43 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp
throw unimplemented("blas", "gemmt", "for column_major layout");
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
const float *a, int64_t lda, float *b, int64_t ldb,
template <typename Func, typename T>
sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans,
int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
const double *a, int64_t lda, double *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
std::complex<float> *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}

sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
std::complex<double> *b, int64_t ldb,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatcopy", "for column_major layout");
}
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb);
auto done = queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(dependencies);
const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? m : n);
const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? n : m);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = reinterpret_cast<const cuDataType *>(a);
auto b_ = reinterpret_cast<cuDataType *>(b);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
get_cublas_operation(trans), logical_m, logical_n,
(cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
});
});
return done;
}

#define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \
sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \
const std::vector<sycl::event> &dependencies) { \
return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, \
ldb, dependencies); \
}

OMATCOPY_LAUNCHER_USM(float, cublasSgeam)
OMATCOPY_LAUNCHER_USM(double, cublasDgeam)
OMATCOPY_LAUNCHER_USM(std::complex<float>, cublasCgeam)
OMATCOPY_LAUNCHER_USM(std::complex<double>, cublasZgeam)

#undef OMATCOPY_LAUNCHER_USM

sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
float *ab, int64_t lda, int64_t ldb,
Expand All @@ -267,35 +305,44 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
throw unimplemented("blas", "imatcopy", "for column_major layout");
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
float alpha, const float *a, int64_t lda, float beta, const float *b,
int64_t ldb, float *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
double alpha, const double *a, int64_t lda, double beta, const double *b,
int64_t ldb, double *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
std::complex<float> *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}

sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
std::complex<double> *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
throw unimplemented("blas", "omatadd", "for column_major layout");
}
template <typename Func, typename T>
inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
const std::vector<sycl::event> &dependencies) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(m, n, lda, ldb, ldc);
auto done = queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(dependencies);
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto a_ = reinterpret_cast<const cuDataType *>(a);
auto b_ = reinterpret_cast<const cuDataType *>(b);
auto c_ = reinterpret_cast<cuDataType *>(c);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
get_cublas_operation(transb), m, n, (cuDataType *)&alpha, a_,
lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
});
});
return done;
}

#define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \
sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \
int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, \
const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc, \
const std::vector<sycl::event> &dependencies) { \
return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \
lda, beta, b, ldb, c, ldc, dependencies); \
}

OMATADD_LAUNCHER_USM(float, cublasSgeam)
OMATADD_LAUNCHER_USM(double, cublasDgeam)
OMATADD_LAUNCHER_USM(std::complex<float>, cublasCgeam)
OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)

#undef OMATADD_LAUNCHER_USM

} // namespace column_major
namespace row_major {
Expand Down

0 comments on commit f711d1a

Please sign in to comment.