Merge branch 'dev-stable' of https://github.com/QuantumPackage/qp2 in…

…to dev-stable
QuantumPackage · Jul 12, 2024 · d7bf334 · d7bf334
2 parents 505d100 + f5cf674
commit d7bf334
Show file tree

Hide file tree

Showing 37 changed files with 4,051 additions and 1,546 deletions.
diff --git a/configure b/configure
@@ -40,14 +40,16 @@ Usage:
  $(basename $0) -c <file> 
  $(basename $0) -h 
  $(basename $0) -i <package> 
+ $(basename $0) -g [nvidia|intel|none]
 
 Options:
- -c <file> Define a COMPILATION configuration file,
- in "${QP_ROOT}/config/".
- -h Print the HELP message
- -i <package> INSTALL <package>. Use at your OWN RISK:
- no support will be provided for the installation of
- dependencies.
+ -c <file> Define a COMPILATION configuration file,
+ in "${QP_ROOT}/config/".
+ -h Print the HELP message
+ -i <package> INSTALL <package>. Use at your OWN RISK:
+ no support will be provided for the installation of
+ dependencies.
+ -g [nvidia|intel|none] Choose GPU acceleration
 
 Example:
  ./$(basename $0) -c config/gfortran.cfg
@@ -83,7 +85,7 @@ function execute () {
 PACKAGES=""
 
 
-while getopts "d:c:i:h" c ; do
+while getopts "d:c:i:g:h" c ; do
  case "$c" in
  c)
  case "$OPTARG" in
@@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
  "") help ; break;;
  *) PACKAGES="${PACKAGE} $OPTARG"
  esac;;
+ g)
+ GPU=$OPTARG;
+ break;;
  h)
  help
  exit 0;;
@@ -109,6 +114,27 @@ while getopts "d:c:i:h" c ; do
  esac
 done
 
+# Handle GPU acceleration
+rm -f ${QP_ROOT}/src/gpu_arch
+case "$GPU" in
+ amd) # AMD
+ echo "Activating AMD GPU acceleration"
+ ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
+ ;;
+ intel) # Intel
+ echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
+ ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
+ ;;
+ nvidia) # Nvidia
+ echo "Activating Nvidia GPU acceleration"
+ ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
+ ;;
+ *) # No Acceleration
+ echo "Disabling GPU acceleration"
+ ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
+ ;;
+esac
+
 # Trim leading and trailing spaces
 PACKAGES=$(echo $PACKAGES | xargs)
 

diff --git a/plugins/local/gpu_intel/LIB b/plugins/local/gpu_intel/LIB
@@ -0,0 +1,2 @@
+-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++ 
+
diff --git a/plugins/local/gpu_intel/NEED b/plugins/local/gpu_intel/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_intel/README.rst b/plugins/local/gpu_intel/README.rst
@@ -0,0 +1,8 @@
+=========
+gpu_intel
+=========
+
+Intel implementation of GPU routines. Uses MKL and SYCL.
+```bash
+icpx -fsycl gpu.cxx -c -qmkl=sequential
+```
diff --git a/plugins/local/gpu_intel/gpu.sycl b/plugins/local/gpu_intel/gpu.sycl
@@ -0,0 +1,177 @@
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <limits>
+#include <oneapi/mkl/blas.hpp>
+
+extern "C" {
+
+/* Generic functions */
+
+int gpu_ndevices() {
+ return 1;
+}
+
+void gpu_set_device(int32_t igpu) {
+}
+
+
+/* Allocation functions */
+
+void gpu_allocate(void** ptr, int64_t size) {
+ auto queue = sycl::queue(sycl::default_selector_v);
+
+ try {
+ *ptr = sycl::malloc_shared(size, queue);
+ assert(*ptr != nullptr);
+ } catch (const sycl::exception& e) {
+ std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+ *ptr = nullptr; // If allocation fails, set pointer to nullptr
+ }
+}
+
+void gpu_deallocate(void** ptr) {
+ assert(*ptr != nullptr);
+ sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
+ *ptr = nullptr;
+}
+
+/* Upload data from host to device */
+void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
+ sycl::queue queue(sycl::default_selector_v);
+ queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
+}
+
+/* Download data from device to host */
+void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
+ sycl::queue queue(sycl::default_selector_v);
+ queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
+}
+
+/* Copy data from one GPU memory location to another */
+void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
+ sycl::queue queue(sycl::default_selector_v);
+ queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
+}
+
+/* Queues */
+
+/* SYCL queue as a replacement for CUDA stream */
+void gpu_stream_create(sycl::queue** ptr) {
+ *ptr = new sycl::queue(sycl::default_selector_v);
+}
+
+void gpu_stream_destroy(sycl::queue** ptr) {
+ assert(*ptr != nullptr);
+ delete *ptr;
+ *ptr = nullptr;
+}
+
+void gpu_synchronize() {
+ sycl::queue queue(sycl::default_selector_v);
+ queue.wait_and_throw();
+}
+
+/* BLAS functions */
+
+typedef struct {
+ sycl::queue* queue;
+} blasHandle_t;
+
+void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
+ handle->queue = ptr;
+}
+
+void gpu_blas_create(blasHandle_t** ptr) {
+ *ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
+ assert(*ptr != nullptr);
+ (*ptr)->queue = new sycl::queue(sycl::default_selector_v);
+ assert((*ptr)->queue != nullptr);
+}
+
+void gpu_blas_destroy(blasHandle_t** ptr) {
+ assert(*ptr != nullptr);
+ delete (*ptr)->queue;
+ free(*ptr);
+ *ptr = nullptr;
+}
+
+
+void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx, 
+ const double* y, const int64_t incy, double* result) {
+ // Ensure input parameters are valid
+ assert(handle != nullptr);
+ assert(handle->queue != nullptr);
+ assert(n > 0);
+ assert(incx > 0);
+ assert(incy > 0);
+ assert(x != nullptr);
+ assert(y != nullptr);
+ assert(result != nullptr);
+
+ oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);
+
+}
+
+void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
+ const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {
+
+ assert(handle != nullptr);
+ assert(handle->queue != nullptr);
+
+ // Validate matrix dimensions and increments to be positive
+ assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
+ assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);
+
+ // Determine the operation type
+ oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
+ if (*transa == 'T' || *transa == 't') {
+ transa_ = oneapi::mkl::transpose::trans;
+ }
+
+ // Perform DGEMV operation using oneMKL
+ oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);
+
+}
+
+void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
+ const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {
+
+ assert(handle != nullptr && handle->queue != nullptr);
+ assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
+ assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+ // Transpose operations
+ auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+ auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
+
+ oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
+ *alpha, a, lda, b, ldb, *beta, c, ldc);
+
+}
+
+
+void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
+ const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
+ assert(handle != nullptr && handle->queue != nullptr);
+ assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
+ assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);
+
+ // Determine transpose operations
+ bool transA = (*transa == 'T' || *transa == 't');
+ bool transB = (*transb == 'T' || *transb == 't');
+
+ handle->queue->submit([&](sycl::handler& cgh) {
+ cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
+ const int i = idx[0];
+ const int j = idx[1];
+ const int ai = transA ? j * lda + i : i * lda + j;
+ const int bi = transB ? j * ldb + i : i * ldb + j;
+ const int ci = i * ldc + j;
+
+ c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
+ });
+ });
+
+}
+
+} // extern C
diff --git a/plugins/local/gpu_nvidia/LIB b/plugins/local/gpu_nvidia/LIB
@@ -0,0 +1 @@
+-lcudart -lcublas -lcublasLt
diff --git a/plugins/local/gpu_nvidia/NEED b/plugins/local/gpu_nvidia/NEED
@@ -0,0 +1 @@
+
diff --git a/plugins/local/gpu_nvidia/README.rst b/plugins/local/gpu_nvidia/README.rst
@@ -0,0 +1,5 @@
+==========
+gpu_nvidia
+==========
+
+Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.