Skip to content

Commit

Permalink
Merge branch 'dev-stable' of https://github.com/QuantumPackage/qp2 in…
Browse files Browse the repository at this point in the history
…to dev-stable
  • Loading branch information
eginer committed Jul 12, 2024
2 parents 505d100 + f5cf674 commit d7bf334
Show file tree
Hide file tree
Showing 37 changed files with 4,051 additions and 1,546 deletions.
40 changes: 33 additions & 7 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ Usage:
$(basename $0) -c <file>
$(basename $0) -h
$(basename $0) -i <package>
$(basename $0) -g [nvidia|intel|none]
Options:
-c <file> Define a COMPILATION configuration file,
in "${QP_ROOT}/config/".
-h Print the HELP message
-i <package> INSTALL <package>. Use at your OWN RISK:
no support will be provided for the installation of
dependencies.
-c <file> Define a COMPILATION configuration file,
in "${QP_ROOT}/config/".
-h Print the HELP message
-i <package> INSTALL <package>. Use at your OWN RISK:
no support will be provided for the installation of
dependencies.
-g [nvidia|intel|none] Choose GPU acceleration
Example:
./$(basename $0) -c config/gfortran.cfg
Expand Down Expand Up @@ -83,7 +85,7 @@ function execute () {
PACKAGES=""


while getopts "d:c:i:h" c ; do
while getopts "d:c:i:g:h" c ; do
case "$c" in
c)
case "$OPTARG" in
Expand All @@ -100,6 +102,9 @@ while getopts "d:c:i:h" c ; do
"") help ; break;;
*) PACKAGES="${PACKAGE} $OPTARG"
esac;;
g)
GPU=$OPTARG;
break;;
h)
help
exit 0;;
Expand All @@ -109,6 +114,27 @@ while getopts "d:c:i:h" c ; do
esac
done

# Handle GPU acceleration
rm -f ${QP_ROOT}/src/gpu_arch
case "$GPU" in
amd) # AMD
echo "Activating AMD GPU acceleration"
ln -s ${QP_ROOT}/plugins/local/gpu_amd ${QP_ROOT}/src/gpu_arch
;;
intel) # Intel
echo "Activating Intel GPU acceleration (EXPERIMENTAL)"
ln -s ${QP_ROOT}/plugins/local/gpu_intel ${QP_ROOT}/src/gpu_arch
;;
nvidia) # Nvidia
echo "Activating Nvidia GPU acceleration"
ln -s ${QP_ROOT}/plugins/local/gpu_nvidia ${QP_ROOT}/src/gpu_arch
;;
*) # No Acceleration
echo "Disabling GPU acceleration"
ln -s ${QP_ROOT}/plugins/local/gpu_x86 ${QP_ROOT}/src/gpu_arch
;;
esac

# Trim leading and trailing spaces
PACKAGES=$(echo $PACKAGES | xargs)

Expand Down
2 changes: 2 additions & 0 deletions plugins/local/gpu_intel/LIB
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-ltbb -lsycl -lmkl_sycl -lgpu -limf -lintlc -lstdc++

1 change: 1 addition & 0 deletions plugins/local/gpu_intel/NEED
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

8 changes: 8 additions & 0 deletions plugins/local/gpu_intel/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
=========
gpu_intel
=========

Intel implementation of GPU routines. Uses MKL and SYCL.
```bash
icpx -fsycl gpu.cxx -c -qmkl=sequential
```
177 changes: 177 additions & 0 deletions plugins/local/gpu_intel/gpu.sycl
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#include <CL/sycl.hpp>
#include <cassert>
#include <limits>
#include <oneapi/mkl/blas.hpp>

extern "C" {

/* Generic functions */

int gpu_ndevices() {
return 1;
}

void gpu_set_device(int32_t igpu) {
}


/* Allocation functions */

void gpu_allocate(void** ptr, int64_t size) {
auto queue = sycl::queue(sycl::default_selector_v);

try {
*ptr = sycl::malloc_shared(size, queue);
assert(*ptr != nullptr);
} catch (const sycl::exception& e) {
std::cerr << "SYCL exception caught: " << e.what() << std::endl;
*ptr = nullptr; // If allocation fails, set pointer to nullptr
}
}

void gpu_deallocate(void** ptr) {
assert(*ptr != nullptr);
sycl::free(*ptr, sycl::queue(sycl::default_selector_v));
*ptr = nullptr;
}

/* Upload data from host to device */
void gpu_upload(const void* cpu_ptr, void* gpu_ptr, const int64_t n) {
sycl::queue queue(sycl::default_selector_v);
queue.memcpy(gpu_ptr, cpu_ptr, n).wait();
}

/* Download data from device to host */
void gpu_download(const void* gpu_ptr, void* cpu_ptr, const int64_t n) {
sycl::queue queue(sycl::default_selector_v);
queue.memcpy(cpu_ptr, gpu_ptr, n).wait();
}

/* Copy data from one GPU memory location to another */
void gpu_copy(const void* gpu_ptr_src, void* gpu_ptr_dest, const int64_t n) {
sycl::queue queue(sycl::default_selector_v);
queue.memcpy(gpu_ptr_dest, gpu_ptr_src, n).wait();
}

/* Queues */

/* SYCL queue as a replacement for CUDA stream */
void gpu_stream_create(sycl::queue** ptr) {
*ptr = new sycl::queue(sycl::default_selector_v);
}

void gpu_stream_destroy(sycl::queue** ptr) {
assert(*ptr != nullptr);
delete *ptr;
*ptr = nullptr;
}

void gpu_synchronize() {
sycl::queue queue(sycl::default_selector_v);
queue.wait_and_throw();
}

/* BLAS functions */

typedef struct {
sycl::queue* queue;
} blasHandle_t;

void gpu_set_stream(blasHandle_t* handle, sycl::queue* ptr) {
handle->queue = ptr;
}

void gpu_blas_create(blasHandle_t** ptr) {
*ptr = (blasHandle_t*) malloc(sizeof(blasHandle_t));
assert(*ptr != nullptr);
(*ptr)->queue = new sycl::queue(sycl::default_selector_v);
assert((*ptr)->queue != nullptr);
}

void gpu_blas_destroy(blasHandle_t** ptr) {
assert(*ptr != nullptr);
delete (*ptr)->queue;
free(*ptr);
*ptr = nullptr;
}


void gpu_ddot(blasHandle_t* handle, const int64_t n, const double* x, const int64_t incx,
const double* y, const int64_t incy, double* result) {
// Ensure input parameters are valid
assert(handle != nullptr);
assert(handle->queue != nullptr);
assert(n > 0);
assert(incx > 0);
assert(incy > 0);
assert(x != nullptr);
assert(y != nullptr);
assert(result != nullptr);

oneapi::mkl::blas::dot(*handle->queue, n, x, incx, y, incy, result);

}

void gpu_dgemv(blasHandle_t* handle, const char* transa, const int64_t m, const int64_t n, const double* alpha,
const double* a, const int64_t lda, const double* x, const int64_t incx, const double* beta, double* y, const int64_t incy) {

assert(handle != nullptr);
assert(handle->queue != nullptr);

// Validate matrix dimensions and increments to be positive
assert(m > 0 && n > 0 && lda > 0 && incx > 0 && incy > 0);
assert(a != nullptr && x != nullptr && y != nullptr && alpha != nullptr && beta != nullptr);

// Determine the operation type
oneapi::mkl::transpose transa_ = oneapi::mkl::transpose::nontrans;
if (*transa == 'T' || *transa == 't') {
transa_ = oneapi::mkl::transpose::trans;
}

// Perform DGEMV operation using oneMKL
oneapi::mkl::blas::column_major::gemv(*handle->queue, transa_, m, n, *alpha, a, lda, x, incx, *beta, y, incy);

}

void gpu_dgemm(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const int64_t k, const double* alpha,
const double* a, const int64_t lda, const double* b, const int64_t ldb, const double* beta, double* c, const int64_t ldc) {

assert(handle != nullptr && handle->queue != nullptr);
assert(m > 0 && n > 0 && k > 0 && lda > 0 && ldb > 0 && ldc > 0);
assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);

// Transpose operations
auto transa_ = (*transa == 'T' || *transa == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;
auto transb_ = (*transb == 'T' || *transb == 't') ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans;

oneapi::mkl::blas::column_major::gemm(*handle->queue, transa_, transb_, m, n, k,
*alpha, a, lda, b, ldb, *beta, c, ldc);

}


void gpu_dgeam(blasHandle_t* handle, const char* transa, const char* transb, const int64_t m, const int64_t n, const double* alpha,
const double* a, const int64_t lda, const double* beta, const double* b, const int64_t ldb, double* c, const int64_t ldc) {
assert(handle != nullptr && handle->queue != nullptr);
assert(m > 0 && n > 0 && lda > 0 && ldb > 0 && ldc > 0);
assert(a != nullptr && b != nullptr && c != nullptr && alpha != nullptr && beta != nullptr);

// Determine transpose operations
bool transA = (*transa == 'T' || *transa == 't');
bool transB = (*transb == 'T' || *transb == 't');

handle->queue->submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl::range<2>(m, n), [=](sycl::id<2> idx) {
const int i = idx[0];
const int j = idx[1];
const int ai = transA ? j * lda + i : i * lda + j;
const int bi = transB ? j * ldb + i : i * ldb + j;
const int ci = i * ldc + j;

c[ci] = (*alpha) * a[ai] + (*beta) * b[bi];
});
});

}

} // extern C
1 change: 1 addition & 0 deletions plugins/local/gpu_nvidia/LIB
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-lcudart -lcublas -lcublasLt
1 change: 1 addition & 0 deletions plugins/local/gpu_nvidia/NEED
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

5 changes: 5 additions & 0 deletions plugins/local/gpu_nvidia/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
==========
gpu_nvidia
==========

Nvidia implementation of GPU routines. Uses CUDA and CUBLAS libraries.
Loading

0 comments on commit d7bf334

Please sign in to comment.