Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
minmingzhu committed Oct 17, 2024
1 parent 37f936a commit 5bd92e2
Show file tree
Hide file tree
Showing 25 changed files with 80 additions and 129 deletions.
9 changes: 2 additions & 7 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,7 @@ JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows,
jlong numCols, jint executorNum, jint executorCores,
jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port,
jobject resultObj) {
jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -230,13 +229,9 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
"oneDAL (native): use GPU kernels with rankid %d",
rank);

const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
doCorrelationOneAPICompute(env, pNumTabData, numRows, numCols, comm,
resultObj);
env->ReleaseStringUTFChars(ip_port, str);
break;
}
#endif
Expand Down
9 changes: 2 additions & 7 deletions mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,7 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif
jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode,
jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
jint maxBins, jboolean bootstrap, jintArray gpuIdxArray, jstring ip_port,
jobject resultObj) {
jint maxBins, jboolean bootstrap, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels");

ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
Expand All @@ -317,18 +316,14 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif
"oneDAL (native): use GPU kernels with rankid %d",
rank);

const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
jobject hashmapObj = doRFClassifierOneAPICompute(
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount,
numFeaturesPerNode, minObservationsLeafNode,
minObservationsSplitNode, minWeightFractionLeafNode,
minImpurityDecreaseSplitNode, maxTreeDepth, seed, maxBins,
bootstrap, comm, resultObj);
env->ReleaseStringUTFChars(ip_port, str);
return hashmapObj;
}
default: {
Expand Down
8 changes: 2 additions & 6 deletions mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra
jint executorNum, jint computeDeviceOrdinal, jint treeCount,
jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth,
jlong seed, jint maxbins, jboolean bootstrap, jintArray gpuIdxArray,
jstring ip_port, jobject resultObj) {
jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand All @@ -309,16 +309,12 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra
"OneDAL (native): use GPU kernels with rankid %d",
rank);

const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
jobject hashmapObj = doRFRegressorOneAPICompute(
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, treeCount,
numFeaturesPerNode, minObservationsLeafNode, maxTreeDepth, seed,
maxbins, bootstrap, comm, resultObj);
env->ReleaseStringUTFChars(ip_port, str);
return hashmapObj;
}
default: {
Expand Down
45 changes: 2 additions & 43 deletions mllib-dal/src/main/native/GPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ typedef std::shared_ptr<sycl::queue> queuePtr;
static std::mutex g_mtx;
static std::vector<sycl::queue> g_queueVector;

static std::vector<sycl::device> get_gpus() {
std::vector<sycl::device> get_gpus() {
auto platforms = sycl::platform::get_platforms();
for (auto p : platforms) {
auto devices = p.get_devices(sycl::info::device_type::gpu);
Expand All @@ -25,6 +25,7 @@ static std::vector<sycl::device> get_gpus() {
}

static int getLocalRank(ccl::communicator &comm, int size, int rank) {
const int MPI_MAX_PROCESSOR_NAME = 128;
/* Obtain local rank among nodes sharing the same host name */
char zero = static_cast<char>(0);
std::vector<char> name(MPI_MAX_PROCESSOR_NAME + 1, zero);
Expand Down Expand Up @@ -112,45 +113,3 @@ sycl::queue getQueue(const ComputeDevice device) {
}
}
}

preview::spmd::communicator<preview::spmd::device_memory_access::usm>
createDalCommunicator(const jint executorNum, const jint rank,
const ccl::string ccl_ip_port) {
auto gpus = get_gpus();

auto t1 = std::chrono::high_resolution_clock::now();

ccl::init();

auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();

logger::println(logger::INFO, "OneCCL singleton init took %f secs",
duration / 1000);

t1 = std::chrono::high_resolution_clock::now();

auto kvs_attr = ccl::create_kvs_attr();

kvs_attr.set<ccl::kvs_attr_id::ip_port>(ccl_ip_port);

ccl::shared_ptr_class<ccl::kvs> kvs = ccl::create_main_kvs(kvs_attr);

t2 = std::chrono::high_resolution_clock::now();
duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs",
duration / 1000);
sycl::queue queue{gpus[0]};
t1 = std::chrono::high_resolution_clock::now();
auto comm = preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rank, kvs);
t2 = std::chrono::high_resolution_clock::now();
duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
return comm;
}
6 changes: 2 additions & 4 deletions mllib-dal/src/main/native/GPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
#include <jni.h>
#include <oneapi/ccl.hpp>

#include "Communicator.hpp"

sycl::queue getAssignedGPU(const ComputeDevice device, jint *gpu_indices);

sycl::queue getQueue(const ComputeDevice device);
preview::spmd::communicator<preview::spmd::device_memory_access::usm>
createDalCommunicator(jint executorNum, jint rank, ccl::string ccl_ip_port);

std::vector<sycl::device> get_gpus();
10 changes: 2 additions & 8 deletions mllib-dal/src/main/native/KMeansImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows,
jlong numCols, jlong pNumTabCenters, jint clusterNum, jdouble tolerance,
jint iterationNum, jint executorNum, jint executorCores,
jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port,
jobject resultObj) {
jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -342,16 +341,11 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
logger::println(logger::INFO,
"OneDAL (native): use GPU kernels with rankid %d",
rank);

const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numCols,
pNumTabCenters, clusterNum, tolerance,
iterationNum, comm, resultObj);

env->ReleaseStringUTFChars(ip_port, str);
break;
}
#endif
Expand Down
9 changes: 2 additions & 7 deletions mllib-dal/src/main/native/LinearRegressionImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept,
jdouble regParam, jdouble elasticNetParam, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
jstring ip_port, jobject resultObj) {
jobject resultObj) {

logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
Expand All @@ -280,15 +280,10 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
logger::println(logger::INFO,
"oneDAL (native): use GPU kernels with rankid %d",
rank);

const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
resultptr = doLROneAPICompute(env, rank, comm, feature, featureRows,
featureCols, label, labelCols,
fitIntercept, executorNum, resultObj);
env->ReleaseStringUTFChars(ip_port, str);
#endif
} else {
ccl::communicator &cclComm = getComm();
Expand Down
53 changes: 46 additions & 7 deletions mllib-dal/src/main/native/OneCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "OneCCL.h"
#include "com_intel_oap_mllib_OneCCL__.h"
#include "service.h"
#include "GPU.h"

extern const size_t ccl_root = 0;

Expand All @@ -44,32 +45,40 @@ static std::vector<ccl::communicator> g_comms;
static std::vector<ccl::shared_ptr_class<ccl::kvs>> g_kvs;

ccl::communicator &getComm() { return g_comms[0]; }
ccl::shared_ptr_class<ccl::kvs> &getKvs() { return g_kvs[0]; }
#ifdef CPU_GPU_PROFILE
static std::vector<oneapi::dal::preview::spmd::communicator<oneapi::dal::preview::spmd::device_memory_access::usm>> g_dal_comms;
oneapi::dal::preview::spmd::communicator<oneapi::dal::preview::spmd::device_memory_access::usm> &getDalComm() { return g_dal_comms[0]; }

ccl::shared_ptr_class<ccl::kvs> &getKvs() { return g_kvs[0]; }
#endif
JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init(
JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port,
jobject param) {

logger::println(logger::INFO, "OneCCL (native): init");

#ifdef CPU_GPU_PROFILE
auto gpus = get_gpus();
#endif

auto t1 = std::chrono::high_resolution_clock::now();

ccl::init();
auto t2 = std::chrono::high_resolution_clock::now();

auto duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): init took %f secs",
duration / 1000);
const char *str = env->GetStringUTFChars(ip_port, 0);
ccl::string ccl_ip_port(str);

#ifdef CPU_ONLY_PROFILE
auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port);

g_kvs.push_back(singletonCCLInit.kvs);
g_comms.push_back(
ccl::create_communicator(size, rank, singletonCCLInit.kvs));
auto duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): init took %f secs",
duration / 1000);

rank_id = getComm().rank();
comm_size = getComm().size();
Expand All @@ -80,6 +89,33 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init(

env->SetLongField(param, fid_comm_size, comm_size);
env->SetLongField(param, fid_rank_id, rank_id);
#endif

#ifdef CPU_GPU_PROFILE
auto kvs_attr = ccl::create_kvs_attr();

kvs_attr.set<ccl::kvs_attr_id::ip_port>(ccl_ip_port);

ccl::shared_ptr_class<ccl::kvs> kvs = ccl::create_main_kvs(kvs_attr);

t2 = std::chrono::high_resolution_clock::now();
duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs",
duration / 1000);
sycl::queue queue{gpus[0]};
t1 = std::chrono::high_resolution_clock::now();
auto comm = oneapi::dal::preview::spmd::make_communicator<oneapi::dal::preview::spmd::backend::ccl>(
queue, size, rank, kvs);
t2 = std::chrono::high_resolution_clock::now();
duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): create communicator took %f secs",
duration / 1000);
g_dal_comms.push_back(comm);
#endif
env->ReleaseStringUTFChars(ip_port, str);

return 1;
Expand All @@ -90,6 +126,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {
logger::printerrln(logger::INFO, "OneCCL (native): cleanup");
g_kvs.pop_back();
g_comms.pop_back();
#ifdef CPU_GPU_PROFILE
g_dal_comms.pop_back();
#endif
}

JNIEXPORT jboolean JNICALL
Expand Down
10 changes: 9 additions & 1 deletion mllib-dal/src/main/native/OneCCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#pragma once

#include <oneapi/ccl.hpp>

#include <vector>

using namespace std;

namespace ccl {
Expand All @@ -44,4 +44,12 @@ event CCL_API gather(const BufferType *sendbuf, int sendcount,

ccl::communicator &getComm();
ccl::shared_ptr_class<ccl::kvs> &getKvs();

#ifdef CPU_GPU_PROFILE
#ifndef ONEDAL_DATA_PARALLEL
#define ONEDAL_DATA_PARALLEL
#endif
#include "Communicator.hpp"
oneapi::dal::preview::spmd::communicator<oneapi::dal::preview::spmd::device_memory_access::usm> &getDalComm();
#endif
extern const size_t ccl_root;
9 changes: 2 additions & 7 deletions mllib-dal/src/main/native/PCAImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,7 @@ JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows,
jlong numCols, jint executorNum, jint executorCores,
jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port,
jobject resultObj) {
jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -282,12 +281,8 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
"oneDAL (native): use GPU kernels with rankid %d",
rank);

const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
doPCAOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj);
env->ReleaseStringUTFChars(ip_port, str);
break;
}
#endif
Expand Down
8 changes: 2 additions & 6 deletions mllib-dal/src/main/native/SummarizerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,7 @@ JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows,
jlong numCols, jint executorNum, jint executorCores,
jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port,
jobject resultObj) {
jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -299,13 +298,10 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
logger::println(logger::INFO,
"oneDAL (native): use GPU kernels with rankid %d",
rank);
const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto comm = getDalComm();
doSummarizerOneAPICompute(env, pNumTabData, numRows, numCols, comm,
resultObj);
env->ReleaseStringUTFChars(ip_port, str);
break;
}
#endif
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 5bd92e2

Please sign in to comment.