Skip to content

Commit

Permalink
update optimize code
Browse files Browse the repository at this point in the history
  • Loading branch information
minmingzhu committed Aug 19, 2024
1 parent 9c509a1 commit 00e411a
Show file tree
Hide file tree
Showing 23 changed files with 80 additions and 148 deletions.
18 changes: 6 additions & 12 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
jintArray gpuIdxArray, jstring ip_port, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand All @@ -225,23 +225,17 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
}
#ifdef CPU_GPU_PROFILE
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rank);
"oneDAL (native): use GPU kernels with rankid %d", rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rank, kvs);
doCorrelationOneAPICompute(env, pNumTabData, numRows, numCols, comm,
resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
env->ReleaseStringUTFChars(ip_port, str);
break;
}
#endif
Expand Down
19 changes: 6 additions & 13 deletions mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,35 +307,28 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif
jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
jint maxBins, jboolean bootstrap, jintArray gpuIdxArray,
jobject resultObj) {
jstring ip_port, jobject resultObj) {
logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels");

ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
"oneDAL (native): use GPU kernels with rankid %d", rank);

auto queue = getAssignedGPU(device, gpuIndices);
const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rank, kvs);
jobject hashmapObj = doRFClassifierOneAPICompute(
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount,
numFeaturesPerNode, minObservationsLeafNode,
minObservationsSplitNode, minWeightFractionLeafNode,
minImpurityDecreaseSplitNode, maxTreeDepth, seed, maxBins,
bootstrap, comm, resultObj);
env->ReleaseStringUTFChars(ip_port, str);
return hashmapObj;
}
default: {
Expand Down
17 changes: 6 additions & 11 deletions mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,33 +296,28 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra
jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode,
jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins,
jboolean bootstrap, jintArray gpuIdxArray, jobject resultObj) {
jboolean bootstrap, jintArray gpuIdxArray, jstring ip_port, jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());

ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rank);
"OneDAL (native): use GPU kernels with rankid %d", rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rank, kvs);
jobject hashmapObj = doRFRegressorOneAPICompute(
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, treeCount,
numFeaturesPerNode, minObservationsLeafNode, maxTreeDepth, seed,
maxbins, bootstrap, comm, resultObj);
env->ReleaseStringUTFChars(ip_port, str);
return hashmapObj;
}
default: {
Expand Down
6 changes: 0 additions & 6 deletions mllib-dal/src/main/native/GPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ static std::vector<sycl::device> get_gpus() {
}

static int getLocalRank(ccl::communicator &comm, int size, int rank) {
const int MPI_MAX_PROCESSOR_NAME = 128;
/* Obtain local rank among nodes sharing the same host name */
char zero = static_cast<char>(0);
std::vector<char> name(MPI_MAX_PROCESSOR_NAME + 1, zero);
Expand Down Expand Up @@ -128,8 +127,6 @@ preview::spmd::communicator<preview::spmd::device_memory_access::usm> createDalC

logger::println(logger::INFO, "OneCCL singleton init took %f secs",
duration / 1000);
logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, OneCCL singleton init took %f secs.", rank, duration / 1000 );


t1 = std::chrono::high_resolution_clock::now();

Expand All @@ -145,7 +142,6 @@ preview::spmd::communicator<preview::spmd::device_memory_access::usm> createDalC
.count();
logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs",
duration / 1000);
logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, OneCCL create communicator took %f secs.", rank, duration / 1000 );
sycl::queue queue{gpus[0]};
t1 = std::chrono::high_resolution_clock::now();
auto comm =
Expand All @@ -155,7 +151,5 @@ preview::spmd::communicator<preview::spmd::device_memory_access::usm> createDalC
duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, create communicator took %f secs.", rank, duration / 1000 );
return comm;
}

2 changes: 1 addition & 1 deletion mllib-dal/src/main/native/KMeansImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols,
jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
jintArray gpuIdxArray, jstring ip_port, jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down
50 changes: 22 additions & 28 deletions mllib-dal/src/main/native/LinearRegressionImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ ridge_regression_compute(size_t rankId, ccl::communicator &comm,

#ifdef CPU_GPU_PROFILE
static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
ccl::communicator &cclComm, sycl::queue &queue,
preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel,
jlong labelCols, jboolean jfitIntercept,
Expand All @@ -225,9 +225,6 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
const bool isRoot = (rankId == ccl_root);
bool fitIntercept = bool(jfitIntercept);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm = preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rankId, kvs);
homogen_table xtrain = *reinterpret_cast<homogen_table *>(
createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols,
comm.get_queue())
Expand Down Expand Up @@ -265,7 +262,7 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept,
jdouble regParam, jdouble elasticNetParam, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
jobject resultObj) {
jstring ip_port, jobject resultObj) {

logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
Expand All @@ -280,22 +277,23 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
jlong resultptr = 0L;
if (useGPU) {
#ifdef CPU_GPU_PROFILE
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rank);
"oneDAL (native): use GPU kernels with rankid %d", rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
int size = cclComm.size();
auto queue = getAssignedGPU(device, gpuIndices);
const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

resultptr = doLROneAPICompute(
env, rank, cclComm, queue, feature, featureRows, featureCols,
env, rank, comm, feature, featureRows, featureCols,
label, labelCols, fitIntercept, executorNum, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
env->ReleaseStringUTFChars(ip_port, str);
#endif
} else {
ccl::communicator &cclComm = getComm();
size_t rankId = cclComm.rank();

NumericTablePtr pLabel = *((NumericTablePtr *)label);
NumericTablePtr pData = *((NumericTablePtr *)feature);

Expand All @@ -318,22 +316,18 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra

NumericTablePtr *coeffvectors = new NumericTablePtr(resultTable);
resultptr = (jlong)coeffvectors;
}

jlong ret = 0L;
if (rankId == ccl_root) {
// Get the class of the result object
jclass clazz = env->GetObjectClass(resultObj);
// Get Field references
jfieldID coeffNumericTableField =
env->GetFieldID(clazz, "coeffNumericTable", "J");
if (rankId == ccl_root) {
// Get the class of the result object
jclass clazz = env->GetObjectClass(resultObj);
// Get Field references
jfieldID coeffNumericTableField =
env->GetFieldID(clazz, "coeffNumericTable", "J");

env->SetLongField(resultObj, coeffNumericTableField, resultptr);
env->SetLongField(resultObj, coeffNumericTableField, resultptr);

// intercept is already in first column of coeffvectors
ret = resultptr;
} else {
ret = (jlong)0;
// intercept is already in first column of coeffvectors
resultptr = (jlong)coeffvectors;
}
}
return ret;
return resultptr;
}
52 changes: 6 additions & 46 deletions mllib-dal/src/main/native/OneCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,72 +55,32 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init(
auto t1 = std::chrono::high_resolution_clock::now();

ccl::init();
auto t2 = std::chrono::high_resolution_clock::now();


const char *str = env->GetStringUTFChars(ip_port, 0);
ccl::string ccl_ip_port(str);
const char *device = env->GetStringUTFChars(use_device, 0);
ccl::string ccl_ip_port(str);

auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port);

g_kvs.push_back(singletonCCLInit.kvs);

#ifdef CPU_ONLY_PROFILE
g_comms.push_back(
ccl::create_communicator(size, rank, singletonCCLInit.kvs));

auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): init took %f secs",
duration / 1000);
#endif

jclass cls = env->GetObjectClass(param);
jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");

env->SetLongField(param, size, comm_size);
env->SetLongField(param, rank, rank_id);
env->ReleaseStringUTFChars(ip_port, str);

return 1;
}

/*
* Class: com_intel_oap_mllib_OneCCL__
* Method: c_init
* Signature: ()I
*/
JNIEXPORT jint JNICALL
Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject, jint size, jint rank, jobject param) {
logger::printerrln(logger::INFO, "OneCCL (native): init dpcpp");
auto t1 = std::chrono::high_resolution_clock::now();

ccl::init();

const char *str = env->GetStringUTFChars(ip_port, 0);
ccl::string ccl_ip_port(str);

auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port);

g_kvs.push_back(singletonCCLInit.kvs);


auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
(float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
logger::println(logger::INFO, "OneCCL (native): init took %f secs",
duration / 1000);
rank_id = getComm().rank();
comm_size = getComm().size();

jclass cls = env->GetObjectClass(param);
jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");

env->SetLongField(param, size, comm_size);
env->SetLongField(param, rank, rank_id);
env->SetLongField(param, fid_comm_size, comm_size);
env->SetLongField(param, fid_rank_id, rank_id);
env->ReleaseStringUTFChars(ip_port, str);

return 1;
Expand Down
18 changes: 6 additions & 12 deletions mllib-dal/src/main/native/PCAImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
jintArray gpuIdxArray, jstring ip_port, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand All @@ -277,22 +277,16 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
}
#ifdef CPU_GPU_PROFILE
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rank);
"oneDAL (native): use GPU kernels with rankid %d", rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
const char *str = env->GetStringUTFChars(ip_port, nullptr);
ccl::string ccl_ip_port(str);
auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port);

auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rank, kvs);
doPCAOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
env->ReleaseStringUTFChars(ip_port, str);
break;
}
#endif
Expand Down
Loading

0 comments on commit 00e411a

Please sign in to comment.