From be55d63e6dc6ac0e631ae59c5eb5b254db63607f Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 18 Sep 2023 07:20:38 +0000 Subject: [PATCH 1/9] update spark to 3.3.3 Signed-off-by: minmingzhu --- README.md | 1 + examples/scala/pom-parent.xml | 2 +- mllib-dal/pom.xml | 2 +- .../com/intel/oap/mllib/classification/NaiveBayesShim.scala | 5 +++-- .../mllib/classification/RandomForestClassifierShim.scala | 6 +++--- .../scala/com/intel/oap/mllib/clustering/KMeansShim.scala | 5 +++-- .../main/scala/com/intel/oap/mllib/feature/PCAShim.scala | 5 +++-- .../scala/com/intel/oap/mllib/recommendation/ALSShim.scala | 4 ++-- .../intel/oap/mllib/regression/LinearRegressionShim.scala | 4 ++-- .../oap/mllib/regression/RandomForestRegressorShim.scala | 5 +++-- .../scala/com/intel/oap/mllib/stat/CorrelationShim.scala | 5 +++-- .../scala/com/intel/oap/mllib/stat/SummarizerShim.scala | 5 +++-- .../classification/{spark322 => spark333}/NaiveBayes.scala | 2 +- .../{spark322 => spark333}/RandomForestClassifier.scala | 2 +- .../spark/ml/clustering/{spark322 => spark333}/KMeans.scala | 2 +- .../spark/ml/feature/{spark322 => spark333}/PCA.scala | 2 +- .../ml/recommendation/{spark322 => spark333}/ALS.scala | 2 +- .../{spark322 => spark333}/LinearRegression.scala | 2 +- .../{spark322 => spark333}/RandomForestRegressor.scala | 2 +- .../spark/ml/stat/{spark322 => spark333}/Correlation.scala | 2 +- .../mllib/stat/{spark322 => spark333}/Statistics.scala | 2 +- 21 files changed, 37 insertions(+), 30 deletions(-) rename mllib-dal/src/main/scala/org/apache/spark/ml/classification/{spark322 => spark333}/NaiveBayes.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/ml/classification/{spark322 => spark333}/RandomForestClassifier.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/ml/clustering/{spark322 => spark333}/KMeans.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/ml/feature/{spark322 => spark333}/PCA.scala (98%) rename mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/{spark322 => spark333}/ALS.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/ml/regression/{spark322 => spark333}/LinearRegression.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/ml/regression/{spark322 => spark333}/RandomForestRegressor.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/ml/stat/{spark322 => spark333}/Correlation.scala (99%) rename mllib-dal/src/main/scala/org/apache/spark/mllib/stat/{spark322 => spark333}/Statistics.scala (98%) diff --git a/README.md b/README.md index 5e2e0724d..55afbf855 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ The following runtime packages with all their dependencies should be installed i * Apache Spark 3.2.0 * Apache Spark 3.2.1 * Apache Spark 3.2.2 +* Apache Spark 3.3.3 ### Supported IntelĀ® oneAPI Toolkits diff --git a/examples/scala/pom-parent.xml b/examples/scala/pom-parent.xml index d1fec21cc..7b20fa05b 100644 --- a/examples/scala/pom-parent.xml +++ b/examples/scala/pom-parent.xml @@ -29,7 +29,7 @@ 1.6.0 2.12.15 2.12 - 3.2.2 + 3.3.3 diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index caedfa050..f88829542 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -31,7 +31,7 @@ 2.12.15 2.12 3.2.9 - 3.2.2 + 3.3.3 2023.1.0.31217 src/assembly/assembly.xml diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/NaiveBayesShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/NaiveBayesShim.scala index dff679027..522a7f6af 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/NaiveBayesShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/NaiveBayesShim.scala @@ -20,7 +20,7 @@ import com.intel.oap.mllib.Utils import org.apache.spark.internal.Logging import org.apache.spark.ml.classification.NaiveBayesModel -import org.apache.spark.ml.classification.spark322.{NaiveBayes => NaiveBayesSpark322} +import org.apache.spark.ml.classification.spark333.{NaiveBayes => NaiveBayesSpark333} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.apache.spark.{SPARK_VERSION, SparkException} @@ -35,7 +35,8 @@ object NaiveBayesShim extends Logging { logInfo(s"Loading NaiveBayes for Spark $SPARK_VERSION") val shim = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => new NaiveBayesSpark322(uid) + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new NaiveBayesSpark333(uid) case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } shim diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierShim.scala index 94831c8d7..66c39f9d3 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierShim.scala @@ -19,7 +19,7 @@ import com.intel.oap.mllib.Utils import org.apache.spark.internal.Logging import org.apache.spark.ml.classification.RandomForestClassificationModel import org.apache.spark.{SPARK_VERSION, SparkException} -import org.apache.spark.ml.classification.spark322.{RandomForestClassifier => RandomForestClassifier322} +import org.apache.spark.ml.classification.spark333.{RandomForestClassifier => RandomForestClassifier333} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset @@ -33,8 +33,8 @@ object RandomForestClassifierShim extends Logging { logInfo(s"Loading RandomForestClassifier for Spark $SPARK_VERSION") val shim = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => - new RandomForestClassifier322(uid) + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new RandomForestClassifier333(uid) case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } shim diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansShim.scala index 66e16c754..8c969142b 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansShim.scala @@ -20,7 +20,7 @@ import com.intel.oap.mllib.Utils import org.apache.spark.internal.Logging import org.apache.spark.ml.clustering.{KMeans, KMeansModel} -import org.apache.spark.ml.clustering.spark322.{KMeans => KMeansSpark322} +import org.apache.spark.ml.clustering.spark333.{KMeans => KMeansSpark333} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.apache.spark.{SPARK_VERSION, SparkException} @@ -34,7 +34,8 @@ object KMeansShim extends Logging { def create(uid: String): KMeansShim = { logInfo(s"Loading KMeans for Spark $SPARK_VERSION") val kmeans = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => new KMeansSpark322(uid) + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new KMeansSpark333(uid) case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } kmeans diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCAShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCAShim.scala index 0f2df5e10..4b656bf6e 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCAShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCAShim.scala @@ -20,7 +20,7 @@ import com.intel.oap.mllib.Utils import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.PCAModel -import org.apache.spark.ml.feature.spark322.{PCA => PCASpark322} +import org.apache.spark.ml.feature.spark333.{PCA => PCASpark333} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.apache.spark.{SPARK_VERSION, SparkException} @@ -34,7 +34,8 @@ object PCAShim extends Logging { def create(uid: String): PCAShim = { logInfo(s"Loading PCA for Spark $SPARK_VERSION") val pca = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => new PCASpark322(uid) + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new PCASpark333(uid) case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } pca diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/recommendation/ALSShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/recommendation/ALSShim.scala index f754df8b9..e2c50f41e 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/recommendation/ALSShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/recommendation/ALSShim.scala @@ -21,7 +21,7 @@ import com.intel.oap.mllib.Utils import org.apache.spark.internal.Logging import org.apache.spark.ml.recommendation.ALS.Rating import org.apache.spark.ml.recommendation.spark313.{ALS => ALSSpark313} -import org.apache.spark.ml.recommendation.spark322.{ALS => ALSSpark322} +import org.apache.spark.ml.recommendation.spark333.{ALS => ALSSpark333} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.{SPARK_VERSION, SparkException} @@ -50,7 +50,7 @@ object ALSShim extends Logging { logInfo(s"Loading ALS for Spark $SPARK_VERSION") val als = Utils.getSparkVersion() match { case "3.1.1" | "3.1.2" | "3.1.3" => new ALSSpark313() - case "3.2.0" | "3.2.1" | "3.2.2" => new ALSSpark322() + case "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => new ALSSpark333() case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } als diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionShim.scala index 8782d7095..fc9aea021 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionShim.scala @@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.regression.LinearRegressionModel import org.apache.spark.ml.regression.spark313.{LinearRegression => LinearRegressionSpark313} -import org.apache.spark.ml.regression.spark322.{LinearRegression => LinearRegressionSpark322} +import org.apache.spark.ml.regression.spark333.{LinearRegression => LinearRegressionSpark333} import org.apache.spark.sql.Dataset import org.apache.spark.{SPARK_VERSION, SparkException} @@ -36,7 +36,7 @@ object LinearRegressionShim extends Logging { logInfo(s"Loading ALS for Spark $SPARK_VERSION") val linearRegression = Utils.getSparkVersion() match { case "3.1.1" | "3.1.2" | "3.1.3" => new LinearRegressionSpark313(uid) - case "3.2.0" | "3.2.1" | "3.2.2" => new LinearRegressionSpark322(uid) + case "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => new LinearRegressionSpark333(uid) case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } linearRegression diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorShim.scala index 6cffe1e11..a056fd08e 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorShim.scala @@ -20,7 +20,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.{SPARK_VERSION, SparkException} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.regression.RandomForestRegressionModel -import org.apache.spark.ml.regression.spark322.{RandomForestRegressor => RandomForestRegressor322} +import org.apache.spark.ml.regression.spark333.{RandomForestRegressor => RandomForestRegressor333} import org.apache.spark.sql.Dataset trait RandomForestRegressorShim extends Logging { @@ -33,7 +33,8 @@ object RandomForestRegressorShim extends Logging { logInfo(s"Loading RandomForestClassifier for Spark $SPARK_VERSION") val shim = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => new RandomForestRegressor322(uid) + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new RandomForestRegressor333(uid) case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } shim diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationShim.scala index 90b9f6d29..036955223 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationShim.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.storage.StorageLevel import scala.reflect.ClassTag -import org.apache.spark.ml.stat.spark322.{Correlation => CorrelationSpark322} +import org.apache.spark.ml.stat.spark333.{Correlation => CorrelationSpark333} trait CorrelationShim extends Serializable with Logging { def corr(dataset: Dataset[_], column: String, method: String): DataFrame @@ -35,7 +35,8 @@ object CorrelationShim extends Logging { def create(): CorrelationShim = { logInfo(s"Loading Correlation for Spark $SPARK_VERSION") val als = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => new CorrelationSpark322() + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new CorrelationSpark333() case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } als diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerShim.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerShim.scala index 5f3ff92df..38efa04a0 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerShim.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerShim.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.mllib.stat.spark322.{Statistics => SummarizerSpark322} +import org.apache.spark.mllib.stat.spark333.{Statistics => SummarizerSpark333} trait SummarizerShim extends Serializable with Logging { def colStats(X: RDD[Vector]): MultivariateStatisticalSummary @@ -35,7 +35,8 @@ object SummarizerShim extends Logging { def create(): SummarizerShim = { logInfo(s"Loading Summarizer for Spark $SPARK_VERSION") val summarizer = Utils.getSparkVersion() match { - case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" => new SummarizerSpark322() + case "3.1.1" | "3.1.2" | "3.1.3" | "3.2.0" | "3.2.1" | "3.2.2" | "3.3.3" => + new SummarizerSpark333() case _ => throw new SparkException(s"Unsupported Spark version $SPARK_VERSION") } summarizer diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark322/NaiveBayes.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark333/NaiveBayes.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark322/NaiveBayes.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark333/NaiveBayes.scala index d59654305..12936022e 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark322/NaiveBayes.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark333/NaiveBayes.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.classification.spark322 +package org.apache.spark.ml.classification.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.classification.{NaiveBayesDALImpl, NaiveBayesShim} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark322/RandomForestClassifier.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark333/RandomForestClassifier.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark322/RandomForestClassifier.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark333/RandomForestClassifier.scala index 6ce3e8261..9777b1413 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark322/RandomForestClassifier.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/classification/spark333/RandomForestClassifier.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.classification.spark322 +package org.apache.spark.ml.classification.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.classification.{LearningNode => LearningNodeDAL, RandomForestClassifierDALImpl, RandomForestClassifierShim} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/spark322/KMeans.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/spark333/KMeans.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/clustering/spark322/KMeans.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/clustering/spark333/KMeans.scala index 8d47377ac..07b56a837 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/spark322/KMeans.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/spark333/KMeans.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.clustering.spark322 +package org.apache.spark.ml.clustering.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.clustering.{KMeansDALImpl, KMeansShim} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/spark322/PCA.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/spark333/PCA.scala similarity index 98% rename from mllib-dal/src/main/scala/org/apache/spark/ml/feature/spark322/PCA.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/feature/spark333/PCA.scala index cc99b1779..e0b32fc22 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/spark322/PCA.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/spark333/PCA.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.feature.spark322 +package org.apache.spark.ml.feature.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.feature.{PCADALImpl, PCAShim} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/spark322/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/spark333/ALS.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/spark322/ALS.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/spark333/ALS.scala index 81e2d8300..2fe5cd29f 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/spark322/ALS.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/spark333/ALS.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.recommendation.spark322 +package org.apache.spark.ml.recommendation.spark333 import com.github.fommil.netlib.BLAS.{getInstance => blas} import com.intel.oap.mllib.{Utils => DALUtils} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark322/LinearRegression.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark333/LinearRegression.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark322/LinearRegression.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark333/LinearRegression.scala index ac861b2a6..a921dfbfc 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark322/LinearRegression.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark333/LinearRegression.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.regression.spark322 +package org.apache.spark.ml.regression.spark333 import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{ diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark322/RandomForestRegressor.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark333/RandomForestRegressor.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark322/RandomForestRegressor.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark333/RandomForestRegressor.scala index 2b1557fc2..479e7fdd1 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark322/RandomForestRegressor.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark333/RandomForestRegressor.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.regression.spark322 +package org.apache.spark.ml.regression.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.classification.{LearningNode => LearningNodeDAL} diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/stat/spark322/Correlation.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/stat/spark333/Correlation.scala similarity index 99% rename from mllib-dal/src/main/scala/org/apache/spark/ml/stat/spark322/Correlation.scala rename to mllib-dal/src/main/scala/org/apache/spark/ml/stat/spark333/Correlation.scala index 1d456ea62..2fe66fe85 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/stat/spark322/Correlation.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/stat/spark333/Correlation.scala @@ -17,7 +17,7 @@ */ // scalastyle:on -package org.apache.spark.ml.stat.spark322 +package org.apache.spark.ml.stat.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.stat.{CorrelationDALImpl, CorrelationShim} diff --git a/mllib-dal/src/main/scala/org/apache/spark/mllib/stat/spark322/Statistics.scala b/mllib-dal/src/main/scala/org/apache/spark/mllib/stat/spark333/Statistics.scala similarity index 98% rename from mllib-dal/src/main/scala/org/apache/spark/mllib/stat/spark322/Statistics.scala rename to mllib-dal/src/main/scala/org/apache/spark/mllib/stat/spark333/Statistics.scala index c36fd18bd..222c1139e 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/mllib/stat/spark322/Statistics.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/mllib/stat/spark333/Statistics.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.apache.spark.mllib.stat.spark322 +package org.apache.spark.mllib.stat.spark333 import com.intel.oap.mllib.Utils import com.intel.oap.mllib.stat.{SummarizerDALImpl, SummarizerShim} From 75c3a8cc1e18f4dcdec9844d4eb188a4c22fbb69 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 Jul 2024 10:56:02 +0800 Subject: [PATCH 2/9] remove oneccl communicator --- mllib-dal/src/main/native/CorrelationImpl.cpp | 15 ++--- .../native/DecisionForestClassifierImpl.cpp | 12 ++-- .../native/DecisionForestRegressorImpl.cpp | 13 ++-- mllib-dal/src/main/native/GPU.cpp | 16 +---- mllib-dal/src/main/native/GPU.h | 3 +- mllib-dal/src/main/native/KMeansImpl.cpp | 16 +++-- .../src/main/native/LinearRegressionImpl.cpp | 15 ++--- mllib-dal/src/main/native/OneCCL.cpp | 63 ++++++++++++++----- mllib-dal/src/main/native/PCAImpl.cpp | 16 ++--- mllib-dal/src/main/native/SummarizerImpl.cpp | 19 +++--- .../javah/com_intel_oap_mllib_OneCCL__.h | 2 +- ...sification_RandomForestClassifierDALImpl.h | 2 +- ...intel_oap_mllib_clustering_KMeansDALImpl.h | 2 +- .../com_intel_oap_mllib_feature_PCADALImpl.h | 2 +- ...mllib_regression_LinearRegressionDALImpl.h | 2 +- ..._regression_RandomForestRegressorDALImpl.h | 2 +- ..._intel_oap_mllib_stat_CorrelationDALImpl.h | 2 +- ...m_intel_oap_mllib_stat_SummarizerDALImpl.h | 2 +- .../scala/com/intel/oap/mllib/OneCCL.scala | 7 ++- .../RandomForestClassifierDALImpl.scala | 6 +- .../oap/mllib/clustering/KMeansDALImpl.scala | 6 +- .../intel/oap/mllib/feature/PCADALImpl.scala | 6 +- .../regression/LinearRegressionDALImpl.scala | 6 +- .../RandomForestRegressorDALImpl.scala | 6 +- .../oap/mllib/stat/CorrelationDALImpl.scala | 6 +- .../oap/mllib/stat/SummarizerDALImpl.scala | 6 +- 26 files changed, 129 insertions(+), 124 deletions(-) diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index f2dcac75b..a9103102f 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -197,19 +197,19 @@ static void doCorrelationOneAPICompute( JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( - JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numCols, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jint executorNum, jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); - ccl::communicator &cclComm = getComm(); - int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::host: case ComputeDevice::cpu: { + ccl::communicator &cclComm = getComm(); + int rankId = cclComm.rank(); NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); // Set number of threads for oneDAL to use for each rank services::Environment::getInstance()->setNumberOfThreads(executorCores); @@ -229,19 +229,16 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( logger::println( logger::INFO, "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - int size = cclComm.size(); - - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rank, kvs); doCorrelationOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); diff --git a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp index c1d064d1b..aad8d9048 100644 --- a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp +++ b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp @@ -300,7 +300,7 @@ static jobject doRFClassifierOneAPICompute( */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL( - JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature, jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum, jint computeDeviceOrdinal, jint classCount, jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode, @@ -310,8 +310,6 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels"); - ccl::communicator &cclComm = getComm(); - int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::gpu: { @@ -319,20 +317,18 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif logger::println( logger::INFO, "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - int size = cclComm.size(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rank, kvs); jobject hashmapObj = doRFClassifierOneAPICompute( env, pNumTabFeature, featureRows, featureCols, pNumTabLabel, labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount, diff --git a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp index 7619c2879..853f736de 100644 --- a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp +++ b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp @@ -292,7 +292,7 @@ static jobject doRFRegressorOneAPICompute( JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL( - JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature, jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum, jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins, @@ -301,8 +301,6 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra "OneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); - ccl::communicator &cclComm = getComm(); - int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::gpu: { @@ -310,19 +308,16 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra logger::println( logger::INFO, "OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - int size = cclComm.size(); - - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rank, kvs); jobject hashmapObj = doRFRegressorOneAPICompute( env, pNumTabFeature, featureRows, featureCols, pNumTabLabel, labelCols, executorNum, computeDeviceOrdinal, treeCount, diff --git a/mllib-dal/src/main/native/GPU.cpp b/mllib-dal/src/main/native/GPU.cpp index 4d60f9d78..9dbba24f4 100644 --- a/mllib-dal/src/main/native/GPU.cpp +++ b/mllib-dal/src/main/native/GPU.cpp @@ -66,8 +66,7 @@ static sycl::queue getSyclQueue(const sycl::device device) { } } -sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm, - int size, int rankId, jint *gpu_indices, int n_gpu) { +sycl::queue getAssignedGPU(const ComputeDevice device, int *gpu_indices) { switch (device) { case ComputeDevice::host: case ComputeDevice::cpu: { @@ -78,19 +77,8 @@ sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm, } case ComputeDevice::gpu: { logger::println(logger::INFO, "selector GPU"); - auto local_rank = getLocalRank(comm, size, rankId); auto gpus = get_gpus(); - - logger::println(logger::INFO, - "rank: %d size: %d local_rank: %d n_gpu: %d", rankId, - size, local_rank, n_gpu); - - auto gpu_selected = gpu_indices[local_rank % n_gpu]; - logger::println(logger::INFO, "GPU selected for current rank: %d", - gpu_selected); - - // In case gpu_selected index is larger than number of GPU SYCL devices - auto rank_gpu = gpus[gpu_selected % gpus.size()]; + auto rank_gpu = gpus[0]; sycl::queue q{rank_gpu}; return q; } diff --git a/mllib-dal/src/main/native/GPU.h b/mllib-dal/src/main/native/GPU.h index 818d3ddb4..f8d7c25a9 100644 --- a/mllib-dal/src/main/native/GPU.h +++ b/mllib-dal/src/main/native/GPU.h @@ -7,7 +7,6 @@ #include #include -sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm, - int size, int rankId, jint *gpu_indices, int n_gpu); +sycl::queue getAssignedGPU(const ComputeDevice device, jint *gpu_indices); sycl::queue getQueue(const ComputeDevice device); diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index a1c629612..f690c1c45 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -305,7 +305,7 @@ static jlong doKMeansOneAPICompute( */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters( - JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numCols, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum, jint executorNum, jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) { @@ -314,12 +314,13 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe ComputeDeviceString[computeDeviceOrdinal].c_str()); jlong ret = 0L; - ccl::communicator &cclComm = getComm(); - int rankId = cclComm.rank(); + ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::host: case ComputeDevice::cpu: { + ccl::communicator &cclComm = getComm(); + int rankId = cclComm.rank(); NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters); // Set number of threads for OneDAL to use for each rank @@ -341,19 +342,16 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe logger::println( logger::INFO, "OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - int size = cclComm.size(); - - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rank, kvs); ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numCols, pNumTabCenters, clusterNum, tolerance, iterationNum, comm, resultObj); diff --git a/mllib-dal/src/main/native/LinearRegressionImpl.cpp b/mllib-dal/src/main/native/LinearRegressionImpl.cpp index 017b7706f..ca94b54c5 100644 --- a/mllib-dal/src/main/native/LinearRegressionImpl.cpp +++ b/mllib-dal/src/main/native/LinearRegressionImpl.cpp @@ -225,10 +225,9 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, const bool isRoot = (rankId == ccl_root); bool fitIntercept = bool(jfitIntercept); - int size = cclComm.size(); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rankId, kvs); homogen_table xtrain = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols, comm.get_queue()) @@ -262,7 +261,7 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL( - JNIEnv *env, jobject obj, jlong feature, jlong featureRows, + JNIEnv *env, jobject obj, jint rank, jlong feature, jlong featureRows, jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept, jdouble regParam, jdouble elasticNetParam, jint executorNum, jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, @@ -272,9 +271,6 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); - ccl::communicator &cclComm = getComm(); - size_t rankId = cclComm.rank(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); bool useGPU = false; if (device == ComputeDevice::gpu && regParam == 0) { @@ -288,15 +284,14 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra logger::println( logger::INFO, "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); resultptr = doLROneAPICompute( - env, rankId, cclComm, queue, feature, featureRows, featureCols, + env, rank, cclComm, queue, feature, featureRows, featureCols, label, labelCols, fitIntercept, executorNum, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); #endif diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index 7d147d1a8..79fbeebed 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -32,6 +32,7 @@ #include "Logger.h" #include "OneCCL.h" #include "com_intel_oap_mllib_OneCCL__.h" +#include "service.h" extern const size_t ccl_root = 0; @@ -46,7 +47,7 @@ ccl::communicator &getComm() { return g_comms[0]; } ccl::shared_ptr_class &getKvs() { return g_kvs[0]; } JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( - JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, + JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jint computeDeviceOrdinal, jobject param) { logger::println(logger::INFO, "OneCCL (native): init"); @@ -57,29 +58,34 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( const char *str = env->GetStringUTFChars(ip_port, 0); ccl::string ccl_ip_port(str); + const char *device = env->GetStringUTFChars(use_device, 0); + ccl::string ccl_ip_port(str); auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port); g_kvs.push_back(singletonCCLInit.kvs); - g_comms.push_back( - ccl::create_communicator(size, rank, singletonCCLInit.kvs)); - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = - (float)std::chrono::duration_cast(t2 - t1) - .count(); - logger::println(logger::INFO, "OneCCL (native): init took %f secs", - duration / 1000); - - rank_id = getComm().rank(); - comm_size = getComm().size(); + ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); + switch (device) { + case ComputeDevice::host: + case ComputeDevice::cpu: { + g_comms.push_back( + ccl::create_communicator(size, rank, singletonCCLInit.kvs)); + + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, "OneCCL (native): init took %f secs", + duration / 1000); + break; + } jclass cls = env->GetObjectClass(param); jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J"); jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J"); - env->SetLongField(param, fid_comm_size, comm_size); - env->SetLongField(param, fid_rank_id, rank_id); + env->SetLongField(param, size, comm_size); + env->SetLongField(param, rank, rank_id); env->ReleaseStringUTFChars(ip_port, str); return 1; @@ -91,10 +97,35 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( * Signature: ()I */ JNIEXPORT jint JNICALL -Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) { +Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject, jint size, jint rank, jobject param) { logger::printerrln(logger::INFO, "OneCCL (native): init dpcpp"); + auto t1 = std::chrono::high_resolution_clock::now(); + ccl::init(); + const char *str = env->GetStringUTFChars(ip_port, 0); + ccl::string ccl_ip_port(str); + + auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port); + + g_kvs.push_back(singletonCCLInit.kvs); + + + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, "OneCCL (native): init took %f secs", + duration / 1000); + + jclass cls = env->GetObjectClass(param); + jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J"); + jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J"); + + env->SetLongField(param, size, comm_size); + env->SetLongField(param, rank, rank_id); + env->ReleaseStringUTFChars(ip_port, str); + return 1; } diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index 0600b47d9..f2821d558 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -250,19 +250,18 @@ static void doPCAOneAPICompute( JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( - JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numCols, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jint executorNum, jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); - - ccl::communicator &cclComm = getComm(); - size_t rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::host: case ComputeDevice::cpu: { + ccl::communicator &cclComm = getComm(); + size_t rankId = cclComm.rank(); NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); // Set number of threads for oneDAL to use for each rank services::Environment::getInstance()->setNumberOfThreads(executorCores); @@ -282,19 +281,16 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( logger::println( logger::INFO, "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - int size = cclComm.size(); - - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rank, kvs); doPCAOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); break; diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 52b585dc2..9af30d939 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -268,19 +268,18 @@ static void doSummarizerOneAPICompute( JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( - JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numCols, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jint executorNum, jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); - - ccl::communicator &cclComm = getComm(); - int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::host: case ComputeDevice::cpu: { + ccl::communicator &cclComm = getComm(); + int rankId = cclComm.rank(); NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); // Set number of threads for oneDAL to use for each rank services::Environment::getInstance()->setNumberOfThreads(executorCores); @@ -300,19 +299,15 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( logger::println( logger::INFO, "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rankId); + rank); jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - - int size = cclComm.size(); - - auto queue = - getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); + auto queue = getAssignedGPU(device, gpuIndices); ccl::shared_ptr_class &kvs = getKvs(); auto comm = preview::spmd::make_communicator( - queue, size, rankId, kvs); + queue, executorNum, rank, kvs); doSummarizerOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); @@ -320,7 +315,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( } #endif default: { - deviceError("PCA", ComputeDeviceString[computeDeviceOrdinal].c_str()); + deviceError("Summarizer", ComputeDeviceString[computeDeviceOrdinal].c_str()); } } return 0; diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h index a89b7d214..4bfa1d0c3 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h @@ -45,7 +45,7 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1getAvailPort * Signature: (IILjava/lang/String;Lcom/intel/oap/mllib/CCLParam;)I */ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init - (JNIEnv *, jobject, jint, jint, jstring, jobject); + (JNIEnv *, jobject, jint, jint, jstring, jstring, jobject); /* * Class: com_intel_oap_mllib_OneCCL__ diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h index 8c0c4ecdd..79bd6f16f 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIIIIIIIDDIJIZ[ILcom/intel/oap/mllib/classification/RandomForestResult;)Ljava/util/HashMap; */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL - (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jint, jdouble, jdouble, jint, jlong, jint, jboolean, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jint, jdouble, jdouble, jint, jlong, jint, jboolean, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h index 595f69fb5..a0fc24dde 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIDIIII[ILcom/intel/oap/mllib/clustering/KMeansResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters - (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint, jdouble, jint, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jint, jdouble, jint, jint, jint, jint, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h index 2ac220860..34646da95 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/feature/PCAResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL - (JNIEnv *, jobject, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h index 28c7e8f42..0dc6f4e79 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL - (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h index ac457b3bf..1350d8268 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIIIIIIJIZ[ILcom/intel/oap/mllib/classification/RandomForestResult;)Ljava/util/HashMap; */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL - (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jlong, jint, jboolean, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jlong, jint, jboolean, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h index 96219ae4f..494b89658 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/stat/CorrelationResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL - (JNIEnv *, jobject, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h index 754a5b645..7db45743f 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/stat/SummarizerResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL - (JNIEnv *, jobject, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala index 48caebe1b..70ddef079 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala @@ -24,14 +24,14 @@ object OneCCL extends Logging { var cclParam = new CCLParam() - def init(executor_num: Int, rank: Int, ip_port: String): Unit = { + def init(executor_num: Int, rank: Int, ip_port: String, computeDeviceOrdinal: Int): Unit = { setExecutorEnv() logInfo(s"Initializing with IP_PORT: ${ip_port}") // cclParam is output from native code - c_init(executor_num, rank, ip_port, cclParam) + c_init(executor_num, rank, ip_port, computeDeviceOrdinal, cclParam) // executor number should equal to oneCCL world size assert(executor_num == cclParam.getCommSize, @@ -67,7 +67,8 @@ object OneCCL extends Logging { @native def c_getAvailPort(localIP: String): Int - @native private def c_init(size: Int, rank: Int, ip_port: String, param: CCLParam): Int + @native private def c_init(size: Int, rank: Int, ip_port: String, + computeDeviceOrdinal: Int, param: CCLParam): Int @native private def c_cleanup(): Unit } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala index 8e4d27160..d0cfa42e4 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala @@ -76,7 +76,7 @@ class RandomForestClassifierDALImpl(val uid: String, val kvsIPPort = getOneCCLIPPort(labeledPointsTables) labeledPointsTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) Iterator.empty }.count() rfcTimer.record("OneCCL Init") @@ -96,6 +96,7 @@ class RandomForestClassifierDALImpl(val uid: String, val computeStartTime = System.nanoTime() val result = new RandomForestResult val hashmap = cRFClassifierTrainDAL( + rank, feature._1, feature._2, feature._3, @@ -140,7 +141,8 @@ class RandomForestClassifierDALImpl(val uid: String, results(0) } - @native private[mllib] def cRFClassifierTrainDAL(featureTabAddr: Long, + @native private[mllib] def cRFClassifierTrainDAL(rank: Int, + featureTabAddr: Long, numRows: Long, numCols: Long, lableTabAddr: Long, diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala index e194e9d22..d8752fcd3 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala @@ -53,7 +53,7 @@ class KMeansDALImpl(var nClusters: Int, val kvsIPPort = getOneCCLIPPort(coalescedTables) coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) Iterator.empty }.count() kmeansTimer.record("OneCCL Init") @@ -81,6 +81,7 @@ class KMeansDALImpl(var nClusters: Int, } cCentroids = cKMeansOneapiComputeWithInitCenters( + rank, tableArr, rows, columns, @@ -136,7 +137,8 @@ class KMeansDALImpl(var nClusters: Int, parentModel } - @native private[mllib] def cKMeansOneapiComputeWithInitCenters(data: Long, + @native private[mllib] def cKMeansOneapiComputeWithInitCenters( rank: Int, + data: Long, numRows: Long, numCols: Long, centers: Long, diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala index 0410c18a7..b9df1f6c3 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala @@ -60,7 +60,7 @@ class PCADALImpl(val k: Int, pcaTimer.record("Data Convertion") coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) Iterator.empty }.count() pcaTimer.record("OneCCL Init") @@ -79,6 +79,7 @@ class PCADALImpl(val k: Int, null } cPCATrainDAL( + rank, tableArr, rows, columns, @@ -214,7 +215,8 @@ class PCADALImpl(val k: Int, // Single entry to call Correlation PCA DAL backend with parameter K - @native private[mllib] def cPCATrainDAL(data: Long, + @native private[mllib] def cPCATrainDAL(rank: Int, + data: Long, numRows: Long, numCols: Long, executorNum: Int, diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index 806fdb40c..a0ed680d6 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -121,7 +121,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, (label.toString.toLong, 0L, 0L) } - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) val result = new LiRResult() val gpuIndices = if (useDevice == "GPU") { @@ -138,6 +138,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, } val cbeta = cLinearRegressionTrainDAL( + rank, featureTabAddr, featureRows, featureColumns, @@ -183,7 +184,8 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, } // Single entry to call Linear Regression DAL backend with parameters - @native private def cLinearRegressionTrainDAL(data: Long, + @native private def cLinearRegressionTrainDAL(rank: Int, + data: Long, numRows: Long, numCols: Long, label: Long, diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala index 16fd17cdb..11e924cd6 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala @@ -70,7 +70,7 @@ class RandomForestRegressorDALImpl(val uid: String, val kvsIPPort = getOneCCLIPPort(labeledPointsTables) labeledPointsTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) Iterator.empty }.count() rfrTimer.record("OneCCL Init") @@ -91,6 +91,7 @@ class RandomForestRegressorDALImpl(val uid: String, val computeStartTime = System.nanoTime() val result = new RandomForestResult val hashmap = cRFRegressorTrainDAL( + rank, feature._1, feature._2, feature._3, @@ -141,7 +142,8 @@ class RandomForestRegressorDALImpl(val uid: String, results(0)._2 } - @native private[mllib] def cRFRegressorTrainDAL(featureTabAddr: Long, + @native private[mllib] def cRFRegressorTrainDAL(rank: Int, + featureTabAddr: Long, numRows: Long, numCols: Long, lableTabAddr: Long, diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala index 21465e1af..73a172bb0 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala @@ -47,7 +47,7 @@ class CorrelationDALImpl( val kvsIPPort = getOneCCLIPPort(coalescedTables) coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) Iterator.empty }.count() corTimer.record("OneCCL Init") @@ -69,6 +69,7 @@ class CorrelationDALImpl( null } cCorrelationTrainDAL( + rank, tableArr, rows, columns, @@ -118,7 +119,8 @@ class CorrelationDALImpl( } - @native private[mllib] def cCorrelationTrainDAL(data: Long, + @native private[mllib] def cCorrelationTrainDAL(rank: Int, + data: Long, numRows: Long, numCols: Long, executorNum: Int, diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala index a516962c3..3828674f2 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala @@ -48,7 +48,7 @@ class SummarizerDALImpl(val executorNum: Int, val kvsIPPort = getOneCCLIPPort(data) coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) + OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) Iterator.empty }.count() sumTimer.record("OneCCL Init") @@ -70,6 +70,7 @@ class SummarizerDALImpl(val executorNum: Int, null } cSummarizerTrainDAL( + rank, tableArr, rows, columns, @@ -150,7 +151,8 @@ class SummarizerDALImpl(val executorNum: Int, summary } - @native private[mllib] def cSummarizerTrainDAL(data: Long, + @native private[mllib] def cSummarizerTrainDAL(rank: Int, + data: Long, numRows: Long, numCols: Long, executorNum: Int, From 407535a988253daf19048404c8d4283c786a6992 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 15 Aug 2024 15:03:38 +0800 Subject: [PATCH 3/9] update --- mllib-dal/src/main/native/OneCCL.cpp | 29 +++++++++---------- .../scala/com/intel/oap/mllib/OneCCL.scala | 7 ++--- .../RandomForestClassifierDALImpl.scala | 2 +- .../oap/mllib/clustering/KMeansDALImpl.scala | 2 +- .../intel/oap/mllib/feature/PCADALImpl.scala | 2 +- .../regression/LinearRegressionDALImpl.scala | 2 +- .../RandomForestRegressorDALImpl.scala | 2 +- .../oap/mllib/stat/CorrelationDALImpl.scala | 2 +- .../oap/mllib/stat/SummarizerDALImpl.scala | 2 +- 9 files changed, 23 insertions(+), 27 deletions(-) diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index 79fbeebed..b924c6987 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -47,7 +47,7 @@ ccl::communicator &getComm() { return g_comms[0]; } ccl::shared_ptr_class &getKvs() { return g_kvs[0]; } JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( - JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jint computeDeviceOrdinal, + JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jobject param) { logger::println(logger::INFO, "OneCCL (native): init"); @@ -64,21 +64,18 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port); g_kvs.push_back(singletonCCLInit.kvs); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); - switch (device) { - case ComputeDevice::host: - case ComputeDevice::cpu: { - g_comms.push_back( - ccl::create_communicator(size, rank, singletonCCLInit.kvs)); - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = - (float)std::chrono::duration_cast(t2 - t1) - .count(); - logger::println(logger::INFO, "OneCCL (native): init took %f secs", - duration / 1000); - break; - } + +#ifdef CPU_ONLY_PROFILE + g_comms.push_back( + ccl::create_communicator(size, rank, singletonCCLInit.kvs)); + + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, "OneCCL (native): init took %f secs", + duration / 1000); +#endif jclass cls = env->GetObjectClass(param); jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J"); diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala index 70ddef079..48caebe1b 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala @@ -24,14 +24,14 @@ object OneCCL extends Logging { var cclParam = new CCLParam() - def init(executor_num: Int, rank: Int, ip_port: String, computeDeviceOrdinal: Int): Unit = { + def init(executor_num: Int, rank: Int, ip_port: String): Unit = { setExecutorEnv() logInfo(s"Initializing with IP_PORT: ${ip_port}") // cclParam is output from native code - c_init(executor_num, rank, ip_port, computeDeviceOrdinal, cclParam) + c_init(executor_num, rank, ip_port, cclParam) // executor number should equal to oneCCL world size assert(executor_num == cclParam.getCommSize, @@ -67,8 +67,7 @@ object OneCCL extends Logging { @native def c_getAvailPort(localIP: String): Int - @native private def c_init(size: Int, rank: Int, ip_port: String, - computeDeviceOrdinal: Int, param: CCLParam): Int + @native private def c_init(size: Int, rank: Int, ip_port: String, param: CCLParam): Int @native private def c_cleanup(): Unit } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala index d0cfa42e4..6aca49f14 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala @@ -76,7 +76,7 @@ class RandomForestClassifierDALImpl(val uid: String, val kvsIPPort = getOneCCLIPPort(labeledPointsTables) labeledPointsTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty }.count() rfcTimer.record("OneCCL Init") diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala index d8752fcd3..64b2f6c7f 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala @@ -53,7 +53,7 @@ class KMeansDALImpl(var nClusters: Int, val kvsIPPort = getOneCCLIPPort(coalescedTables) coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty }.count() kmeansTimer.record("OneCCL Init") diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala index b9df1f6c3..8eb9554a1 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala @@ -60,7 +60,7 @@ class PCADALImpl(val k: Int, pcaTimer.record("Data Convertion") coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty }.count() pcaTimer.record("OneCCL Init") diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index a0ed680d6..f95bc0846 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -121,7 +121,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, (label.toString.toLong, 0L, 0L) } - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) val result = new LiRResult() val gpuIndices = if (useDevice == "GPU") { diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala index 11e924cd6..018473a61 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala @@ -70,7 +70,7 @@ class RandomForestRegressorDALImpl(val uid: String, val kvsIPPort = getOneCCLIPPort(labeledPointsTables) labeledPointsTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty }.count() rfrTimer.record("OneCCL Init") diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala index 73a172bb0..e521aefe7 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala @@ -47,7 +47,7 @@ class CorrelationDALImpl( val kvsIPPort = getOneCCLIPPort(coalescedTables) coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty }.count() corTimer.record("OneCCL Init") diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala index 3828674f2..277039ab1 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala @@ -48,7 +48,7 @@ class SummarizerDALImpl(val executorNum: Int, val kvsIPPort = getOneCCLIPPort(data) coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort, computeDevice.ordinal()) + OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty }.count() sumTimer.record("OneCCL Init") From 330b3d87a85cee6f1710d9eadc6f1d4d0eb0e900 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 16 Aug 2024 11:16:42 +0800 Subject: [PATCH 4/9] set ZE_AFFINITY_MASK=rankId --- .../src/main/scala/com/intel/oap/mllib/OneCCL.scala | 8 ++------ .../RandomForestClassifierDALImpl.scala | 11 +++++++++++ .../intel/oap/mllib/clustering/KMeansDALImpl.scala | 10 ++++++++++ .../com/intel/oap/mllib/feature/PCADALImpl.scala | 11 +++++++++++ .../mllib/regression/LinearRegressionDALImpl.scala | 11 +++++++++++ .../regression/RandomForestRegressorDALImpl.scala | 11 +++++++++++ .../com/intel/oap/mllib/stat/CorrelationDALImpl.scala | 11 +++++++++++ .../com/intel/oap/mllib/stat/SummarizerDALImpl.scala | 11 +++++++++++ 8 files changed, 78 insertions(+), 6 deletions(-) diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala index 48caebe1b..c89c9ffd2 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/OneCCL.scala @@ -42,12 +42,8 @@ object OneCCL extends Logging { } // Run on Executor - def setExecutorEnv(): Unit = { - setEnv("CCL_ATL_TRANSPORT", "ofi") - // Set CCL_ROOT to workaround CCL_ROOT env read bug, should remove when upstream fix this - setEnv("CCL_ROOT", "/opt/intel/oneapi/ccl/latest") - // Uncomment this if you whant to debug oneCCL - // setEnv("CCL_LOG_LEVEL", "debug") + def setExecutorEnv(key: String, value: String): Unit = { + setEnv(key, value) } // Run on Executor diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala index 6aca49f14..70479c79a 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala @@ -75,6 +75,17 @@ class RandomForestClassifierDALImpl(val uid: String, rfcTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(labeledPointsTables) + labeledPointsTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + labeledPointsTables.mapPartitionsWithIndex { (rank, table) => OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala index 64b2f6c7f..14eb16800 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala @@ -51,6 +51,16 @@ class KMeansDALImpl(var nClusters: Int, kmeansTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(coalescedTables) + coalescedTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() coalescedTables.mapPartitionsWithIndex { (rank, table) => OneCCL.init(executorNum, rank, kvsIPPort) diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala index 8eb9554a1..7190ade3f 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala @@ -59,6 +59,17 @@ class PCADALImpl(val k: Int, val kvsIPPort = getOneCCLIPPort(coalescedTables) pcaTimer.record("Data Convertion") + coalescedTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + coalescedTables.mapPartitionsWithIndex { (rank, table) => OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index f95bc0846..40b2f4423 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -106,6 +106,17 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, } lrTimer.record("Data Convertion") + labeledPointsTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + val results = labeledPointsTables.mapPartitionsWithIndex { (rank, tables) => val (feature, label) = tables.next() val (featureTabAddr : Long, featureRows : Long, featureColumns : Long) = diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala index 018473a61..77ea4c656 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala @@ -69,6 +69,17 @@ class RandomForestRegressorDALImpl(val uid: String, val kvsIPPort = getOneCCLIPPort(labeledPointsTables) + labeledPointsTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + labeledPointsTables.mapPartitionsWithIndex { (rank, table) => OneCCL.init(executorNum, rank, kvsIPPort) Iterator.empty diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala index e521aefe7..fff2d4ac5 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala @@ -52,6 +52,17 @@ class CorrelationDALImpl( }.count() corTimer.record("OneCCL Init") + coalescedTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + val results = coalescedTables.mapPartitionsWithIndex { (rank, iter) => val (tableArr : Long, rows : Long, columns : Long) = if (useDevice == "GPU") { iter.next() diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala index 277039ab1..dcde7ef91 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala @@ -53,6 +53,17 @@ class SummarizerDALImpl(val executorNum: Int, }.count() sumTimer.record("OneCCL Init") + coalescedTables.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + val results = coalescedTables.mapPartitionsWithIndex { (rank, iter) => val (tableArr : Long, rows : Long, columns : Long) = if (useDevice == "GPU") { iter.next() From 9c509a1c06cf85b120533c53ca913581d5a6f36c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 18 Aug 2024 09:52:43 +0800 Subject: [PATCH 5/9] optimize code --- mllib-dal/src/main/native/GPU.cpp | 46 +++++++++++++++++++ mllib-dal/src/main/native/GPU.h | 2 + mllib-dal/src/main/native/KMeansImpl.cpp | 16 ++----- .../scala/com/intel/oap/mllib/CommonJob.scala | 46 +++++++++++++++++++ .../RandomForestClassifierDALImpl.scala | 19 ++------ .../oap/mllib/clustering/KMeansDALImpl.scala | 24 +++------- .../intel/oap/mllib/feature/PCADALImpl.scala | 19 ++------ .../regression/LinearRegressionDALImpl.scala | 16 ++----- .../RandomForestRegressorDALImpl.scala | 19 ++------ .../oap/mllib/stat/CorrelationDALImpl.scala | 19 ++------ .../oap/mllib/stat/SummarizerDALImpl.scala | 19 ++------ .../oap/mllib/ConvertHomogenTableSuite.scala | 6 +-- .../mllib/CorrelationHomogenTableSuite.scala | 2 +- .../mllib/SummarizerHomogenTableSuite.scala | 12 ++--- .../com/intel/oap/mllib/TestCommon.scala | 12 ++--- 15 files changed, 141 insertions(+), 136 deletions(-) create mode 100644 mllib-dal/src/main/scala/com/intel/oap/mllib/CommonJob.scala diff --git a/mllib-dal/src/main/native/GPU.cpp b/mllib-dal/src/main/native/GPU.cpp index 9dbba24f4..5be0223a4 100644 --- a/mllib-dal/src/main/native/GPU.cpp +++ b/mllib-dal/src/main/native/GPU.cpp @@ -113,3 +113,49 @@ sycl::queue getQueue(const ComputeDevice device) { } } } + + +preview::spmd::communicator createDalCommunicator(const jint executorNum, const jint rank, const ccl::string ccl_ip_port){ + auto gpus = get_gpus(); + + auto t1 = std::chrono::high_resolution_clock::now(); + + ccl::init(); + + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1).count(); + + logger::println(logger::INFO, "OneCCL singleton init took %f secs", + duration / 1000); + logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, OneCCL singleton init took %f secs.", rank, duration / 1000 ); + + + t1 = std::chrono::high_resolution_clock::now(); + + auto kvs_attr = ccl::create_kvs_attr(); + + kvs_attr.set(ccl_ip_port); + + ccl::shared_ptr_class kvs = ccl::create_main_kvs(kvs_attr); + + t2 = std::chrono::high_resolution_clock::now(); + duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs", + duration / 1000); + logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, OneCCL create communicator took %f secs.", rank, duration / 1000 ); + sycl::queue queue{gpus[0]}; + t1 = std::chrono::high_resolution_clock::now(); + auto comm = + preview::spmd::make_communicator( + queue, executorNum, rank, kvs); + t2 = std::chrono::high_resolution_clock::now(); + duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, create communicator took %f secs.", rank, duration / 1000 ); + return comm; +} + diff --git a/mllib-dal/src/main/native/GPU.h b/mllib-dal/src/main/native/GPU.h index f8d7c25a9..83b3272f0 100644 --- a/mllib-dal/src/main/native/GPU.h +++ b/mllib-dal/src/main/native/GPU.h @@ -6,7 +6,9 @@ #include #include #include +#include "Communicator.hpp" sycl::queue getAssignedGPU(const ComputeDevice device, jint *gpu_indices); sycl::queue getQueue(const ComputeDevice device); +preview::spmd::communicator createDalCommunicator(jint executorNum, jint rank, ccl::string ccl_ip_port); diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index f690c1c45..21ef5e218 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -338,25 +338,19 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); + "OneDAL (native): use GPU kernels with rankid %d", rank); - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - auto queue = getAssignedGPU(device, gpuIndices); - - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numCols, pNumTabCenters, clusterNum, tolerance, iterationNum, comm, resultObj); - env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + env->ReleaseStringUTFChars(ip_port, str); break; } #endif diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/CommonJob.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/CommonJob.scala new file mode 100644 index 000000000..e3e0aab58 --- /dev/null +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/CommonJob.scala @@ -0,0 +1,46 @@ +/* + * Copyright 2020 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.oap.mllib + +import org.apache.spark.TaskContext +import org.apache.spark.rdd.RDD + +object CommonJob { + + def setAffinityMask(data: RDD[_], useDevice: String): Unit = { + data.mapPartitionsWithIndex { (rank, iter) => + val gpuIndices = if (useDevice == "GPU") { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } else { + null + } + OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) + Iterator.empty + }.count() + } + + def createCCLInit(data: RDD[_], executorNum: Int, kvsIPPort: String, useDevice: String): Unit = { + if (useDevice == "CPU") { + data.mapPartitionsWithIndex { (rank, table) => + OneCCL.init(executorNum, rank, kvsIPPort) + Iterator.empty + }.count() + } + } + +} diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala index 70479c79a..6a2da4ac7 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala @@ -16,7 +16,7 @@ package com.intel.oap.mllib.classification import com.intel.oap.mllib.Utils.getOneCCLIPPort -import com.intel.oap.mllib.{OneCCL, OneDAL, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Utils} import com.intel.oneapi.dal.table.Common import org.apache.spark.annotation.Since import org.apache.spark.TaskContext @@ -75,21 +75,8 @@ class RandomForestClassifierDALImpl(val uid: String, rfcTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(labeledPointsTables) - labeledPointsTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() - - labeledPointsTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) - Iterator.empty - }.count() + CommonJob.setAffinityMask(labeledPointsTables, useDevice) + CommonJob.createCCLInit(labeledPointsTables, executorNum, kvsIPPort, useDevice) rfcTimer.record("OneCCL Init") val results = labeledPointsTables.mapPartitionsWithIndex { (rank, tables) => diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala index 14eb16800..61dd1ef80 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala @@ -17,7 +17,7 @@ package com.intel.oap.mllib.clustering import com.intel.oap.mllib.Utils.getOneCCLIPPort -import com.intel.oap.mllib.{OneCCL, OneDAL, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Utils} import com.intel.oneapi.dal.table.Common import org.apache.spark.TaskContext import org.apache.spark.internal.Logging @@ -51,21 +51,9 @@ class KMeansDALImpl(var nClusters: Int, kmeansTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(coalescedTables) - coalescedTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() - - coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) - Iterator.empty - }.count() + + CommonJob.setAffinityMask(coalescedTables, useDevice) + CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) kmeansTimer.record("OneCCL Init") val results = coalescedTables.mapPartitionsWithIndex { (rank, iter) => @@ -118,7 +106,9 @@ class KMeansDALImpl(var nClusters: Int, } else { Iterator.empty } - OneCCL.cleanup() + if (useDevice == "CPU") { + OneCCL.cleanup() + } ret }.collect() diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala index 7190ade3f..071117cc0 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala @@ -19,7 +19,7 @@ package com.intel.oap.mllib.feature import java.nio.DoubleBuffer import com.intel.daal.data_management.data.{HomogenNumericTable, NumericTable} import com.intel.oap.mllib.Utils.getOneCCLIPPort -import com.intel.oap.mllib.{OneCCL, OneDAL, Service, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Service, Utils} import org.apache.spark.TaskContext import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging @@ -59,21 +59,8 @@ class PCADALImpl(val k: Int, val kvsIPPort = getOneCCLIPPort(coalescedTables) pcaTimer.record("Data Convertion") - coalescedTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() - - coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) - Iterator.empty - }.count() + CommonJob.setAffinityMask(coalescedTables, useDevice) + CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) pcaTimer.record("OneCCL Init") val results = coalescedTables.mapPartitionsWithIndex { (rank, iter) => diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index 40b2f4423..79243f988 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -17,7 +17,7 @@ package com.intel.oap.mllib.regression import com.intel.oap.mllib.Utils.getOneCCLIPPort -import com.intel.oap.mllib.{OneCCL, OneDAL, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Utils} import com.intel.oneapi.dal.table.Common import org.apache.spark.SparkException import org.apache.spark.TaskContext @@ -106,16 +106,9 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, } lrTimer.record("Data Convertion") - labeledPointsTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() + CommonJob.setAffinityMask(labeledPointsTables, useDevice) + CommonJob.createCCLInit(labeledPointsTables, executorNum, kvsIPPort, useDevice) + lrTimer.record("OneCCL Init") val results = labeledPointsTables.mapPartitionsWithIndex { (rank, tables) => val (feature, label) = tables.next() @@ -132,7 +125,6 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, (label.toString.toLong, 0L, 0L) } - OneCCL.init(executorNum, rank, kvsIPPort) val result = new LiRResult() val gpuIndices = if (useDevice == "GPU") { diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala index 77ea4c656..100be8823 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala @@ -17,7 +17,7 @@ package com.intel.oap.mllib.regression import com.intel.oap.mllib.Utils.getOneCCLIPPort import com.intel.oap.mllib.classification.{LearningNode, RandomForestResult} -import com.intel.oap.mllib.{OneCCL, OneDAL, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Utils} import com.intel.oneapi.dal.table.Common import org.apache.spark.TaskContext import org.apache.spark.internal.Logging @@ -69,21 +69,8 @@ class RandomForestRegressorDALImpl(val uid: String, val kvsIPPort = getOneCCLIPPort(labeledPointsTables) - labeledPointsTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() - - labeledPointsTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) - Iterator.empty - }.count() + CommonJob.setAffinityMask(labeledPointsTables, useDevice) + CommonJob.createCCLInit(labeledPointsTables, executorNum, kvsIPPort, useDevice) rfrTimer.record("OneCCL Init") val results = labeledPointsTables.mapPartitionsWithIndex { (rank, tables) => diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala index fff2d4ac5..04a3760bb 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala @@ -17,7 +17,7 @@ package com.intel.oap.mllib.stat import com.intel.oap.mllib.Utils.getOneCCLIPPort -import com.intel.oap.mllib.{OneCCL, OneDAL, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Utils} import com.intel.oneapi.dal.table.Common import org.apache.spark.TaskContext import org.apache.spark.internal.Logging @@ -46,23 +46,10 @@ class CorrelationDALImpl( val kvsIPPort = getOneCCLIPPort(coalescedTables) - coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) - Iterator.empty - }.count() + CommonJob.setAffinityMask(coalescedTables, useDevice) + CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) corTimer.record("OneCCL Init") - coalescedTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() - val results = coalescedTables.mapPartitionsWithIndex { (rank, iter) => val (tableArr : Long, rows : Long, columns : Long) = if (useDevice == "GPU") { iter.next() diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala index dcde7ef91..c8422b097 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala @@ -16,7 +16,7 @@ package com.intel.oap.mllib.stat -import com.intel.oap.mllib.{OneCCL, OneDAL, Utils} +import com.intel.oap.mllib.{CommonJob, OneCCL, OneDAL, Utils} import org.apache.spark.TaskContext import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.Vector @@ -47,23 +47,10 @@ class SummarizerDALImpl(val executorNum: Int, val kvsIPPort = getOneCCLIPPort(data) - coalescedTables.mapPartitionsWithIndex { (rank, table) => - OneCCL.init(executorNum, rank, kvsIPPort) - Iterator.empty - }.count() + CommonJob.setAffinityMask(coalescedTables, useDevice) + CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) sumTimer.record("OneCCL Init") - coalescedTables.mapPartitionsWithIndex { (rank, iter) => - val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) - } else { - null - } - OneCCL.setExecutorEnv("ZE_AFFINITY_MASK", gpuIndices(0).toString()) - Iterator.empty - }.count() - val results = coalescedTables.mapPartitionsWithIndex { (rank, iter) => val (tableArr : Long, rows : Long, columns : Long) = if (useDevice == "GPU") { iter.next() diff --git a/mllib-dal/src/test/scala/com/intel/oap/mllib/ConvertHomogenTableSuite.scala b/mllib-dal/src/test/scala/com/intel/oap/mllib/ConvertHomogenTableSuite.scala index bbb6bbe7e..3246387b3 100644 --- a/mllib-dal/src/test/scala/com/intel/oap/mllib/ConvertHomogenTableSuite.scala +++ b/mllib-dal/src/test/scala/com/intel/oap/mllib/ConvertHomogenTableSuite.scala @@ -57,7 +57,7 @@ class ConvertHomogenTableSuite extends FunctionsSuite with Logging { val metadata = table.getMetaData for (i <- 0 until 10) { assert(metadata.getDataType(i) == FLOAT64) - assert(metadata.getFeatureType(i) == Common.FeatureType.RATIO) + assert(metadata.getFeatureType(i) == CommonJob.FeatureType.RATIO) } assertArrayEquals(table.getDoubleData, TestCommon.convertArray(data)) @@ -75,7 +75,7 @@ class ConvertHomogenTableSuite extends FunctionsSuite with Logging { val metadata = table.getMetaData for (i <- 0 until 10) { assert(metadata.getDataType(i) == FLOAT64) - assert(metadata.getFeatureType(i) == Common.FeatureType.RATIO) + assert(metadata.getFeatureType(i) == CommonJob.FeatureType.RATIO) } assertArrayEquals(table.getDoubleData, data) @@ -105,7 +105,7 @@ class ConvertHomogenTableSuite extends FunctionsSuite with Logging { val metadata = table.getMetaData for (i <- 0 until 10) { assert(metadata.getDataType(i) == FLOAT64) - assert(metadata.getFeatureType(i) == Common.FeatureType.RATIO) + assert(metadata.getFeatureType(i) == CommonJob.FeatureType.RATIO) } assertArrayEquals(table.getDoubleData, TestCommon.convertArray(data)) diff --git a/mllib-dal/src/test/scala/com/intel/oap/mllib/CorrelationHomogenTableSuite.scala b/mllib-dal/src/test/scala/com/intel/oap/mllib/CorrelationHomogenTableSuite.scala index 34361766d..98d37a338 100644 --- a/mllib-dal/src/test/scala/com/intel/oap/mllib/CorrelationHomogenTableSuite.scala +++ b/mllib-dal/src/test/scala/com/intel/oap/mllib/CorrelationHomogenTableSuite.scala @@ -45,7 +45,7 @@ class CorrelationHomogenTableSuite extends FunctionsSuite with Logging { val correlationDAL = new CorrelationDALImpl(1, 1) val gpuIndices = Array(0) val result = new CorrelationResult() - correlationDAL.cCorrelationTrainDAL(dataTable.getcObejct(), sourceData.length, sourceData(0).length, 1, 1, Common.ComputeDevice.HOST.ordinal(), gpuIndices, result); + correlationDAL.cCorrelationTrainDAL(dataTable.getcObejct(), sourceData.length, sourceData(0).length, 1, 1, CommonJob.ComputeDevice.HOST.ordinal(), gpuIndices, result); val correlationMatrix = TestCommon.getMatrixFromTable(OneDAL.makeHomogenTable( result.getCorrelationNumericTable), TestCommon.getComputeDevice) diff --git a/mllib-dal/src/test/scala/com/intel/oap/mllib/SummarizerHomogenTableSuite.scala b/mllib-dal/src/test/scala/com/intel/oap/mllib/SummarizerHomogenTableSuite.scala index 712cccbfa..5917af2e1 100644 --- a/mllib-dal/src/test/scala/com/intel/oap/mllib/SummarizerHomogenTableSuite.scala +++ b/mllib-dal/src/test/scala/com/intel/oap/mllib/SummarizerHomogenTableSuite.scala @@ -31,15 +31,15 @@ class SummarizerHomogenTableSuite extends FunctionsSuite with Logging{ val sourceData = TestCommon.readCSV("src/test/resources/data/covcormoments_dense.csv") - val dataTable = new HomogenTable(sourceData.length, sourceData(0).length, TestCommon.convertArray(sourceData), Common.ComputeDevice.HOST); + val dataTable = new HomogenTable(sourceData.length, sourceData(0).length, TestCommon.convertArray(sourceData), CommonJob.ComputeDevice.HOST); val summarizerDAL = new SummarizerDALImpl(1, 1) val gpuIndices = Array(0) val result = new SummarizerResult() - summarizerDAL.cSummarizerTrainDAL(dataTable.getcObejct(), sourceData.length, sourceData(0).length, 1, 1, Common.ComputeDevice.HOST.ordinal(), gpuIndices, result) - val meanTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getMeanNumericTable), Common.ComputeDevice.HOST) - val varianceTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getVarianceNumericTable), Common.ComputeDevice.HOST) - val minimumTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getMinimumNumericTable), Common.ComputeDevice.HOST) - val maximumTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getMaximumNumericTable), Common.ComputeDevice.HOST) + summarizerDAL.cSummarizerTrainDAL(dataTable.getcObejct(), sourceData.length, sourceData(0).length, 1, 1, CommonJob.ComputeDevice.HOST.ordinal(), gpuIndices, result) + val meanTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getMeanNumericTable), CommonJob.ComputeDevice.HOST) + val varianceTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getVarianceNumericTable), CommonJob.ComputeDevice.HOST) + val minimumTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getMinimumNumericTable), CommonJob.ComputeDevice.HOST) + val maximumTable = OneDAL.homogenTable1xNToVector(OneDAL.makeHomogenTable(result.getMaximumNumericTable), CommonJob.ComputeDevice.HOST) assertArrayEquals(expectMean , meanTable.toArray, 0.000001) assertArrayEquals(expectVariance, varianceTable.toDense.values, 0.000001) diff --git a/mllib-dal/src/test/scala/com/intel/oap/mllib/TestCommon.scala b/mllib-dal/src/test/scala/com/intel/oap/mllib/TestCommon.scala index 5a2ecef27..9ae20cec4 100644 --- a/mllib-dal/src/test/scala/com/intel/oap/mllib/TestCommon.scala +++ b/mllib-dal/src/test/scala/com/intel/oap/mllib/TestCommon.scala @@ -84,7 +84,7 @@ object TestCommon { arrayDouble } def getMatrixFromTable(table: HomogenTable, - device: Common.ComputeDevice): DenseMatrix = { + device: CommonJob.ComputeDevice): DenseMatrix = { val numRows = table.getRowCount.toInt val numCols = table.getColumnCount.toInt // returned DoubleBuffer is ByteByffer, need to copy as double array @@ -97,14 +97,14 @@ object TestCommon { matrix } - def getComputeDevice: Common.ComputeDevice = { + def getComputeDevice: CommonJob.ComputeDevice = { val device = System.getProperty("computeDevice") - var computeDevice: Common.ComputeDevice = Common.ComputeDevice.HOST + var computeDevice: CommonJob.ComputeDevice = CommonJob.ComputeDevice.HOST if(device != null) { device.toUpperCase match { - case "HOST" => computeDevice = Common.ComputeDevice.HOST - case "CPU" => computeDevice = Common.ComputeDevice.CPU - case "GPU" => computeDevice = Common.ComputeDevice.GPU + case "HOST" => computeDevice = CommonJob.ComputeDevice.HOST + case "CPU" => computeDevice = CommonJob.ComputeDevice.CPU + case "GPU" => computeDevice = CommonJob.ComputeDevice.GPU case _ => "Invalid Device" } } From 00e411a94e3cfbce8287bfbecaf9d06bc3739455 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 19 Aug 2024 13:50:57 +0800 Subject: [PATCH 6/9] update optimize code --- mllib-dal/src/main/native/CorrelationImpl.cpp | 18 +++---- .../native/DecisionForestClassifierImpl.cpp | 19 +++---- .../native/DecisionForestRegressorImpl.cpp | 17 +++--- mllib-dal/src/main/native/GPU.cpp | 6 --- mllib-dal/src/main/native/KMeansImpl.cpp | 2 +- .../src/main/native/LinearRegressionImpl.cpp | 50 ++++++++---------- mllib-dal/src/main/native/OneCCL.cpp | 52 +++---------------- mllib-dal/src/main/native/PCAImpl.cpp | 18 +++---- mllib-dal/src/main/native/SummarizerImpl.cpp | 18 +++---- .../javah/com_intel_oap_mllib_OneCCL__.h | 2 +- ...intel_oap_mllib_clustering_KMeansDALImpl.h | 2 +- .../com_intel_oap_mllib_feature_PCADALImpl.h | 2 +- ...mllib_regression_LinearRegressionDALImpl.h | 2 +- ..._regression_RandomForestRegressorDALImpl.h | 2 +- ..._intel_oap_mllib_stat_CorrelationDALImpl.h | 2 +- ...m_intel_oap_mllib_stat_SummarizerDALImpl.h | 2 +- .../RandomForestClassifierDALImpl.scala | 2 + .../oap/mllib/clustering/KMeansDALImpl.scala | 2 + .../intel/oap/mllib/feature/PCADALImpl.scala | 2 + .../regression/LinearRegressionDALImpl.scala | 2 + .../RandomForestRegressorDALImpl.scala | 2 + .../oap/mllib/stat/CorrelationDALImpl.scala | 2 + .../oap/mllib/stat/SummarizerDALImpl.scala | 2 + 23 files changed, 80 insertions(+), 148 deletions(-) diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index a9103102f..dd21890f4 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -199,7 +199,7 @@ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jobject resultObj) { + jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -225,23 +225,17 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); + "oneDAL (native): use GPU kernels with rankid %d", rank); - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - auto queue = getAssignedGPU(device, gpuIndices); - - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); doCorrelationOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); - env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + env->ReleaseStringUTFChars(ip_port, str); break; } #endif diff --git a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp index aad8d9048..64a0782ca 100644 --- a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp +++ b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp @@ -307,28 +307,20 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif jint minObservationsSplitNode, jdouble minWeightFractionLeafNode, jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed, jint maxBins, jboolean bootstrap, jintArray gpuIdxArray, - jobject resultObj) { + jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels"); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::gpu: { - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); - - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); + "oneDAL (native): use GPU kernels with rankid %d", rank); - auto queue = getAssignedGPU(device, gpuIndices); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); jobject hashmapObj = doRFClassifierOneAPICompute( env, pNumTabFeature, featureRows, featureCols, pNumTabLabel, labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount, @@ -336,6 +328,7 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif minObservationsSplitNode, minWeightFractionLeafNode, minImpurityDecreaseSplitNode, maxTreeDepth, seed, maxBins, bootstrap, comm, resultObj); + env->ReleaseStringUTFChars(ip_port, str); return hashmapObj; } default: { diff --git a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp index 853f736de..c757d12a1 100644 --- a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp +++ b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp @@ -296,7 +296,7 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum, jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins, - jboolean bootstrap, jintArray gpuIdxArray, jobject resultObj) { + jboolean bootstrap, jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "OneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -304,25 +304,20 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::gpu: { - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); + "OneDAL (native): use GPU kernels with rankid %d", rank); - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - auto queue = getAssignedGPU(device, gpuIndices); - - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); jobject hashmapObj = doRFRegressorOneAPICompute( env, pNumTabFeature, featureRows, featureCols, pNumTabLabel, labelCols, executorNum, computeDeviceOrdinal, treeCount, numFeaturesPerNode, minObservationsLeafNode, maxTreeDepth, seed, maxbins, bootstrap, comm, resultObj); + env->ReleaseStringUTFChars(ip_port, str); return hashmapObj; } default: { diff --git a/mllib-dal/src/main/native/GPU.cpp b/mllib-dal/src/main/native/GPU.cpp index 5be0223a4..2454fadef 100644 --- a/mllib-dal/src/main/native/GPU.cpp +++ b/mllib-dal/src/main/native/GPU.cpp @@ -25,7 +25,6 @@ static std::vector get_gpus() { } static int getLocalRank(ccl::communicator &comm, int size, int rank) { - const int MPI_MAX_PROCESSOR_NAME = 128; /* Obtain local rank among nodes sharing the same host name */ char zero = static_cast(0); std::vector name(MPI_MAX_PROCESSOR_NAME + 1, zero); @@ -128,8 +127,6 @@ preview::spmd::communicator createDalC logger::println(logger::INFO, "OneCCL singleton init took %f secs", duration / 1000); - logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, OneCCL singleton init took %f secs.", rank, duration / 1000 ); - t1 = std::chrono::high_resolution_clock::now(); @@ -145,7 +142,6 @@ preview::spmd::communicator createDalC .count(); logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs", duration / 1000); - logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, OneCCL create communicator took %f secs.", rank, duration / 1000 ); sycl::queue queue{gpus[0]}; t1 = std::chrono::high_resolution_clock::now(); auto comm = @@ -155,7 +151,5 @@ preview::spmd::communicator createDalC duration = (float)std::chrono::duration_cast(t2 - t1) .count(); - logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, create communicator took %f secs.", rank, duration / 1000 ); return comm; } - diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index 21ef5e218..b868fd475 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -308,7 +308,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum, jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jobject resultObj) { + jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "OneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); diff --git a/mllib-dal/src/main/native/LinearRegressionImpl.cpp b/mllib-dal/src/main/native/LinearRegressionImpl.cpp index ca94b54c5..7518a23b6 100644 --- a/mllib-dal/src/main/native/LinearRegressionImpl.cpp +++ b/mllib-dal/src/main/native/LinearRegressionImpl.cpp @@ -215,7 +215,7 @@ ridge_regression_compute(size_t rankId, ccl::communicator &comm, #ifdef CPU_GPU_PROFILE static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, - ccl::communicator &cclComm, sycl::queue &queue, + preview::spmd::communicator comm, jlong pNumTabFeature, jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols, jboolean jfitIntercept, @@ -225,9 +225,6 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, const bool isRoot = (rankId == ccl_root); bool fitIntercept = bool(jfitIntercept); - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = preview::spmd::make_communicator( - queue, executorNum, rankId, kvs); homogen_table xtrain = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols, comm.get_queue()) @@ -265,7 +262,7 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept, jdouble regParam, jdouble elasticNetParam, jint executorNum, jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, - jobject resultObj) { + jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", @@ -280,22 +277,23 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra jlong resultptr = 0L; if (useGPU) { #ifdef CPU_GPU_PROFILE - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); + "oneDAL (native): use GPU kernels with rankid %d", rank); - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - int size = cclComm.size(); - auto queue = getAssignedGPU(device, gpuIndices); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); resultptr = doLROneAPICompute( - env, rank, cclComm, queue, feature, featureRows, featureCols, + env, rank, comm, feature, featureRows, featureCols, label, labelCols, fitIntercept, executorNum, resultObj); - env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + env->ReleaseStringUTFChars(ip_port, str); #endif } else { + ccl::communicator &cclComm = getComm(); + size_t rankId = cclComm.rank(); + NumericTablePtr pLabel = *((NumericTablePtr *)label); NumericTablePtr pData = *((NumericTablePtr *)feature); @@ -318,22 +316,18 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra NumericTablePtr *coeffvectors = new NumericTablePtr(resultTable); resultptr = (jlong)coeffvectors; - } - - jlong ret = 0L; - if (rankId == ccl_root) { - // Get the class of the result object - jclass clazz = env->GetObjectClass(resultObj); - // Get Field references - jfieldID coeffNumericTableField = - env->GetFieldID(clazz, "coeffNumericTable", "J"); + if (rankId == ccl_root) { + // Get the class of the result object + jclass clazz = env->GetObjectClass(resultObj); + // Get Field references + jfieldID coeffNumericTableField = + env->GetFieldID(clazz, "coeffNumericTable", "J"); - env->SetLongField(resultObj, coeffNumericTableField, resultptr); + env->SetLongField(resultObj, coeffNumericTableField, resultptr); - // intercept is already in first column of coeffvectors - ret = resultptr; - } else { - ret = (jlong)0; + // intercept is already in first column of coeffvectors + resultptr = (jlong)coeffvectors; + } } - return ret; + return resultptr; } diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index b924c6987..988dd844a 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -55,72 +55,32 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( auto t1 = std::chrono::high_resolution_clock::now(); ccl::init(); + auto t2 = std::chrono::high_resolution_clock::now(); + const char *str = env->GetStringUTFChars(ip_port, 0); ccl::string ccl_ip_port(str); - const char *device = env->GetStringUTFChars(use_device, 0); - ccl::string ccl_ip_port(str); auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port); g_kvs.push_back(singletonCCLInit.kvs); - -#ifdef CPU_ONLY_PROFILE g_comms.push_back( ccl::create_communicator(size, rank, singletonCCLInit.kvs)); - - auto t2 = std::chrono::high_resolution_clock::now(); auto duration = (float)std::chrono::duration_cast(t2 - t1) .count(); logger::println(logger::INFO, "OneCCL (native): init took %f secs", duration / 1000); -#endif - - jclass cls = env->GetObjectClass(param); - jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J"); - jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J"); - - env->SetLongField(param, size, comm_size); - env->SetLongField(param, rank, rank_id); - env->ReleaseStringUTFChars(ip_port, str); - - return 1; -} - -/* - * Class: com_intel_oap_mllib_OneCCL__ - * Method: c_init - * Signature: ()I - */ -JNIEXPORT jint JNICALL -Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject, jint size, jint rank, jobject param) { - logger::printerrln(logger::INFO, "OneCCL (native): init dpcpp"); - auto t1 = std::chrono::high_resolution_clock::now(); - ccl::init(); - - const char *str = env->GetStringUTFChars(ip_port, 0); - ccl::string ccl_ip_port(str); - - auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port); - - g_kvs.push_back(singletonCCLInit.kvs); - - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = - (float)std::chrono::duration_cast(t2 - t1) - .count(); - logger::println(logger::INFO, "OneCCL (native): init took %f secs", - duration / 1000); + rank_id = getComm().rank(); + comm_size = getComm().size(); jclass cls = env->GetObjectClass(param); jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J"); jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J"); - env->SetLongField(param, size, comm_size); - env->SetLongField(param, rank, rank_id); + env->SetLongField(param, fid_comm_size, comm_size); + env->SetLongField(param, fid_rank_id, rank_id); env->ReleaseStringUTFChars(ip_port, str); return 1; diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index f2821d558..67949341d 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -252,7 +252,7 @@ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jobject resultObj) { + jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -277,22 +277,16 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); + "oneDAL (native): use GPU kernels with rankid %d", rank); - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - auto queue = getAssignedGPU(device, gpuIndices); - - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); doPCAOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); - env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + env->ReleaseStringUTFChars(ip_port, str); break; } #endif diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 9af30d939..852db8b03 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -270,7 +270,7 @@ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jobject resultObj) { + jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -295,22 +295,16 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - int nGpu = env->GetArrayLength(gpuIdxArray); logger::println( logger::INFO, - "oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu, - rank); + "oneDAL (native): use GPU kernels with rankid %d", rank); + const char *str = env->GetStringUTFChars(ip_port, nullptr); + ccl::string ccl_ip_port(str); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); - auto queue = getAssignedGPU(device, gpuIndices); - - ccl::shared_ptr_class &kvs = getKvs(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); doSummarizerOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); - env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + env->ReleaseStringUTFChars(ip_port, str); break; } #endif diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h index 4bfa1d0c3..a89b7d214 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_OneCCL__.h @@ -45,7 +45,7 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1getAvailPort * Signature: (IILjava/lang/String;Lcom/intel/oap/mllib/CCLParam;)I */ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init - (JNIEnv *, jobject, jint, jint, jstring, jstring, jobject); + (JNIEnv *, jobject, jint, jint, jstring, jobject); /* * Class: com_intel_oap_mllib_OneCCL__ diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h index a0fc24dde..9a00db0a2 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIDIIII[ILcom/intel/oap/mllib/clustering/KMeansResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jint, jdouble, jint, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jint, jdouble, jint, jint, jint, jint, jintArray, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h index 34646da95..3f1875ca9 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/feature/PCAResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h index 0dc6f4e79..4f90f23f8 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h index 1350d8268..7bf694a19 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIIIIIIJIZ[ILcom/intel/oap/mllib/classification/RandomForestResult;)Ljava/util/HashMap; */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jlong, jint, jboolean, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jlong, jint, jboolean, jintArray, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h index 494b89658..4c404b452 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/stat/CorrelationResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h index 7db45743f..4261d6fdd 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/stat/SummarizerResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala index 6a2da4ac7..f0ac1f0b5 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala @@ -114,6 +114,7 @@ class RandomForestClassifierDALImpl(val uid: String, maxBins, bootstrap, gpuIndices, + kvsIPPort, result) val computeEndTime = System.nanoTime() @@ -159,6 +160,7 @@ class RandomForestClassifierDALImpl(val uid: String, maxBins: Int, bootstrap: Boolean, gpuIndices: Array[Int], + kvsIPPort: String, result: RandomForestResult): java.util.HashMap[java.lang.Integer, java.util.ArrayList[LearningNode]] } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala index 61dd1ef80..be034ca03 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala @@ -91,6 +91,7 @@ class KMeansDALImpl(var nClusters: Int, executorCores, computeDevice.ordinal(), gpuIndices, + kvsIPPort, result ) @@ -149,5 +150,6 @@ class KMeansDALImpl(var nClusters: Int, executorCores: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], + kvsIPPort: String, result: KMeansResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala index 071117cc0..06f9039ca 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala @@ -85,6 +85,7 @@ class PCADALImpl(val k: Int, executorCores, computeDevice.ordinal(), gpuIndices, + kvsIPPort, result ) @@ -221,5 +222,6 @@ class PCADALImpl(val k: Int, executorCores: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], + kvsIPPort: String, result: PCAResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index 79243f988..6c45b5807 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -154,6 +154,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, executorCores, computeDevice.ordinal(), gpuIndices, + kvsIPPort, result ) @@ -200,6 +201,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, executorCores: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], + kvsIPPort: String, result: LiRResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala index 100be8823..e5742923b 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala @@ -105,6 +105,7 @@ class RandomForestRegressorDALImpl(val uid: String, maxbins, bootstrap, gpuIndices, + kvsIPPort, result) val computeEndTime = System.nanoTime() @@ -156,5 +157,6 @@ class RandomForestRegressorDALImpl(val uid: String, maxbins: Int, bootstrap: Boolean, gpuIndices: Array[Int], + kvsIPPort: String, result: RandomForestResult): java.util.HashMap[java.lang.Integer, java.util.ArrayList[LearningNode]] } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala index 04a3760bb..203c00796 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala @@ -75,6 +75,7 @@ class CorrelationDALImpl( executorCores, computeDevice.ordinal(), gpuIndices, + kvsIPPort, result ) @@ -125,5 +126,6 @@ class CorrelationDALImpl( executorCores: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], + kvsIPPort: String, result: CorrelationResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala index c8422b097..a3f65b8fd 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala @@ -76,6 +76,7 @@ class SummarizerDALImpl(val executorNum: Int, executorCores, computeDevice.ordinal(), gpuIndices, + kvsIPPort, result ) @@ -157,5 +158,6 @@ class SummarizerDALImpl(val executorNum: Int, executorCores: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], + kvsIPPort: String, result: SummarizerResult): Long } From 26269a712a5ec033cea9ed24e7a8590017d68842 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 23 Aug 2024 16:42:45 +0800 Subject: [PATCH 7/9] improve communicator code --- mllib-dal/src/main/native/Communicator.hpp | 30 +++------------ mllib-dal/src/main/native/Singleton.hpp | 45 ---------------------- 2 files changed, 6 insertions(+), 69 deletions(-) delete mode 100644 mllib-dal/src/main/native/Singleton.hpp diff --git a/mllib-dal/src/main/native/Communicator.hpp b/mllib-dal/src/main/native/Communicator.hpp index bdd5072c9..548988432 100644 --- a/mllib-dal/src/main/native/Communicator.hpp +++ b/mllib-dal/src/main/native/Communicator.hpp @@ -21,7 +21,6 @@ #include "oneapi/ccl.hpp" #include "oneapi/dal/detail/ccl/communicator.hpp" -#include "Singleton.hpp" namespace de = oneapi::dal::detail; namespace oneapi::dal::preview::spmd { @@ -29,39 +28,22 @@ namespace oneapi::dal::preview::spmd { namespace backend { struct ccl {}; } // namespace backend -class ccl_info { - friend class de::singleton; - -private: - ccl_info(int size, int rankId, ccl::shared_ptr_class keyvs) { - rank = rankId; - rank_count = size; - kvs = keyvs; - } - -public: - ccl::shared_ptr_class kvs; - int rank; - int rank_count; -}; template communicator make_communicator(int size, int rank, const ccl::shared_ptr_class kvs) { - auto& info = de::singleton::get(size, rank, kvs); // integral cast - return oneapi::dal::detail::ccl_communicator{ info.kvs, - info.rank, - info.rank_count }; + return oneapi::dal::detail::ccl_communicator{ kvs, + rank, + size }; } template communicator make_communicator(sycl::queue& queue, int size, int rank, const ccl::shared_ptr_class kvs) { - auto& info = de::singleton::get(size, rank, kvs); return oneapi::dal::detail::ccl_communicator{ queue, - info.kvs, - oneapi::dal::detail::integral_cast(info.rank), - oneapi::dal::detail::integral_cast(info.rank_count) + kvs, + oneapi::dal::detail::integral_cast(rank), + oneapi::dal::detail::integral_cast(size) }; } diff --git a/mllib-dal/src/main/native/Singleton.hpp b/mllib-dal/src/main/native/Singleton.hpp deleted file mode 100644 index 1169feac4..000000000 --- a/mllib-dal/src/main/native/Singleton.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#pragma once - -namespace oneapi::dal::detail { - -namespace v1 { - -template -class singleton { -public: - static T& get(int size, int rank, ccl::shared_ptr_class kvs) { - static std::once_flag flag; - std::call_once(flag, [size, rank, kvs] { - get_instance(size, rank, kvs); - }); - return get_instance(size, rank, kvs); - } - -private: - static T& get_instance(int size, int rank, ccl::shared_ptr_class kvs) { - static T instance{size, rank, kvs}; - return instance; - } -}; - -} // namespace v1 - -using v1::singleton; - -} // namespace oneapi::dal::detail From ce24380379febd8de3320e927334b60415d64084 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 5 Sep 2024 13:44:31 +0800 Subject: [PATCH 8/9] Store the algorithm breakdown time into a specific file --- mllib-dal/src/main/native/CorrelationImpl.cpp | 47 ++++++++--- .../native/DecisionForestClassifierImpl.cpp | 53 ++++++++++--- .../native/DecisionForestRegressorImpl.cpp | 52 +++++++++--- mllib-dal/src/main/native/GPU.cpp | 79 ++++++++++--------- mllib-dal/src/main/native/GPU.h | 5 +- mllib-dal/src/main/native/KMeansImpl.cpp | 51 ++++++++---- .../src/main/native/LinearRegressionImpl.cpp | 52 +++++++++--- mllib-dal/src/main/native/Logger.h | 39 ++++++++- mllib-dal/src/main/native/OneCCL.cpp | 3 +- mllib-dal/src/main/native/PCAImpl.cpp | 50 ++++++++---- mllib-dal/src/main/native/SummarizerImpl.cpp | 55 +++++++++---- ...sification_RandomForestClassifierDALImpl.h | 2 +- ...intel_oap_mllib_clustering_KMeansDALImpl.h | 2 +- .../com_intel_oap_mllib_feature_PCADALImpl.h | 2 +- ...mllib_regression_LinearRegressionDALImpl.h | 2 +- ..._regression_RandomForestRegressorDALImpl.h | 2 +- ..._intel_oap_mllib_stat_CorrelationDALImpl.h | 2 +- ...m_intel_oap_mllib_stat_SummarizerDALImpl.h | 2 +- .../RandomForestClassifierDALImpl.scala | 6 +- .../oap/mllib/clustering/KMeansDALImpl.scala | 6 +- .../intel/oap/mllib/feature/PCADALImpl.scala | 7 +- .../regression/LinearRegressionDALImpl.scala | 7 +- .../RandomForestRegressorDALImpl.scala | 7 +- .../oap/mllib/stat/CorrelationDALImpl.scala | 6 +- .../oap/mllib/stat/SummarizerDALImpl.scala | 6 +- 25 files changed, 390 insertions(+), 155 deletions(-) diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index dd21890f4..50d6eb3db 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -151,33 +151,49 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, size_t rankId, static void doCorrelationOneAPICompute( JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numCols, preview::spmd::communicator comm, - jobject resultObj) { + std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): GPU compute start"); const bool isRoot = (comm.get_rank() == ccl_root); + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table htable = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabData, numRows, numCols, comm.get_queue()) .get()); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println( + logger::INFO, + "Correlation batch(native): create homogen table took %f secs", + duration / 1000); + + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); const auto cor_desc = covariance_gpu::descriptor{}.set_result_options( covariance_gpu::result_options::cor_matrix | covariance_gpu::result_options::means); - auto t1 = std::chrono::high_resolution_clock::now(); + t1 = std::chrono::high_resolution_clock::now(); const auto result_train = preview::compute(comm, cor_desc, htable); if (isRoot) { logger::println(logger::INFO, "Mean:"); printHomegenTable(result_train.get_means()); logger::println(logger::INFO, "Correlation:"); printHomegenTable(result_train.get_cor_matrix()); - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = + t2 = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(t2 - t1) .count(); logger::println( logger::INFO, "Correlation batch(native): computing step took %d secs.", duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); // Return all covariance & mean jclass clazz = env->GetObjectClass(resultObj); @@ -197,9 +213,10 @@ static void doCorrelationOneAPICompute( JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( - JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, - jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, + jlong numCols, jint executorNum, jint executorCores, + jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port, + jstring breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -225,17 +242,21 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - logger::println( - logger::INFO, - "oneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "oneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); - + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); doCorrelationOneAPICompute(env, pNumTabData, numRows, numCols, comm, - resultObj); + c_breakdown_name, resultObj); + env->ReleaseStringUTFChars(ip_port, str); + env->ReleaseStringUTFChars(breakdown_name, cstr); break; } #endif diff --git a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp index 64a0782ca..e44c7bd7e 100644 --- a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp +++ b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp @@ -216,9 +216,11 @@ static jobject doRFClassifierOneAPICompute( jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed, jint maxBins, jboolean bootstrap, preview::spmd::communicator comm, - jobject resultObj) { + std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): GPU compute start"); const bool isRoot = (comm.get_rank() == ccl_root); + + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table hFeaturetable = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols, comm.get_queue()) @@ -227,6 +229,17 @@ static jobject doRFClassifierOneAPICompute( createHomogenTableWithArrayPtr(pNumTabLabel, featureRows, labelCols, comm.get_queue()) .get()); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println( + logger::INFO, + "DF Classifier (native): create feature homogen table took %f secs", + duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); const auto df_desc = df::descriptor( + t2 - t1) + .count(); + logger::println(logger::INFO, + "DF Classifier (native): training step took %f secs.", + duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); // convert to java hashmap trees = collect_model(env, result_train.get_model(), classCount); @@ -300,26 +324,29 @@ static jobject doRFClassifierOneAPICompute( */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL( - JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature, jlong featureRows, - jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum, - jint computeDeviceOrdinal, jint classCount, jint treeCount, - jint numFeaturesPerNode, jint minObservationsLeafNode, + JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature, + jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols, + jint executorNum, jint computeDeviceOrdinal, jint classCount, + jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode, jint minObservationsSplitNode, jdouble minWeightFractionLeafNode, jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed, - jint maxBins, jboolean bootstrap, jintArray gpuIdxArray, - jstring ip_port, jobject resultObj) { + jint maxBins, jboolean bootstrap, jintArray gpuIdxArray, jstring ip_port, + jstring breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels"); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::gpu: { - logger::println( - logger::INFO, - "oneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "oneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); jobject hashmapObj = doRFClassifierOneAPICompute( env, pNumTabFeature, featureRows, featureCols, pNumTabLabel, @@ -327,8 +354,10 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif numFeaturesPerNode, minObservationsLeafNode, minObservationsSplitNode, minWeightFractionLeafNode, minImpurityDecreaseSplitNode, maxTreeDepth, seed, maxBins, - bootstrap, comm, resultObj); + bootstrap, comm, c_breakdown_name, resultObj); + env->ReleaseStringUTFChars(ip_port, str); + env->ReleaseStringUTFChars(breakdown_name, cstr); return hashmapObj; } default: { diff --git a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp index c757d12a1..c0fac9370 100644 --- a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp +++ b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp @@ -213,9 +213,10 @@ static jobject doRFRegressorOneAPICompute( jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins, jboolean bootstrap, preview::spmd::communicator comm, - jobject resultObj) { + std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "OneDAL (native): GPU compute start"); const bool isRoot = (comm.get_rank() == ccl_root); + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table hFeaturetable = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols, comm.get_queue()) @@ -224,6 +225,18 @@ static jobject doRFRegressorOneAPICompute( createHomogenTableWithArrayPtr(pNumTabLabel, featureRows, labelCols, comm.get_queue()) .get()); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println( + logger::INFO, + "DF Regression (native): create feature homogen table took %f secs", + duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); + const auto df_desc = df::descriptor{} @@ -237,6 +250,7 @@ static jobject doRFRegressorOneAPICompute( df::error_metric_mode::out_of_bag_error_per_observation) .set_variable_importance_mode(df::variable_importance_mode::mdi); + t1 = std::chrono::high_resolution_clock::now(); const auto result_train = preview::train(comm, df_desc, hFeaturetable, hLabeltable); const auto result_infer = @@ -250,6 +264,16 @@ static jobject doRFRegressorOneAPICompute( logger::println(logger::INFO, "Prediction results:"); printHomegenTable(result_infer.get_responses()); + t2 = std::chrono::high_resolution_clock::now(); + duration = (float)std::chrono::duration_cast( + t2 - t1) + .count(); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); + logger::println(logger::INFO, + "DF Regression (native): training step took %f secs.", + duration / 1000); // convert c++ map to java hashmap jint statsSize = 3; // spark create VarianceCalculator needs array of // sufficient statistics @@ -292,11 +316,12 @@ static jobject doRFRegressorOneAPICompute( JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL( - JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature, jlong featureRows, - jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum, - jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode, - jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins, - jboolean bootstrap, jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { + JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature, + jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols, + jint executorNum, jint computeDeviceOrdinal, jint treeCount, + jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth, + jlong seed, jint maxbins, jboolean bootstrap, jintArray gpuIdxArray, + jstring ip_port, jstring breakdown_name, jobject resultObj) { logger::println(logger::INFO, "OneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -304,20 +329,25 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { case ComputeDevice::gpu: { - logger::println( - logger::INFO, - "OneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "OneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); jobject hashmapObj = doRFRegressorOneAPICompute( env, pNumTabFeature, featureRows, featureCols, pNumTabLabel, labelCols, executorNum, computeDeviceOrdinal, treeCount, numFeaturesPerNode, minObservationsLeafNode, maxTreeDepth, seed, - maxbins, bootstrap, comm, resultObj); + maxbins, bootstrap, comm, c_breakdown_name, resultObj); + env->ReleaseStringUTFChars(ip_port, str); + env->ReleaseStringUTFChars(breakdown_name, cstr); return hashmapObj; } default: { diff --git a/mllib-dal/src/main/native/GPU.cpp b/mllib-dal/src/main/native/GPU.cpp index 2454fadef..ecaa42121 100644 --- a/mllib-dal/src/main/native/GPU.cpp +++ b/mllib-dal/src/main/native/GPU.cpp @@ -113,43 +113,44 @@ sycl::queue getQueue(const ComputeDevice device) { } } - -preview::spmd::communicator createDalCommunicator(const jint executorNum, const jint rank, const ccl::string ccl_ip_port){ - auto gpus = get_gpus(); - - auto t1 = std::chrono::high_resolution_clock::now(); - - ccl::init(); - - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = - (float)std::chrono::duration_cast(t2 - t1).count(); - - logger::println(logger::INFO, "OneCCL singleton init took %f secs", - duration / 1000); - - t1 = std::chrono::high_resolution_clock::now(); - - auto kvs_attr = ccl::create_kvs_attr(); - - kvs_attr.set(ccl_ip_port); - - ccl::shared_ptr_class kvs = ccl::create_main_kvs(kvs_attr); - - t2 = std::chrono::high_resolution_clock::now(); - duration = - (float)std::chrono::duration_cast(t2 - t1) - .count(); - logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs", - duration / 1000); - sycl::queue queue{gpus[0]}; - t1 = std::chrono::high_resolution_clock::now(); - auto comm = - preview::spmd::make_communicator( - queue, executorNum, rank, kvs); - t2 = std::chrono::high_resolution_clock::now(); - duration = - (float)std::chrono::duration_cast(t2 - t1) - .count(); - return comm; +preview::spmd::communicator +createDalCommunicator(const jint executorNum, const jint rank, + const ccl::string ccl_ip_port) { + auto gpus = get_gpus(); + + auto t1 = std::chrono::high_resolution_clock::now(); + + ccl::init(); + + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + + logger::println(logger::INFO, "OneCCL singleton init took %f secs", + duration / 1000); + + t1 = std::chrono::high_resolution_clock::now(); + + auto kvs_attr = ccl::create_kvs_attr(); + + kvs_attr.set(ccl_ip_port); + + ccl::shared_ptr_class kvs = ccl::create_main_kvs(kvs_attr); + + t2 = std::chrono::high_resolution_clock::now(); + duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs", + duration / 1000); + sycl::queue queue{gpus[0]}; + t1 = std::chrono::high_resolution_clock::now(); + auto comm = preview::spmd::make_communicator( + queue, executorNum, rank, kvs); + t2 = std::chrono::high_resolution_clock::now(); + duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + return comm; } diff --git a/mllib-dal/src/main/native/GPU.h b/mllib-dal/src/main/native/GPU.h index 83b3272f0..1056ef22a 100644 --- a/mllib-dal/src/main/native/GPU.h +++ b/mllib-dal/src/main/native/GPU.h @@ -1,14 +1,15 @@ #pragma once +#include "Communicator.hpp" #include "service.h" #include #include #include #include #include -#include "Communicator.hpp" sycl::queue getAssignedGPU(const ComputeDevice device, jint *gpu_indices); sycl::queue getQueue(const ComputeDevice device); -preview::spmd::communicator createDalCommunicator(jint executorNum, jint rank, ccl::string ccl_ip_port); +preview::spmd::communicator +createDalCommunicator(jint executorNum, jint rank, ccl::string ccl_ip_port); diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index b868fd475..ab25fc3a5 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -246,21 +246,34 @@ static jlong doKMeansOneAPICompute( JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numCols, jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum, preview::spmd::communicator comm, - jobject resultObj) { + std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "OneDAL (native): GPU compute start"); const bool isRoot = (comm.get_rank() == ccl_root); + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table htable = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabData, numRows, numCols, comm.get_queue()) .get()); homogen_table centroids = *reinterpret_cast(pNumTabCenters); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, + "KMeans (native): create homogen table took %f secs", + duration / 1000); + + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); + const auto kmeans_desc = kmeans_gpu::descriptor() .set_cluster_count(clusterNum) .set_max_iteration_count(iterationNum) .set_accuracy_threshold(tolerance); kmeans_gpu::train_input local_input{htable, centroids}; - auto t1 = std::chrono::high_resolution_clock::now(); + t1 = std::chrono::high_resolution_clock::now(); kmeans_gpu::train_result result_train = preview::train(comm, kmeans_desc, local_input); if (isRoot) { @@ -268,13 +281,16 @@ static jlong doKMeansOneAPICompute( result_train.get_iteration_count()); logger::println(logger::INFO, "Centroids:"); printHomegenTable(result_train.get_model().get_centroids()); - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = + t2 = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(t2 - t1) .count(); logger::println(logger::INFO, "KMeans (native): training step took %d secs", duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); // Get the class of the input object jclass clazz = env->GetObjectClass(resultObj); // Get Field references @@ -305,10 +321,11 @@ static jlong doKMeansOneAPICompute( */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters( - JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, - jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum, - jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, + jlong numCols, jlong pNumTabCenters, jint clusterNum, jdouble tolerance, + jint iterationNum, jint executorNum, jint executorCores, + jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port, + jstring breakdown_name, jobject resultObj) { logger::println(logger::INFO, "OneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -338,18 +355,22 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - logger::println( - logger::INFO, - "OneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "OneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); - ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numCols, - pNumTabCenters, clusterNum, tolerance, - iterationNum, comm, resultObj); + ret = doKMeansOneAPICompute( + env, pNumTabData, numRows, numCols, pNumTabCenters, clusterNum, + tolerance, iterationNum, comm, c_breakdown_name, resultObj); + env->ReleaseStringUTFChars(breakdown_name, cstr); env->ReleaseStringUTFChars(ip_port, str); break; } diff --git a/mllib-dal/src/main/native/LinearRegressionImpl.cpp b/mllib-dal/src/main/native/LinearRegressionImpl.cpp index 7518a23b6..1bb96cb38 100644 --- a/mllib-dal/src/main/native/LinearRegressionImpl.cpp +++ b/mllib-dal/src/main/native/LinearRegressionImpl.cpp @@ -214,17 +214,18 @@ ridge_regression_compute(size_t rankId, ccl::communicator &comm, } #ifdef CPU_GPU_PROFILE -static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, - preview::spmd::communicator comm, - jlong pNumTabFeature, jlong featureRows, - jlong featureCols, jlong pNumTabLabel, - jlong labelCols, jboolean jfitIntercept, - jint executorNum, jobject resultObj) { +static jlong doLROneAPICompute( + JNIEnv *env, size_t rankId, + preview::spmd::communicator comm, + jlong pNumTabFeature, jlong featureRows, jlong featureCols, + jlong pNumTabLabel, jlong labelCols, jboolean jfitIntercept, + jint executorNum, std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): GPU compute start , rankid %d", rankId); const bool isRoot = (rankId == ccl_root); bool fitIntercept = bool(jfitIntercept); + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table xtrain = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols, comm.get_queue()) @@ -233,16 +234,37 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, createHomogenTableWithArrayPtr(pNumTabLabel, featureRows, labelCols, comm.get_queue()) .get()); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println( + logger::INFO, + "LinerRegression(native): create feature homogen table took %f secs", + duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); linear_regression_gpu::train_input local_input{xtrain, ytrain}; const auto linear_regression_desc = linear_regression_gpu::descriptor(fitIntercept); - + t1 = std::chrono::high_resolution_clock::now(); linear_regression_gpu::train_result result_train = preview::train(comm, linear_regression_desc, xtrain, ytrain); if (isRoot) { HomogenTablePtr result_matrix = std::make_shared( result_train.get_model().get_betas()); + t2 = std::chrono::high_resolution_clock::now(); + duration = (float)std::chrono::duration_cast( + t2 - t1) + .count(); + logger::println(logger::INFO, + "LinerRegression(native): training step took %f secs", + duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); saveHomogenTablePtrToVector(result_matrix); return (jlong)result_matrix.get(); } else { @@ -277,18 +299,22 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra jlong resultptr = 0L; if (useGPU) { #ifdef CPU_GPU_PROFILE - logger::println( - logger::INFO, - "oneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "oneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); resultptr = doLROneAPICompute( - env, rank, comm, feature, featureRows, featureCols, - label, labelCols, fitIntercept, executorNum, resultObj); + env, rank, comm, feature, featureRows, featureCols, label, + labelCols, fitIntercept, executorNum, c_breakdown_name, resultObj); env->ReleaseStringUTFChars(ip_port, str); + env->ReleaseStringUTFChars(breakdown_name, cstr); #endif } else { ccl::communicator &cclComm = getComm(); diff --git a/mllib-dal/src/main/native/Logger.h b/mllib-dal/src/main/native/Logger.h index 84484aa80..5f21c60ba 100644 --- a/mllib-dal/src/main/native/Logger.h +++ b/mllib-dal/src/main/native/Logger.h @@ -1,10 +1,46 @@ #pragma once #include +#include #include +#include +#include +#include +#include +#include #include +namespace fs = std::filesystem; namespace logger { +std::ofstream logFile; +std::string name; + +public: +static Logger &getInstance(std::string name) { + static std::once_flag flag; + static Logger instance(name); + std::call_once(flag, [&name] { instance = Logger(name); }); + return instance; +} + +void printLogToFile(const char *format, ...); +void closeFile(); + +private: +Logger(std::string name) { + char *path = std::getenv("SPARKJOB_CONFIG_DIR"); + if (path != nullptr) { + std::cout << "SPARKJOB_CONFIG_DIR Directory: " << path << std::endl; + } else { + std::cout << "SPARKJOB_CONFIG_DIR environment variable not found." + << std::endl; + } + auto filePath = fs::path(path) / fs::path(name); + std::cout << "file path: " << filePath << std::endl; + logFile.open(filePath, std::ios::out | std::ios::app); +} +}; // namespace logger + // message type for print functions enum MessageType { DEBUG = 0, @@ -24,4 +60,5 @@ int printerr(MessageType message_type, const std::string &msg); int printerr(MessageType message_type, const char *format, ...); int printerrln(MessageType message_type, const char *format, ...); int printerrln(MessageType message_type, const std::string &msg); -}; // namespace logger +} +; // namespace logger diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index 988dd844a..e9a164507 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -55,8 +55,7 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init( auto t1 = std::chrono::high_resolution_clock::now(); ccl::init(); - auto t2 = std::chrono::high_resolution_clock::now(); - + auto t2 = std::chrono::high_resolution_clock::now(); const char *str = env->GetStringUTFChars(ip_port, 0); ccl::string ccl_ip_port(str); diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index 67949341d..503001845 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -184,25 +184,33 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, size_t rankId, static void doPCAOneAPICompute( JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numCols, preview::spmd::communicator comm, - jobject resultObj) { + std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): GPU compute start"); const bool isRoot = (comm.get_rank() == ccl_root); + + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table htable = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabData, numRows, numCols, comm.get_queue()) .get()); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + logger::println(logger::INFO, + "PCA (native): create homogen table took %f secs", + duration / 1000); + + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); const auto cov_desc = covariance_gpu::descriptor{}.set_result_options( covariance_gpu::result_options::cov_matrix); - auto t1 = std::chrono::high_resolution_clock::now(); + t1 = std::chrono::high_resolution_clock::now(); const auto result = preview::compute(comm, cov_desc, htable); - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(t2 - t1).count(); - logger::println(logger::INFO, "PCA (native): Covariance step took %d secs", - duration / 1000); if (isRoot) { using float_t = GpuAlgorithmFPType; using method_t = pca_gpu::method::precomputed; @@ -210,7 +218,6 @@ static void doPCAOneAPICompute( using descriptor_t = pca_gpu::descriptor; const auto pca_desc = descriptor_t().set_deterministic(true); - t1 = std::chrono::high_resolution_clock::now(); const auto result_train = preview::train(comm, pca_desc, result.get_cov_matrix()); t2 = std::chrono::high_resolution_clock::now(); @@ -219,6 +226,9 @@ static void doPCAOneAPICompute( .count(); logger::println(logger::INFO, "PCA (native): Eigen step took %d secs", duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); // Return all eigenvalues & eigenvectors // Get the class of the input object jclass clazz = env->GetObjectClass(resultObj); @@ -250,9 +260,10 @@ static void doPCAOneAPICompute( JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( - JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, - jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, + jlong numCols, jint executorNum, jint executorCores, + jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port, + jstring breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -277,15 +288,22 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - logger::println( - logger::INFO, - "oneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "oneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); + + doPCAOneAPICompute(env, pNumTabData, numRows, numCols, comm, + c_breakdown_name, resultObj); - doPCAOneAPICompute(env, pNumTabData, numRows, numCols, comm, resultObj); + env->ReleaseStringUTFChars(breakdown_name, cstr); env->ReleaseStringUTFChars(ip_port, str); break; } diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 852db8b03..7714bbdbd 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -204,16 +204,29 @@ static void doSummarizerDAALCompute(JNIEnv *env, jobject obj, size_t rankId, static void doSummarizerOneAPICompute( JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numCols, preview::spmd::communicator comm, - jobject resultObj) { + std::string breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): GPU compute start"); const bool isRoot = (comm.get_rank() == ccl_root); + + auto t1 = std::chrono::high_resolution_clock::now(); homogen_table htable = *reinterpret_cast( createHomogenTableWithArrayPtr(pNumTabData, numRows, numCols, comm.get_queue()) .get()); + auto t2 = std::chrono::high_resolution_clock::now(); + auto duration = + (float)std::chrono::duration_cast(t2 - t1) + .count(); + + logger::println(logger::INFO, + "Summarizer (native): create homogen table took %f secs", + duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, create homogen table took %f secs.", + comm.get_rank(), duration / 1000); const auto bs_desc = basic_statistics::descriptor{}; - auto t1 = std::chrono::high_resolution_clock::now(); + t1 = std::chrono::high_resolution_clock::now(); const auto result_train = preview::compute(comm, bs_desc, htable); if (isRoot) { logger::println(logger::INFO, "Minimum"); @@ -224,14 +237,16 @@ static void doSummarizerOneAPICompute( printHomegenTable(result_train.get_mean()); logger::println(logger::INFO, "Variation"); printHomegenTable(result_train.get_variance()); - auto t2 = std::chrono::high_resolution_clock::now(); - auto duration = - (float)std::chrono::duration_cast(t2 - - t1) - .count(); + t2 = std::chrono::high_resolution_clock::now(); + duration = (float)std::chrono::duration_cast( + t2 - t1) + .count(); logger::println(logger::INFO, "Summarizer (native): computing step took %d secs", duration / 1000); + logger::Logger::getInstance(breakdown_name) + .printLogToFile("rankID was %d, training step took %f secs.", + comm.get_rank(), duration / 1000); // Return all covariance & mean jclass clazz = env->GetObjectClass(resultObj); @@ -268,9 +283,10 @@ static void doSummarizerOneAPICompute( JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( - JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, jlong numCols, - jint executorNum, jint executorCores, jint computeDeviceOrdinal, - jintArray gpuIdxArray, jstring ip_port, jobject resultObj) { + JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows, + jlong numCols, jint executorNum, jint executorCores, + jint computeDeviceOrdinal, jintArray gpuIdxArray, jstring ip_port, + jstring breakdown_name, jobject resultObj) { logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels; device %s", ComputeDeviceString[computeDeviceOrdinal].c_str()); @@ -295,21 +311,28 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { - logger::println( - logger::INFO, - "oneDAL (native): use GPU kernels with rankid %d", rank); + logger::println(logger::INFO, + "oneDAL (native): use GPU kernels with rankid %d", + rank); const char *str = env->GetStringUTFChars(ip_port, nullptr); ccl::string ccl_ip_port(str); - auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port); + const char *cstr = env->GetStringUTFChars(breakdown_name, nullptr); + std::string c_breakdown_name(cstr); + + auto comm = createDalCommunicator(executorNum, rank, ccl_ip_port, + c_breakdown_name); doSummarizerOneAPICompute(env, pNumTabData, numRows, numCols, comm, - resultObj); + c_breakdown_name resultObj); + + env->ReleaseStringUTFChars(breakdown_name, cstr); env->ReleaseStringUTFChars(ip_port, str); break; } #endif default: { - deviceError("Summarizer", ComputeDeviceString[computeDeviceOrdinal].c_str()); + deviceError("Summarizer", + ComputeDeviceString[computeDeviceOrdinal].c_str()); } } return 0; diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h index 79bd6f16f..52d9a6178 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_classification_RandomForestClassifierDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIIIIIIIDDIJIZ[ILcom/intel/oap/mllib/classification/RandomForestResult;)Ljava/util/HashMap; */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jint, jdouble, jdouble, jint, jlong, jint, jboolean, jintArray, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jint, jdouble, jdouble, jint, jlong, jint, jboolean, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h index 9a00db0a2..9a14c535e 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_clustering_KMeansDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIDIIII[ILcom/intel/oap/mllib/clustering/KMeansResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jint, jdouble, jint, jint, jint, jint, jintArray, jstring, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jint, jdouble, jint, jint, jint, jint, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h index 3f1875ca9..cfde9d028 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_feature_PCADALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/feature/PCAResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h index 4f90f23f8..c50ae352a 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jstring, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h index 7bf694a19..f1247498a 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_RandomForestRegressorDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JJIIIIIIJIZ[ILcom/intel/oap/mllib/classification/RandomForestResult;)Ljava/util/HashMap; */ JNIEXPORT jobject JNICALL Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jlong, jint, jboolean, jintArray, jstring, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jlong, jlong, jint, jint, jint, jint, jint, jint, jlong, jint, jboolean, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h index 4c404b452..c549d2dde 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_CorrelationDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/stat/CorrelationResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h index 4261d6fdd..e4f8dcd29 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_stat_SummarizerDALImpl.h @@ -13,7 +13,7 @@ extern "C" { * Signature: (JIII[ILcom/intel/oap/mllib/stat/SummarizerResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL - (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jobject); + (JNIEnv *, jobject, jint, jlong, jlong, jlong, jint, jint, jint, jintArray, jstring, jstring, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala index f0ac1f0b5..17879dae4 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/classification/RandomForestClassifierDALImpl.scala @@ -55,7 +55,8 @@ class RandomForestClassifierDALImpl(val uid: String, logInfo(s"RandomForestClassifierDALImpl executorNum : " + executorNum) val sparkContext = labeledPoints.rdd.sparkContext - val rfcTimer = new Utils.AlgoTimeMetrics("RandomForestClassifier", sparkContext) + val metricsName = "RFClassifier_" + executorNum + val rfcTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) // used run Random Forest unit test val isTest = sparkContext.getConf.getBoolean("spark.oap.mllib.isTest", false) @@ -74,6 +75,7 @@ class RandomForestClassifierDALImpl(val uid: String, } rfcTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(labeledPointsTables) + val trainingBreakdownName = "RFClassifier_training_breakdown_" + executorNum CommonJob.setAffinityMask(labeledPointsTables, useDevice) CommonJob.createCCLInit(labeledPointsTables, executorNum, kvsIPPort, useDevice) @@ -115,6 +117,7 @@ class RandomForestClassifierDALImpl(val uid: String, bootstrap, gpuIndices, kvsIPPort, + trainingBreakdownName, result) val computeEndTime = System.nanoTime() @@ -161,6 +164,7 @@ class RandomForestClassifierDALImpl(val uid: String, bootstrap: Boolean, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: RandomForestResult): java.util.HashMap[java.lang.Integer, java.util.ArrayList[LearningNode]] } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala index be034ca03..54264f708 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/clustering/KMeansDALImpl.scala @@ -38,7 +38,8 @@ class KMeansDALImpl(var nClusters: Int, def train(data: RDD[Vector]): MLlibKMeansModel = { val sparkContext = data.sparkContext - val kmeansTimer = new Utils.AlgoTimeMetrics("KMeans", sparkContext) + val metricsName = "Kmeans_" + executorNum + val kmeansTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) kmeansTimer.record("Preprocessing") @@ -51,6 +52,7 @@ class KMeansDALImpl(var nClusters: Int, kmeansTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(coalescedTables) + val trainingBreakdownName = "Kmeans_training_breakdown_" + executorNum CommonJob.setAffinityMask(coalescedTables, useDevice) CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) @@ -92,6 +94,7 @@ class KMeansDALImpl(var nClusters: Int, computeDevice.ordinal(), gpuIndices, kvsIPPort, + trainingBreakdownName, result ) @@ -151,5 +154,6 @@ class KMeansDALImpl(var nClusters: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: KMeansResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala index 06f9039ca..1d8e2ba4e 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/feature/PCADALImpl.scala @@ -45,7 +45,8 @@ class PCADALImpl(val k: Int, def train(data: RDD[Vector]): PCADALModel = { val normalizedData = normalizeData(data) val sparkContext = normalizedData.sparkContext - val pcaTimer = new Utils.AlgoTimeMetrics("PCA", sparkContext) + val metricsName = "PCA_" + executorNum + val pcaTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) pcaTimer.record("Preprocessing") @@ -57,6 +58,8 @@ class PCADALImpl(val k: Int, OneDAL.coalesceVectorsToNumericTables(normalizedData, executorNum) } val kvsIPPort = getOneCCLIPPort(coalescedTables) + val trainingBreakdownName = "PCA_training_breakdown_" + executorNum + pcaTimer.record("Data Convertion") CommonJob.setAffinityMask(coalescedTables, useDevice) @@ -86,6 +89,7 @@ class PCADALImpl(val k: Int, computeDevice.ordinal(), gpuIndices, kvsIPPort, + trainingBreakdownName, result ) @@ -223,5 +227,6 @@ class PCADALImpl(val k: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: PCAResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index 6c45b5807..ef18cc033 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -71,13 +71,16 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, featuresCol: String): LinearRegressionDALModel = { val sparkContext = labeledPoints.sparkSession.sparkContext - val lrTimer = new Utils.AlgoTimeMetrics("LinearRegression", sparkContext) + val metricsName = "LinearRegression_" + executorNum + val lrTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) val isTest = sparkContext.getConf.getBoolean("spark.oap.mllib.isTest", false) val kvsIPPort = getOneCCLIPPort(labeledPoints.rdd) + val trainingBreakdownName = "LinearRegression_training_breakdown_" + executorNum + lrTimer.record("Preprocessing") val labeledPointsTables = if (useDevice == "GPU") { @@ -155,6 +158,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, computeDevice.ordinal(), gpuIndices, kvsIPPort, + trainingBreakdownName, result ) @@ -202,6 +206,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, computeDeviceOrdinal: Int, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: LiRResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala index e5742923b..c9270998e 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/RandomForestRegressorDALImpl.scala @@ -45,9 +45,9 @@ class RandomForestRegressorDALImpl(val uid: String, def train(labeledPoints: Dataset[_], labelCol: String, featuresCol: String): (util.Map[Integer, util.ArrayList[LearningNode]]) = { - logInfo(s"RandomForestRegressorDALImpl executorNum : " + executorNum) val sparkContext = labeledPoints.rdd.sparkContext - val rfrTimer = new Utils.AlgoTimeMetrics("RandomForestRegressor", sparkContext) + val metricsName = "RFRegressor_" + executorNum + val rfrTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) // used run Random Forest unit test @@ -68,6 +68,7 @@ class RandomForestRegressorDALImpl(val uid: String, rfrTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(labeledPointsTables) + val trainingBreakdownName = "RFRegressor_training_breakdown_" + executorNum CommonJob.setAffinityMask(labeledPointsTables, useDevice) CommonJob.createCCLInit(labeledPointsTables, executorNum, kvsIPPort, useDevice) @@ -106,6 +107,7 @@ class RandomForestRegressorDALImpl(val uid: String, bootstrap, gpuIndices, kvsIPPort, + trainingBreakdownName, result) val computeEndTime = System.nanoTime() @@ -158,5 +160,6 @@ class RandomForestRegressorDALImpl(val uid: String, bootstrap: Boolean, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: RandomForestResult): java.util.HashMap[java.lang.Integer, java.util.ArrayList[LearningNode]] } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala index 203c00796..de13f17de 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/CorrelationDALImpl.scala @@ -31,7 +31,8 @@ class CorrelationDALImpl( def computeCorrelationMatrix(data: RDD[Vector]): Matrix = { val sparkContext = data.sparkContext - val corTimer = new Utils.AlgoTimeMetrics("Correlation", sparkContext) + val metricsName = "Correlation_" + executorNum + val corTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) corTimer.record("Preprocessing") @@ -45,6 +46,7 @@ class CorrelationDALImpl( corTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(coalescedTables) + val trainingBreakdownName = "Correlation_training_breakdown_" + executorNum CommonJob.setAffinityMask(coalescedTables, useDevice) CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) @@ -76,6 +78,7 @@ class CorrelationDALImpl( computeDevice.ordinal(), gpuIndices, kvsIPPort, + trainingBreakdownName, result ) @@ -127,5 +130,6 @@ class CorrelationDALImpl( computeDeviceOrdinal: Int, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: CorrelationResult): Long } diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala index a3f65b8fd..df47b9985 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/stat/SummarizerDALImpl.scala @@ -32,7 +32,8 @@ class SummarizerDALImpl(val executorNum: Int, def computeSummarizerMatrix(data: RDD[Vector]): Summary = { val sparkContext = data.sparkContext - val sumTimer = new Utils.AlgoTimeMetrics("Summarizer", sparkContext) + val metricsName = "Summarizer_" + executorNum + val sumTimer = new Utils.AlgoTimeMetrics(metricsName, sparkContext) val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) sumTimer.record("Preprocessing") @@ -46,6 +47,7 @@ class SummarizerDALImpl(val executorNum: Int, sumTimer.record("Data Convertion") val kvsIPPort = getOneCCLIPPort(data) + val trainingBreakdownName = "Summarizer_training_breakdown_" + executorNum CommonJob.setAffinityMask(coalescedTables, useDevice) CommonJob.createCCLInit(coalescedTables, executorNum, kvsIPPort, useDevice) @@ -77,6 +79,7 @@ class SummarizerDALImpl(val executorNum: Int, computeDevice.ordinal(), gpuIndices, kvsIPPort, + trainingBreakdownName, result ) @@ -159,5 +162,6 @@ class SummarizerDALImpl(val executorNum: Int, computeDeviceOrdinal: Int, gpuIndices: Array[Int], kvsIPPort: String, + breakdownName: String, result: SummarizerResult): Long } From caafd27c7f1617a89a7a52738cb887c845cbd8e0 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 5 Sep 2024 13:53:45 +0800 Subject: [PATCH 9/9] update --- mllib-dal/src/main/native/GPU.cpp | 14 +++----------- mllib-dal/src/main/native/GPU.h | 2 +- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/mllib-dal/src/main/native/GPU.cpp b/mllib-dal/src/main/native/GPU.cpp index ecaa42121..872dc7dc9 100644 --- a/mllib-dal/src/main/native/GPU.cpp +++ b/mllib-dal/src/main/native/GPU.cpp @@ -115,7 +115,7 @@ sycl::queue getQueue(const ComputeDevice device) { preview::spmd::communicator createDalCommunicator(const jint executorNum, const jint rank, - const ccl::string ccl_ip_port) { + const ccl::string ccl_ip_port, std::string breakdown_name) { auto gpus = get_gpus(); auto t1 = std::chrono::high_resolution_clock::now(); @@ -127,10 +127,7 @@ createDalCommunicator(const jint executorNum, const jint rank, (float)std::chrono::duration_cast(t2 - t1) .count(); - logger::println(logger::INFO, "OneCCL singleton init took %f secs", - duration / 1000); - - t1 = std::chrono::high_resolution_clock::now(); + logger::Logger::getInstance(breakdown_name).printLogToFile("rankID was %d, OneCCL singleton init took %f secs.", rank, duration / 1000 ); auto kvs_attr = ccl::create_kvs_attr(); @@ -138,12 +135,6 @@ createDalCommunicator(const jint executorNum, const jint rank, ccl::shared_ptr_class kvs = ccl::create_main_kvs(kvs_attr); - t2 = std::chrono::high_resolution_clock::now(); - duration = - (float)std::chrono::duration_cast(t2 - t1) - .count(); - logger::println(logger::INFO, "OneCCL (native): create kvs took %f secs", - duration / 1000); sycl::queue queue{gpus[0]}; t1 = std::chrono::high_resolution_clock::now(); auto comm = preview::spmd::make_communicator( @@ -152,5 +143,6 @@ createDalCommunicator(const jint executorNum, const jint rank, duration = (float)std::chrono::duration_cast(t2 - t1) .count(); + logger::Logger::getInstance(c_breakdown_name).printLogToFile("rankID was %d, create communicator took %f secs.", rank, duration / 1000 ); return comm; } diff --git a/mllib-dal/src/main/native/GPU.h b/mllib-dal/src/main/native/GPU.h index 1056ef22a..b9832ed46 100644 --- a/mllib-dal/src/main/native/GPU.h +++ b/mllib-dal/src/main/native/GPU.h @@ -12,4 +12,4 @@ sycl::queue getAssignedGPU(const ComputeDevice device, jint *gpu_indices); sycl::queue getQueue(const ComputeDevice device); preview::spmd::communicator -createDalCommunicator(jint executorNum, jint rank, ccl::string ccl_ip_port); +createDalCommunicator(jint executorNum, jint rank, ccl::string ccl_ip_port, std::string breakdown_name);